[PATCH] MATCH: Optimize COND_ADD_LEN reduction pattern

2023-09-26 Thread Juzhe-Zhong


This patch leverage this commit: 
https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=62b505a4d5fc89
to optimize COND_LEN_ADD reduction pattern.

We are doing optimization of VEC_COND_EXPR + COND_LEN_ADD -> COND_LEN_ADD.

Consider thsi following case:

#include 

void
pr11594 (uint64_t *restrict a, uint64_t *restrict b, int loop_size)
{
  uint64_t result = 0;

  for (int i = 0; i < loop_size; i++)
{
  if (b[i] <= a[i])
{
  result += a[i];
}
}

  a[0] = result;
}

Before this patch:
vsetvli a7,zero,e64,m1,ta,ma
vmv.v.i v2,0
vmv1r.v v3,v2--- redundant
.L3:
vsetvli a5,a2,e64,m1,ta,ma
vle64.v v1,0(a3)
vle64.v v0,0(a1)
sllia6,a5,3
vsetvli a7,zero,e64,m1,ta,ma
sub a2,a2,a5
vmsleu.vv   v0,v0,v1
add a1,a1,a6
vmerge.vvm  v1,v3,v1,v0  redundant.
add a3,a3,a6
vsetvli zero,a5,e64,m1,tu,ma
vadd.vv v2,v2,v1
bne a2,zero,.L3
li  a5,0
vsetvli a4,zero,e64,m1,ta,ma
vmv.s.x v1,a5
vredsum.vs  v2,v2,v1
vmv.x.s a5,v2
sd  a5,0(a0)
ret

After this patch:

vsetvli a6,zero,e64,m1,ta,ma
vmv.v.i v1,0
.L3:
vsetvli a5,a2,e64,m1,ta,ma
vle64.v v2,0(a4)
vle64.v v0,0(a1)
sllia3,a5,3
vsetvli a6,zero,e64,m1,ta,ma
sub a2,a2,a5
vmsleu.vv   v0,v0,v2
add a1,a1,a3
vsetvli zero,a5,e64,m1,tu,mu
add a4,a4,a3
vadd.vv v1,v1,v2,v0.t
bne a2,zero,.L3
li  a5,0
vsetivlizero,1,e64,m1,ta,ma
vmv.s.x v2,a5
vsetvli a5,zero,e64,m1,ta,ma
vredsum.vs  v1,v1,v2
vmv.x.s a5,v1
sd  a5,0(a0)
ret

Bootstrap && Regression is running.

Ok for trunk when testing passes ?

PR tree-optimization/111594
PR tree-optimization/110660

gcc/ChangeLog:

* match.pd: Optimize COND_LEN_ADD reduction.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/cond/cond_reduc-1.c: New test.
* gcc.target/riscv/rvv/autovec/cond/pr111594.c: New test.

---
 gcc/match.pd  | 13 +
 .../riscv/rvv/autovec/cond/cond_reduc-1.c | 29 +++
 .../riscv/rvv/autovec/cond/pr111594.c | 22 ++
 3 files changed, 64 insertions(+)
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/cond_reduc-1.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111594.c

diff --git a/gcc/match.pd b/gcc/match.pd
index a17778fbaa6..af8d12c138e 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -8866,6 +8866,19 @@ and,
   (IFN_COND_ADD @0 @1 (vec_cond @2 @3 integer_zerop) @1)
(IFN_COND_ADD (bit_and @0 @2) @1 @3 @1))
 
+/* Detect simplication for a conditional length reduction where
+
+   a = mask ? b : 0
+   c = i < len + bias ? d + a : d
+
+   is turned into
+
+   c = mask && i < len ? d + b : d.  */
+(simplify
+  (IFN_COND_LEN_ADD integer_minus_onep @0 (vec_cond @1 @2 zerop) @0 @3 @4)
+   (if (!HONOR_NANS (type) && !HONOR_SIGNED_ZEROS (type))
+(IFN_COND_LEN_ADD @1 @0 @2 @0 @3 @4)))
+
 /* For pointers @0 and @2 and nonnegative constant offset @1, look for
expressions like:
 
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/cond_reduc-1.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/cond_reduc-1.c
new file mode 100644
index 000..db6f9d1ec6c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/cond_reduc-1.c
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=rv64gcv_zvfh -mabi=lp64d 
-fno-vect-cost-model -ffast-math -fdump-tree-optimized" } */
+
+#include 
+
+#define COND_REDUCTION(TYPE)   
\
+  TYPE foo##TYPE (TYPE *restrict a, TYPE *restrict b, int loop_size)   
\
+  {
\
+TYPE result = 0;   
\
+for (int i = 0; i < loop_size; i++)
\
+  if (b[i] <= a[i])
\
+   result += a[i];\
+return result; 
\
+  }
+
+COND_REDUCTION (int8_t)
+COND_REDUCTION (int16_t)
+COND_REDUCTION (int32_t)
+COND_REDUCTION (int64_t)
+COND_REDUCTION (uint8_t)
+COND_REDUCTION (uint16_t)
+COND_REDUCTION (uint32_t)
+COND_REDUCTION (uint64_t)
+COND_REDUCTION (_Float16)
+COND_REDUCTION (float)
+COND_REDUCTION (double)
+
+/* { dg-final { scan-tree-dump-not "VCOND_MASK" "optimized" } } */
+/* { dg-final { scan-tree-dump-times "COND_LEN_ADD" 11 "optimized" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111594.c 
b/

[PATCH v2] RISC-V: Support FP nearbyint auto-vectorization

2023-09-26 Thread pan2 . li
From: Pan Li 

This patch would like to support auto-vectorization for the
nearbyint API in math.h. It depends on the -ffast-math option.

When we would like to call nearbyint/nearbyintf like v2 = nearbyint (v1),
we will convert it into below insns (reference the implementation of llvm).

* frflags a5
* vfcvt.x.f v3, v1, RDN
* vfcvt.f.x v2, v3
* fsflags a5

However, the floating point value may not need the cvt as above if
its mantissa is zero. Take single precision floating point as example:

Assume we have RTZ rounding mode

  ++---+-+
  | raw float  | binary layout | after nearbyint |
  ++---+-+
  | 8388607.5  | 0x4aff| 8388607.0   |
  | 8388608.0  | 0x4b00| 8388608.0   |
  | 8388609.0  | 0x4b01| 8388609.0   |
  ++---+-+

All single floating point >= 8388608.0 will have all zero mantisaa.
We leverage vmflt and mask to filter them out in vector and only do the
cvt on mask.

Befor this patch:
math-nearbyint-1.c:21:1: missed: couldn't vectorize loop
  ...
.L3:
  flw fa0,0(s0)
  addis0,s0,4
  addis1,s1,4
  callnearbyint
  fsw fa0,-4(s1)
  bne s0,s2,.L3

After this patch:
  vfabs.v v2,v1
  vmflt.vfv0,v2,fa5
  frflags a7
  vfcvt.x.f.v v4,v1,v0.t
  vfcvt.f.x.v v2,v4,v0.t
  fsflags a7
  vfsgnj.vv   v2,v2,v1

Please note VLS mode is also involved in this patch and covered by the
test cases.

gcc/ChangeLog:

* config/riscv/autovec.md (nearbyint2): New pattern.
* config/riscv/riscv-protos.h (enum insn_type): New enum.
(expand_vec_nearbyint): New function decl.
* config/riscv/riscv-v.cc (expand_vec_nearbyint): New func impl.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/unop/test-math.h: Add helper function.
* gcc.target/riscv/rvv/autovec/unop/math-nearbyint-0.c: New test.
* gcc.target/riscv/rvv/autovec/unop/math-nearbyint-1.c: New test.
* gcc.target/riscv/rvv/autovec/unop/math-nearbyint-2.c: New test.
* gcc.target/riscv/rvv/autovec/unop/math-nearbyint-3.c: New test.
* gcc.target/riscv/rvv/autovec/unop/math-nearbyint-run-1.c: New test.
* gcc.target/riscv/rvv/autovec/unop/math-nearbyint-run-2.c: New test.
* gcc.target/riscv/rvv/autovec/vls/math-nearbyint-1.c: New test.

Signed-off-by: Pan Li 
---
 gcc/config/riscv/autovec.md   | 11 
 gcc/config/riscv/riscv-protos.h   |  2 +
 gcc/config/riscv/riscv-v.cc   | 29 ++
 .../riscv/rvv/autovec/unop/math-nearbyint-0.c | 20 +++
 .../riscv/rvv/autovec/unop/math-nearbyint-1.c | 20 +++
 .../riscv/rvv/autovec/unop/math-nearbyint-2.c | 20 +++
 .../riscv/rvv/autovec/unop/math-nearbyint-3.c | 22 +++
 .../rvv/autovec/unop/math-nearbyint-run-1.c   | 48 +++
 .../rvv/autovec/unop/math-nearbyint-run-2.c   | 48 +++
 .../riscv/rvv/autovec/unop/test-math.h| 33 +++
 .../riscv/rvv/autovec/vls/math-nearbyint-1.c  | 58 +++
 11 files changed, 311 insertions(+)
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/math-nearbyint-0.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/math-nearbyint-1.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/math-nearbyint-2.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/math-nearbyint-3.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/math-nearbyint-run-1.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/math-nearbyint-run-2.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/math-nearbyint-1.c

diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md
index a005e17457e..b47f086f5e6 100644
--- a/gcc/config/riscv/autovec.md
+++ b/gcc/config/riscv/autovec.md
@@ -2210,6 +2210,7 @@ (define_expand "avg3_ceil"
 ;; Includes:
 ;; - ceil/ceilf
 ;; - floor/floorf
+;; - nearbyint/nearbyintf
 ;; -
 (define_expand "ceil2"
   [(match_operand:V_VLSF 0 "register_operand")
@@ -2230,3 +2231,13 @@ (define_expand "floor2"
 DONE;
   }
 )
+
+(define_expand "nearbyint2"
+  [(match_operand:V_VLSF 0 "register_operand")
+   (match_operand:V_VLSF 1 "register_operand")]
+  "TARGET_VECTOR && !flag_trapping_math && !flag_rounding_math"
+  {
+riscv_vector::expand_vec_nearbyint (operands[0], operands[1], mode, 
mode);
+DONE;
+  }
+)
diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 63eb2475705..f87bdef0f71 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -296,6 +296,7 @@ enum insn_type : unsigned int
   UNARY_OP_TAMA = __MASK_OP_TAMA | UNARY_OP_P,
   UNARY_OP_TAMU = __MASK_OP_TAMU | UNARY_OP_P,
   UNARY_OP_FRM_DYN = UNARY_OP | FRM_DYN_P,
+  UNARY_OP_TAMU_FRM_DYN

Re: [PATCH v2] RISC-V: Support FP nearbyint auto-vectorization

2023-09-26 Thread juzhe.zh...@rivai.ai
LGTM.



juzhe.zh...@rivai.ai
 
From: pan2.li
Date: 2023-09-26 15:19
To: gcc-patches
CC: juzhe.zhong; pan2.li; yanzhang.wang; kito.cheng
Subject: [PATCH v2] RISC-V: Support FP nearbyint auto-vectorization
From: Pan Li 
 
This patch would like to support auto-vectorization for the
nearbyint API in math.h. It depends on the -ffast-math option.
 
When we would like to call nearbyint/nearbyintf like v2 = nearbyint (v1),
we will convert it into below insns (reference the implementation of llvm).
 
* frflags a5
* vfcvt.x.f v3, v1, RDN
* vfcvt.f.x v2, v3
* fsflags a5
 
However, the floating point value may not need the cvt as above if
its mantissa is zero. Take single precision floating point as example:
 
Assume we have RTZ rounding mode
 
  ++---+-+
  | raw float  | binary layout | after nearbyint |
  ++---+-+
  | 8388607.5  | 0x4aff| 8388607.0   |
  | 8388608.0  | 0x4b00| 8388608.0   |
  | 8388609.0  | 0x4b01| 8388609.0   |
  ++---+-+
 
All single floating point >= 8388608.0 will have all zero mantisaa.
We leverage vmflt and mask to filter them out in vector and only do the
cvt on mask.
 
Befor this patch:
math-nearbyint-1.c:21:1: missed: couldn't vectorize loop
  ...
.L3:
  flw fa0,0(s0)
  addis0,s0,4
  addis1,s1,4
  callnearbyint
  fsw fa0,-4(s1)
  bne s0,s2,.L3
 
After this patch:
  vfabs.v v2,v1
  vmflt.vfv0,v2,fa5
  frflags a7
  vfcvt.x.f.v v4,v1,v0.t
  vfcvt.f.x.v v2,v4,v0.t
  fsflags a7
  vfsgnj.vv   v2,v2,v1
 
Please note VLS mode is also involved in this patch and covered by the
test cases.
 
gcc/ChangeLog:
 
* config/riscv/autovec.md (nearbyint2): New pattern.
* config/riscv/riscv-protos.h (enum insn_type): New enum.
(expand_vec_nearbyint): New function decl.
* config/riscv/riscv-v.cc (expand_vec_nearbyint): New func impl.
 
gcc/testsuite/ChangeLog:
 
* gcc.target/riscv/rvv/autovec/unop/test-math.h: Add helper function.
* gcc.target/riscv/rvv/autovec/unop/math-nearbyint-0.c: New test.
* gcc.target/riscv/rvv/autovec/unop/math-nearbyint-1.c: New test.
* gcc.target/riscv/rvv/autovec/unop/math-nearbyint-2.c: New test.
* gcc.target/riscv/rvv/autovec/unop/math-nearbyint-3.c: New test.
* gcc.target/riscv/rvv/autovec/unop/math-nearbyint-run-1.c: New test.
* gcc.target/riscv/rvv/autovec/unop/math-nearbyint-run-2.c: New test.
* gcc.target/riscv/rvv/autovec/vls/math-nearbyint-1.c: New test.
 
Signed-off-by: Pan Li 
---
gcc/config/riscv/autovec.md   | 11 
gcc/config/riscv/riscv-protos.h   |  2 +
gcc/config/riscv/riscv-v.cc   | 29 ++
.../riscv/rvv/autovec/unop/math-nearbyint-0.c | 20 +++
.../riscv/rvv/autovec/unop/math-nearbyint-1.c | 20 +++
.../riscv/rvv/autovec/unop/math-nearbyint-2.c | 20 +++
.../riscv/rvv/autovec/unop/math-nearbyint-3.c | 22 +++
.../rvv/autovec/unop/math-nearbyint-run-1.c   | 48 +++
.../rvv/autovec/unop/math-nearbyint-run-2.c   | 48 +++
.../riscv/rvv/autovec/unop/test-math.h| 33 +++
.../riscv/rvv/autovec/vls/math-nearbyint-1.c  | 58 +++
11 files changed, 311 insertions(+)
create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/math-nearbyint-0.c
create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/math-nearbyint-1.c
create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/math-nearbyint-2.c
create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/math-nearbyint-3.c
create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/math-nearbyint-run-1.c
create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/math-nearbyint-run-2.c
create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/math-nearbyint-1.c
 
diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md
index a005e17457e..b47f086f5e6 100644
--- a/gcc/config/riscv/autovec.md
+++ b/gcc/config/riscv/autovec.md
@@ -2210,6 +2210,7 @@ (define_expand "avg3_ceil"
;; Includes:
;; - ceil/ceilf
;; - floor/floorf
+;; - nearbyint/nearbyintf
;; -
(define_expand "ceil2"
   [(match_operand:V_VLSF 0 "register_operand")
@@ -2230,3 +2231,13 @@ (define_expand "floor2"
 DONE;
   }
)
+
+(define_expand "nearbyint2"
+  [(match_operand:V_VLSF 0 "register_operand")
+   (match_operand:V_VLSF 1 "register_operand")]
+  "TARGET_VECTOR && !flag_trapping_math && !flag_rounding_math"
+  {
+riscv_vector::expand_vec_nearbyint (operands[0], operands[1], mode, 
mode);
+DONE;
+  }
+)
diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 63eb2475705..f87bdef0f71 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -296,6 +296,7 @@ enum insn_type : unsigned int
   UNARY_OP_TAMA = __MASK_OP_TAMA | UNARY_OP_P,
   UNARY_OP_TAMU

[PATCH v1] RISC-V: Support FP rint auto-vectorization

2023-09-26 Thread pan2 . li
From: Pan Li 

This patch would like to support auto-vectorization for the
rint API in math.h. It depends on the -ffast-math option.

When we would like to call rint/rintf like v2 = rint (v1),
we will convert it into below insns (reference the implementation of llvm).

* vfcvt.x.f v3, v1
* vfcvt.f.x v2, v3

However, the floating point value may not need the cvt as above if
its mantissa is zero. Take single precision floating point as example:

Assume we have RTZ rounding mode

  ++---+-+
  | raw float  | binary layout | after int   |
  ++---+-+
  | -8388607.5 | 0xcaff| -8388607.0  |
  | 8388607.5  | 0x4aff| 8388607.0   |
  | 8388608.0  | 0x4b00| 8388608.0   |
  | 8388609.0  | 0x4b01| 8388609.0   |
  ++---+-+

All single floating point >= 8388608.0 will have all zero mantisaa.
We leverage vmflt and mask to filter them out in vector and only do
the cvt on mask.

Befor this patch:
math-rint-1.c:21:1: missed: couldn't vectorize loop
  ...
.L3:
  flw fa0,0(s0)
  addis0,s0,4
  addis1,s1,4
  callrint
  fsw fa0,-4(s1)
  bne s0,s2,.L3

After this patch:
  vfabs.v v2,v1
  vmflt.vfv0,v2,fa5
  vfcvt.x.f.v v4,v1,v0.t
  vfcvt.f.x.v v2,v4,v0.t
  vfsgnj.vv   v2,v2,v1

Please note VLS mode is also involved in this patch and covered by the
test cases.

gcc/ChangeLog:

* config/riscv/autovec.md (rint2): New pattern.
* config/riscv/riscv-protos.h (expand_vec_rint): New function decl.
* config/riscv/riscv-v.cc (expand_vec_rint): New function impl.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/unop/math-rint-0.c: New test.
* gcc.target/riscv/rvv/autovec/unop/math-rint-1.c: New test.
* gcc.target/riscv/rvv/autovec/unop/math-rint-2.c: New test.
* gcc.target/riscv/rvv/autovec/unop/math-rint-3.c: New test.
* gcc.target/riscv/rvv/autovec/unop/math-rint-run-1.c: New test.
* gcc.target/riscv/rvv/autovec/unop/math-rint-run-2.c: New test.
* gcc.target/riscv/rvv/autovec/vls/math-rint-1.c: New test.

Signed-off-by: Pan Li 
---
 gcc/config/riscv/autovec.md   | 10 
 gcc/config/riscv/riscv-protos.h   |  1 +
 gcc/config/riscv/riscv-v.cc   | 22 +++
 .../riscv/rvv/autovec/unop/math-rint-0.c  | 18 ++
 .../riscv/rvv/autovec/unop/math-rint-1.c  | 18 ++
 .../riscv/rvv/autovec/unop/math-rint-2.c  | 18 ++
 .../riscv/rvv/autovec/unop/math-rint-3.c  | 20 +++
 .../riscv/rvv/autovec/unop/math-rint-run-1.c  | 48 +++
 .../riscv/rvv/autovec/unop/math-rint-run-2.c  | 48 +++
 .../riscv/rvv/autovec/vls/math-rint-1.c   | 58 +++
 10 files changed, 261 insertions(+)
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/math-rint-0.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/math-rint-1.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/math-rint-2.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/math-rint-3.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/math-rint-run-1.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/math-rint-run-2.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/math-rint-1.c

diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md
index b47f086f5e6..1d2fca60e98 100644
--- a/gcc/config/riscv/autovec.md
+++ b/gcc/config/riscv/autovec.md
@@ -2241,3 +2241,13 @@ (define_expand "nearbyint2"
 DONE;
   }
 )
+
+(define_expand "rint2"
+  [(match_operand:V_VLSF 0 "register_operand")
+   (match_operand:V_VLSF 1 "register_operand")]
+  "TARGET_VECTOR && !flag_trapping_math && !flag_rounding_math"
+  {
+riscv_vector::expand_vec_rint (operands[0], operands[1], mode, 
mode);
+DONE;
+  }
+)
diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index f87bdef0f71..629adeea94c 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -462,6 +462,7 @@ void expand_reduction (unsigned, unsigned, rtx *, rtx);
 void expand_vec_ceil (rtx, rtx, machine_mode, machine_mode);
 void expand_vec_floor (rtx, rtx, machine_mode, machine_mode);
 void expand_vec_nearbyint (rtx, rtx, machine_mode, machine_mode);
+void expand_vec_rint (rtx, rtx, machine_mode, machine_mode);
 #endif
 bool sew64_scalar_helper (rtx *, rtx *, rtx, machine_mode,
  bool, void (*)(rtx *, rtx));
diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index 5d3d458fa6c..445ed000f88 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -3698,4 +3698,26 @@ expand_vec_nearbyint (rtx op_0, rtx op_1, machine_mode 
vec_fp_mode,
   emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode);
 }
 
+void
+expand_vec_rint (r

Re: Improve -Wflex-array-member-not-at-end changes.html wording |Plus: and warning bug?

2023-09-26 Thread Tobias Burnus

Hi Richard,

On 26.09.23 08:49, Richard Biener wrote:

On Mon, 25 Sep 2023, Tobias Burnus wrote:

First, the following gives only a -pedantic warning and not a
-Wflex-array-member-not-at-end:

   struct t { int b; int x[]; };
   struct q { int b; struct t a[2]; int c; };

warning: invalid use of structure with flexible array member [-Wpedantic]

If I remove the "[2]", it shows additionally:
   warning: structure containing a flexible array member is not at the end of
   another structure [-Wflex-array-member-not-at-end]

It seems as if it should print latter warning also inside the struct.

I think an array element with a flex array is invalid and this is what
is diagnosed here.  Maybe it should say ' as array element'


My issue is not that it is invalid – but that it is *not* diagnosed
by Qing's -Wflex-array-member-not-at-end

And I believe it should be diagnosed.

(The example is diagnosed when changing 'struct t c[2]' to 'struct t c'
(i.e. array→scalar); it is also diagnosed by "-pedantic", but that diagnoses 
too much.)

* * *

Additionally in previous email:

* RFC whether -Wflex-array-member-not-at-end should be enabled by, e.g. -Wall,
  if we want to deprecate this feature

* Proposed wording improvement for the associated entry in the release notes 
(gcc-14/changes.html)

Tobias

-
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 
München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas 
Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht 
München, HRB 106955


Re: [PATCH v1] RISC-V: Support FP rint auto-vectorization

2023-09-26 Thread juzhe.zh...@rivai.ai
LGTM。



juzhe.zh...@rivai.ai
 
From: pan2.li
Date: 2023-09-26 15:24
To: gcc-patches
CC: juzhe.zhong; pan2.li; yanzhang.wang; kito.cheng
Subject: [PATCH v1] RISC-V: Support FP rint auto-vectorization
From: Pan Li 
 
This patch would like to support auto-vectorization for the
rint API in math.h. It depends on the -ffast-math option.
 
When we would like to call rint/rintf like v2 = rint (v1),
we will convert it into below insns (reference the implementation of llvm).
 
* vfcvt.x.f v3, v1
* vfcvt.f.x v2, v3
 
However, the floating point value may not need the cvt as above if
its mantissa is zero. Take single precision floating point as example:
 
Assume we have RTZ rounding mode
 
  ++---+-+
  | raw float  | binary layout | after int   |
  ++---+-+
  | -8388607.5 | 0xcaff| -8388607.0  |
  | 8388607.5  | 0x4aff| 8388607.0   |
  | 8388608.0  | 0x4b00| 8388608.0   |
  | 8388609.0  | 0x4b01| 8388609.0   |
  ++---+-+
 
All single floating point >= 8388608.0 will have all zero mantisaa.
We leverage vmflt and mask to filter them out in vector and only do
the cvt on mask.
 
Befor this patch:
math-rint-1.c:21:1: missed: couldn't vectorize loop
  ...
.L3:
  flw fa0,0(s0)
  addis0,s0,4
  addis1,s1,4
  callrint
  fsw fa0,-4(s1)
  bne s0,s2,.L3
 
After this patch:
  vfabs.v v2,v1
  vmflt.vfv0,v2,fa5
  vfcvt.x.f.v v4,v1,v0.t
  vfcvt.f.x.v v2,v4,v0.t
  vfsgnj.vv   v2,v2,v1
 
Please note VLS mode is also involved in this patch and covered by the
test cases.
 
gcc/ChangeLog:
 
* config/riscv/autovec.md (rint2): New pattern.
* config/riscv/riscv-protos.h (expand_vec_rint): New function decl.
* config/riscv/riscv-v.cc (expand_vec_rint): New function impl.
 
gcc/testsuite/ChangeLog:
 
* gcc.target/riscv/rvv/autovec/unop/math-rint-0.c: New test.
* gcc.target/riscv/rvv/autovec/unop/math-rint-1.c: New test.
* gcc.target/riscv/rvv/autovec/unop/math-rint-2.c: New test.
* gcc.target/riscv/rvv/autovec/unop/math-rint-3.c: New test.
* gcc.target/riscv/rvv/autovec/unop/math-rint-run-1.c: New test.
* gcc.target/riscv/rvv/autovec/unop/math-rint-run-2.c: New test.
* gcc.target/riscv/rvv/autovec/vls/math-rint-1.c: New test.
 
Signed-off-by: Pan Li 
---
gcc/config/riscv/autovec.md   | 10 
gcc/config/riscv/riscv-protos.h   |  1 +
gcc/config/riscv/riscv-v.cc   | 22 +++
.../riscv/rvv/autovec/unop/math-rint-0.c  | 18 ++
.../riscv/rvv/autovec/unop/math-rint-1.c  | 18 ++
.../riscv/rvv/autovec/unop/math-rint-2.c  | 18 ++
.../riscv/rvv/autovec/unop/math-rint-3.c  | 20 +++
.../riscv/rvv/autovec/unop/math-rint-run-1.c  | 48 +++
.../riscv/rvv/autovec/unop/math-rint-run-2.c  | 48 +++
.../riscv/rvv/autovec/vls/math-rint-1.c   | 58 +++
10 files changed, 261 insertions(+)
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/math-rint-0.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/math-rint-1.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/math-rint-2.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/math-rint-3.c
create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/math-rint-run-1.c
create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/math-rint-run-2.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/math-rint-1.c
 
diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md
index b47f086f5e6..1d2fca60e98 100644
--- a/gcc/config/riscv/autovec.md
+++ b/gcc/config/riscv/autovec.md
@@ -2241,3 +2241,13 @@ (define_expand "nearbyint2"
 DONE;
   }
)
+
+(define_expand "rint2"
+  [(match_operand:V_VLSF 0 "register_operand")
+   (match_operand:V_VLSF 1 "register_operand")]
+  "TARGET_VECTOR && !flag_trapping_math && !flag_rounding_math"
+  {
+riscv_vector::expand_vec_rint (operands[0], operands[1], mode, 
mode);
+DONE;
+  }
+)
diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index f87bdef0f71..629adeea94c 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -462,6 +462,7 @@ void expand_reduction (unsigned, unsigned, rtx *, rtx);
void expand_vec_ceil (rtx, rtx, machine_mode, machine_mode);
void expand_vec_floor (rtx, rtx, machine_mode, machine_mode);
void expand_vec_nearbyint (rtx, rtx, machine_mode, machine_mode);
+void expand_vec_rint (rtx, rtx, machine_mode, machine_mode);
#endif
bool sew64_scalar_helper (rtx *, rtx *, rtx, machine_mode,
  bool, void (*)(rtx *, rtx));
diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index 5d3d458fa6c..445ed000f88 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -3698,4 +3698,26 @@ expand_vec_nearbyint (rtx op_0, rtx op_1, machine_mode 
vec_fp_mode,

Re: [patch] invoke.texi: Update -fopenmp and -fopenmp-simd for omp::decl and loop semantic

2023-09-26 Thread Richard Biener
On Mon, Sep 25, 2023 at 8:25 PM Tobias Burnus  wrote:
>
> I stumbled over this during the ARM64 talk at the cauldron as they
> consider using -fopenmp-simd by default.

Ah, forgot to ask during the talk - isn't __attribute__((simd(..))) exactly to
avoid the need for -fopenmp-simd?

> → https://gcc.gnu.org/wiki/cauldron2023 (I put my talk/BoF slides up;
> others aren't, yet)
>
> I did stumble over 'omp loop' with SIMD. It turns out that -fopenmp-simd
> just turns 'loop' into 'simd', ignoring whatever value the user has
> specified for the bind value.
>
> Additionally, [[omp::decl(...)]] was missing.
>
> Any comment to that patch before I commit it?
>
> Tobias
>
> PS: the [[omp::...]] needs a 'C++' → 'C/C++' change once omp:: support
> with C23's attributes is in.
> -
> Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 
> München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas 
> Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht 
> München, HRB 106955


Re: [PATCH] MATCH: Optimize COND_ADD_LEN reduction pattern

2023-09-26 Thread Richard Biener
On Tue, Sep 26, 2023 at 9:13 AM Juzhe-Zhong  wrote:
>
>
> This patch leverage this commit: 
> https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=62b505a4d5fc89
> to optimize COND_LEN_ADD reduction pattern.
>
> We are doing optimization of VEC_COND_EXPR + COND_LEN_ADD -> COND_LEN_ADD.
>
> Consider thsi following case:
>
> #include 
>
> void
> pr11594 (uint64_t *restrict a, uint64_t *restrict b, int loop_size)
> {
>   uint64_t result = 0;
>
>   for (int i = 0; i < loop_size; i++)
> {
>   if (b[i] <= a[i])
> {
>   result += a[i];
> }
> }
>
>   a[0] = result;
> }
>
> Before this patch:
> vsetvli a7,zero,e64,m1,ta,ma
> vmv.v.i v2,0
> vmv1r.v v3,v2--- redundant
> .L3:
> vsetvli a5,a2,e64,m1,ta,ma
> vle64.v v1,0(a3)
> vle64.v v0,0(a1)
> sllia6,a5,3
> vsetvli a7,zero,e64,m1,ta,ma
> sub a2,a2,a5
> vmsleu.vv   v0,v0,v1
> add a1,a1,a6
> vmerge.vvm  v1,v3,v1,v0  redundant.
> add a3,a3,a6
> vsetvli zero,a5,e64,m1,tu,ma
> vadd.vv v2,v2,v1
> bne a2,zero,.L3
> li  a5,0
> vsetvli a4,zero,e64,m1,ta,ma
> vmv.s.x v1,a5
> vredsum.vs  v2,v2,v1
> vmv.x.s a5,v2
> sd  a5,0(a0)
> ret
>
> After this patch:
>
> vsetvli a6,zero,e64,m1,ta,ma
> vmv.v.i v1,0
> .L3:
> vsetvli a5,a2,e64,m1,ta,ma
> vle64.v v2,0(a4)
> vle64.v v0,0(a1)
> sllia3,a5,3
> vsetvli a6,zero,e64,m1,ta,ma
> sub a2,a2,a5
> vmsleu.vv   v0,v0,v2
> add a1,a1,a3
> vsetvli zero,a5,e64,m1,tu,mu
> add a4,a4,a3
> vadd.vv v1,v1,v2,v0.t
> bne a2,zero,.L3
> li  a5,0
> vsetivlizero,1,e64,m1,ta,ma
> vmv.s.x v2,a5
> vsetvli a5,zero,e64,m1,ta,ma
> vredsum.vs  v1,v1,v2
> vmv.x.s a5,v1
> sd  a5,0(a0)
> ret
>
> Bootstrap && Regression is running.
>
> Ok for trunk when testing passes ?
>
> PR tree-optimization/111594
> PR tree-optimization/110660
>
> gcc/ChangeLog:
>
> * match.pd: Optimize COND_LEN_ADD reduction.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/riscv/rvv/autovec/cond/cond_reduc-1.c: New test.
> * gcc.target/riscv/rvv/autovec/cond/pr111594.c: New test.
>
> ---
>  gcc/match.pd  | 13 +
>  .../riscv/rvv/autovec/cond/cond_reduc-1.c | 29 +++
>  .../riscv/rvv/autovec/cond/pr111594.c | 22 ++
>  3 files changed, 64 insertions(+)
>  create mode 100644 
> gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/cond_reduc-1.c
>  create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111594.c
>
> diff --git a/gcc/match.pd b/gcc/match.pd
> index a17778fbaa6..af8d12c138e 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -8866,6 +8866,19 @@ and,
>(IFN_COND_ADD @0 @1 (vec_cond @2 @3 integer_zerop) @1)
> (IFN_COND_ADD (bit_and @0 @2) @1 @3 @1))
>
> +/* Detect simplication for a conditional length reduction where
> +
> +   a = mask ? b : 0
> +   c = i < len + bias ? d + a : d
> +
> +   is turned into
> +
> +   c = mask && i < len ? d + b : d.  */
> +(simplify
> +  (IFN_COND_LEN_ADD integer_minus_onep @0 (vec_cond @1 @2 zerop) @0 @3 @4)

I think you want intger_truep instead of integer_minus_onep for
readability.  Since you
use zerop here can you also adjust the preceeding pattern?

> +   (if (!HONOR_NANS (type) && !HONOR_SIGNED_ZEROS (type))

it might be better to check ANY_INTEGRAL_TYPE_P (type) ||
fold_real_zero_addition_p (type, NULL_TREE, @5, 0)
your change misses HONOR_SIGN_DEPENDENT_ROUNDING I think.

> +(IFN_COND_LEN_ADD @1 @0 @2 @0 @3 @4)))
> +



>  /* For pointers @0 and @2 and nonnegative constant offset @1, look for
> expressions like:
>
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/cond_reduc-1.c 
> b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/cond_reduc-1.c
> new file mode 100644
> index 000..db6f9d1ec6c
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/cond_reduc-1.c
> @@ -0,0 +1,29 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-march=rv64gcv_zvfh -mabi=lp64d 
> -fno-vect-cost-model -ffast-math -fdump-tree-optimized" } */
> +
> +#include 
> +
> +#define COND_REDUCTION(TYPE) 
>   \
> +  TYPE foo##TYPE (TYPE *restrict a, TYPE *restrict b, int loop_size) 
>   \
> +  {  
>   \
> +TYPE result = 0; 
>   \
> +for (int i = 0; i < loop_size; i++)  
>   \
> +  if (b[i] <= a[i])  
>   \
> +   result += 

Re: [patch] invoke.texi: Update -fopenmp and -fopenmp-simd for omp::decl and loop semantic

2023-09-26 Thread Jakub Jelinek
On Tue, Sep 26, 2023 at 09:34:15AM +0200, Richard Biener wrote:
> On Mon, Sep 25, 2023 at 8:25 PM Tobias Burnus  wrote:
> >
> > I stumbled over this during the ARM64 talk at the cauldron as they
> > consider using -fopenmp-simd by default.
> 
> Ah, forgot to ask during the talk - isn't __attribute__((simd(..))) exactly to
> avoid the need for -fopenmp-simd?

Yes.  Though, -fopenmp-simd can do slightly more than that,
#pragma GCC ivdep equivalent on various loops is the other important part
(though for both simd attribute and ivdep pragma the OpenMP counterparts
are significantly more flexible, in the declare simd case one can specify
certain arguments should be linear (with various variants for
references)/uniform, simd construct can specify not just everything is
independent or not, but how many consecutive iterations must be independent,
can specify reductions etc., preferred vectorization factors, ...).
Though, I don't think it is a good idea to enable -fopenmp-simd by default.

Jakub



Re: [patch] invoke.texi: Update -fopenmp and -fopenmp-simd for omp::decl and loop semantic

2023-09-26 Thread Richard Biener
On Tue, Sep 26, 2023 at 9:50 AM Jakub Jelinek  wrote:
>
> On Tue, Sep 26, 2023 at 09:34:15AM +0200, Richard Biener wrote:
> > On Mon, Sep 25, 2023 at 8:25 PM Tobias Burnus  
> > wrote:
> > >
> > > I stumbled over this during the ARM64 talk at the cauldron as they
> > > consider using -fopenmp-simd by default.
> >
> > Ah, forgot to ask during the talk - isn't __attribute__((simd(..))) exactly 
> > to
> > avoid the need for -fopenmp-simd?
>
> Yes.  Though, -fopenmp-simd can do slightly more than that,
> #pragma GCC ivdep equivalent on various loops is the other important part
> (though for both simd attribute and ivdep pragma the OpenMP counterparts
> are significantly more flexible, in the declare simd case one can specify
> certain arguments should be linear (with various variants for
> references)/uniform, simd construct can specify not just everything is
> independent or not, but how many consecutive iterations must be independent,
> can specify reductions etc., preferred vectorization factors, ...).
> Though, I don't think it is a good idea to enable -fopenmp-simd by default.

Yeah, writing invalid OMP simd pragmas (syntactically or semantically) shouldn't
be causing problems when not using -fopenmp-simd but would if that's
the default.

Richard.

>
> Jakub
>


Re: [patch] invoke.texi: Update -fopenmp and -fopenmp-simd for omp::decl and loop semantic

2023-09-26 Thread Jakub Jelinek
On Mon, Sep 25, 2023 at 08:24:14PM +0200, Tobias Burnus wrote:
> I stumbled over this during the ARM64 talk at the cauldron as they
> consider using -fopenmp-simd by default.
> 
> → https://gcc.gnu.org/wiki/cauldron2023 (I put my talk/BoF slides up;
> others aren't, yet)
> 
> I did stumble over 'omp loop' with SIMD. It turns out that -fopenmp-simd
> just turns 'loop' into 'simd', ignoring whatever value the user has
> specified for the bind value.

I think that is desirable, because with -fopenmp-simd binding to teams,
parallel and thread are the same thing, there is exactly one team and
one thread in parallel.  Sure, one can have multiple threads with
-fopenmp-simd, but those are POSIX threads, while the binding of loop is
to the OpenMP constructs.
Even loop binds(team) is a way to express the iterations are vectorizable...

Jakub



[PATCH V2] MATCH: Optimize COND_ADD_LEN reduction pattern

2023-09-26 Thread Juzhe-Zhong
This patch leverage this commit: 
https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=62b505a4d5fc89
to optimize COND_LEN_ADD reduction pattern.

We are doing optimization of VEC_COND_EXPR + COND_LEN_ADD -> COND_LEN_ADD.

Consider thsi following case:

#include 

void
pr11594 (uint64_t *restrict a, uint64_t *restrict b, int loop_size)
{
  uint64_t result = 0;

  for (int i = 0; i < loop_size; i++)
{
  if (b[i] <= a[i])
{
  result += a[i];
}
}

  a[0] = result;
}

Before this patch:
vsetvli a7,zero,e64,m1,ta,ma
vmv.v.i v2,0
vmv1r.v v3,v2--- redundant
.L3:
vsetvli a5,a2,e64,m1,ta,ma
vle64.v v1,0(a3)
vle64.v v0,0(a1)
sllia6,a5,3
vsetvli a7,zero,e64,m1,ta,ma
sub a2,a2,a5
vmsleu.vv   v0,v0,v1
add a1,a1,a6
vmerge.vvm  v1,v3,v1,v0  redundant.
add a3,a3,a6
vsetvli zero,a5,e64,m1,tu,ma
vadd.vv v2,v2,v1
bne a2,zero,.L3
li  a5,0
vsetvli a4,zero,e64,m1,ta,ma
vmv.s.x v1,a5
vredsum.vs  v2,v2,v1
vmv.x.s a5,v2
sd  a5,0(a0)
ret

After this patch:

vsetvli a6,zero,e64,m1,ta,ma
vmv.v.i v1,0
.L3:
vsetvli a5,a2,e64,m1,ta,ma
vle64.v v2,0(a4)
vle64.v v0,0(a1)
sllia3,a5,3
vsetvli a6,zero,e64,m1,ta,ma
sub a2,a2,a5
vmsleu.vv   v0,v0,v2
add a1,a1,a3
vsetvli zero,a5,e64,m1,tu,mu
add a4,a4,a3
vadd.vv v1,v1,v2,v0.t
bne a2,zero,.L3
li  a5,0
vsetivlizero,1,e64,m1,ta,ma
vmv.s.x v2,a5
vsetvli a5,zero,e64,m1,ta,ma
vredsum.vs  v1,v1,v2
vmv.x.s a5,v1
sd  a5,0(a0)
ret

Bootstrap && Regression is running.

Ok for trunk when testing passes ?

PR tree-optimization/111594
PR tree-optimization/110660

gcc/ChangeLog:

* match.pd: Optimize COND_LEN_ADD reduction.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/cond/cond_reduc-1.c: New test.
* gcc.target/riscv/rvv/autovec/cond/pr111594.c: New test.

---
 gcc/match.pd  | 14 +
 .../riscv/rvv/autovec/cond/cond_reduc-1.c | 29 +++
 .../riscv/rvv/autovec/cond/pr111594.c | 22 ++
 3 files changed, 65 insertions(+)
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/cond_reduc-1.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111594.c

diff --git a/gcc/match.pd b/gcc/match.pd
index a17778fbaa6..5061c19e086 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -8866,6 +8866,20 @@ and,
   (IFN_COND_ADD @0 @1 (vec_cond @2 @3 integer_zerop) @1)
(IFN_COND_ADD (bit_and @0 @2) @1 @3 @1))
 
+/* Detect simplication for a conditional length reduction where
+
+   a = mask ? b : 0
+   c = i < len + bias ? d + a : d
+
+   is turned into
+
+   c = mask && i < len + bias ? d + b : d.  */
+(simplify
+  (IFN_COND_LEN_ADD integer_truep @0 (vec_cond @1 @2 zerop@5) @0 @3 @4)
+   (if (ANY_INTEGRAL_TYPE_P (type)
+   || fold_real_zero_addition_p (type, NULL_TREE, @5, 0))
+(IFN_COND_LEN_ADD @1 @0 @2 @0 @3 @4)))
+
 /* For pointers @0 and @2 and nonnegative constant offset @1, look for
expressions like:
 
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/cond_reduc-1.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/cond_reduc-1.c
new file mode 100644
index 000..db6f9d1ec6c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/cond_reduc-1.c
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=rv64gcv_zvfh -mabi=lp64d 
-fno-vect-cost-model -ffast-math -fdump-tree-optimized" } */
+
+#include 
+
+#define COND_REDUCTION(TYPE)   
\
+  TYPE foo##TYPE (TYPE *restrict a, TYPE *restrict b, int loop_size)   
\
+  {
\
+TYPE result = 0;   
\
+for (int i = 0; i < loop_size; i++)
\
+  if (b[i] <= a[i])
\
+   result += a[i];\
+return result; 
\
+  }
+
+COND_REDUCTION (int8_t)
+COND_REDUCTION (int16_t)
+COND_REDUCTION (int32_t)
+COND_REDUCTION (int64_t)
+COND_REDUCTION (uint8_t)
+COND_REDUCTION (uint16_t)
+COND_REDUCTION (uint32_t)
+COND_REDUCTION (uint64_t)
+COND_REDUCTION (_Float16)
+COND_REDUCTION (float)
+COND_REDUCTION (double)
+
+/* { dg-final { scan-tree-dump-not "VCOND_MASK" "optimized" } } */
+/* { dg-final { scan-tree-dump-times "COND_LEN_ADD" 11 "optimized" } } */
diff --git a/gcc/testsuite/gcc.ta

[PATCH] MATCH: Optimize COND_ADD reduction pattern

2023-09-26 Thread Juzhe-Zhong
Current COND_ADD reduction pattern can't optimize floating-point vector.
As Richard suggested: 
https://gcc.gnu.org/pipermail/gcc-patches/2023-September/631336.html
Allow COND_ADD reduction pattern to optimize floating-point vector.

Bootstrap and Regression is running.

Ok for trunk if tests pass ?

gcc/ChangeLog:

* match.pd: Optimize COND_ADD reduction pattern.

---
 gcc/match.pd | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/gcc/match.pd b/gcc/match.pd
index 5061c19e086..398beaebd27 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -8863,8 +8863,10 @@ and,
 
c = mask1 && mask2 ? d + b : d.  */
 (simplify
-  (IFN_COND_ADD @0 @1 (vec_cond @2 @3 integer_zerop) @1)
-   (IFN_COND_ADD (bit_and @0 @2) @1 @3 @1))
+  (IFN_COND_ADD @0 @1 (vec_cond @2 @3 zerop@4) @1)
+   (if (ANY_INTEGRAL_TYPE_P (type)
+   || fold_real_zero_addition_p (type, NULL_TREE, @4, 0))
+   (IFN_COND_ADD (bit_and @0 @2) @1 @3 @1)))
 
 /* Detect simplication for a conditional length reduction where
 
-- 
2.36.3



Re: Re: [PATCH] MATCH: Optimize COND_ADD_LEN reduction pattern

2023-09-26 Thread juzhe.zh...@rivai.ai
Hi, Richi.

Addresse comments.

One is V2 patch for COND_LEN_ADD reduction:
https://gcc.gnu.org/pipermail/gcc-patches/2023-September/631340.html 

The second one is optimize COND_ADD reduction:
https://gcc.gnu.org/pipermail/gcc-patches/2023-September/631341.html 




juzhe.zh...@rivai.ai
 
From: Richard Biener
Date: 2023-09-26 15:46
To: Juzhe-Zhong
CC: gcc-patches; richard.sandiford; rguenther; pinskia
Subject: Re: [PATCH] MATCH: Optimize COND_ADD_LEN reduction pattern
On Tue, Sep 26, 2023 at 9:13 AM Juzhe-Zhong  wrote:
>
>
> This patch leverage this commit: 
> https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=62b505a4d5fc89
> to optimize COND_LEN_ADD reduction pattern.
>
> We are doing optimization of VEC_COND_EXPR + COND_LEN_ADD -> COND_LEN_ADD.
>
> Consider thsi following case:
>
> #include 
>
> void
> pr11594 (uint64_t *restrict a, uint64_t *restrict b, int loop_size)
> {
>   uint64_t result = 0;
>
>   for (int i = 0; i < loop_size; i++)
> {
>   if (b[i] <= a[i])
> {
>   result += a[i];
> }
> }
>
>   a[0] = result;
> }
>
> Before this patch:
> vsetvli a7,zero,e64,m1,ta,ma
> vmv.v.i v2,0
> vmv1r.v v3,v2--- redundant
> .L3:
> vsetvli a5,a2,e64,m1,ta,ma
> vle64.v v1,0(a3)
> vle64.v v0,0(a1)
> sllia6,a5,3
> vsetvli a7,zero,e64,m1,ta,ma
> sub a2,a2,a5
> vmsleu.vv   v0,v0,v1
> add a1,a1,a6
> vmerge.vvm  v1,v3,v1,v0  redundant.
> add a3,a3,a6
> vsetvli zero,a5,e64,m1,tu,ma
> vadd.vv v2,v2,v1
> bne a2,zero,.L3
> li  a5,0
> vsetvli a4,zero,e64,m1,ta,ma
> vmv.s.x v1,a5
> vredsum.vs  v2,v2,v1
> vmv.x.s a5,v2
> sd  a5,0(a0)
> ret
>
> After this patch:
>
> vsetvli a6,zero,e64,m1,ta,ma
> vmv.v.i v1,0
> .L3:
> vsetvli a5,a2,e64,m1,ta,ma
> vle64.v v2,0(a4)
> vle64.v v0,0(a1)
> sllia3,a5,3
> vsetvli a6,zero,e64,m1,ta,ma
> sub a2,a2,a5
> vmsleu.vv   v0,v0,v2
> add a1,a1,a3
> vsetvli zero,a5,e64,m1,tu,mu
> add a4,a4,a3
> vadd.vv v1,v1,v2,v0.t
> bne a2,zero,.L3
> li  a5,0
> vsetivlizero,1,e64,m1,ta,ma
> vmv.s.x v2,a5
> vsetvli a5,zero,e64,m1,ta,ma
> vredsum.vs  v1,v1,v2
> vmv.x.s a5,v1
> sd  a5,0(a0)
> ret
>
> Bootstrap && Regression is running.
>
> Ok for trunk when testing passes ?
>
> PR tree-optimization/111594
> PR tree-optimization/110660
>
> gcc/ChangeLog:
>
> * match.pd: Optimize COND_LEN_ADD reduction.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/riscv/rvv/autovec/cond/cond_reduc-1.c: New test.
> * gcc.target/riscv/rvv/autovec/cond/pr111594.c: New test.
>
> ---
>  gcc/match.pd  | 13 +
>  .../riscv/rvv/autovec/cond/cond_reduc-1.c | 29 +++
>  .../riscv/rvv/autovec/cond/pr111594.c | 22 ++
>  3 files changed, 64 insertions(+)
>  create mode 100644 
> gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/cond_reduc-1.c
>  create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111594.c
>
> diff --git a/gcc/match.pd b/gcc/match.pd
> index a17778fbaa6..af8d12c138e 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -8866,6 +8866,19 @@ and,
>(IFN_COND_ADD @0 @1 (vec_cond @2 @3 integer_zerop) @1)
> (IFN_COND_ADD (bit_and @0 @2) @1 @3 @1))
>
> +/* Detect simplication for a conditional length reduction where
> +
> +   a = mask ? b : 0
> +   c = i < len + bias ? d + a : d
> +
> +   is turned into
> +
> +   c = mask && i < len ? d + b : d.  */
> +(simplify
> +  (IFN_COND_LEN_ADD integer_minus_onep @0 (vec_cond @1 @2 zerop) @0 @3 @4)
 
I think you want intger_truep instead of integer_minus_onep for
readability.  Since you
use zerop here can you also adjust the preceeding pattern?
 
> +   (if (!HONOR_NANS (type) && !HONOR_SIGNED_ZEROS (type))
 
it might be better to check ANY_INTEGRAL_TYPE_P (type) ||
fold_real_zero_addition_p (type, NULL_TREE, @5, 0)
your change misses HONOR_SIGN_DEPENDENT_ROUNDING I think.
 
> +(IFN_COND_LEN_ADD @1 @0 @2 @0 @3 @4)))
> +
 
 
 
>  /* For pointers @0 and @2 and nonnegative constant offset @1, look for
> expressions like:
>
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/cond_reduc-1.c 
> b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/cond_reduc-1.c
> new file mode 100644
> index 000..db6f9d1ec6c
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/cond_reduc-1.c
> @@ -0,0 +1,29 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-march=rv64gcv_zvfh -mabi=lp64d 
> -fno-vect-cost-model -ffast-math -fdump-tree-optimized" } */
> +
> +#include 
> +
> +#define COND_REDUCTION(TYPE)  

RE: [PATCH v3] aarch64: Fine-grained policies to control ldp-stp formation.

2023-09-26 Thread Kyrylo Tkachov


> -Original Message-
> From: Andrew Pinski 
> Sent: Monday, September 25, 2023 9:05 PM
> To: Philipp Tomsich 
> Cc: Manos Anagnostakis ; gcc-
> patc...@gcc.gnu.org; Kyrylo Tkachov 
> Subject: Re: [PATCH v3] aarch64: Fine-grained policies to control ldp-stp
> formation.
> 
> On Mon, Sep 25, 2023 at 12:59 PM Philipp Tomsich
>  wrote:
> >
> > On Mon, 25 Sept 2023 at 21:54, Andrew Pinski  wrote:
> > >
> > > On Mon, Sep 25, 2023 at 12:50 PM Manos Anagnostakis
> > >  wrote:
> > > >
> > > > This patch implements the following TODO in
> gcc/config/aarch64/aarch64.cc
> > > > to provide the requested behaviour for handling ldp and stp:
> > > >
> > > >   /* Allow the tuning structure to disable LDP instruction formation
> > > >  from combining instructions (e.g., in peephole2).
> > > >  TODO: Implement fine-grained tuning control for LDP and STP:
> > > >1. control policies for load and store separately;
> > > >2. support the following policies:
> > > >   - default (use what is in the tuning structure)
> > > >   - always
> > > >   - never
> > > >   - aligned (only if the compiler can prove that the
> > > > load will be aligned to 2 * element_size)  */
> > > >
> > > > It provides two new and concrete target-specific command-line
> parameters
> > > > -param=aarch64-ldp-policy= and -param=aarch64-stp-policy=
> > > > to give the ability to control load and store policies seperately as
> > > > stated in part 1 of the TODO.
> > > >
> > > > The accepted values for both parameters are:
> > > > - 0: Use the policy of the tuning structure (default).
> > > > - 1: Emit ldp/stp regardless of alignment.
> > > > - 2: Do not emit ldp/stp.
> > > > - 3: In order to emit ldp/stp, first check if the load/store will
> > > >   be aligned to 2 * element_size.
> > >
> > > Instead of a number, does it make sense to instead use an string
> > > (ENUM) for this param.
> > > Also I think using --param is a bad idea if it is going to be
> > > documented in the user manual.
> > > Maybe a -m option should be used instead.
> >
> > See https://gcc.gnu.org/pipermail/gcc-patches/2023-
> September/631283.html
> > for the discussion triggering the change from -m... to --param and the
> > change to using a number instead of a string.
> 
> That is the opposite of the current GCC practice across all targets.
> Things like this should be consistent and if one target decides to do
> it different, then maybe it should NOT.
> Anyways we should document the correct coding style for options so we
> don't have these back and forths again.

My rationale for having this as a param rather than an -m* option is that
this is just an override for a codegen heuristic that the compiler should be
getting correct on its own when used by a normal user.
Having a way to force an explicit LDP/STP policy can be useful for testing
the compiler and for some power user experimentation, but I wouldn't
want to see it make its way into any user makefiles.

Good point on having it accept an enum, it is definitely more readable to have 
a string argument.
Thanks,
Kyrill

> 
> 
> Thanks,
> Andrew
> 
> >
> > Thanks,
> > Philipp.
> >
> > >
> > > Thanks,
> > > Andrew
> > >
> > > >
> > > > gcc/ChangeLog:
> > > > * config/aarch64/aarch64-protos.h (struct tune_params): Add
> > > > appropriate enums for the policies.
> > > > * config/aarch64/aarch64-tuning-flags.def
> > > > (AARCH64_EXTRA_TUNING_OPTION): Remove superseded tuning
> > > > options.
> > > > * config/aarch64/aarch64.cc (aarch64_parse_ldp_policy): New
> > > > function to parse ldp-policy parameter.
> > > > (aarch64_parse_stp_policy): New function to parse stp-policy
> parameter.
> > > > (aarch64_override_options_internal): Call parsing functions.
> > > > (aarch64_operands_ok_for_ldpstp): Add parameter-value check
> and
> > > > alignment check and remove superseded ones.
> > > > (aarch64_operands_adjust_ok_for_ldpstp): Add parameter-value
> check and
> > > > alignment check and remove superseded ones.
> > > > * config/aarch64/aarch64.opt: Add options.
> > > > * doc/invoke.texi: Document the parameters accordingly.
> > > >
> > > > gcc/testsuite/ChangeLog:
> > > > * gcc.target/aarch64/ampere1-no_ldp_combine.c: Removed.
> > > > * gcc.target/aarch64/ldp_aligned.c: New test.
> > > > * gcc.target/aarch64/ldp_always.c: New test.
> > > > * gcc.target/aarch64/ldp_never.c: New test.
> > > > * gcc.target/aarch64/stp_aligned.c: New test.
> > > > * gcc.target/aarch64/stp_always.c: New test.
> > > > * gcc.target/aarch64/stp_never.c: New test.
> > > >
> > > > Signed-off-by: Manos Anagnostakis 
> > > > ---
> > > > Changes in v3:
> > > > - Changed command-line options to target-specific parameters
> > > >   and documented them accordingly in doc/invoke.texi.
>

RE: [PATCH v3] aarch64: Fine-grained policies to control ldp-stp formation.

2023-09-26 Thread Kyrylo Tkachov
Hi Manos,

Thank you for the quick turnaround, please post the patch that uses a --param 
with an enum. I think that's the direction we should be going with this patch.
Thanks,
Kyrill

From: Manos Anagnostakis  
Sent: Tuesday, September 26, 2023 7:06 AM
To: gcc-patches@gcc.gnu.org
Cc: Philipp Tomsich ; Kyrylo Tkachov 
; Andrew Pinski 
Subject: Re: [PATCH v3] aarch64: Fine-grained policies to control ldp-stp 
formation.

Thank you Andrew for the input.

I've prepared a patch using --param with enum, which seems a more suitable 
approach to me as strings are more descriptive as well.

The current patch needed an adjustment on how to call the parsing functions to 
match the compiler coding style.

Both are bootstrapped and regstested.

I can send a V4 of whichever is preferred.

Thanks!

Manos.

On Mon, Sep 25, 2023 at 11:57 PM Andrew Pinski  wrote:
On Mon, Sep 25, 2023 at 1:04 PM Andrew Pinski  wrote:
>
> On Mon, Sep 25, 2023 at 12:59 PM Philipp Tomsich
>  wrote:
> >
> > On Mon, 25 Sept 2023 at 21:54, Andrew Pinski  
> > wrote:
> > >
> > > On Mon, Sep 25, 2023 at 12:50 PM Manos Anagnostakis
> > >  wrote:
> > > >
> > > > This patch implements the following TODO in 
> > > > gcc/config/aarch64/aarch64.cc
> > > > to provide the requested behaviour for handling ldp and stp:
> > > >
> > > >   /* Allow the tuning structure to disable LDP instruction formation
> > > >      from combining instructions (e.g., in peephole2).
> > > >      TODO: Implement fine-grained tuning control for LDP and STP:
> > > >            1. control policies for load and store separately;
> > > >            2. support the following policies:
> > > >               - default (use what is in the tuning structure)
> > > >               - always
> > > >               - never
> > > >               - aligned (only if the compiler can prove that the
> > > >                 load will be aligned to 2 * element_size)  */
> > > >
> > > > It provides two new and concrete target-specific command-line parameters
> > > > -param=aarch64-ldp-policy= and -param=aarch64-stp-policy=
> > > > to give the ability to control load and store policies seperately as
> > > > stated in part 1 of the TODO.
> > > >
> > > > The accepted values for both parameters are:
> > > > - 0: Use the policy of the tuning structure (default).
> > > > - 1: Emit ldp/stp regardless of alignment.
> > > > - 2: Do not emit ldp/stp.
> > > > - 3: In order to emit ldp/stp, first check if the load/store will
> > > >   be aligned to 2 * element_size.
> > >
> > > Instead of a number, does it make sense to instead use an string
> > > (ENUM) for this param.
> > > Also I think using --param is a bad idea if it is going to be
> > > documented in the user manual.
> > > Maybe a -m option should be used instead.
> >
> > See https://gcc.gnu.org/pipermail/gcc-patches/2023-September/631283.html
> > for the discussion triggering the change from -m... to --param and the
> > change to using a number instead of a string.
>
> That is the opposite of the current GCC practice across all targets.
> Things like this should be consistent and if one target decides to do
> it different, then maybe it should NOT.
> Anyways we should document the correct coding style for options so we
> don't have these back and forths again.

Kyrylo:
>  It will have to take a number rather than a string but that should be okay, 
>as long as the right values are documented in invoke.texi.

No it does not need to be a number. --param=ranger-debug= does not
take a number, it takes an enum .
One of the benefits of moving --param support over to .opt to allow
more than just numbers even.

Thanks,
Andrew


>
>
> Thanks,
> Andrew
>
> >
> > Thanks,
> > Philipp.
> >
> > >
> > > Thanks,
> > > Andrew
> > >
> > > >
> > > > gcc/ChangeLog:
> > > >         * config/aarch64/aarch64-protos.h (struct tune_params): Add
> > > >         appropriate enums for the policies.
> > > >         * config/aarch64/aarch64-tuning-flags.def
> > > >         (AARCH64_EXTRA_TUNING_OPTION): Remove superseded tuning
> > > >         options.
> > > >         * config/aarch64/aarch64.cc (aarch64_parse_ldp_policy): New
> > > >         function to parse ldp-policy parameter.
> > > >         (aarch64_parse_stp_policy): New function to parse stp-policy 
> > > >parameter.
> > > >         (aarch64_override_options_internal): Call parsing functions.
> > > >         (aarch64_operands_ok_for_ldpstp): Add parameter-value check and
> > > >         alignment check and remove superseded ones.
> > > >         (aarch64_operands_adjust_ok_for_ldpstp): Add parameter-value 
> > > >check and
> > > >         alignment check and remove superseded ones.
> > > >         * config/aarch64/aarch64.opt: Add options.
> > > >         * doc/invoke.texi: Document the parameters accordingly.
> > > >
> > > > gcc/testsuite/ChangeLog:
> > > >         * gcc.target/aarch

RE: [PATCH v3] aarch64: Fine-grained policies to control ldp-stp formation.

2023-09-26 Thread Kyrylo Tkachov


> -Original Message-
> From: Kyrylo Tkachov 
> Sent: Tuesday, September 26, 2023 9:36 AM
> To: Manos Anagnostakis ; gcc-
> patc...@gcc.gnu.org
> Cc: Philipp Tomsich ; Andrew Pinski
> 
> Subject: RE: [PATCH v3] aarch64: Fine-grained policies to control ldp-stp
> formation.
> 
> Hi Manos,
> 
> Thank you for the quick turnaround, please post the patch that uses a --
> param with an enum. I think that's the direction we should be going with this
> patch.

Ah, and please address Tamar's feedback from 
https://gcc.gnu.org/pipermail/gcc-patches/2023-September/631343.html
Thanks,
Kyrill

> 
> From: Manos Anagnostakis 
> Sent: Tuesday, September 26, 2023 7:06 AM
> To: gcc-patches@gcc.gnu.org
> Cc: Philipp Tomsich ; Kyrylo Tkachov
> ; Andrew Pinski 
> Subject: Re: [PATCH v3] aarch64: Fine-grained policies to control ldp-stp
> formation.
> 
> Thank you Andrew for the input.
> 
> I've prepared a patch using --param with enum, which seems a more suitable
> approach to me as strings are more descriptive as well.
> 
> The current patch needed an adjustment on how to call the parsing functions
> to match the compiler coding style.
> 
> Both are bootstrapped and regstested.
> 
> I can send a V4 of whichever is preferred.
> 
> Thanks!
> 
> Manos.
> 
> On Mon, Sep 25, 2023 at 11:57 PM Andrew Pinski
>  wrote:
> On Mon, Sep 25, 2023 at 1:04 PM Andrew Pinski 
> wrote:
> >
> > On Mon, Sep 25, 2023 at 12:59 PM Philipp Tomsich
> >  wrote:
> > >
> > > On Mon, 25 Sept 2023 at 21:54, Andrew Pinski
>  wrote:
> > > >
> > > > On Mon, Sep 25, 2023 at 12:50 PM Manos Anagnostakis
> > > >  wrote:
> > > > >
> > > > > This patch implements the following TODO in
> gcc/config/aarch64/aarch64.cc
> > > > > to provide the requested behaviour for handling ldp and stp:
> > > > >
> > > > >   /* Allow the tuning structure to disable LDP instruction formation
> > > > >      from combining instructions (e.g., in peephole2).
> > > > >      TODO: Implement fine-grained tuning control for LDP and STP:
> > > > >            1. control policies for load and store separately;
> > > > >            2. support the following policies:
> > > > >               - default (use what is in the tuning structure)
> > > > >               - always
> > > > >               - never
> > > > >               - aligned (only if the compiler can prove that the
> > > > >                 load will be aligned to 2 * element_size)  */
> > > > >
> > > > > It provides two new and concrete target-specific command-line
> parameters
> > > > > -param=aarch64-ldp-policy= and -param=aarch64-stp-policy=
> > > > > to give the ability to control load and store policies seperately as
> > > > > stated in part 1 of the TODO.
> > > > >
> > > > > The accepted values for both parameters are:
> > > > > - 0: Use the policy of the tuning structure (default).
> > > > > - 1: Emit ldp/stp regardless of alignment.
> > > > > - 2: Do not emit ldp/stp.
> > > > > - 3: In order to emit ldp/stp, first check if the load/store will
> > > > >   be aligned to 2 * element_size.
> > > >
> > > > Instead of a number, does it make sense to instead use an string
> > > > (ENUM) for this param.
> > > > Also I think using --param is a bad idea if it is going to be
> > > > documented in the user manual.
> > > > Maybe a -m option should be used instead.
> > >
> > > See https://gcc.gnu.org/pipermail/gcc-patches/2023-
> September/631283.html
> > > for the discussion triggering the change from -m... to --param and the
> > > change to using a number instead of a string.
> >
> > That is the opposite of the current GCC practice across all targets.
> > Things like this should be consistent and if one target decides to do
> > it different, then maybe it should NOT.
> > Anyways we should document the correct coding style for options so we
> > don't have these back and forths again.
> 
> Kyrylo:
> >  It will have to take a number rather than a string but that should be 
> >okay, as
> long as the right values are documented in invoke.texi.
> 
> No it does not need to be a number. --param=ranger-debug= does not
> take a number, it takes an enum .
> One of the benefits of moving --param support over to .opt to allow
> more than just numbers even.
> 
> Thanks,
> Andrew
> 
> 
> >
> >
> > Thanks,
> > Andrew
> >
> > >
> > > Thanks,
> > > Philipp.
> > >
> > > >
> > > > Thanks,
> > > > Andrew
> > > >
> > > > >
> > > > > gcc/ChangeLog:
> > > > >         * config/aarch64/aarch64-protos.h (struct tune_params): Add
> > > > >         appropriate enums for the policies.
> > > > >         * config/aarch64/aarch64-tuning-flags.def
> > > > >         (AARCH64_EXTRA_TUNING_OPTION): Remove superseded tuning
> > > > >         options.
> > > > >         * config/aarch64/aarch64.cc (aarch64_parse_ldp_policy): New
> > > > >         function to parse ldp-policy parameter.
> > > > >         (aarch64_parse_stp_po

[pushed] Darwin: Handle -dynamiclib on cc1 lines.

2023-09-26 Thread Iain Sandoe
Tested on x86_64-darwin21, pushed to trunk, thanks
Iain

--- 8< ---

The changes of r14-4172 missed a case where we accept -dynamiclib on the
command line and then pass it to cc1 (which does not accept it).

This prunes the -dynamiclib from cc1 lines.

gcc/ChangeLog:

* config/darwin.h (DARWIN_CC1_SPEC): Remove -dynamiclib.

Signed-off-by: Iain Sandoe 
---
 gcc/config/darwin.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/config/darwin.h b/gcc/config/darwin.h
index 61e46f76b22..2ee66c1a3d1 100644
--- a/gcc/config/darwin.h
+++ b/gcc/config/darwin.h
@@ -307,7 +307,7 @@ extern GTY(()) int darwin_ms_struct;
%:version-compare(>= 10.7 mmacosx-version-min= -no_pie) }"
 
 #define DARWIN_CC1_SPEC
\
-  "%

Re: [PATCH] RISC-V/testsuite: Fix ILP32 RVV failures from missing

2023-09-26 Thread Maciej W. Rozycki
On Fri, 22 Sep 2023, Jeff Law wrote:

> > gcc/testsuite/
> > * gcc.target/riscv/rvv/autovec/vmv-imm-template.h: Remove
> >  inclusion.
> OK

 Now committed, thanks.

  Maciej


Re: [PATCH] MATCH: Optimize COND_ADD reduction pattern

2023-09-26 Thread Richard Biener
On Tue, 26 Sep 2023, Juzhe-Zhong wrote:

> Current COND_ADD reduction pattern can't optimize floating-point vector.
> As Richard suggested: 
> https://gcc.gnu.org/pipermail/gcc-patches/2023-September/631336.html
> Allow COND_ADD reduction pattern to optimize floating-point vector.
> 
> Bootstrap and Regression is running.
> 
> Ok for trunk if tests pass ?

I just wondered about fixed point - zerop seems to also allow
fixed_zerop.  Maybe do

 if (ANY_INTEGRAL_TYPE_P (type)
 || (FLOAT_TYPE_P (type)
 && fold_real_zero_addition_p (type, NULL_TREE, @4, 0)))

(also for the other patch) to avoid touching the fixed point case.

Richard.

> gcc/ChangeLog:
> 
>   * match.pd: Optimize COND_ADD reduction pattern.
> 
> ---
>  gcc/match.pd | 6 --
>  1 file changed, 4 insertions(+), 2 deletions(-)
> 
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 5061c19e086..398beaebd27 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -8863,8 +8863,10 @@ and,
>  
> c = mask1 && mask2 ? d + b : d.  */
>  (simplify
> -  (IFN_COND_ADD @0 @1 (vec_cond @2 @3 integer_zerop) @1)
> -   (IFN_COND_ADD (bit_and @0 @2) @1 @3 @1))
> +  (IFN_COND_ADD @0 @1 (vec_cond @2 @3 zerop@4) @1)
> +   (if (ANY_INTEGRAL_TYPE_P (type)
> + || fold_real_zero_addition_p (type, NULL_TREE, @4, 0))
> +   (IFN_COND_ADD (bit_and @0 @2) @1 @3 @1)))
>  
>  /* Detect simplication for a conditional length reduction where
>  
> 

-- 
Richard Biener 
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)


[PATCH V3] MATCH: Optimize COND_ADD_LEN reduction pattern

2023-09-26 Thread Juzhe-Zhong
This patch leverage this commit: 
https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=62b505a4d5fc89
to optimize COND_LEN_ADD reduction pattern.

We are doing optimization of VEC_COND_EXPR + COND_LEN_ADD -> COND_LEN_ADD.

Consider thsi following case:

#include 

void
pr11594 (uint64_t *restrict a, uint64_t *restrict b, int loop_size)
{
  uint64_t result = 0;

  for (int i = 0; i < loop_size; i++)
{
  if (b[i] <= a[i])
{
  result += a[i];
}
}

  a[0] = result;
}

Before this patch:
vsetvli a7,zero,e64,m1,ta,ma
vmv.v.i v2,0
vmv1r.v v3,v2--- redundant
.L3:
vsetvli a5,a2,e64,m1,ta,ma
vle64.v v1,0(a3)
vle64.v v0,0(a1)
sllia6,a5,3
vsetvli a7,zero,e64,m1,ta,ma
sub a2,a2,a5
vmsleu.vv   v0,v0,v1
add a1,a1,a6
vmerge.vvm  v1,v3,v1,v0  redundant.
add a3,a3,a6
vsetvli zero,a5,e64,m1,tu,ma
vadd.vv v2,v2,v1
bne a2,zero,.L3
li  a5,0
vsetvli a4,zero,e64,m1,ta,ma
vmv.s.x v1,a5
vredsum.vs  v2,v2,v1
vmv.x.s a5,v2
sd  a5,0(a0)
ret

After this patch:

vsetvli a6,zero,e64,m1,ta,ma
vmv.v.i v1,0
.L3:
vsetvli a5,a2,e64,m1,ta,ma
vle64.v v2,0(a4)
vle64.v v0,0(a1)
sllia3,a5,3
vsetvli a6,zero,e64,m1,ta,ma
sub a2,a2,a5
vmsleu.vv   v0,v0,v2
add a1,a1,a3
vsetvli zero,a5,e64,m1,tu,mu
add a4,a4,a3
vadd.vv v1,v1,v2,v0.t
bne a2,zero,.L3
li  a5,0
vsetivlizero,1,e64,m1,ta,ma
vmv.s.x v2,a5
vsetvli a5,zero,e64,m1,ta,ma
vredsum.vs  v1,v1,v2
vmv.x.s a5,v1
sd  a5,0(a0)
ret

Bootstrap && Regression is running.

Ok for trunk when testing passes ?

PR tree-optimization/111594
PR tree-optimization/110660

gcc/ChangeLog:

* match.pd: Optimize COND_LEN_ADD reduction.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/cond/cond_reduc-1.c: New test.
* gcc.target/riscv/rvv/autovec/cond/pr111594.c: New test.

---
 gcc/match.pd  | 15 ++
 .../riscv/rvv/autovec/cond/cond_reduc-1.c | 29 +++
 .../riscv/rvv/autovec/cond/pr111594.c | 22 ++
 3 files changed, 66 insertions(+)
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/cond_reduc-1.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111594.c

diff --git a/gcc/match.pd b/gcc/match.pd
index a17778fbaa6..3ce90cb 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -8866,6 +8866,21 @@ and,
   (IFN_COND_ADD @0 @1 (vec_cond @2 @3 integer_zerop) @1)
(IFN_COND_ADD (bit_and @0 @2) @1 @3 @1))
 
+/* Detect simplication for a conditional length reduction where
+
+   a = mask ? b : 0
+   c = i < len + bias ? d + a : d
+
+   is turned into
+
+   c = mask && i < len + bias ? d + b : d.  */
+(simplify
+  (IFN_COND_LEN_ADD integer_truep @0 (vec_cond @1 @2 zerop@5) @0 @3 @4)
+   (if (ANY_INTEGRAL_TYPE_P (type)
+   || (FLOAT_TYPE_P (type)
+   && fold_real_zero_addition_p (type, NULL_TREE, @5, 0)))
+(IFN_COND_LEN_ADD @1 @0 @2 @0 @3 @4)))
+
 /* For pointers @0 and @2 and nonnegative constant offset @1, look for
expressions like:
 
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/cond_reduc-1.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/cond_reduc-1.c
new file mode 100644
index 000..db6f9d1ec6c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/cond_reduc-1.c
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=rv64gcv_zvfh -mabi=lp64d 
-fno-vect-cost-model -ffast-math -fdump-tree-optimized" } */
+
+#include 
+
+#define COND_REDUCTION(TYPE)   
\
+  TYPE foo##TYPE (TYPE *restrict a, TYPE *restrict b, int loop_size)   
\
+  {
\
+TYPE result = 0;   
\
+for (int i = 0; i < loop_size; i++)
\
+  if (b[i] <= a[i])
\
+   result += a[i];\
+return result; 
\
+  }
+
+COND_REDUCTION (int8_t)
+COND_REDUCTION (int16_t)
+COND_REDUCTION (int32_t)
+COND_REDUCTION (int64_t)
+COND_REDUCTION (uint8_t)
+COND_REDUCTION (uint16_t)
+COND_REDUCTION (uint32_t)
+COND_REDUCTION (uint64_t)
+COND_REDUCTION (_Float16)
+COND_REDUCTION (float)
+COND_REDUCTION (double)
+
+/* { dg-final { scan-tree-dump-not "VCOND_MASK" "optimized" } } */
+/* { dg-final { scan-tree-dump-times "COND_LEN_ADD" 11 "optimized" } 

[PATCH V2] MATCH: Optimize COND_ADD_LEN reduction pattern

2023-09-26 Thread Juzhe-Zhong
Current COND_ADD reduction pattern can't optimize floating-point vector.
As Richard suggested: 
https://gcc.gnu.org/pipermail/gcc-patches/2023-September/631336.html
Allow COND_ADD reduction pattern to optimize floating-point vector.

Bootstrap and Regression is running.

Ok for trunk if tests pass ?

gcc/ChangeLog:

* match.pd: Optimize COND_ADD reduction pattern.

---
 gcc/match.pd | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/gcc/match.pd b/gcc/match.pd
index 3ce90cb..790d956fe69 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -8863,8 +8863,11 @@ and,
 
c = mask1 && mask2 ? d + b : d.  */
 (simplify
-  (IFN_COND_ADD @0 @1 (vec_cond @2 @3 integer_zerop) @1)
-   (IFN_COND_ADD (bit_and @0 @2) @1 @3 @1))
+  (IFN_COND_ADD @0 @1 (vec_cond @2 @3 zerop@4) @1)
+   (if (ANY_INTEGRAL_TYPE_P (type)
+   || (FLOAT_TYPE_P (type)
+   && fold_real_zero_addition_p (type, NULL_TREE, @4, 0)))
+   (IFN_COND_ADD (bit_and @0 @2) @1 @3 @1)))
 
 /* Detect simplication for a conditional length reduction where
 
-- 
2.36.3



[PATCH V2] MATCH: Optimize COND_ADD reduction pattern

2023-09-26 Thread Juzhe-Zhong
Current COND_ADD reduction pattern can't optimize floating-point vector.
As Richard suggested: 
https://gcc.gnu.org/pipermail/gcc-patches/2023-September/631336.html
Allow COND_ADD reduction pattern to optimize floating-point vector.

Bootstrap and Regression is running.

Ok for trunk if tests pass ?

gcc/ChangeLog:

* match.pd: Optimize COND_ADD reduction pattern.

---
 gcc/match.pd | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/gcc/match.pd b/gcc/match.pd
index 3ce90cb..790d956fe69 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -8863,8 +8863,11 @@ and,
 
c = mask1 && mask2 ? d + b : d.  */
 (simplify
-  (IFN_COND_ADD @0 @1 (vec_cond @2 @3 integer_zerop) @1)
-   (IFN_COND_ADD (bit_and @0 @2) @1 @3 @1))
+  (IFN_COND_ADD @0 @1 (vec_cond @2 @3 zerop@4) @1)
+   (if (ANY_INTEGRAL_TYPE_P (type)
+   || (FLOAT_TYPE_P (type)
+   && fold_real_zero_addition_p (type, NULL_TREE, @4, 0)))
+   (IFN_COND_ADD (bit_and @0 @2) @1 @3 @1)))
 
 /* Detect simplication for a conditional length reduction where
 
-- 
2.36.3



Re: Re: [PATCH] MATCH: Optimize COND_ADD reduction pattern

2023-09-26 Thread juzhe.zh...@rivai.ai
Address comments:

V3 
COND_LEN_ADD:https://gcc.gnu.org/pipermail/gcc-patches/2023-September/631350.html
 
V2 COND_ADD: 
https://gcc.gnu.org/pipermail/gcc-patches/2023-September/631352.html 

Thanks.


juzhe.zh...@rivai.ai
 
From: Richard Biener
Date: 2023-09-26 17:41
To: Juzhe-Zhong
CC: gcc-patches; richard.sandiford
Subject: Re: [PATCH] MATCH: Optimize COND_ADD reduction pattern
On Tue, 26 Sep 2023, Juzhe-Zhong wrote:
 
> Current COND_ADD reduction pattern can't optimize floating-point vector.
> As Richard suggested: 
> https://gcc.gnu.org/pipermail/gcc-patches/2023-September/631336.html
> Allow COND_ADD reduction pattern to optimize floating-point vector.
> 
> Bootstrap and Regression is running.
> 
> Ok for trunk if tests pass ?
 
I just wondered about fixed point - zerop seems to also allow
fixed_zerop.  Maybe do
 
if (ANY_INTEGRAL_TYPE_P (type)
 || (FLOAT_TYPE_P (type)
 && fold_real_zero_addition_p (type, NULL_TREE, @4, 0)))
 
(also for the other patch) to avoid touching the fixed point case.
 
Richard.
 
> gcc/ChangeLog:
> 
> * match.pd: Optimize COND_ADD reduction pattern.
> 
> ---
>  gcc/match.pd | 6 --
>  1 file changed, 4 insertions(+), 2 deletions(-)
> 
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 5061c19e086..398beaebd27 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -8863,8 +8863,10 @@ and,
>  
> c = mask1 && mask2 ? d + b : d.  */
>  (simplify
> -  (IFN_COND_ADD @0 @1 (vec_cond @2 @3 integer_zerop) @1)
> -   (IFN_COND_ADD (bit_and @0 @2) @1 @3 @1))
> +  (IFN_COND_ADD @0 @1 (vec_cond @2 @3 zerop@4) @1)
> +   (if (ANY_INTEGRAL_TYPE_P (type)
> + || fold_real_zero_addition_p (type, NULL_TREE, @4, 0))
> +   (IFN_COND_ADD (bit_and @0 @2) @1 @3 @1)))
>  
>  /* Detect simplication for a conditional length reduction where
>  
> 
 
-- 
Richard Biener 
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)
 


[PATCH v1] RISC-V: Support FP round auto-vectorization

2023-09-26 Thread pan2 . li
From: Pan Li 

This patch would like to support auto-vectorization for the
round API in math.h. It depends on the -ffast-math option.

When we would like to call round/roundf like v2 = round (v1),
we will convert it into below insns (reference the implementation of llvm).

* vfcvt.x.f v3, v1, RMM
* vfcvt.f.x v2, v3

However, the floating point value may not need the cvt as above if
its mantissa is zero. Take single precision floating point as example:

  ++---+-+
  | raw float  | binary layout | after round |
  ++---+-+
  | -8388607.5 | 0xcaff| -8388608.0  |
  | 8388607.5  | 0x4aff| 8388608.0   |
  | 8388608.0  | 0x4b00| 8388608.0   |
  | 8388609.0  | 0x4b01| 8388609.0   |
  ++---+-+

All single floating point >= 8388608.0 will have all zero mantisaa.
We leverage vmflt and mask to filter them out in vector and only do
the cvt on mask.

Befor this patch:
math-round-1.c:21:1: missed: couldn't vectorize loop
  ...
.L3:
  flw fa0,0(s0)
  addis0,s0,4
  addis1,s1,4
  callround
  fsw fa0,-4(s1)
  bne s0,s2,.L3

After this patch:
  ...
  fsrmi   4   // RMM, rounding to nearest, ties to max magnitude
.L4:
  vfabs.v v2,v1
  vmflt.vfv0,v2,fa5
  vfcvt.x.f.v v4,v1,v0.t
  vfcvt.f.x.v v2,v4,v0.t
  vfsgnj.vv   v2,v2,v1
  bne .L4
.L14:
  fsrma6
  ret

Please note VLS mode is also involved in this patch and covered by the
test cases.

gcc/ChangeLog:

* config/riscv/autovec.md (round2): New pattern.
* config/riscv/riscv-protos.h (enum insn_flags): New enum type.
(enum insn_type): Ditto.
(expand_vec_round): New function decl.
* config/riscv/riscv-v.cc (expand_vec_round): New function impl.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/unop/math-round-0.c: New test.
* gcc.target/riscv/rvv/autovec/unop/math-round-1.c: New test.
* gcc.target/riscv/rvv/autovec/unop/math-round-2.c: New test.
* gcc.target/riscv/rvv/autovec/unop/math-round-3.c: New test.
* gcc.target/riscv/rvv/autovec/unop/math-round-run-1.c: New test.
* gcc.target/riscv/rvv/autovec/unop/math-round-run-2.c: New test.
* gcc.target/riscv/rvv/autovec/vls/math-round-1.c: New test.

Signed-off-by: Pan Li 
---
 gcc/config/riscv/autovec.md   | 10 
 gcc/config/riscv/riscv-protos.h   |  5 ++
 gcc/config/riscv/riscv-v.cc   | 24 
 .../riscv/rvv/autovec/unop/math-round-0.c | 23 
 .../riscv/rvv/autovec/unop/math-round-1.c | 23 
 .../riscv/rvv/autovec/unop/math-round-2.c | 23 
 .../riscv/rvv/autovec/unop/math-round-3.c | 25 +
 .../riscv/rvv/autovec/unop/math-round-run-1.c | 39 +
 .../riscv/rvv/autovec/unop/math-round-run-2.c | 39 +
 .../riscv/rvv/autovec/vls/math-round-1.c  | 56 +++
 10 files changed, 267 insertions(+)
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/math-round-0.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/math-round-1.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/math-round-2.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/math-round-3.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/math-round-run-1.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/math-round-run-2.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/math-round-1.c

diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md
index 1d2fca60e98..798cf1272c5 100644
--- a/gcc/config/riscv/autovec.md
+++ b/gcc/config/riscv/autovec.md
@@ -2251,3 +2251,13 @@ (define_expand "rint2"
 DONE;
   }
 )
+
+(define_expand "round2"
+  [(match_operand:V_VLSF 0 "register_operand")
+   (match_operand:V_VLSF 1 "register_operand")]
+  "TARGET_VECTOR && !flag_trapping_math && !flag_rounding_math"
+  {
+riscv_vector::expand_vec_round (operands[0], operands[1], mode, 
mode);
+DONE;
+  }
+)
diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 629adeea94c..70ca244c591 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -256,6 +256,9 @@ enum insn_flags : unsigned int
 
   /* Means INSN has FRM operand and the value is FRM_RDN.  */
   FRM_RDN_P = 1 << 17,
+
+  /* Means INSN has FRM operand and the value is FRM_RMM.  */
+  FRM_RMM_P = 1 << 18,
 };
 
 enum insn_type : unsigned int
@@ -299,6 +302,7 @@ enum insn_type : unsigned int
   UNARY_OP_TAMU_FRM_DYN = UNARY_OP_TAMU | FRM_DYN_P,
   UNARY_OP_TAMU_FRM_RUP = UNARY_OP_TAMU | FRM_RUP_P,
   UNARY_OP_TAMU_FRM_RDN = UNARY_OP_TAMU | FRM_RDN_P,
+  UNARY_OP_TAMU_FRM_RMM = UNARY_OP_TAMU | FRM_RMM_P,
 
   /* Binary operator.  */
   BINARY_OP = __NORMAL_OP | BINARY_OP_P,
@@ 

Re: [PATCH v1] RISC-V: Support FP round auto-vectorization

2023-09-26 Thread juzhe.zh...@rivai.ai
LGTM



juzhe.zh...@rivai.ai
 
From: pan2.li
Date: 2023-09-26 19:00
To: gcc-patches
CC: juzhe.zhong; pan2.li; yanzhang.wang; kito.cheng
Subject: [PATCH v1] RISC-V: Support FP round auto-vectorization
From: Pan Li 
 
This patch would like to support auto-vectorization for the
round API in math.h. It depends on the -ffast-math option.
 
When we would like to call round/roundf like v2 = round (v1),
we will convert it into below insns (reference the implementation of llvm).
 
* vfcvt.x.f v3, v1, RMM
* vfcvt.f.x v2, v3
 
However, the floating point value may not need the cvt as above if
its mantissa is zero. Take single precision floating point as example:
 
  ++---+-+
  | raw float  | binary layout | after round |
  ++---+-+
  | -8388607.5 | 0xcaff| -8388608.0  |
  | 8388607.5  | 0x4aff| 8388608.0   |
  | 8388608.0  | 0x4b00| 8388608.0   |
  | 8388609.0  | 0x4b01| 8388609.0   |
  ++---+-+
 
All single floating point >= 8388608.0 will have all zero mantisaa.
We leverage vmflt and mask to filter them out in vector and only do
the cvt on mask.
 
Befor this patch:
math-round-1.c:21:1: missed: couldn't vectorize loop
  ...
.L3:
  flw fa0,0(s0)
  addis0,s0,4
  addis1,s1,4
  callround
  fsw fa0,-4(s1)
  bne s0,s2,.L3
 
After this patch:
  ...
  fsrmi   4   // RMM, rounding to nearest, ties to max magnitude
.L4:
  vfabs.v v2,v1
  vmflt.vfv0,v2,fa5
  vfcvt.x.f.v v4,v1,v0.t
  vfcvt.f.x.v v2,v4,v0.t
  vfsgnj.vv   v2,v2,v1
  bne .L4
.L14:
  fsrma6
  ret
 
Please note VLS mode is also involved in this patch and covered by the
test cases.
 
gcc/ChangeLog:
 
* config/riscv/autovec.md (round2): New pattern.
* config/riscv/riscv-protos.h (enum insn_flags): New enum type.
(enum insn_type): Ditto.
(expand_vec_round): New function decl.
* config/riscv/riscv-v.cc (expand_vec_round): New function impl.
 
gcc/testsuite/ChangeLog:
 
* gcc.target/riscv/rvv/autovec/unop/math-round-0.c: New test.
* gcc.target/riscv/rvv/autovec/unop/math-round-1.c: New test.
* gcc.target/riscv/rvv/autovec/unop/math-round-2.c: New test.
* gcc.target/riscv/rvv/autovec/unop/math-round-3.c: New test.
* gcc.target/riscv/rvv/autovec/unop/math-round-run-1.c: New test.
* gcc.target/riscv/rvv/autovec/unop/math-round-run-2.c: New test.
* gcc.target/riscv/rvv/autovec/vls/math-round-1.c: New test.
 
Signed-off-by: Pan Li 
---
gcc/config/riscv/autovec.md   | 10 
gcc/config/riscv/riscv-protos.h   |  5 ++
gcc/config/riscv/riscv-v.cc   | 24 
.../riscv/rvv/autovec/unop/math-round-0.c | 23 
.../riscv/rvv/autovec/unop/math-round-1.c | 23 
.../riscv/rvv/autovec/unop/math-round-2.c | 23 
.../riscv/rvv/autovec/unop/math-round-3.c | 25 +
.../riscv/rvv/autovec/unop/math-round-run-1.c | 39 +
.../riscv/rvv/autovec/unop/math-round-run-2.c | 39 +
.../riscv/rvv/autovec/vls/math-round-1.c  | 56 +++
10 files changed, 267 insertions(+)
create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/math-round-0.c
create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/math-round-1.c
create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/math-round-2.c
create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/math-round-3.c
create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/math-round-run-1.c
create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/math-round-run-2.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/math-round-1.c
 
diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md
index 1d2fca60e98..798cf1272c5 100644
--- a/gcc/config/riscv/autovec.md
+++ b/gcc/config/riscv/autovec.md
@@ -2251,3 +2251,13 @@ (define_expand "rint2"
 DONE;
   }
)
+
+(define_expand "round2"
+  [(match_operand:V_VLSF 0 "register_operand")
+   (match_operand:V_VLSF 1 "register_operand")]
+  "TARGET_VECTOR && !flag_trapping_math && !flag_rounding_math"
+  {
+riscv_vector::expand_vec_round (operands[0], operands[1], mode, 
mode);
+DONE;
+  }
+)
diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 629adeea94c..70ca244c591 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -256,6 +256,9 @@ enum insn_flags : unsigned int
   /* Means INSN has FRM operand and the value is FRM_RDN.  */
   FRM_RDN_P = 1 << 17,
+
+  /* Means INSN has FRM operand and the value is FRM_RMM.  */
+  FRM_RMM_P = 1 << 18,
};
enum insn_type : unsigned int
@@ -299,6 +302,7 @@ enum insn_type : unsigned int
   UNARY_OP_TAMU_FRM_DYN = UNARY_OP_TAMU | FRM_DYN_P,
   UNARY_OP_TAMU_FRM_RUP = UNARY_OP_TAMU | FRM_RUP_P,
   UNARY_OP_TAMU_FRM_RDN = UNARY_OP_TAMU | FRM_RDN_P,
+  UNARY_OP_TAMU_FRM_RMM = UNARY_OP

Re: [PATCH V3] MATCH: Optimize COND_ADD_LEN reduction pattern

2023-09-26 Thread Richard Biener
On Tue, 26 Sep 2023, Juzhe-Zhong wrote:

> This patch leverage this commit: 
> https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=62b505a4d5fc89
> to optimize COND_LEN_ADD reduction pattern.
> 
> We are doing optimization of VEC_COND_EXPR + COND_LEN_ADD -> COND_LEN_ADD.
> 
> Consider thsi following case:
> 
> #include 
> 
> void
> pr11594 (uint64_t *restrict a, uint64_t *restrict b, int loop_size)
> {
>   uint64_t result = 0;
> 
>   for (int i = 0; i < loop_size; i++)
> {
>   if (b[i] <= a[i])
>   {
> result += a[i];
>   }
> }
> 
>   a[0] = result;
> }
> 
> Before this patch:
> vsetvli a7,zero,e64,m1,ta,ma
> vmv.v.i v2,0
> vmv1r.v v3,v2--- redundant
> .L3:
> vsetvli a5,a2,e64,m1,ta,ma
> vle64.v v1,0(a3)
> vle64.v v0,0(a1)
> sllia6,a5,3
> vsetvli a7,zero,e64,m1,ta,ma
> sub a2,a2,a5
> vmsleu.vv   v0,v0,v1
> add a1,a1,a6
> vmerge.vvm  v1,v3,v1,v0  redundant.
> add a3,a3,a6
> vsetvli zero,a5,e64,m1,tu,ma
> vadd.vv v2,v2,v1
> bne a2,zero,.L3
> li  a5,0
> vsetvli a4,zero,e64,m1,ta,ma
> vmv.s.x v1,a5
> vredsum.vs  v2,v2,v1
> vmv.x.s a5,v2
> sd  a5,0(a0)
> ret
> 
> After this patch:
> 
>   vsetvli a6,zero,e64,m1,ta,ma
>   vmv.v.i v1,0
> .L3:
>   vsetvli a5,a2,e64,m1,ta,ma
>   vle64.v v2,0(a4)
>   vle64.v v0,0(a1)
>   sllia3,a5,3
>   vsetvli a6,zero,e64,m1,ta,ma
>   sub a2,a2,a5
>   vmsleu.vv   v0,v0,v2
>   add a1,a1,a3
>   vsetvli zero,a5,e64,m1,tu,mu
>   add a4,a4,a3
>   vadd.vv v1,v1,v2,v0.t
>   bne a2,zero,.L3
>   li  a5,0
>   vsetivlizero,1,e64,m1,ta,ma
>   vmv.s.x v2,a5
>   vsetvli a5,zero,e64,m1,ta,ma
>   vredsum.vs  v1,v1,v2
>   vmv.x.s a5,v1
>   sd  a5,0(a0)
>   ret
> 
> Bootstrap && Regression is running.
> 
> Ok for trunk when testing passes ?

OK

>   PR tree-optimization/111594
> PR tree-optimization/110660
> 
> gcc/ChangeLog:
> 
>   * match.pd: Optimize COND_LEN_ADD reduction.
> 
> gcc/testsuite/ChangeLog:
> 
>   * gcc.target/riscv/rvv/autovec/cond/cond_reduc-1.c: New test.
>   * gcc.target/riscv/rvv/autovec/cond/pr111594.c: New test.
> 
> ---
>  gcc/match.pd  | 15 ++
>  .../riscv/rvv/autovec/cond/cond_reduc-1.c | 29 +++
>  .../riscv/rvv/autovec/cond/pr111594.c | 22 ++
>  3 files changed, 66 insertions(+)
>  create mode 100644 
> gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/cond_reduc-1.c
>  create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111594.c
> 
> diff --git a/gcc/match.pd b/gcc/match.pd
> index a17778fbaa6..3ce90cb 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -8866,6 +8866,21 @@ and,
>(IFN_COND_ADD @0 @1 (vec_cond @2 @3 integer_zerop) @1)
> (IFN_COND_ADD (bit_and @0 @2) @1 @3 @1))
>  
> +/* Detect simplication for a conditional length reduction where
> +
> +   a = mask ? b : 0
> +   c = i < len + bias ? d + a : d
> +
> +   is turned into
> +
> +   c = mask && i < len + bias ? d + b : d.  */
> +(simplify
> +  (IFN_COND_LEN_ADD integer_truep @0 (vec_cond @1 @2 zerop@5) @0 @3 @4)
> +   (if (ANY_INTEGRAL_TYPE_P (type)
> + || (FLOAT_TYPE_P (type)
> + && fold_real_zero_addition_p (type, NULL_TREE, @5, 0)))
> +(IFN_COND_LEN_ADD @1 @0 @2 @0 @3 @4)))
> +
>  /* For pointers @0 and @2 and nonnegative constant offset @1, look for
> expressions like:
>  
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/cond_reduc-1.c 
> b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/cond_reduc-1.c
> new file mode 100644
> index 000..db6f9d1ec6c
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/cond_reduc-1.c
> @@ -0,0 +1,29 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-march=rv64gcv_zvfh -mabi=lp64d 
> -fno-vect-cost-model -ffast-math -fdump-tree-optimized" } */
> +
> +#include 
> +
> +#define COND_REDUCTION(TYPE) 
>   \
> +  TYPE foo##TYPE (TYPE *restrict a, TYPE *restrict b, int loop_size) 
>   \
> +  {  
>   \
> +TYPE result = 0; 
>   \
> +for (int i = 0; i < loop_size; i++)  
>   \
> +  if (b[i] <= a[i])  
>   \
> + result += a[i];\
> +return result;   
>   \
> +  }
> +
> +COND_REDUCTION (int8_t)
> +COND_REDUCTION (int16_t)
> +COND_REDUCTION (int32_t)
> +COND_REDUCTION (int64_t)
> +COND_REDUCTION 

Re: [PATCH V2] MATCH: Optimize COND_ADD_LEN reduction pattern

2023-09-26 Thread Richard Biener
On Tue, 26 Sep 2023, Juzhe-Zhong wrote:

> Current COND_ADD reduction pattern can't optimize floating-point vector.
> As Richard suggested: 
> https://gcc.gnu.org/pipermail/gcc-patches/2023-September/631336.html
> Allow COND_ADD reduction pattern to optimize floating-point vector.
> 
> Bootstrap and Regression is running.
> 
> Ok for trunk if tests pass ?

OK.

> gcc/ChangeLog:
> 
>   * match.pd: Optimize COND_ADD reduction pattern.
> 
> ---
>  gcc/match.pd | 7 +--
>  1 file changed, 5 insertions(+), 2 deletions(-)
> 
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 3ce90cb..790d956fe69 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -8863,8 +8863,11 @@ and,
>  
> c = mask1 && mask2 ? d + b : d.  */
>  (simplify
> -  (IFN_COND_ADD @0 @1 (vec_cond @2 @3 integer_zerop) @1)
> -   (IFN_COND_ADD (bit_and @0 @2) @1 @3 @1))
> +  (IFN_COND_ADD @0 @1 (vec_cond @2 @3 zerop@4) @1)
> +   (if (ANY_INTEGRAL_TYPE_P (type)
> + || (FLOAT_TYPE_P (type)
> + && fold_real_zero_addition_p (type, NULL_TREE, @4, 0)))
> +   (IFN_COND_ADD (bit_and @0 @2) @1 @3 @1)))
>  
>  /* Detect simplication for a conditional length reduction where
>  
> 

-- 
Richard Biener 
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)


Re: [PATCH] PHIOPT: Fix minmax_replacement for three way

2023-09-26 Thread Richard Biener
On Sat, Sep 23, 2023 at 2:55 AM Andrew Pinski  wrote:
>
> So when diamond bb support was added to minmax_replacement in 
> r13-1950-g9bb19e143cfe,
> the code was not expecting the alt_middle_bb not to exist if it was empty 
> (for threeway_p).
> So when factor_out_conditional_conversion was used to factor out conversions, 
> it turns out
> the assumption for alt_middle_bb to be wrong and we ended up with threeway_p 
> being true but
> having middle_bb being empty but alt_middle_bb not being empty which causes 
> wrong code in
> many cases.
>
> This patch fixes the issue by adding a test for the 2 cases where the 
> assumption on
> threeway_p case having the other bb being empty.
>
> Changes made:
> v2: Fix test for `(a <= u) b = MAX(a, d) else b = u`.
>
> Note my plan for GCC 15 is remove minmax_replacement as match.pd will catch 
> all cases
> at that point.
>
> OK? Bootstrapped and tested on x86_64-linux-gnu with no regressions.

OK.

> PR tree-optimization/111469
>
> gcc/ChangeLog:
>
> * tree-ssa-phiopt.cc (minmax_replacement): Fix
> the assumption for the `non-diamond` handling cases
> of diamond code.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.c-torture/execute/pr111469-1.c: New test.
> ---
>  .../gcc.c-torture/execute/pr111469-1.c| 38 +++
>  gcc/tree-ssa-phiopt.cc|  9 -
>  2 files changed, 45 insertions(+), 2 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.c-torture/execute/pr111469-1.c
>
> diff --git a/gcc/testsuite/gcc.c-torture/execute/pr111469-1.c 
> b/gcc/testsuite/gcc.c-torture/execute/pr111469-1.c
> new file mode 100644
> index 000..b68d5989eac
> --- /dev/null
> +++ b/gcc/testsuite/gcc.c-torture/execute/pr111469-1.c
> @@ -0,0 +1,38 @@
> +/* PR tree-optimization/111469 */
> +
> +long f;
> +char *g;
> +__attribute__((noinline))
> +char o() {
> +  char l;
> +  while (f)
> +;
> +  l = *g;
> +  return l;
> +}
> +
> +/* factor_out_conditional_conversion is able to remove the casts
> +   from the 2 bbs (correctly)
> +   but then minmax_replacement should not optimize this to a MIN_EXPR
> +   as o has side effects. */
> +
> +__attribute__((noinline))
> +unsigned short gg(unsigned short a, unsigned short b)
> +{
> +  short d;
> +  if (a > b)
> +  {
> +d= b;
> +  }
> +  else
> +  {
> +o();
> +d = a;
> +  }
> +  return d;
> +}
> +
> +int main(void)
> +{
> +  gg(3, 2);
> +}
> diff --git a/gcc/tree-ssa-phiopt.cc b/gcc/tree-ssa-phiopt.cc
> index 3835d25d08c..312a6f9082b 100644
> --- a/gcc/tree-ssa-phiopt.cc
> +++ b/gcc/tree-ssa-phiopt.cc
> @@ -1823,7 +1823,9 @@ minmax_replacement (basic_block cond_bb, basic_block 
> middle_bb, basic_block alt_
>arg_false = arg0;
>  }
>
> -  if (empty_block_p (middle_bb))
> +  if (empty_block_p (middle_bb)
> +  && (!threeway_p
> + || empty_block_p (alt_middle_bb)))
>  {
>if ((operand_equal_for_phi_arg_p (arg_true, smaller)
>|| (alt_smaller
> @@ -2006,7 +2008,8 @@ minmax_replacement (basic_block cond_bb, basic_block 
> middle_bb, basic_block alt_
>
>return true;
>  }
> -  else
> +  else if (!threeway_p
> +  || empty_block_p (alt_middle_bb))
>  {
>/* Recognize the following case, assuming d <= u:
>
> @@ -2182,6 +2185,8 @@ minmax_replacement (basic_block cond_bb, basic_block 
> middle_bb, basic_block alt_
>   SSA_OP_DEF));
>gsi_move_before (&gsi_from, &gsi);
>  }
> +  else
> +return false;
>
>/* Emit the statement to compute min/max.  */
>gimple_seq stmts = NULL;
> --
> 2.31.1
>


Re: [PATCH] PHIOPT: Fix minmax_replacement for three way

2023-09-26 Thread Richard Biener
On Thu, Sep 21, 2023 at 10:10 AM Andrew Pinski  wrote:
>
> So when diamond bb support was added to minmax_replacement in 
> r13-1950-g9bb19e143cfe,
> the code was not expecting the alt_middle_bb not to exist if it was empty 
> (for threeway_p).
> So when factor_out_conditional_conversion was used to factor out conversions, 
> it turns out
> the assumption for alt_middle_bb to be wrong and we ended up with threeway_p 
> being true but
> having middle_bb being empty but alt_middle_bb not being empty which causes 
> wrong code in
> many cases.
>
> This patch fixes the issue by adding a test for the 2 cases where the 
> assumption on
> threeway_p case having the other bb being empty.
>
> Note my plan for GCC 15 is remove minmax_replacement as match.pd will catch 
> all cases
> at that point.
>
> OK? Bootstrapped and tested on x86_64-linux-gnu with no regressions.

OK.

> PR tree-optimization/111469
>
> gcc/ChangeLog:
>
> * tree-ssa-phiopt.cc (minmax_replacement): Fix
> the assumption for the `non-diamond` handling cases
> of diamond code.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.c-torture/execute/pr111469-1.c: New test.
> ---
>  .../gcc.c-torture/execute/pr111469-1.c| 38 +++
>  gcc/tree-ssa-phiopt.cc| 10 -
>  2 files changed, 46 insertions(+), 2 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.c-torture/execute/pr111469-1.c
>
> diff --git a/gcc/testsuite/gcc.c-torture/execute/pr111469-1.c 
> b/gcc/testsuite/gcc.c-torture/execute/pr111469-1.c
> new file mode 100644
> index 000..b68d5989eac
> --- /dev/null
> +++ b/gcc/testsuite/gcc.c-torture/execute/pr111469-1.c
> @@ -0,0 +1,38 @@
> +/* PR tree-optimization/111469 */
> +
> +long f;
> +char *g;
> +__attribute__((noinline))
> +char o() {
> +  char l;
> +  while (f)
> +;
> +  l = *g;
> +  return l;
> +}
> +
> +/* factor_out_conditional_conversion is able to remove the casts
> +   from the 2 bbs (correctly)
> +   but then minmax_replacement should not optimize this to a MIN_EXPR
> +   as o has side effects. */
> +
> +__attribute__((noinline))
> +unsigned short gg(unsigned short a, unsigned short b)
> +{
> +  short d;
> +  if (a > b)
> +  {
> +d= b;
> +  }
> +  else
> +  {
> +o();
> +d = a;
> +  }
> +  return d;
> +}
> +
> +int main(void)
> +{
> +  gg(3, 2);
> +}
> diff --git a/gcc/tree-ssa-phiopt.cc b/gcc/tree-ssa-phiopt.cc
> index 3835d25d08c..96901a40444 100644
> --- a/gcc/tree-ssa-phiopt.cc
> +++ b/gcc/tree-ssa-phiopt.cc
> @@ -1823,7 +1823,9 @@ minmax_replacement (basic_block cond_bb, basic_block 
> middle_bb, basic_block alt_
>arg_false = arg0;
>  }
>
> -  if (empty_block_p (middle_bb))
> +  if (empty_block_p (middle_bb)
> +  && (!threeway_p
> + || empty_block_p (alt_middle_bb)))
>  {
>if ((operand_equal_for_phi_arg_p (arg_true, smaller)
>|| (alt_smaller
> @@ -2006,7 +2008,9 @@ minmax_replacement (basic_block cond_bb, basic_block 
> middle_bb, basic_block alt_
>
>return true;
>  }
> -  else
> +  else if (middle_bb == alt_middle_bb
> +   && (!threeway_p
> +  || empty_block_p (alt_middle_bb)))
>  {
>/* Recognize the following case, assuming d <= u:
>
> @@ -2182,6 +2186,8 @@ minmax_replacement (basic_block cond_bb, basic_block 
> middle_bb, basic_block alt_
>   SSA_OP_DEF));
>gsi_move_before (&gsi_from, &gsi);
>  }
> +  else
> +return false;
>
>/* Emit the statement to compute min/max.  */
>gimple_seq stmts = NULL;
> --
> 2.31.1
>


Re: [PATCH 1/3] librust: Add libproc_macro and build system

2023-09-26 Thread Richard Biener
On Wed, Sep 20, 2023 at 2:04 PM Arthur Cohen  wrote:
>
> From: Pierre-Emmanuel Patry 
>
> This patch series adds the build system changes to allow the Rust
> frontend to develop and distribute its own libraries. The first library
> we have been working on is the `proc_macro` library, comprised of a C++
> library as well as a user-facing Rust library.
>
> Follow up commits containing the actual library code will be committed.
> Should I submit patches to the MAINTAINERS file to allow Philip and I to
> commit to this folder without first approval?

I think the Rust language frontend maintainership implicitly includes
the rust runtime libraries.

> This first commit adds a simple `libgrust` folder with on top of which the
> full library will be built.

OK.

> All the best,
>
> Arthur
>
> -
>
> Add some dummy files in libproc_macro along with it's build system.
>
> ChangeLog:
>
> * libgrust/Makefile.am: New file.
> * libgrust/configure.ac: New file.
> * libgrust/libproc_macro/Makefile.am: New file.
> * libgrust/libproc_macro/proc_macro.cc: New file.
> * libgrust/libproc_macro/proc_macro.h: New file.
>
> Signed-off-by: Pierre-Emmanuel Patry 
> ---
>  libgrust/Makefile.am |  68 
>  libgrust/configure.ac| 113 +++
>  libgrust/libproc_macro/Makefile.am   |  58 ++
>  libgrust/libproc_macro/proc_macro.cc |   7 ++
>  libgrust/libproc_macro/proc_macro.h  |   7 ++
>  5 files changed, 253 insertions(+)
>  create mode 100644 libgrust/Makefile.am
>  create mode 100644 libgrust/configure.ac
>  create mode 100644 libgrust/libproc_macro/Makefile.am
>  create mode 100644 libgrust/libproc_macro/proc_macro.cc
>  create mode 100644 libgrust/libproc_macro/proc_macro.h
>
> diff --git a/libgrust/Makefile.am b/libgrust/Makefile.am
> new file mode 100644
> index 000..8e5274922c5
> --- /dev/null
> +++ b/libgrust/Makefile.am
> @@ -0,0 +1,68 @@
> +AUTOMAKE_OPTIONS = 1.8 foreign
> +
> +SUFFIXES = .c .rs .def .o .lo .a
> +
> +ACLOCAL_AMFLAGS = -I . -I .. -I ../config
> +
> +AM_CFLAGS = -I $(srcdir)/../libgcc -I $(MULTIBUILDTOP)../../gcc/include
> +
> +TOP_GCCDIR := $(shell cd $(top_srcdir) && cd .. && pwd)
> +
> +GCC_DIR = $(TOP_GCCDIR)/gcc
> +RUST_SRC = $(GCC_DIR)/rust
> +
> +toolexeclibdir=@toolexeclibdir@
> +toolexecdir=@toolexecdir@
> +
> +SUBDIRS = libproc_macro
> +
> +RUST_BUILDDIR := $(shell pwd)
> +
> +# Work around what appears to be a GNU make bug handling MAKEFLAGS
> +# values defined in terms of make variables, as is the case for CC and
> +# friends when we are called from the top level Makefile.
> +AM_MAKEFLAGS = \
> +"GCC_DIR=$(GCC_DIR)" \
> +"RUST_SRC=$(RUST_SRC)" \
> +   "AR_FLAGS=$(AR_FLAGS)" \
> +   "CC_FOR_BUILD=$(CC_FOR_BUILD)" \
> +   "CC_FOR_TARGET=$(CC_FOR_TARGET)" \
> +   "RUST_FOR_TARGET=$(RUST_FOR_TARGET)" \
> +   "CFLAGS=$(CFLAGS)" \
> +   "CXXFLAGS=$(CXXFLAGS)" \
> +   "CFLAGS_FOR_BUILD=$(CFLAGS_FOR_BUILD)" \
> +   "CFLAGS_FOR_TARGET=$(CFLAGS_FOR_TARGET)" \
> +   "INSTALL=$(INSTALL)" \
> +   "INSTALL_DATA=$(INSTALL_DATA)" \
> +   "INSTALL_PROGRAM=$(INSTALL_PROGRAM)" \
> +   "INSTALL_SCRIPT=$(INSTALL_SCRIPT)" \
> +   "LDFLAGS=$(LDFLAGS)" \
> +   "LIBCFLAGS=$(LIBCFLAGS)" \
> +   "LIBCFLAGS_FOR_TARGET=$(LIBCFLAGS_FOR_TARGET)" \
> +   "MAKE=$(MAKE)" \
> +   "MAKEINFO=$(MAKEINFO) $(MAKEINFOFLAGS)" \
> +   "PICFLAG=$(PICFLAG)" \
> +   "PICFLAG_FOR_TARGET=$(PICFLAG_FOR_TARGET)" \
> +   "SHELL=$(SHELL)" \
> +   "RUNTESTFLAGS=$(RUNTESTFLAGS)" \
> +   "exec_prefix=$(exec_prefix)" \
> +   "infodir=$(infodir)" \
> +   "libdir=$(libdir)" \
> +   "includedir=$(includedir)" \
> +   "prefix=$(prefix)" \
> +   "tooldir=$(tooldir)" \
> +   "gxx_include_dir=$(gxx_include_dir)" \
> +   "AR=$(AR)" \
> +   "AS=$(AS)" \
> +   "LD=$(LD)" \
> +   "RANLIB=$(RANLIB)" \
> +   "NM=$(NM)" \
> +   "NM_FOR_BUILD=$(NM_FOR_BUILD)" \
> +   "NM_FOR_TARGET=$(NM_FOR_TARGET)" \
> +   "DESTDIR=$(DESTDIR)" \
> +   "WERROR=$(WERROR)" \
> +"TARGET_LIB_PATH=$(TARGET_LIB_PATH)" \
> +"TARGET_LIB_PATH_librust=$(TARGET_LIB_PATH_librust)" \
> +   "LIBTOOL=$(RUST_BUILDDIR)/libtool"
> +
> +include $(top_srcdir)/../multilib.am
> diff --git a/libgrust/configure.ac b/libgrust/configure.ac
> new file mode 100644
> index 000..7aed489a643
> --- /dev/null
> +++ b/libgrust/configure.ac
> @@ -0,0 +1,113 @@
> +AC_INIT([libgrust], version-unused,,librust)
> +AC_CONFIG_SRCDIR(Makefile.am)
> +AC_CONFIG_FILES([Makefile])
> +
> +# AM_ENABLE_MULTILIB(, ..)
> +
> +# Do not delete or change the following two lines.  For why, see
> +# http://gcc.gnu.org/ml/libstdc++/2003-07/msg00451.html
> +AC_CANONICAL_SYSTEM
> +target_alias=${target_alias-$host_alias}
> +AC_SUBST(target_alias)
> +
> +# Automake should never attempt to rebuild configure
> +AM_MAINTAINER_MODE

Re: [PATCH 2/3] build: Add libgrust as compilation modules

2023-09-26 Thread Richard Biener
On Wed, Sep 20, 2023 at 2:04 PM Arthur Cohen  wrote:
>
> From: Pierre-Emmanuel Patry 
>
> Define the libgrust directory as a host compilation module as well as
> for targets.

OK if you tested this doesn't break build when rust is enabled on trunk
(and doesn't build libgrust if not).

Richard.

> ChangeLog:
>
> * Makefile.def: Add libgrust as host & target module.
> * configure.ac: Add libgrust to host tools list.
>
> gcc/rust/ChangeLog:
>
> * config-lang.in: Add libgrust as a target module for the rust
> language.
>
> Signed-off-by: Pierre-Emmanuel Patry 
> ---
>  Makefile.def| 2 ++
>  configure.ac| 3 ++-
>  gcc/rust/config-lang.in | 2 ++
>  3 files changed, 6 insertions(+), 1 deletion(-)
>
> diff --git a/Makefile.def b/Makefile.def
> index 870150183b9..3df3fc18d14 100644
> --- a/Makefile.def
> +++ b/Makefile.def
> @@ -149,6 +149,7 @@ host_modules= { module= libcc1; 
> extra_configure_flags=--enable-shared; };
>  host_modules= { module= gotools; };
>  host_modules= { module= libctf; bootstrap=true; };
>  host_modules= { module= libsframe; bootstrap=true; };
> +host_modules= { module= libgrust; };
>
>  target_modules = { module= libstdc++-v3;
>bootstrap=true;
> @@ -192,6 +193,7 @@ target_modules = { module= libgm2; lib_path=.libs; };
>  target_modules = { module= libgomp; bootstrap= true; lib_path=.libs; };
>  target_modules = { module= libitm; lib_path=.libs; };
>  target_modules = { module= libatomic; bootstrap=true; lib_path=.libs; };
> +target_modules = { module= libgrust; };
>
>  // These are (some of) the make targets to be done in each subdirectory.
>  // Not all; these are the ones which don't have special options.
> diff --git a/configure.ac b/configure.ac
> index 1d16530140a..036e5945905 100644
> --- a/configure.ac
> +++ b/configure.ac
> @@ -133,7 +133,7 @@ build_tools="build-texinfo build-flex build-bison 
> build-m4 build-fixincludes"
>
>  # these libraries are used by various programs built for the host environment
>  #f
> -host_libs="intl libiberty opcodes bfd readline tcl tk itcl libgui zlib 
> libbacktrace libcpp libcody libdecnumber gmp mpfr mpc isl libiconv libctf 
> libsframe"
> +host_libs="intl libiberty opcodes bfd readline tcl tk itcl libgui zlib 
> libbacktrace libcpp libcody libdecnumber gmp mpfr mpc isl libiconv libctf 
> libsframe libgrust "
>
>  # these tools are built for the host environment
>  # Note, the powerpc-eabi build depends on sim occurring before gdb in order 
> to
> @@ -164,6 +164,7 @@ target_libraries="target-libgcc \
> target-libada \
> target-libgm2 \
> target-libgo \
> +   target-libgrust \
> target-libphobos \
> target-zlib"
>
> diff --git a/gcc/rust/config-lang.in b/gcc/rust/config-lang.in
> index aac66c9b962..8f071dcb0bf 100644
> --- a/gcc/rust/config-lang.in
> +++ b/gcc/rust/config-lang.in
> @@ -29,4 +29,6 @@ compilers="rust1\$(exeext)"
>
>  build_by_default="no"
>
> +target_libs="target-libffi target-libbacktrace target-libgrust"
> +
>  gtfiles="\$(srcdir)/rust/rust-lang.cc"
> --
> 2.42.0
>


[COMMITTED] ada: Clarify RM references that justify a constraint check

2023-09-26 Thread Marc Poulhiès
From: Yannick Moy 

gcc/ada/

* exp_ch5.adb (Expand_N_Case_Statement): Reference both sections
of the Ada RM that deal with case statements and case expressions
to justify the insertion of a runtime check.

Tested on x86_64-pc-linux-gnu, committed on master.

---
 gcc/ada/exp_ch5.adb | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/gcc/ada/exp_ch5.adb b/gcc/ada/exp_ch5.adb
index d55fdb3e2e5..cd3b02b9360 100644
--- a/gcc/ada/exp_ch5.adb
+++ b/gcc/ada/exp_ch5.adb
@@ -4092,8 +4092,9 @@ package body Exp_Ch5 is
  end if;
 
  --  First step is to worry about possible invalid argument. The RM
- --  requires (RM 5.4(13)) that if the result is invalid (e.g. it is
- --  outside the base range), then Constraint_Error must be raised.
+ --  requires (RM 4.5.7 (21/3) and 5.4 (13)) that if the result is
+ --  invalid (e.g. it is outside the base range), then Constraint_Error
+ --  must be raised.
 
  --  Case of validity check required (validity checks are on, the
  --  expression is not known to be valid, and the case statement
@@ -4274,7 +4275,7 @@ package body Exp_Ch5 is
 
 --  If Predicates_Ignored is true the value does not satisfy the
 --  predicate, and there is no Others choice, Constraint_Error
---  must be raised (4.5.7 (21/3)).
+--  must be raised (RM 4.5.7 (21/3) and 5.4 (13)).
 
 if Predicates_Ignored (Etype (Expr)) then
declare
-- 
2.42.0



[COMMITTED] ada: Make minor corrections to CUDA-related comments

2023-09-26 Thread Marc Poulhiès
From: Ronan Desplanques 

gcc/ada/

* exp_prag.adb: Make minor corrections in comments.
* rtsfind.ads: Remove unused element from RTU_Id definition.

Tested on x86_64-pc-linux-gnu, committed on master.

---
 gcc/ada/exp_prag.adb | 8 
 gcc/ada/rtsfind.ads  | 1 -
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/gcc/ada/exp_prag.adb b/gcc/ada/exp_prag.adb
index 1cc4653a3b0..d2807cdc7ef 100644
--- a/gcc/ada/exp_prag.adb
+++ b/gcc/ada/exp_prag.adb
@@ -685,7 +685,7 @@ package body Exp_Prag is
--Blocks_Id'address,
--Mem_Id'address,
--Stream_Id'address),
-   --  CUDA.Runtime_Api.Launch_Kernel (
+   --  CUDA.Internal.Launch_Kernel (
--My_Proc'Address,
--Blocks_Id,
--Grids_Id,
@@ -703,7 +703,7 @@ package body Exp_Prag is
  Decls  : List_Id;
  Copies : Elist_Id);
   --  For each parameter in list Params, create an object declaration of
-  --  the followinng form:
+  --  the following form:
   --
   --Copy_Id : Param_Typ := Param_Val;
   --
@@ -755,8 +755,8 @@ package body Exp_Prag is
  Kernel_Arg : Entity_Id;
  Memory : Entity_Id;
  Stream : Entity_Id) return Node_Id;
-  --  Builds and returns a call to CUDA.Launch_Kernel using the given
-  --  arguments. Proc is the entity of the procedure passed to the
+  --  Builds and returns a call to CUDA.Internal.Launch_Kernel using the
+  --  given arguments. Proc is the entity of the procedure passed to the
   --  CUDA_Execute pragma. Grid_Dims and Block_Dims are entities of the
   --  generated declarations that hold the kernel's dimensions. Args is the
   --  entity of the temporary array that holds the arguments of the kernel.
diff --git a/gcc/ada/rtsfind.ads b/gcc/ada/rtsfind.ads
index 881f723dfa9..669f6df79cb 100644
--- a/gcc/ada/rtsfind.ads
+++ b/gcc/ada/rtsfind.ads
@@ -179,7 +179,6 @@ package Rtsfind is
 
   CUDA_Driver_Types,
   CUDA_Internal,
-  CUDA_Runtime_Api,
   CUDA_Vector_Types,
 
   --  Interfaces
-- 
2.42.0



[COMMITTED] ada: Define CHERI exception types

2023-09-26 Thread Marc Poulhiès
From: Daniel King 

These exception types map to the CHERI hardware exceptions that are
triggered due to misuse of capabilities.

gcc/ada/

* libgnat/i-cheri.ads (Capability_Bound_Error)
(Capability_Permission_Error, Capability_Sealed_Error)
(Capability_Tag_Error): New, define CHERI exception types.

Tested on x86_64-pc-linux-gnu, committed on master.

---
 gcc/ada/libgnat/i-cheri.ads | 16 
 1 file changed, 16 insertions(+)

diff --git a/gcc/ada/libgnat/i-cheri.ads b/gcc/ada/libgnat/i-cheri.ads
index 547b033dbaf..80985212589 100644
--- a/gcc/ada/libgnat/i-cheri.ads
+++ b/gcc/ada/libgnat/i-cheri.ads
@@ -467,4 +467,20 @@ is
  External_Name => "__builtin_cheri_stack_get";
--  Get the Capability Stack Pointer (CSP)
 
+   ---
+   -- Capability Exceptions --
+   ---
+
+   Capability_Bound_Error : exception;
+   --  An out-of-bounds access was attempted
+
+   Capability_Permission_Error : exception;
+   --  An attempted access exceeded the permissions granted by a capability
+
+   Capability_Sealed_Error : exception;
+   --  A sealed capability was dereferenced
+
+   Capability_Tag_Error : exception;
+   --  An invalid capability was dereferenced
+
 end Interfaces.CHERI;
-- 
2.42.0



[COMMITTED] ada: Fix conversions between addresses and integers

2023-09-26 Thread Marc Poulhiès
From: Daniel King 

On CHERI targets the size of System.Address and Integer_Address
(or similar) are not the same. The operations in System.Storage_Elements
should be used to convert between integers and addresses.

gcc/ada/

* libgnat/a-tags.adb (To_Tag): Use System.Storage_Elements for
integer to address conversion.
* libgnat/s-putima.adb (Put_Image_Pointer): Likewise.

Tested on x86_64-pc-linux-gnu, committed on master.

---
 gcc/ada/libgnat/a-tags.adb   | 5 -
 gcc/ada/libgnat/s-putima.adb | 6 ++
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/gcc/ada/libgnat/a-tags.adb b/gcc/ada/libgnat/a-tags.adb
index 3590785aa44..1ffc78ed1e8 100644
--- a/gcc/ada/libgnat/a-tags.adb
+++ b/gcc/ada/libgnat/a-tags.adb
@@ -93,7 +93,10 @@ package body Ada.Tags is
--  Disable warnings on possible aliasing problem
 
function To_Tag is
- new Unchecked_Conversion (Integer_Address, Tag);
+ new Unchecked_Conversion (System.Address, Tag);
+
+   function To_Tag (S : Integer_Address) return Tag is
+ (To_Tag (To_Address (S)));
 
function To_Dispatch_Table_Ptr is
   new Ada.Unchecked_Conversion (Tag, Dispatch_Table_Ptr);
diff --git a/gcc/ada/libgnat/s-putima.adb b/gcc/ada/libgnat/s-putima.adb
index 1d6e6085928..bcc7af2ebf5 100644
--- a/gcc/ada/libgnat/s-putima.adb
+++ b/gcc/ada/libgnat/s-putima.adb
@@ -32,7 +32,7 @@
 with Ada.Strings.Text_Buffers.Utils;
 use Ada.Strings.Text_Buffers;
 use Ada.Strings.Text_Buffers.Utils;
-with Ada.Unchecked_Conversion;
+with System.Storage_Elements; use System.Storage_Elements;
 
 package body System.Put_Images is
 
@@ -132,15 +132,13 @@ package body System.Put_Images is
procedure Put_Image_Pointer
  (S : in out Sink'Class; X : Pointer; Type_Kind : String)
is
-  function Cast is new Ada.Unchecked_Conversion
-(System.Address, Unsigned_Address);
begin
   if X = null then
  Put_UTF_8 (S, "null");
   else
  Put_UTF_8 (S, "(");
  Put_UTF_8 (S, Type_Kind);
- Hex.Put_Image (S, Cast (X.all'Address));
+ Hex.Put_Image (S, Unsigned_Address (To_Integer (X.all'Address)));
  Put_UTF_8 (S, ")");
   end if;
end Put_Image_Pointer;
-- 
2.42.0



[COMMITTED] ada: Dimensional analysis when used with elementary functions

2023-09-26 Thread Marc Poulhiès
From: Derek Schacht 

gcc/ada/

* doc/gnat_ugn/gnat_and_program_execution.rst: Add more details on
using Generic Elementary Functions with dimensional analysis.
* gnat_ugn.texi: Regenerate.

Tested on x86_64-pc-linux-gnu, committed on master.

---
 .../gnat_ugn/gnat_and_program_execution.rst   | 12 
 gcc/ada/gnat_ugn.texi | 19 +--
 2 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/gcc/ada/doc/gnat_ugn/gnat_and_program_execution.rst 
b/gcc/ada/doc/gnat_ugn/gnat_and_program_execution.rst
index 62abca24f41..35e34772658 100644
--- a/gcc/ada/doc/gnat_ugn/gnat_and_program_execution.rst
+++ b/gcc/ada/doc/gnat_ugn/gnat_and_program_execution.rst
@@ -3294,6 +3294,18 @@ requires ``DV(Source)`` = ``DV(Target)``, and 
analogously for parameter
 passing (the dimension vector for the actual parameter must be equal to the
 dimension vector for the formal parameter).
 
+When using dimensioned types with elementary functions it is necessary to
+instantiate the ``Ada.Numerics.Generic_Elementary_Functions`` package using
+the ``Mks_Type`` and not any of the derived subtypes such as ``Distance``.
+For functions such as ``Sqrt`` the dimensional analysis will fail when using
+the subtypes because both the parameter and return are of the same type.
+
+An example instantiation
+
+  .. code-block:: ada
+  
+package Mks_Numerics is new 
+   Ada.Numerics.Generic_Elementary_Functions (System.Dim.Mks.Mks_Type);
 
 .. _Stack_Related_Facilities:
 
diff --git a/gcc/ada/gnat_ugn.texi b/gcc/ada/gnat_ugn.texi
index 7c5926eba64..1562bee1f64 100644
--- a/gcc/ada/gnat_ugn.texi
+++ b/gcc/ada/gnat_ugn.texi
@@ -19,7 +19,7 @@
 
 @copying
 @quotation
-GNAT User's Guide for Native Platforms , Aug 31, 2023
+GNAT User's Guide for Native Platforms , Sep 26, 2023
 
 AdaCore
 
@@ -15510,7 +15510,6 @@ Linker to be used. The default is @code{bfd} for 
@code{ld.bfd}; @code{gold}
 (for @code{ld.gold}) and @code{mold} (for @code{ld.mold}) are more
 recent and faster alternatives, but only available on GNU/Linux
 platforms.
-
 @end table
 
 @node Binding with gnatbind,Linking with gnatlink,Linker Switches,Building 
Executable Programs with GNAT
@@ -22093,6 +22092,22 @@ requires @code{DV(Source)} = @code{DV(Target)}, and 
analogously for parameter
 passing (the dimension vector for the actual parameter must be equal to the
 dimension vector for the formal parameter).
 
+When using dimensioned types with elementary functions it is necessary to
+instantiate the @code{Ada.Numerics.Generic_Elementary_Functions} package using
+the @code{Mks_Type} and not any of the derived subtypes such as 
@code{Distance}.
+For functions such as @code{Sqrt} the dimensional analysis will fail when using
+the subtypes because both the parameter and return are of the same type.
+
+An example instantiation
+
+@quotation
+
+@example
+package Mks_Numerics is new
+   Ada.Numerics.Generic_Elementary_Functions (System.Dim.Mks.Mks_Type);
+@end example
+@end quotation
+
 @node Stack Related Facilities,Memory Management Issues,Performing 
Dimensionality Analysis in GNAT,GNAT and Program Execution
 @anchor{gnat_ugn/gnat_and_program_execution 
id52}@anchor{14d}@anchor{gnat_ugn/gnat_and_program_execution 
stack-related-facilities}@anchor{1aa}
 @section Stack Related Facilities
-- 
2.42.0



[COMMITTED] ada: Fix missing finalization of extended return object on abnormal completion

2023-09-26 Thread Marc Poulhiès
From: Eric Botcazou 

This happens in the case of a nonlimited return type and is a fallout of the
optimization recently implemented for them.

gcc/ada/

* einfo.ads (Status_Flag_Or_Transient_Decl): Remove ??? comment.
* exp_ch6.adb (Expand_N_Extended_Return_Statement): Extend the
handling of finalizable return objects to the non-BIP case.
* exp_ch7.adb (Build_Finalizer.Process_Declarations): Adjust the
comment accordingly.
* exp_util.adb (Requires_Cleanup_Actions): Likewise.

Tested on x86_64-pc-linux-gnu, committed on master.

---
 gcc/ada/einfo.ads| 8 
 gcc/ada/exp_ch6.adb  | 4 ++--
 gcc/ada/exp_ch7.adb  | 6 +++---
 gcc/ada/exp_util.adb | 6 +++---
 4 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/gcc/ada/einfo.ads b/gcc/ada/einfo.ads
index 977392899f9..9165fb7485d 100644
--- a/gcc/ada/einfo.ads
+++ b/gcc/ada/einfo.ads
@@ -4518,11 +4518,11 @@ package Einfo is
 --Status_Flag_Or_Transient_Decl
 --   Defined in constant, loop, and variable entities. Applies to objects
 --   that require special treatment by the finalization machinery, such as
---   extended return results, IF and CASE expression results, and objects
+--   extended return objects, conditional expression results, and objects
 --   inside N_Expression_With_Actions nodes. The attribute contains the
---   entity of a flag which specifies particular behavior over a region of
---   code or the declaration of a "hook" object.
---   In which case is it a flag, or a hook object???
+--   entity of a flag which specifies a particular behavior over a region
+--   of the extended return for the return objects, or the declaration of a
+--   hook object for conditional expressions and N_Expression_With_Actions.
 
 --Storage_Size_Variable [implementation base type only]
 --   Defined in access types and task type entities. This flag is set
diff --git a/gcc/ada/exp_ch6.adb b/gcc/ada/exp_ch6.adb
index a16dfe2d57e..beb2e2f90f0 100644
--- a/gcc/ada/exp_ch6.adb
+++ b/gcc/ada/exp_ch6.adb
@@ -5607,7 +5607,7 @@ package body Exp_Ch6 is
   --  with the scope finalizer. There is one flag per each return object
   --  in case of multiple returns.
 
-  if Is_BIP_Func and then Needs_Finalization (Etype (Ret_Obj_Id)) then
+  if Needs_Finalization (Etype (Ret_Obj_Id)) then
  declare
 Flag_Decl : Node_Id;
 Flag_Id   : Entity_Id;
@@ -5706,7 +5706,7 @@ package body Exp_Ch6 is
  --  Update the state of the function right before the object is
  --  returned.
 
- if Is_BIP_Func and then Needs_Finalization (Etype (Ret_Obj_Id)) then
+ if Needs_Finalization (Etype (Ret_Obj_Id)) then
 declare
Flag_Id : constant Entity_Id :=
Status_Flag_Or_Transient_Decl (Ret_Obj_Id);
diff --git a/gcc/ada/exp_ch7.adb b/gcc/ada/exp_ch7.adb
index 4ea5e6ede64..271dfd22618 100644
--- a/gcc/ada/exp_ch7.adb
+++ b/gcc/ada/exp_ch7.adb
@@ -2381,9 +2381,9 @@ package body Exp_Ch7 is
elsif Is_Ignored_Ghost_Entity (Obj_Id) then
   null;
 
-   --  Return object of a build-in-place function. This case is
-   --  recognized and marked by the expansion of an extended return
-   --  statement (see Expand_N_Extended_Return_Statement).
+   --  Return object of extended return statements. This case is
+   --  recognized and marked by the expansion of extended return
+   --  statements (see Expand_N_Extended_Return_Statement).
 
elsif Needs_Finalization (Obj_Typ)
  and then Is_Return_Object (Obj_Id)
diff --git a/gcc/ada/exp_util.adb b/gcc/ada/exp_util.adb
index 2e6a1cf892e..9ac64fe9381 100644
--- a/gcc/ada/exp_util.adb
+++ b/gcc/ada/exp_util.adb
@@ -13127,9 +13127,9 @@ package body Exp_Util is
 elsif Is_Ignored_Ghost_Entity (Obj_Id) then
null;
 
---  Return object of a build-in-place function. This case is
---  recognized and marked by the expansion of an extended return
---  statement (see Expand_N_Extended_Return_Statement).
+--  Return object of extended return statements. This case is
+--  recognized and marked by the expansion of extended return
+--  statements (see Expand_N_Extended_Return_Statement).
 
 elsif Needs_Finalization (Obj_Typ)
   and then Is_Return_Object (Obj_Id)
-- 
2.42.0



[COMMITTED] ada: Add CHERI variant of System.Stream_Attributes

2023-09-26 Thread Marc Poulhiès
From: Daniel King 

Reading and writing System.Address to a stream on CHERI targets does
not preserve the capability tag; it will always be invalid since
a valid capability cannot be created out of thin air. Reading an Address
from a stream would therefore never yield a capability that can be
dereferenced.

This patch introduces a CHERI variant of System.Stream_Attributes that
raises Program_Error when attempting to read a System.Address from a stream.

gcc/ada/

* libgnat/s-stratt__cheri.adb: New file

Tested on x86_64-pc-linux-gnu, committed on master.

---
 gcc/ada/libgnat/s-stratt__cheri.adb | 1019 +++
 1 file changed, 1019 insertions(+)
 create mode 100644 gcc/ada/libgnat/s-stratt__cheri.adb

diff --git a/gcc/ada/libgnat/s-stratt__cheri.adb 
b/gcc/ada/libgnat/s-stratt__cheri.adb
new file mode 100644
index 000..f753cf3bf00
--- /dev/null
+++ b/gcc/ada/libgnat/s-stratt__cheri.adb
@@ -0,0 +1,1019 @@
+--
+--  --
+-- GNAT RUN-TIME COMPONENTS --
+--  --
+-- S Y S T E M . S T R E A M _ A T T R I B U T E S  --
+--  --
+-- B o d y  --
+--  --
+--  Copyright (C) 1992-2023, Free Software Foundation, Inc. --
+--  --
+-- GNAT is free software;  you can  redistribute it  and/or modify it under --
+-- terms of the  GNU General Public License as published  by the Free Soft- --
+-- ware  Foundation;  either version 3,  or (at your option) any later ver- --
+-- sion.  GNAT is distributed in the hope that it will be useful, but WITH- --
+-- OUT ANY WARRANTY;  without even the  implied warranty of MERCHANTABILITY --
+-- or FITNESS FOR A PARTICULAR PURPOSE. --
+--  --
+-- As a special exception under Section 7 of GPL version 3, you are granted --
+-- additional permissions described in the GCC Runtime Library Exception,   --
+-- version 3.1, as published by the Free Software Foundation.   --
+--  --
+-- You should have received a copy of the GNU General Public License and--
+-- a copy of the GCC Runtime Library Exception along with this program; --
+-- see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see--
+-- .  --
+--  --
+-- GNAT was originally developed  by the GNAT team at  New York University. --
+-- Extensive contributions were provided by Ada Core Technologies Inc.  --
+--  --
+--
+
+--  This is the CHERI variant of this package
+
+with Ada.IO_Exceptions;
+with Ada.Streams; use Ada.Streams;
+with Ada.Unchecked_Conversion;
+with System.Stream_Attributes.XDR;
+
+package body System.Stream_Attributes is
+
+   XDR_Stream : constant Integer;
+   pragma Import (C, XDR_Stream, "__gl_xdr_stream");
+   --  This imported value is used to determine whether the build had the
+   --  binder switch "-xdr" present which enables XDR streaming and sets this
+   --  flag to 1.
+
+   function XDR_Support return Boolean is (XDR_Stream = 1);
+   pragma Inline (XDR_Support);
+   --  Return True if XDR streaming should be used. Note that 128-bit integers
+   --  are not supported by the XDR protocol and will raise Device_Error.
+
+   Err : exception renames Ada.IO_Exceptions.End_Error;
+   --  Exception raised if insufficient data read (note that the RM implies
+   --  that Data_Error might be the appropriate choice, but AI95-00132
+   --  decides with a binding interpretation that End_Error is preferred).
+
+   SU : constant := System.Storage_Unit;
+
+   subtype SEA is Ada.Streams.Stream_Element_Array;
+   subtype SEO is Ada.Streams.Stream_Element_Offset;
+
+   generic function UC renames Ada.Unchecked_Conversion;
+
+   --  Subtypes used to define Stream_Element_Array values that map
+   --  into the elementary types, using unchecked conversion.
+
+   Thin_Pointer_Size : constant := System.Address'Size;
+   Fat_Pointer_Size  : constant := System.Address'Size * 2;
+
+   subtype S_AD   is SEA (1 .. (Fat_Pointer_Size  + SU - 1) / SU);
+   subtype S_AS   is SEA (1 .. (Thin_Pointer_Size + SU - 1) 

[COMMITTED] ada: Fix missing call to Finalize_Protection for simple protected objects

2023-09-26 Thread Marc Poulhiès
From: Eric Botcazou 

There is a glitch in Exp_Ch7.Build_Finalizer causing the finalizer to do
nothing for simple protected objects.

The change also removes redundant calls to the Is_Simple_Protected_Type
predicate and fixes a minor inconsistency between Requires_Cleanup_Actions
and Build_Finalizer for this case.

gcc/ada/

* exp_ch7.adb (Build_Finalizer.Process_Declarations): Remove call
to Is_Simple_Protected_Type as redundant.
(Build_Finalizer.Process_Object_Declaration): Do not retrieve the
corresponding record type for simple protected objects. Make the
flow of control more explicit in their specific processing.
* exp_util.adb (Requires_Cleanup_Actions): Return false for simple
protected objects present in library-level package bodies for the
sake of consistency with Build_Finalizer and remove call to
Is_Simple_Protected_Type as redundant.

Tested on x86_64-pc-linux-gnu, committed on master.

---
 gcc/ada/exp_ch7.adb  | 19 ++-
 gcc/ada/exp_util.adb | 32 ++--
 2 files changed, 40 insertions(+), 11 deletions(-)

diff --git a/gcc/ada/exp_ch7.adb b/gcc/ada/exp_ch7.adb
index 585acd8b428..5049de54dd7 100644
--- a/gcc/ada/exp_ch7.adb
+++ b/gcc/ada/exp_ch7.adb
@@ -2356,8 +2356,7 @@ package body Exp_Ch7 is
 
elsif Ekind (Obj_Id) = E_Variable
  and then not In_Library_Level_Package_Body (Obj_Id)
- and then (Is_Simple_Protected_Type (Obj_Typ)
-or else Has_Simple_Protected_Object (Obj_Typ))
+ and then Has_Simple_Protected_Object (Obj_Typ)
then
   Processing_Actions (Is_Protected => True);
end if;
@@ -3006,7 +3005,9 @@ package body Exp_Ch7 is
   --  Start of processing for Process_Object_Declaration
 
   begin
- --  Handle the object type and the reference to the object
+ --  Handle the object type and the reference to the object. Note
+ --  that objects having simple protected components must retain
+ --  their original form for the processing below to work.
 
  Obj_Ref := New_Occurrence_Of (Obj_Id, Loc);
  Obj_Typ := Base_Type (Etype (Obj_Id));
@@ -3018,6 +3019,7 @@ package body Exp_Ch7 is
 
 elsif Is_Concurrent_Type (Obj_Typ)
   and then Present (Corresponding_Record_Type (Obj_Typ))
+  and then not Is_Protected
 then
Obj_Typ := Corresponding_Record_Type (Obj_Typ);
Obj_Ref := Unchecked_Convert_To (Obj_Typ, Obj_Ref);
@@ -3180,12 +3182,11 @@ package body Exp_Ch7 is
   Fin_Stmts := New_List (Fin_Call);
end if;
 
-elsif Has_Simple_Protected_Object (Obj_Typ) then
-   if Is_Record_Type (Obj_Typ) then
-  Fin_Stmts := Cleanup_Record (Decl, Obj_Ref, Obj_Typ);
-   elsif Is_Array_Type (Obj_Typ) then
-  Fin_Stmts := Cleanup_Array (Decl, Obj_Ref, Obj_Typ);
-   end if;
+elsif Is_Array_Type (Obj_Typ) then
+   Fin_Stmts := Cleanup_Array (Decl, Obj_Ref, Obj_Typ);
+
+else
+   Fin_Stmts := Cleanup_Record (Decl, Obj_Ref, Obj_Typ);
 end if;
 
 --  Generate:
diff --git a/gcc/ada/exp_util.adb b/gcc/ada/exp_util.adb
index 9ac64fe9381..1aff5a062ce 100644
--- a/gcc/ada/exp_util.adb
+++ b/gcc/ada/exp_util.adb
@@ -13100,10 +13100,38 @@ package body Exp_Util is
 --  Simple protected objects which use type System.Tasking.
 --  Protected_Objects.Protection to manage their locks should be
 --  treated as controlled since they require manual cleanup.
+--  The only exception is illustrated in the following example:
+
+-- package Pkg is
+--type Ctrl is new Controlled ...
+--procedure Finalize (Obj : in out Ctrl);
+--Lib_Obj : Ctrl;
+-- end Pkg;
+
+-- package body Pkg is
+--protected Prot is
+--   procedure Do_Something (Obj : in out Ctrl);
+--end Prot;
+
+--protected body Prot is
+--   procedure Do_Something (Obj : in out Ctrl) is ...
+--end Prot;
+
+--procedure Finalize (Obj : in out Ctrl) is
+--begin
+--   Prot.Do_Something (Obj);
+--end Finalize;
+-- end Pkg;
+
+--  Since for the most part entities in package bodies depend on
+--  those in package specs, Prot's lock should be cleaned up
+--  first. The subsequent cleanup of the spec finalizes Lib_Obj.
+--  This act however attempts to invoke Do_Something and fails
+--  because the lock has disappeared.
 

[COMMITTED] ada: Update personality function for CHERI purecap

2023-09-26 Thread Marc Poulhiès
From: Daniel King 

This makes two changes to the GNAT personality function to reflect
differences for pure capability CHERI/Morello. The first is to use
__builtin_code_address_from_pointer to drop the LSB from Morello
code pointers when searching through call-site tables (without this
we would never find the right landing pad when unwinding).

The second change is to reflect the change in the exception table
format for pure-capability Morello where the landing pad is a capability
indirected by an offset in the call-site table.

gcc/ada/

* raise-gcc.c (get_ip_from_context): Adapt for CHERI purecap
(get_call_site_action_for): Adapt for CHERI purecap

Tested on x86_64-pc-linux-gnu, committed on master.

---
 gcc/ada/raise-gcc.c | 39 ---
 1 file changed, 36 insertions(+), 3 deletions(-)

diff --git a/gcc/ada/raise-gcc.c b/gcc/ada/raise-gcc.c
index 56ddfc5a6cf..bdf1c26e612 100644
--- a/gcc/ada/raise-gcc.c
+++ b/gcc/ada/raise-gcc.c
@@ -50,10 +50,12 @@
 
 #ifdef __cplusplus
 # include 
+# include 
 # include 
 #else
 # include 
 # include 
+# include 
 # include 
 #endif
 
@@ -592,6 +594,11 @@ get_ip_from_context (_Unwind_Context *uw_context)
 #else
   _Unwind_Ptr ip = _Unwind_GetIP (uw_context);
 #endif
+
+#if !defined(__USING_SJLJ_EXCEPTIONS__) && defined(__CHERI__)
+  ip = __builtin_code_address_from_pointer ((void *)ip);
+#endif
+
   /* Subtract 1 if necessary because GetIPInfo yields a call return address
  in this case, while we are interested in information for the call point.
  This does not always yield the exact call instruction address but always
@@ -850,7 +857,27 @@ get_call_site_action_for (_Unwind_Ptr ip,
   /* Note that all call-site encodings are "absolute" displacements.  */
   p = read_encoded_value (0, region->call_site_encoding, p, &cs_start);
   p = read_encoded_value (0, region->call_site_encoding, p, &cs_len);
+#ifdef __CHERI_PURE_CAPABILITY__
+  // Single uleb128 value as the capability marker.
+  _Unwind_Ptr marker = 0;
+  p = read_encoded_value (0, DW_EH_PE_uleb128, p, &marker);
+  if (marker == 0xd)
+   {
+ /* 8-byte offset to the (indirected) capability. */
+ p = read_encoded_value (0, DW_EH_PE_pcrel | DW_EH_PE_udata8, p,
+ &cs_lp);
+   }
+  else if (marker)
+   {
+ /* Unsupported landing pad marker value. */
+ abort ();
+   }
+  else
+   cs_lp = 0; // No landing pad.
+#else
   p = read_encoded_value (0, region->call_site_encoding, p, &cs_lp);
+#endif
+
   p = read_uleb128 (p, &cs_action);
 
   db (DB_CSITE,
@@ -859,18 +886,24 @@ get_call_site_action_for (_Unwind_Ptr ip,
  (char *)region->lp_base + cs_lp, (void *)cs_lp);
 
   /* The table is sorted, so if we've passed the IP, stop.  */
-  if (ip < region->base + cs_start)
+  if (ip < region->base + (size_t)cs_start)
break;
 
   /* If we have a match, fill the ACTION fields accordingly.  */
-  else if (ip < region->base + cs_start + cs_len)
+  else if (ip < region->base + (size_t)cs_start + (size_t)cs_len)
{
  /* Let the caller know there may be an action to take, but let it
 determine the kind.  */
  action->kind = unknown;
 
  if (cs_lp)
-   action->landing_pad = region->lp_base + cs_lp;
+   {
+#ifdef __CHERI_PURE_CAPABILITY__
+ action->landing_pad = *(_Unwind_Ptr *)cs_lp;
+#else
+ action->landing_pad = region->lp_base + cs_lp;
+#endif
+   }
  else
action->landing_pad = 0;
 
-- 
2.42.0



[COMMITTED] ada: Crash processing the accessibility level of an actual parameter

2023-09-26 Thread Marc Poulhiès
From: Javier Miranda 

gcc/ada/

* exp_ch6.adb (Expand_Call_Helper): When computing the
accessibility level of an actual parameter based on the
expresssion of a constant declaration, add missing support for
deferred constants

Tested on x86_64-pc-linux-gnu, committed on master.

---
 gcc/ada/exp_ch6.adb | 24 +---
 1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/gcc/ada/exp_ch6.adb b/gcc/ada/exp_ch6.adb
index beb2e2f90f0..c1d5fa3c08b 100644
--- a/gcc/ada/exp_ch6.adb
+++ b/gcc/ada/exp_ch6.adb
@@ -4352,13 +4352,23 @@ package body Exp_Ch6 is
--  Generate the accessibility level based on the expression in
--  the constant's declaration.
 
-   Add_Extra_Actual
- (Expr => Accessibility_Level
-(Expr=> Expression
-  (Parent (Entity (Prev))),
- Level   => Dynamic_Level,
- Allow_Alt_Model => False),
-  EF   => Extra_Accessibility (Formal));
+   declare
+  Ent : Entity_Id := Entity (Prev);
+
+   begin
+  --  Handle deferred constants
+
+  if Present (Full_View (Ent)) then
+ Ent := Full_View (Ent);
+  end if;
+
+  Add_Extra_Actual
+(Expr => Accessibility_Level
+   (Expr=> Expression (Parent (Ent)),
+Level   => Dynamic_Level,
+Allow_Alt_Model => False),
+ EF   => Extra_Accessibility (Formal));
+   end;
 
 --  Normal case
 
-- 
2.42.0



[COMMITTED] ada: Fix deferred constant wrongly rejected

2023-09-26 Thread Marc Poulhiès
From: Eric Botcazou 

This recent regression occurs when the nominal subtype of the constant is a
discriminated record type with default discriminants.

gcc/ada/
PR ada/110488
* sem_ch3.adb (Analyze_Object_Declaration): Do not build a default
subtype for a deferred constant in the definite case too.

Tested on x86_64-pc-linux-gnu, committed on master.

---
 gcc/ada/sem_ch3.adb | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/gcc/ada/sem_ch3.adb b/gcc/ada/sem_ch3.adb
index 92902a7debb..c79d323395f 100644
--- a/gcc/ada/sem_ch3.adb
+++ b/gcc/ada/sem_ch3.adb
@@ -5048,9 +5048,11 @@ package body Sem_Ch3 is
 Apply_Length_Check (E, T);
  end if;
 
-  --  When possible, build the default subtype
+  --  When possible, and not a deferred constant, build the default subtype
 
-  elsif Build_Default_Subtype_OK (T) then
+  elsif Build_Default_Subtype_OK (T)
+and then (not Constant_Present (N) or else Present (E))
+  then
  if No (E) then
 Act_T := Build_Default_Subtype (T, N);
  else
-- 
2.42.0



[COMMITTED] ada: Fix unnesting generated loops with nested finalization procedure

2023-09-26 Thread Marc Poulhiès
The compiler can generate loops for creating array aggregates, for
example used during the initialization of variable. If the component
type of the array element requires finalization, the compiler also
creates a block and a nested procedure that need to be correctly
unnested if unnesting is enabled. During the unnesting transformation,
the scopes for these inner blocks need to be fixed and set to the
enclosing loop entity.

gcc/ada/

* exp_ch7.adb (Contains_Subprogram): Recursively search for subp
in loop's statements.
(Unnest_Loop): New.
(Unnest_Loop): Rename local variable for more clarity.
* exp_unst.ads: Refresh comment.

Tested on x86_64-pc-linux-gnu, committed on master.

---
 gcc/ada/exp_ch7.adb  | 88 +---
 gcc/ada/exp_unst.ads |  7 +---
 2 files changed, 85 insertions(+), 10 deletions(-)

diff --git a/gcc/ada/exp_ch7.adb b/gcc/ada/exp_ch7.adb
index 271dfd22618..585acd8b428 100644
--- a/gcc/ada/exp_ch7.adb
+++ b/gcc/ada/exp_ch7.adb
@@ -4378,6 +4378,32 @@ package body Exp_Ch7 is
begin
   E := First_Entity (Blk);
 
+  --  The compiler may generate loops with a declare block containing
+  --  nested procedures used for finalization. Recursively search for
+  --  subprograms in such constructs.
+
+  if Ekind (Blk) = E_Loop
+and then Parent_Kind (Blk) = N_Loop_Statement
+  then
+ declare
+Stmt : Node_Id := First (Statements (Parent (Blk)));
+ begin
+while Present (Stmt) loop
+   if Nkind (Stmt) = N_Block_Statement then
+  declare
+ Id : constant Entity_Id :=
+  Entity (Identifier (Stmt));
+  begin
+ if Contains_Subprogram (Id) then
+return True;
+ end if;
+  end;
+   end if;
+   Next (Stmt);
+end loop;
+ end;
+  end if;
+
   while Present (E) loop
  if Is_Subprogram (E) then
 return True;
@@ -9350,17 +9376,67 @@ package body Exp_Ch7 is
-
 
procedure Unnest_Loop (Loop_Stmt : Node_Id) is
+
+  procedure Fixup_Inner_Scopes (Loop_Stmt : Node_Id);
+  --  The loops created by the compiler for array aggregates can have
+  --  nested finalization procedure when the type of the array components
+  --  needs finalization. It has the following form:
+
+  --  for J4b in 10 .. 12 loop
+  -- declare
+  --procedure __finalizer;
+  -- begin
+  --procedure __finalizer is
+  --  ...
+  --end;
+  --...
+  --obj (J4b) := ...;
+
+  --  When the compiler creates the N_Block_Statement, it sets its scope to
+  --  the upper scope (the one containing the loop).
+
+  --  The Unnest_Loop procedure moves the N_Loop_Statement inside a new
+  --  procedure and correctly sets the scopes for both the new procedure
+  --  and the loop entity. The inner block scope is not modified and this
+  --  leaves the Tree in an incoherent state (i.e. the inner procedure must
+  --  have its enclosing procedure in its scope ancestries).
+
+  --  This procedure fixes the scope links.
+
+  --  Another (better) fix would be to have the block scope set to be the
+  --  loop entity earlier (when the block is created or when the loop gets
+  --  an actual entity set). But unfortunately this proved harder to
+  --  implement ???
+
+  procedure Fixup_Inner_Scopes (Loop_Stmt : Node_Id) is
+ Stmt  : Node_Id:= First (Statements (Loop_Stmt));
+ Loop_Stmt_Ent : constant Entity_Id := Entity (Identifier (Loop_Stmt));
+ Ent_To_Fix: Entity_Id;
+  begin
+ while Present (Stmt) loop
+if Nkind (Stmt) = N_Block_Statement
+  and then Is_Abort_Block (Stmt)
+then
+   Ent_To_Fix := Entity (Identifier (Stmt));
+   Set_Scope (Ent_To_Fix, Loop_Stmt_Ent);
+elsif Nkind (Stmt) = N_Loop_Statement then
+   Fixup_Inner_Scopes (Stmt);
+end if;
+Next (Stmt);
+ end loop;
+  end Fixup_Inner_Scopes;
+
   Loc: constant Source_Ptr := Sloc (Loop_Stmt);
   Ent: Entity_Id;
   Local_Body : Node_Id;
   Local_Call : Node_Id;
+  Loop_Ent   : Entity_Id;
   Local_Proc : Entity_Id;
-  Local_Scop : Entity_Id;
   Loop_Copy  : constant Node_Id :=
  Relocate_Node (Loop_Stmt);
begin
-  Local_Scop := Entity (Identifier (Loop_Stmt));
-  Ent := First_Entity (Local_Scop);
+  Loop_Ent := Entity (Identifier (Loop_Stmt));
+  Ent := First_Entity (Loop_Ent);
 
   Local_Proc := Make_Temporary (Loc, 'P');
 
@@ -9389,8 +9465,10 @@ package body Exp_Ch7 is
   --  New procedu

Re: [PATCH] MATCH: Simplify `(A ==/!= B) &/| (((cast)A) CMP C)`

2023-09-26 Thread Richard Biener
On Thu, Sep 21, 2023 at 4:43 AM Andrew Pinski  wrote:
>
> This patch adds support to the pattern for `(A == B) &/| (A CMP C)`
> where the second A could be casted to a different type.
> Some were handled correctly if using seperate `if` statements
> but not if combined with BIT_AND/BIT_IOR.
> In the case of pr111456-1.c, the testcase would pass if
> `--param=logical-op-non-short-circuit=0` was used but now
> can be optimized always.
>
> OK? Bootstrapped and tested on x86_64-linux-gnu.

OK.

> PR tree-optimization/106164
> PR tree-optimization/111456
>
> gcc/ChangeLog:
>
> * match.pd (`(A ==/!= B) & (A CMP C)`):
> Support an optional cast on the second A.
> (`(A ==/!= B) | (A CMP C)`): Likewise.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.dg/tree-ssa/cmpbit-6.c: New test.
> * gcc.dg/tree-ssa/cmpbit-7.c: New test.
> * gcc.dg/tree-ssa/pr111456-1.c: New test.
> ---
>  gcc/match.pd   | 76 +-
>  gcc/testsuite/gcc.dg/tree-ssa/cmpbit-6.c   | 22 +++
>  gcc/testsuite/gcc.dg/tree-ssa/cmpbit-7.c   | 28 
>  gcc/testsuite/gcc.dg/tree-ssa/pr111456-1.c | 43 
>  4 files changed, 139 insertions(+), 30 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/cmpbit-6.c
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/cmpbit-7.c
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr111456-1.c
>
> diff --git a/gcc/match.pd b/gcc/match.pd
> index a37af05f873..0bf91bde486 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -2973,7 +2973,7 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> && TYPE_OVERFLOW_WRAPS (TREE_TYPE (@1)))
>(gt @0 (minus @1 { build_int_cst (TREE_TYPE (@1), 1); }
>
> -/* Convert (X == CST1) && (X OP2 CST2) to a known value
> +/* Convert (X == CST1) && ((other)X OP2 CST2) to a known value
> based on CST1 OP2 CST2.  Similarly for (X != CST1).  */
>  /* Convert (X == Y) && (X OP2 Y) to a known value if X is an integral type.
> Similarly for (X != Y).  */
> @@ -2981,26 +2981,30 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>  (for code1 (eq ne)
>   (for code2 (eq ne lt gt le ge)
>(simplify
> -   (bit_and:c (code1@3 @0 @1) (code2@4 @0 @2))
> +   (bit_and:c (code1:c@3 @0 @1) (code2:c@4 (convert?@c0 @0) @2))
> (if ((TREE_CODE (@1) == INTEGER_CST
>  && TREE_CODE (@2) == INTEGER_CST)
> || ((INTEGRAL_TYPE_P (TREE_TYPE (@1))
>  || POINTER_TYPE_P (TREE_TYPE (@1)))
> -   && operand_equal_p (@1, @2)))
> +   && bitwise_equal_p (@1, @2)))
>  (with
>   {
>bool one_before = false;
>bool one_after = false;
>int cmp = 0;
> +  bool allbits = true;
>if (TREE_CODE (@1) == INTEGER_CST
>   && TREE_CODE (@2) == INTEGER_CST)
> {
> - cmp = tree_int_cst_compare (@1, @2);
> + allbits = TYPE_PRECISION (TREE_TYPE (@1)) <= TYPE_PRECISION 
> (TREE_TYPE (@2));
> + auto t1 = wi::to_wide (fold_convert (TREE_TYPE (@2), @1));
> + auto t2 = wi::to_wide (@2);
> + cmp = wi::cmp (t1, t2, TYPE_SIGN (TREE_TYPE (@2)));
>   if (cmp < 0
> - && wi::to_wide (@1) == wi::to_wide (@2) - 1)
> + && t1 == t2 - 1)
> one_before = true;
>   if (cmp > 0
> - && wi::to_wide (@1) == wi::to_wide (@2) + 1)
> + && t1 == t2 + 1)
> one_after = true;
> }
>bool val;
> @@ -3018,25 +3022,29 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>   (switch
>(if (code1 == EQ_EXPR && val) @3)
>(if (code1 == EQ_EXPR && !val) { constant_boolean_node (false, type); 
> })
> -  (if (code1 == NE_EXPR && !val) @4)
> +  (if (code1 == NE_EXPR && !val && allbits) @4)
>(if (code1 == NE_EXPR
> && code2 == GE_EXPR
> -  && cmp == 0)
> -   (gt @0 @1))
> +  && cmp == 0
> +  && allbits)
> +   (gt @c0 (convert @1)))
>(if (code1 == NE_EXPR
> && code2 == LE_EXPR
> -  && cmp == 0)
> -   (lt @0 @1))
> +  && cmp == 0
> +  && allbits)
> +   (lt @c0 (convert @1)))
>/* (a != (b+1)) & (a > b) -> a > (b+1) */
>(if (code1 == NE_EXPR
> && code2 == GT_EXPR
> -  && one_after)
> -   (gt @0 @1))
> +  && one_after
> +  && allbits)
> +   (gt @c0 (convert @1)))
>/* (a != (b-1)) & (a < b) -> a < (b-1) */
>(if (code1 == NE_EXPR
> && code2 == LT_EXPR
> -  && one_before)
> -   (lt @0 @1))
> +  && one_before
> +  && allbits)
> +   (lt @c0 (convert @1)))
>   )
>  )
> )
> @@ -3100,26 +3108,30 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>  (for code1 (eq ne)
>   (for code2 (eq ne lt gt le ge)
>(simplify
> -   (bit_ior:c (code1@3 @0 @1) (code2@4 @0 @2))
> +   (bit_ior:c (code1:c@3 @0 @1) (code2:c@4 (convert?@c0 @0) @2))
> (if ((TREE_CODE (@1) == INTEGER_CST
>  && TREE

[PATCH v2 1/2] Enable vect.exp for LoongArch.

2023-09-26 Thread Chenghui Pan
gcc/testsuite/ChangeLog:

* lib/target-supports.exp: Enable vect.exp for LoongArch.
---
 gcc/testsuite/lib/target-supports.exp | 31 +++
 1 file changed, 31 insertions(+)

diff --git a/gcc/testsuite/lib/target-supports.exp 
b/gcc/testsuite/lib/target-supports.exp
index 2de41cef2f6..17863288ff0 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -11174,6 +11174,13 @@ proc check_vect_support_and_set_flags { } {
lappend DEFAULT_VECTCFLAGS "--param" "riscv-vector-abi"
set dg-do-what-default compile
}
+} elseif [istarget loongarch*-*-*] {
+  lappend DEFAULT_VECTCFLAGS "-mdouble-float" "-mlasx"
+  if [check_effective_target_loongarch_asx_hw] {
+ set dg-do-what-default run
+  } else {
+ set dg-do-what-default compile
+  }
 } else {
 return 0
 }
@@ -12656,6 +12663,30 @@ proc 
check_effective_target_const_volatile_readonly_section { } {
   return 1
 }
 
+proc check_effective_target_loongarch_sx_hw { } {
+return [check_runtime loongarch_sx_hw {
+   #include 
+   int main (void)
+   {
+ __m128i a, b, c;
+ c = __lsx_vand_v (a, b);
+ return 0;
+   }
+} "-mlsx"]
+}
+
+proc check_effective_target_loongarch_asx_hw { } {
+return [check_runtime loongarch_asx_hw {
+   #include 
+   int main (void)
+   {
+ __m256i a, b, c;
+ c = __lasx_xvand_v (a, b);
+ return 0;
+   }
+} "-mlasx"]
+}
+
 # Appends necessary Python flags to extra-tool-flags if Python.h is supported.
 # Otherwise, modifies dg-do-what.
 proc dg-require-python-h { args } {
-- 
2.36.0



[PATCH v2 2/2] Add LoongArch in check_effective_target_vect_int_mod according to ISA capabilities.

2023-09-26 Thread Chenghui Pan
gcc/testsuite/ChangeLog:

* lib/target-supports.exp: Add LoongArch in
check_effective_target_vect_int_mod according to ISA capabilities.
---
 gcc/testsuite/lib/target-supports.exp | 18 ++
 1 file changed, 18 insertions(+)

diff --git a/gcc/testsuite/lib/target-supports.exp 
b/gcc/testsuite/lib/target-supports.exp
index 17863288ff0..4a84dee430b 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -8586,6 +8586,8 @@ proc check_effective_target_vect_int_mod { } {
 return [check_cached_effective_target_indexed vect_int_mod {
   expr { ([istarget powerpc*-*-*]
  && [check_effective_target_has_arch_pwr10])
+ || ([istarget loongarch*-*-*]
+ && [check_effective_target_loongarch_sx])
  || [istarget amdgcn-*-*] }}]
 }
 
@@ -12663,6 +12665,14 @@ proc 
check_effective_target_const_volatile_readonly_section { } {
   return 1
 }
 
+proc check_effective_target_loongarch_sx { } {
+return [check_no_compiler_messages loongarch_lsx assembly {
+   #if !defined(__loongarch_sx)
+   #error "LSX not defined"
+   #endif
+}]
+}
+
 proc check_effective_target_loongarch_sx_hw { } {
 return [check_runtime loongarch_sx_hw {
#include 
@@ -12675,6 +12685,14 @@ proc check_effective_target_loongarch_sx_hw { } {
 } "-mlsx"]
 }
 
+proc check_effective_target_loongarch_asx { } {
+return [check_no_compiler_messages loongarch_asx assembly {
+   #if !defined(__loongarch_asx)
+   #error "LASX not defined"
+   #endif
+}]
+}
+
 proc check_effective_target_loongarch_asx_hw { } {
 return [check_runtime loongarch_asx_hw {
#include 
-- 
2.36.0



[PATCH v2 0/2] Update target-supports.exp for LoongArch SX/ASX.

2023-09-26 Thread Chenghui Pan
This is an update of:
https://gcc.gnu.org/pipermail/gcc-patches/2023-September/630953.html

This version of patch set contains code that enable vect.exp for LoongArch
target when target environment is capable of running LASX instructions.

After some attemptions, we still need "check_effective_target_loongarch_sx" 
in "proc check_effective_target_vect_int_mod {}" to choose correct dg-final
directives for LoongArch, because DEFAULT_VECTCFLAGS cannot affect pr104992.c
which is invoked by gcc.dg/dg.exp (not vect.exp).

Chenghui Pan (2):
  Enable vect.exp for LoongArch.
  Add LoongArch in check_effective_target_vect_int_mod according to ISA
capabilities.

 gcc/testsuite/lib/target-supports.exp | 49 +++
 1 file changed, 49 insertions(+)

-- 
2.36.0



Re: [PATCH v2 0/2] Update target-supports.exp for LoongArch SX/ASX.

2023-09-26 Thread Chenghui Pan
Correction: vect.exp will be set to run when target is capable of
running LASX instructions, otherwise it will be compiled only.

On Tue, 2023-09-26 at 19:56 +0800, Chenghui Pan wrote:
> This is an update of:
> https://gcc.gnu.org/pipermail/gcc-patches/2023-September/630953.html
> 
> This version of patch set contains code that enable vect.exp for
> LoongArch
> target when target environment is capable of running LASX
> instructions.
> 
> After some attemptions, we still need
> "check_effective_target_loongarch_sx" 
> in "proc check_effective_target_vect_int_mod {}" to choose correct
> dg-final
> directives for LoongArch, because DEFAULT_VECTCFLAGS cannot affect
> pr104992.c
> which is invoked by gcc.dg/dg.exp (not vect.exp).
> 
> Chenghui Pan (2):
>   Enable vect.exp for LoongArch.
>   Add LoongArch in check_effective_target_vect_int_mod according to
> ISA
>     capabilities.
> 
>  gcc/testsuite/lib/target-supports.exp | 49
> +++
>  1 file changed, 49 insertions(+)
> 



RE: [PATCH V2] MATCH: Optimize COND_ADD_LEN reduction pattern

2023-09-26 Thread Li, Pan2
Committed as passed x86 bootstrap and regression test, thanks Richard.

Pan

-Original Message-
From: Richard Biener  
Sent: Tuesday, September 26, 2023 7:35 PM
To: Juzhe-Zhong 
Cc: gcc-patches@gcc.gnu.org; richard.sandif...@arm.com
Subject: Re: [PATCH V2] MATCH: Optimize COND_ADD_LEN reduction pattern

On Tue, 26 Sep 2023, Juzhe-Zhong wrote:

> Current COND_ADD reduction pattern can't optimize floating-point vector.
> As Richard suggested: 
> https://gcc.gnu.org/pipermail/gcc-patches/2023-September/631336.html
> Allow COND_ADD reduction pattern to optimize floating-point vector.
> 
> Bootstrap and Regression is running.
> 
> Ok for trunk if tests pass ?

OK.

> gcc/ChangeLog:
> 
>   * match.pd: Optimize COND_ADD reduction pattern.
> 
> ---
>  gcc/match.pd | 7 +--
>  1 file changed, 5 insertions(+), 2 deletions(-)
> 
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 3ce90cb..790d956fe69 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -8863,8 +8863,11 @@ and,
>  
> c = mask1 && mask2 ? d + b : d.  */
>  (simplify
> -  (IFN_COND_ADD @0 @1 (vec_cond @2 @3 integer_zerop) @1)
> -   (IFN_COND_ADD (bit_and @0 @2) @1 @3 @1))
> +  (IFN_COND_ADD @0 @1 (vec_cond @2 @3 zerop@4) @1)
> +   (if (ANY_INTEGRAL_TYPE_P (type)
> + || (FLOAT_TYPE_P (type)
> + && fold_real_zero_addition_p (type, NULL_TREE, @4, 0)))
> +   (IFN_COND_ADD (bit_and @0 @2) @1 @3 @1)))
>  
>  /* Detect simplication for a conditional length reduction where
>  
> 

-- 
Richard Biener 
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)


RE: [PATCH V3] MATCH: Optimize COND_ADD_LEN reduction pattern

2023-09-26 Thread Li, Pan2
Committed as passed x86 bootstrap and regression test, thanks Richard.

Pan

-Original Message-
From: Richard Biener  
Sent: Tuesday, September 26, 2023 7:35 PM
To: Juzhe-Zhong 
Cc: gcc-patches@gcc.gnu.org; richard.sandif...@arm.com
Subject: Re: [PATCH V3] MATCH: Optimize COND_ADD_LEN reduction pattern

On Tue, 26 Sep 2023, Juzhe-Zhong wrote:

> This patch leverage this commit: 
> https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=62b505a4d5fc89
> to optimize COND_LEN_ADD reduction pattern.
> 
> We are doing optimization of VEC_COND_EXPR + COND_LEN_ADD -> COND_LEN_ADD.
> 
> Consider thsi following case:
> 
> #include 
> 
> void
> pr11594 (uint64_t *restrict a, uint64_t *restrict b, int loop_size)
> {
>   uint64_t result = 0;
> 
>   for (int i = 0; i < loop_size; i++)
> {
>   if (b[i] <= a[i])
>   {
> result += a[i];
>   }
> }
> 
>   a[0] = result;
> }
> 
> Before this patch:
> vsetvli a7,zero,e64,m1,ta,ma
> vmv.v.i v2,0
> vmv1r.v v3,v2--- redundant
> .L3:
> vsetvli a5,a2,e64,m1,ta,ma
> vle64.v v1,0(a3)
> vle64.v v0,0(a1)
> sllia6,a5,3
> vsetvli a7,zero,e64,m1,ta,ma
> sub a2,a2,a5
> vmsleu.vv   v0,v0,v1
> add a1,a1,a6
> vmerge.vvm  v1,v3,v1,v0  redundant.
> add a3,a3,a6
> vsetvli zero,a5,e64,m1,tu,ma
> vadd.vv v2,v2,v1
> bne a2,zero,.L3
> li  a5,0
> vsetvli a4,zero,e64,m1,ta,ma
> vmv.s.x v1,a5
> vredsum.vs  v2,v2,v1
> vmv.x.s a5,v2
> sd  a5,0(a0)
> ret
> 
> After this patch:
> 
>   vsetvli a6,zero,e64,m1,ta,ma
>   vmv.v.i v1,0
> .L3:
>   vsetvli a5,a2,e64,m1,ta,ma
>   vle64.v v2,0(a4)
>   vle64.v v0,0(a1)
>   sllia3,a5,3
>   vsetvli a6,zero,e64,m1,ta,ma
>   sub a2,a2,a5
>   vmsleu.vv   v0,v0,v2
>   add a1,a1,a3
>   vsetvli zero,a5,e64,m1,tu,mu
>   add a4,a4,a3
>   vadd.vv v1,v1,v2,v0.t
>   bne a2,zero,.L3
>   li  a5,0
>   vsetivlizero,1,e64,m1,ta,ma
>   vmv.s.x v2,a5
>   vsetvli a5,zero,e64,m1,ta,ma
>   vredsum.vs  v1,v1,v2
>   vmv.x.s a5,v1
>   sd  a5,0(a0)
>   ret
> 
> Bootstrap && Regression is running.
> 
> Ok for trunk when testing passes ?

OK

>   PR tree-optimization/111594
> PR tree-optimization/110660
> 
> gcc/ChangeLog:
> 
>   * match.pd: Optimize COND_LEN_ADD reduction.
> 
> gcc/testsuite/ChangeLog:
> 
>   * gcc.target/riscv/rvv/autovec/cond/cond_reduc-1.c: New test.
>   * gcc.target/riscv/rvv/autovec/cond/pr111594.c: New test.
> 
> ---
>  gcc/match.pd  | 15 ++
>  .../riscv/rvv/autovec/cond/cond_reduc-1.c | 29 +++
>  .../riscv/rvv/autovec/cond/pr111594.c | 22 ++
>  3 files changed, 66 insertions(+)
>  create mode 100644 
> gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/cond_reduc-1.c
>  create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111594.c
> 
> diff --git a/gcc/match.pd b/gcc/match.pd
> index a17778fbaa6..3ce90cb 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -8866,6 +8866,21 @@ and,
>(IFN_COND_ADD @0 @1 (vec_cond @2 @3 integer_zerop) @1)
> (IFN_COND_ADD (bit_and @0 @2) @1 @3 @1))
>  
> +/* Detect simplication for a conditional length reduction where
> +
> +   a = mask ? b : 0
> +   c = i < len + bias ? d + a : d
> +
> +   is turned into
> +
> +   c = mask && i < len + bias ? d + b : d.  */
> +(simplify
> +  (IFN_COND_LEN_ADD integer_truep @0 (vec_cond @1 @2 zerop@5) @0 @3 @4)
> +   (if (ANY_INTEGRAL_TYPE_P (type)
> + || (FLOAT_TYPE_P (type)
> + && fold_real_zero_addition_p (type, NULL_TREE, @5, 0)))
> +(IFN_COND_LEN_ADD @1 @0 @2 @0 @3 @4)))
> +
>  /* For pointers @0 and @2 and nonnegative constant offset @1, look for
> expressions like:
>  
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/cond_reduc-1.c 
> b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/cond_reduc-1.c
> new file mode 100644
> index 000..db6f9d1ec6c
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/cond_reduc-1.c
> @@ -0,0 +1,29 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-march=rv64gcv_zvfh -mabi=lp64d 
> -fno-vect-cost-model -ffast-math -fdump-tree-optimized" } */
> +
> +#include 
> +
> +#define COND_REDUCTION(TYPE) 
>   \
> +  TYPE foo##TYPE (TYPE *restrict a, TYPE *restrict b, int loop_size) 
>   \
> +  {  
>   \
> +TYPE result = 0; 
>   \
> +for (int i = 0; i < loop_size; i++)  
>   \
> +  if (b[i] <= a[i])  
>

Re: RISC-V: Added support for CRC.

2023-09-26 Thread Oleg Endo
On Sun, 2023-09-24 at 00:05 +0100, Joern Rennecke wrote:
> 
> Although maybe Oleg Endo's library, as mentioned in
> https://gcc.gnu.org/pipermail/gcc-patches/2022-March/591748.html ,
> might be suitable?  What is the license for that?
> 
> 

I haven't published the library, but I think I could do that.

It's a C++-14 header-only thing and uses templates + constexpr to generate
the .rodata lookup tables.  It's convenient for an application project, as
it doesn't require any generator tool in the build.  This might be not a big
advantage in the context of GCC.

Since the tables are computed during compile-time, there is no particular
optimization implemented.  The run-time function is also nothing fancy:

static constexpr uint8_t table_index (value_type rem, uint8_t x)
{
  if (ReflectInput)
return x ^ rem;
  else
return x ^ (BitCount > 8 ? (rem >> (BitCount - 8))
 : (rem << (8 - BitCount)));
}

static constexpr value_type shift (value_type rem)
{
  return ReflectInput ? rem >> 8 : rem << 8;
}

static value_type
default_process_bytes (value_type rem, const uint8_t* in, const uint8_t* in_end)
{
  for (; in != in_end; ++in)
  {
auto i = table_index (rem, *in);
rem = table[i] ^ shift (rem);
  }
  return rem;
}

Anyway, let me know if anyone is interested.

Cheers,
Oleg


[Committed] RISC-V: Fix mem-to-mem VLS move pattern[PR111566]

2023-09-26 Thread Juzhe-Zhong
The mem-to-mem insn pattern is splitted from reg-to-mem/mem-to-reg/reg-to-reg
causes ICE in RA since RA prefer they stay together.

Now, we split mem-to-mem as a pure pre-RA split pattern and only allow
define_insn match mem-to-mem VLS move in pre-RA stage (Forbid mem-to-mem move 
after RA).

Tested no difference. Committed.

PR target/111566

gcc/ChangeLog:

* config/riscv/vector.md (*mov_mem_to_mem):

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/fortran/pr111566.f90: New test.

---
 gcc/config/riscv/vector.md| 19 +---
 .../gcc.target/riscv/rvv/fortran/pr111566.f90 | 31 +++
 2 files changed, 45 insertions(+), 5 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/fortran/pr111566.f90

diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md
index d5300a33946..a98242f2fd8 100644
--- a/gcc/config/riscv/vector.md
+++ b/gcc/config/riscv/vector.md
@@ -1222,12 +1222,14 @@
 DONE;
 })
 
-(define_insn_and_split "*mov_mem_to_mem"
+;; Some VLS modes (like V2SImode) have size <= a general purpose
+;; register width, we optimize such mem-to-mem move into mem-to-mem
+;; scalar move.  Otherwise, we always force operands[1] into register
+;; so that we will never get mem-to-mem move after RA.
+(define_split
   [(set (match_operand:VLS_AVL_IMM 0 "memory_operand")
(match_operand:VLS_AVL_IMM 1 "memory_operand"))]
   "TARGET_VECTOR && can_create_pseudo_p ()"
-  "#"
-  "&& 1"
   [(const_int 0)]
   {
 if (GET_MODE_BITSIZE (mode).to_constant () <= MAX_BITS_PER_WORD)
@@ -1256,14 +1258,21 @@
   }
 DONE;
   }
-  [(set_attr "type" "vmov")]
 )
 
+;; We recognize mem-to-mem move in pre-RA stage so that we won't have
+;; ICE (unrecognizable insn: (set (mem) (mem))).  Then, the previous
+;; mem-to-mem split pattern will force operands[1] into a register so
+;; that mem-to-mem move will never happen after RA.
+;;
+;; We don't allow mem-to-mem move in post-RA stage since we
+;; don't have an instruction to split mem-to-mem move after RA.
 (define_insn_and_split "*mov"
   [(set (match_operand:VLS_AVL_IMM 0 "reg_or_mem_operand" "=vr, m, vr")
(match_operand:VLS_AVL_IMM 1 "reg_or_mem_operand" "  m,vr, vr"))]
   "TARGET_VECTOR
-   && (register_operand (operands[0], mode)
+   && (can_create_pseudo_p ()
+   || register_operand (operands[0], mode)
|| register_operand (operands[1], mode))"
   "@
#
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/fortran/pr111566.f90 
b/gcc/testsuite/gcc.target/riscv/rvv/fortran/pr111566.f90
new file mode 100644
index 000..2e30dc9bfaa
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/fortran/pr111566.f90
@@ -0,0 +1,31 @@
+! { dg-do compile }
+! { dg-options "-march=rv64gcv -mabi=lp64d -Ofast -fallow-argument-mismatch 
-fmax-stack-var-size=65536 -S  -std=legacy -w" }
+
+module a
+  integer,parameter :: SHR_KIND_R8 = selected_real_kind(12)
+end module a
+module b
+  use a,  c => shr_kind_r8
+contains
+  subroutine d(cg , km, i1, i2)
+real (c) ch(i2,km)
+real (c) cg(4,i1:i2,km)
+real  dc(i2,km)
+real(c) ci(i2,km)
+real(c) cj(i2,km)
+do k=2,ck
+   do i=i1,0
+  cl = ci(i,k) *ci(i,1) /  cj(i,k)+ch(i,1)
+  cm = cg(1,i,k) - min(e,cg(1,i,co))
+  dc(i,k) = sign(cm, cl)
+   enddo
+enddo
+if ( cq == 0 ) then
+   do i=i1,i2
+  if( cr <=  cs ) then
+ cg= sign( min(ct,   cg),  cg)
+  endif
+   enddo
+endif
+  end subroutine d
+end module b
-- 
2.36.3



Re: RISC-V: Added support for CRC.

2023-09-26 Thread Jeff Law




On 9/23/23 17:05, Joern Rennecke wrote:

Mariam Harutyunyan:
+++ b/gcc/ChangeLog
@@ -1,3 +1,45 @@
+2023-08-03  Mariam Arutunian  
+

It is common courtesy to include all authors in the list of authors
for the ChangeLog; also,
this may help people in the future understand the history of the code better.
While must of your patch is new, it still contains non-trivial parts of mine
( https://gcc.gnu.org/pipermail/gcc-patches/2022-March/591744.html )
.And stripping out the comment why, currently,  we can't use linkonce
for crc tables on the the RISC-V target is
not helpful to someone who wants to understand the code.
Thanks for pointing this out Joern.  Neither Mariam nor I were aware 
that some of your code was used to seed this work.  We are happy to 
include you as a co-author.



Mariam -- the way to do this is to add a "Co-Author:" line to your 
commit message.  If you look at upstream commit:


5ff4431675c0d0c800d4a983254e94a6b401c14d

Shows the right format.




See also the discussion to put this into loop distribution:
https://gcc.gnu.org/pipermail/gcc-patches/2022-March/591821.html > 
https://gcc.gnu.org/pipermail/gcc-patches/2022-March/591866.html
Yes, this was the gist of the discussion I had with Richi.  CRC 
optimization has a lot in common with distribution, reduction and final 
value replacement.


For early testing and development we focused on detecting a CRC function 
and replacing it at a function level.  THat allowed Mariam to avoid a 
class a problems for a while and focus on an end-to-end solution.  Once 
that was working reasonably well Mariam went back and enhanced the first 
pass filter and validation so that it would work if the CRC loop was 
embedded inside a larger function either at the source level or due to 
inlining.


The placement we chose was just after loop distribution.  I think 
there's some wiggle room in terms of pass placement.


The one thing I'm hesitant to do is make this part of an existing pass. 
I dont see many benefits of integrating inside loop dist and I very much 
like its clean separation.





Mariam Harutyunyan:

It adds internal
functions and built-ins specifically designed to handle CRC computations
efficiently.


This sounds like this is a finished work, although defining built-in
functions to calculate the CRC of single data elements and recognizers
for some C idioms that do these calculations,
is just a starting point.
More precisely it was carved out of a larger piece of work in the hopes 
that it would be useful independently and allow upstreaming of the 
backend work.  Alexander was fairly dismissive of the utility of doing 
that.  There was some interest from Cauldron attendees to potentially 
reviving that idea.





Alexander Monakov :


Jeff, as I understand this all is happening only because Coremark contains
use of bitwise CRC that affects benchmark scores. In another universe where
- Coremark was careful to checksum outputs outside of timed sections, or
- implemented CRC in a manner that is not transparent to the compiler, or
- did not use CRC at all
we would not be spending effort on this, correct?


It is a stated goal of coremark to test performance for CRC.  They do
not use a library call
to implement CRC, but a specific bit-banging algorithm they have
chosen.  That algorithm is,
for the vast majority of processors, not representative of the targets
performance potential in calculating CRCs, thus if a compiler fails to
translate this into the CRC implementation that
would be used for performance code, the compiler frustrates this goal
of coremark to give a measure of CRC calculation performance.All true.  But the Coremark code is what it is.  This isn't a whole lot 
different than the work in the 90s which rewrote loops and compromised 
some of the spec benchmarks, or the eqntott hack to simplify that one 
key loop in eqntott.


What ultimately pushed us to keep moving forward on this effort was 
discovering numerous CRC loop implementations out in the wild, including 
4 implementations (IIRC) in the kernel itself.






At best we might have
a discussion on providing a __builtin_clmul for carry-less multiplication
(which _is_ a fundamental primitive, unlike __builtin_crc), and move on.


Some processors have specialized instructions for CRC computations.
They do.  But I think the discussion about providing intrinsic access to 
a clmul instruction is orthogonal to the discussion about whether or not 
to provide a builtin for crc.


I can easily see creating a clmul RTL opcode for targets which support 
it and hoisting the clmul vs lookup table selection into generic code. 
I'm still pondering if we're likely to ever see cases where we want a 
vector clmul intrinsic or support in the autovectorizer for clmul. 
We've certainly got targets with vector clmul in the ISA, the question 
is using it.






Instead, efficient CRC loops have the following structure:
- they carry an unreduced remainder in the loop, performing final reduction
  m

Re: [Committed] RISC-V: Fix mem-to-mem VLS move pattern[PR111566]

2023-09-26 Thread Jeff Law




On 9/26/23 07:15, Juzhe-Zhong wrote:

The mem-to-mem insn pattern is splitted from reg-to-mem/mem-to-reg/reg-to-reg
causes ICE in RA since RA prefer they stay together.

Now, we split mem-to-mem as a pure pre-RA split pattern and only allow
define_insn match mem-to-mem VLS move in pre-RA stage (Forbid mem-to-mem move 
after RA).

Tested no difference. Committed.

PR target/111566

gcc/ChangeLog:

* config/riscv/vector.md (*mov_mem_to_mem):

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/fortran/pr111566.f90: New test.

ChangeLog for the vector.md is missing.

In general we shouldn't be allowing mem->mem in most patterns since the 
hardware doesn't actually implement such instructions.  I suspect that's 
the real problem here and that ultimately you're just papering over it.




Jeff



[PATCH] AArch64: Remove BTI from outline atomics

2023-09-26 Thread Wilco Dijkstra

The outline atomic functions have hidden visibility and can only be called
directly.  Therefore we can remove the BTI at function entry.  This improves
security by reducing the number of indirect entry points in a binary.
The BTI markings on the objects are still emitted.

Passes regress, OK for commit?

libgcc/ChangeLog:
    * config/aarch64/lse.S (BTI_C): Remove define.

---

diff --git a/libgcc/config/aarch64/lse.S b/libgcc/config/aarch64/lse.S
index 
ba05047ff02b6fc5752235bffa924fc4a2f48c04..dbfb83fb09083641bf06c50b631a5f27bdf61b80
 100644
--- a/libgcc/config/aarch64/lse.S
+++ b/libgcc/config/aarch64/lse.S
@@ -163,8 +163,6 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  
If not, see
 #define tmp3    14
 #define tmp4    13
 
-#define BTI_C  hint    34
-
 /* Start and end a function.  */
 .macro  STARTFN name
 .text
@@ -174,7 +172,6 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  
If not, see
 .type   \name, %function
 .cfi_startproc
 \name:
-   BTI_C
 .endm
 
 .macro  ENDFN name


Re: [PATCH] RISC-V: Add opaque integer modes to fix ICE on DSE[PR111590]

2023-09-26 Thread Richard Sandiford
Juzhe-Zhong  writes:
> When doing fortran test with 'V' extension enabled on RISC-V port.
> I saw multiple ICE: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111590
>
> The root cause is on DSE:
>
> internal compiler error: in smallest_mode_for_size, at stor-layout.cc:356
> 0x1918f70 smallest_mode_for_size(poly_int<2u, unsigned long>, mode_class)
> ../../../../gcc/gcc/stor-layout.cc:356
> 0x11f75bb smallest_int_mode_for_size(poly_int<2u, unsigned long>)
> ../../../../gcc/gcc/machmode.h:916
> 0x3304141 find_shift_sequence
> ../../../../gcc/gcc/dse.cc:1738
> 0x3304f1a get_stored_val
> ../../../../gcc/gcc/dse.cc:1906
> 0x3305377 replace_read
> ../../../../gcc/gcc/dse.cc:2010
> 0x3306226 check_mem_read_rtx
> ../../../../gcc/gcc/dse.cc:2310
> 0x330667b check_mem_read_use
> ../../../../gcc/gcc/dse.cc:2415
>
> After investigations, DSE is trying to do optimization like this following 
> codes:
>
> (insn 86 85 87 9 (set (reg:V4DI 168)
> (mem/u/c:V4DI (reg/f:DI 171) [0  S32 A128])) "bug.f90":6:18 discrim 6 
> 1167 {*movv4di}
>  (expr_list:REG_EQUAL (const_vector:V4DI [
> (const_int 4 [0x4])
> (const_int 1 [0x1]) repeated x2
> (const_int 3 [0x3])
> ])
> (nil)))
>
> (set (mem) (reg:V4DI 168))
>
> Then it ICE on: auto new_mode = smallest_int_mode_for_size (access_size * 
> BITS_PER_UNIT);
>
> The access_size may be 24 or 32. We don't have such integer modes with these 
> size so it ICE.
>
> I saw both aarch64 and ARM has EI/OI/CI/XI opaque modes. 
>
> So I add it to walk around ICE on DCE, it works as all ICE are resolved.
>
> CC Richard to review to make sure I am doing the right thing to fix the bug.
>
> Hi, Richard, could you help me with this issue ? Thanks.
>
> gcc/ChangeLog:
>
>   * config/riscv/riscv-modes.def (INT_MODE): Add opaque modes

I think it's a bug in dse.  The contract is:

/* Find the narrowest integer mode that contains at least SIZE bits.
   Such a mode must exist.  */

(emphasis on the last line).

The easy fix would be to add:

  && known_le (access_size, GET_MODE_SIZE (MAX_MODE_INT))

The better but more complex fix would be to make dse use native_encode_rtx/
native_decode_rtx (which IIRC didn't exist when the dse code was written).

Thanks,
Richard


>
> ---
>  gcc/config/riscv/riscv-modes.def | 6 ++
>  1 file changed, 6 insertions(+)
>
> diff --git a/gcc/config/riscv/riscv-modes.def 
> b/gcc/config/riscv/riscv-modes.def
> index e3c6ccb2809..ab86032c914 100644
> --- a/gcc/config/riscv/riscv-modes.def
> +++ b/gcc/config/riscv/riscv-modes.def
> @@ -393,6 +393,12 @@ VLS_MODES (1024); /* V1024QI  V512HI  V256SI V128DI  
> V512HF  V256SF V128DF */
>  VLS_MODES (2048); /* V2048QI V1024HI  V512SI V256DI V1024HF  V512SF V256DF */
>  VLS_MODES (4096); /* V4096QI V2048HI V1024SI V512DI V2048HF V1024SF V512DF */
>  
> +/* Opaque integer modes 3, 4, 6 or 8 general double registers.  */
> +INT_MODE (EI, 24);
> +INT_MODE (OI, 32);
> +INT_MODE (CI, 48);
> +INT_MODE (XI, 64);
> +
>  /* TODO: According to RISC-V 'V' ISA spec, the maximun vector length can
> be 65536 for a single vector register which means the vector mode in
> GCC can be maximum = 65536 * 8 bits (LMUL=8).


[PATCH v4] aarch64: Fine-grained policies to control ldp-stp formation.

2023-09-26 Thread Manos Anagnostakis
This patch implements the following TODO in gcc/config/aarch64/aarch64.cc
to provide the requested behaviour for handling ldp and stp:

  /* Allow the tuning structure to disable LDP instruction formation
 from combining instructions (e.g., in peephole2).
 TODO: Implement fine-grained tuning control for LDP and STP:
   1. control policies for load and store separately;
   2. support the following policies:
  - default (use what is in the tuning structure)
  - always
  - never
  - aligned (only if the compiler can prove that the
load will be aligned to 2 * element_size)  */

It provides two new and concrete target-specific command-line parameters
-param=aarch64-ldp-policy= and -param=aarch64-stp-policy=
to give the ability to control load and store policies seperately as
stated in part 1 of the TODO.

The accepted values for both parameters are:
- default: Use the policy of the tuning structure (default).
- always: Emit ldp/stp regardless of alignment.
- never: Do not emit ldp/stp.
- aligned: In order to emit ldp/stp, first check if the load/store will
  be aligned to 2 * element_size.

Bootstrapped and regtested aarch64-linux.

gcc/ChangeLog:
* config/aarch64/aarch64-opts.h (enum aarch64_ldp_policy): New
enum type.
(enum aarch64_stp_policy): New enum type.
* config/aarch64/aarch64-protos.h (struct tune_params): Add
appropriate enums for the policies.
(aarch64_mem_ok_with_ldpstp_policy_model): New declaration.
* config/aarch64/aarch64-tuning-flags.def
(AARCH64_EXTRA_TUNING_OPTION): Remove superseded tuning
options.
* config/aarch64/aarch64.cc (aarch64_parse_ldp_policy): New
function to parse ldp-policy parameter.
(aarch64_parse_stp_policy): New function to parse stp-policy parameter.
(aarch64_override_options_internal): Call parsing functions.
(aarch64_mem_ok_with_ldpstp_policy_model): New function.
(aarch64_operands_ok_for_ldpstp): Add call to
aarch64_mem_ok_with_ldpstp_policy_model for parameter-value
check and alignment check and remove superseded ones.
(aarch64_operands_adjust_ok_for_ldpstp): Add call to
aarch64_mem_ok_with_ldpstp_policy_model for parameter-value
check and alignment check and remove superseded ones.
* config/aarch64/aarch64.opt: Add parameters.
* doc/invoke.texi: Document the parameters accordingly.

gcc/testsuite/ChangeLog:
* gcc.target/aarch64/ampere1-no_ldp_combine.c: Removed.
* gcc.target/aarch64/ldp_aligned.c: New test.
* gcc.target/aarch64/ldp_always.c: New test.
* gcc.target/aarch64/ldp_never.c: New test.
* gcc.target/aarch64/stp_aligned.c: New test.
* gcc.target/aarch64/stp_always.c: New test.
* gcc.target/aarch64/stp_never.c: New test.

Signed-off-by: Manos Anagnostakis 
---
Changes in v4:
- Changed the parameters to accept enum instead of an
  integer and updated documentation in doc/invoke.texi.
- Packed all the new checks in aarch64_operands_ok_for_ldpstp/
  aarch64_operands_adjust_ok_for_ldpstp in a new function
  called aarch64_mem_ok_with_ldpstp_policy_model.

 gcc/config/aarch64/aarch64-opts.h |  16 ++
 gcc/config/aarch64/aarch64-protos.h   |  25 +++
 gcc/config/aarch64/aarch64-tuning-flags.def   |   8 -
 gcc/config/aarch64/aarch64.cc | 212 +-
 gcc/config/aarch64/aarch64.opt|  38 
 gcc/doc/invoke.texi   |  20 ++
 .../aarch64/ampere1-no_ldp_combine.c  |  11 -
 .../gcc.target/aarch64/ldp_aligned.c  |  66 ++
 gcc/testsuite/gcc.target/aarch64/ldp_always.c |  66 ++
 gcc/testsuite/gcc.target/aarch64/ldp_never.c  |  66 ++
 .../gcc.target/aarch64/stp_aligned.c  |  60 +
 gcc/testsuite/gcc.target/aarch64/stp_always.c |  60 +
 gcc/testsuite/gcc.target/aarch64/stp_never.c  |  60 +
 13 files changed, 632 insertions(+), 76 deletions(-)
 delete mode 100644 gcc/testsuite/gcc.target/aarch64/ampere1-no_ldp_combine.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/ldp_aligned.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/ldp_always.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/ldp_never.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/stp_aligned.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/stp_always.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/stp_never.c

diff --git a/gcc/config/aarch64/aarch64-opts.h 
b/gcc/config/aarch64/aarch64-opts.h
index 7e8f1babed8..db8348507a3 100644
--- a/gcc/config/aarch64/aarch64-opts.h
+++ b/gcc/config/aarch64/aarch64-opts.h
@@ -108,4 +108,20 @@ enum aarch64_key_type {
   AARCH64_KEY_B
 };
 
+/* Load pair policy type.  */
+enum aarch64_ldp_policy {
+  LDP_POLICY_DEFAULT,
+  LDP_POLICY_ALWAYS,
+  LDP_POLICY_NEVER,
+ 

Re: [PATCH] AArch64: Remove BTI from outline atomics

2023-09-26 Thread Richard Earnshaw (lists)
On 26/09/2023 14:46, Wilco Dijkstra wrote:
> 
> The outline atomic functions have hidden visibility and can only be called
> directly.  Therefore we can remove the BTI at function entry.  This improves
> security by reducing the number of indirect entry points in a binary.
> The BTI markings on the objects are still emitted.

Please can you add a comment to that effect in the source code.  OK with that 
change.

R.

> 
> Passes regress, OK for commit?
> 
> libgcc/ChangeLog:
>     * config/aarch64/lse.S (BTI_C): Remove define.
> 
> ---
> 
> diff --git a/libgcc/config/aarch64/lse.S b/libgcc/config/aarch64/lse.S
> index 
> ba05047ff02b6fc5752235bffa924fc4a2f48c04..dbfb83fb09083641bf06c50b631a5f27bdf61b80
>  100644
> --- a/libgcc/config/aarch64/lse.S
> +++ b/libgcc/config/aarch64/lse.S
> @@ -163,8 +163,6 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  
> If not, see
>  #define tmp3    14
>  #define tmp4    13
>  
> -#define BTI_C  hint    34
> -
>  /* Start and end a function.  */
>  .macro  STARTFN name
>  .text
> @@ -174,7 +172,6 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  
> If not, see
>  .type   \name, %function
>  .cfi_startproc
>  \name:
> -   BTI_C
>  .endm
>  
>  .macro  ENDFN name



[PATCH] c++: non-static memfn call dependence cleanup

2023-09-26 Thread Patrick Palka
Bootstrapped and regtested on x86_64-pc-linux-gnu, does this look OK for trunk?

-- >8 --

In cp_parser_postfix_expression, we essentially repeat the
type-dependent and COMPONENT_REF callee cases of finish_call_expr.
This patch deduplicates this logic.

gcc/cp/ChangeLog:

* parser.cc (cp_parser_postfix_expression): Consolidate three
calls to finish_call_expr, one to build_new_method_call and
one to build_min_nt_call_vec into one call to finish_call_expr.
* pt.cc (type_dependent_expression_p): Use t_d_object_e_p
instead of t_d_e_p for COMPONENT_REF and OFFSET_REF.

gcc/testsuite/ChangeLog:

* g++.dg/template/crash127.C: Expect additional error due to
being able to check the member access expression ahead of time.
Strengthen the test by not instantiating the class template.
---
 gcc/cp/parser.cc | 60 ++--
 gcc/cp/pt.cc |  2 +-
 gcc/testsuite/g++.dg/template/crash127.C |  3 +-
 3 files changed, 16 insertions(+), 49 deletions(-)

diff --git a/gcc/cp/parser.cc b/gcc/cp/parser.cc
index f3abae716fe..78082ee7284 100644
--- a/gcc/cp/parser.cc
+++ b/gcc/cp/parser.cc
@@ -8047,54 +8047,12 @@ cp_parser_postfix_expression (cp_parser *parser, bool 
address_p, bool cast_p,
close_paren_loc);
iloc_sentinel ils (combined_loc);
 
-   if (TREE_CODE (postfix_expression) == COMPONENT_REF)
- {
-   tree instance = TREE_OPERAND (postfix_expression, 0);
-   tree fn = TREE_OPERAND (postfix_expression, 1);
-
-   if (processing_template_decl
-   && (type_dependent_object_expression_p (instance)
-   || (!BASELINK_P (fn)
-   && TREE_CODE (fn) != FIELD_DECL)
-   || type_dependent_expression_p (fn)
-   || any_type_dependent_arguments_p (args)))
- {
-   maybe_generic_this_capture (instance, fn);
-   postfix_expression
- = build_min_nt_call_vec (postfix_expression, args);
- }
-   else if (BASELINK_P (fn))
- {
- postfix_expression
-   = (build_new_method_call
-  (instance, fn, &args, NULL_TREE,
-   (idk == CP_ID_KIND_QUALIFIED
-? LOOKUP_NORMAL|LOOKUP_NONVIRTUAL
-: LOOKUP_NORMAL),
-   /*fn_p=*/NULL,
-   complain));
- }
-   else
- postfix_expression
-   = finish_call_expr (postfix_expression, &args,
-   /*disallow_virtual=*/false,
-   /*koenig_p=*/false,
-   complain);
- }
-   else if (TREE_CODE (postfix_expression) == OFFSET_REF
-|| TREE_CODE (postfix_expression) == MEMBER_REF
-|| TREE_CODE (postfix_expression) == DOTSTAR_EXPR)
+   if (TREE_CODE (postfix_expression) == OFFSET_REF
+   || TREE_CODE (postfix_expression) == MEMBER_REF
+   || TREE_CODE (postfix_expression) == DOTSTAR_EXPR)
  postfix_expression = (build_offset_ref_call_from_tree
(postfix_expression, &args,
 complain));
-   else if (idk == CP_ID_KIND_QUALIFIED)
- /* A call to a static class member, or a namespace-scope
-function.  */
- postfix_expression
-   = finish_call_expr (postfix_expression, &args,
-   /*disallow_virtual=*/true,
-   koenig_p,
-   complain);
else
  /* All other function calls.  */
  {
@@ -8107,12 +8065,22 @@ cp_parser_postfix_expression (cp_parser *parser, bool 
address_p, bool cast_p,
   "not permitted in intervening code");
parser->omp_for_parse_state->fail = true;
  }
+   bool disallow_virtual = (idk == CP_ID_KIND_QUALIFIED);
postfix_expression
  = finish_call_expr (postfix_expression, &args,
- /*disallow_virtual=*/false,
+ disallow_virtual,
  koenig_p,
  complain);
+
+   if (type_dependent_expression_p (postfix_expression))
+ {
+   tree fn = CALL_EXPR_FN (postfix_expression);
+   if (TREE_CODE (fn) == COMPONENT_REF)
+ maybe_generic_this_capture (TREE_OPERAND (fn, 0),
+   

Re: [PING] [PATCH] Harmonize headers between both dg-extract-results scripts

2023-09-26 Thread Jeff Law




On 9/25/23 03:55, Paul Iannetta wrote:

On Mon, Sep 18, 2023 at 08:39:34AM +0200, Paul Iannetta wrote:

On Thu, Sep 14, 2023 at 04:24:33PM +0200, Paul Iannetta wrote:

Hi,

This is a small patch so that both dg-extract-results.py and
dg-extract-results.sh share the same header.  In particular, it fixes
the fact that the regexp r'^Test Run By (\S+) on (.*)$' was never
matched in the python file.


By the way, the bash script dg-extract-results.sh checks whether
python is available by invoking python.  However, it seems that the
policy on newer machines is to not provide python as a symlink (at
least on Ubuntu 22.04 and above; and RHEL 8).  Therefore, we might
want to also check against python3 so that the bash script does not
fail to find python even though it is available.

Thanks,
Paul



Author: Paul Iannetta 
Date:   Thu Sep 14 15:43:58 2023 +0200

 Harmonize headers between both dg-extract-results scripts

 The header of the python version looked like:
 Target is ...
 Host   is ...
 The header of the bash version looked like:
 Test run by ... on ...
 Target is ...

 After this change both headers look like:
 Test run by ... on ...
 Target is ...
 Host   is ...

 The order of the tests is not the same but since dg-cmp-results.sh it
 does not matter much.

 contrib/ChangeLog:

 2023-09-14  Paul Iannetta  

 * dg-extract-results.py: Print the "Test run" line.
 * dg-extract-results.sh: Print the "Host" line.

OK
jeff


[PATCH] __atomic_test_and_set: Fall back to library, not non-atomic code

2023-09-26 Thread Hans-Peter Nilsson
Tested cris-elf, native x86_64-pc-linux-gnu and arm-eabi.

For arm-eabi, notably lacking any atomic support for the
default multilib, with --target_board=arm-sim it regressed
29_atomics/atomic_flag/cons/value_init.cc with the expected
linker failure due to lack of __atomic_test_and_set - which
is a good thing.  With this one, there are 44 unexpected
FAILs for libstdc+++ at r14-4210-g94982a6b9cf4.  This number
was 206 as late as r14-3470-g721f7e2c4e5e, but mitigated by
r14-3980-g62b29347c38394, deliberately.  To fix the
regression, I'll do the same and follow up with adding
dg-require-thread-fence on
29_atomics/atomic_flag/cons/value_init.cc (and if approved,
commit it before this one).

Incidentally, the fortran test-results for arm-eabi are
riddled with missing-__sync_synchronize linker errors
causing some 18134 unexpected failures, where cris-elf has
121.

Ok to commit?

-- >8 --
Make __atomic_test_and_set consistent with other __atomic_ and __sync_
builtins: call a matching library function instead of emitting
non-atomic code when the target has no direct insn support.

There's special-case code handling targetm.atomic_test_and_set_trueval
!= 1 trying a modified maybe_emit_sync_lock_test_and_set.  Previously,
if that worked but its matching emit_store_flag_force returned NULL,
we'd segfault later on.  Now that the caller handles NULL, gcc_assert
here instead.

While the referenced PR:s are ARM-specific, the issue is general.

PR target/107567
PR target/109166
* builtins.cc (expand_builtin) :
Handle failure from expand_builtin_atomic_test_and_set.
* optabs.cc (expand_atomic_test_and_set): When all attempts fail to
generate atomic code through target support, return NULL
instead of emitting non-atomic code.  Also, for code handling
targetm.atomic_test_and_set_trueval != 1, gcc_assert result
from calling emit_store_flag_force instead of returning NULL.
---
 gcc/builtins.cc |  5 -
 gcc/optabs.cc   | 22 +++---
 2 files changed, 11 insertions(+), 16 deletions(-)

diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index 6e4274bb2a4e..40dfd36a3197 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -8387,7 +8387,10 @@ expand_builtin (tree exp, rtx target, rtx subtarget, 
machine_mode mode,
   break;
 
 case BUILT_IN_ATOMIC_TEST_AND_SET:
-  return expand_builtin_atomic_test_and_set (exp, target);
+  target = expand_builtin_atomic_test_and_set (exp, target);
+  if (target)
+   return target;
+  break;
 
 case BUILT_IN_ATOMIC_CLEAR:
   return expand_builtin_atomic_clear (exp);
diff --git a/gcc/optabs.cc b/gcc/optabs.cc
index 8b96f23aec05..e1898da22808 100644
--- a/gcc/optabs.cc
+++ b/gcc/optabs.cc
@@ -7080,25 +7080,17 @@ expand_atomic_test_and_set (rtx target, rtx mem, enum 
memmodel model)
   /* Recall that the legacy lock_test_and_set optab was allowed to do magic
  things with the value 1.  Thus we try again without trueval.  */
   if (!ret && targetm.atomic_test_and_set_trueval != 1)
-ret = maybe_emit_sync_lock_test_and_set (subtarget, mem, const1_rtx, 
model);
-
-  /* Failing all else, assume a single threaded environment and simply
- perform the operation.  */
-  if (!ret)
 {
-  /* If the result is ignored skip the move to target.  */
-  if (subtarget != const0_rtx)
-emit_move_insn (subtarget, mem);
+  ret = maybe_emit_sync_lock_test_and_set (subtarget, mem, const1_rtx, 
model);
 
-  emit_move_insn (mem, trueval);
-  ret = subtarget;
+  if (ret)
+   {
+ /* Rectify the not-one trueval.  */
+ ret = emit_store_flag_force (target, NE, ret, const0_rtx, mode, 0, 1);
+ gcc_assert (ret);
+   }
 }
 
-  /* Recall that have to return a boolean value; rectify if trueval
- is not exactly one.  */
-  if (targetm.atomic_test_and_set_trueval != 1)
-ret = emit_store_flag_force (target, NE, ret, const0_rtx, mode, 0, 1);
-  
   return ret;
 }
 
-- 
2.30.2



[PATCH] testsuite: Require thread-fence for 29_atomics/atomic_flag/cons/value_init.cc

2023-09-26 Thread Hans-Peter Nilsson
Ok to commit?
-- >8 --
A recent patch made __atomic_test_and_set no longer fall
back to emitting non-atomic code, but instead will then emit
a call to __atomic_test_and_set, thereby exposing the need
to gate also this test on support for atomics, similar to
r14-3980-g62b29347c38394.

libstdc++-v3:
* testsuite/29_atomics/atomic_flag/cons/value_init.cc: Add
dg-require-thread-fence.
---
 libstdc++-v3/testsuite/29_atomics/atomic_flag/cons/value_init.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/libstdc++-v3/testsuite/29_atomics/atomic_flag/cons/value_init.cc 
b/libstdc++-v3/testsuite/29_atomics/atomic_flag/cons/value_init.cc
index 084e2930f7e2..f3f38b54dbcd 100644
--- a/libstdc++-v3/testsuite/29_atomics/atomic_flag/cons/value_init.cc
+++ b/libstdc++-v3/testsuite/29_atomics/atomic_flag/cons/value_init.cc
@@ -16,6 +16,7 @@
 // .
 
 // { dg-do run { target c++20 } }
+// { dg-require-thread-fence "" }
 
 #include 
 #include 
-- 
2.30.2



[PATCH V2] RISC-V: Fix mem-to-mem VLS move pattern[PR111566]

2023-09-26 Thread Juzhe-Zhong
The mem-to-mem insn pattern is splitted from reg-to-mem/mem-to-reg/reg-to-reg
causes ICE in RA since RA prefer they stay together.

Now, we split mem-to-mem as a pure pre-RA split pattern and only allow
define_insn match mem-to-mem VLS move in pre-RA stage (Forbid mem-to-mem move 
after RA).

Tested no difference. Committed.

PR target/111566

gcc/ChangeLog:

* config/riscv/vector.md (*mov_mem_to_mem): Only allow mem-to-mem 
move for VLS modes size <= MAX_BITS_PER_WORD

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/fortran/pr111566.f90: New test.

---
 gcc/config/riscv/vector.md| 60 ++-
 .../gcc.target/riscv/rvv/fortran/pr111566.f90 | 31 ++
 2 files changed, 62 insertions(+), 29 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/fortran/pr111566.f90

diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md
index d5300a33946..a6dbaa74a10 100644
--- a/gcc/config/riscv/vector.md
+++ b/gcc/config/riscv/vector.md
@@ -1222,48 +1222,50 @@
 DONE;
 })
 
-(define_insn_and_split "*mov_mem_to_mem"
+;; Some VLS modes (like V2SImode) have size <= a general purpose
+;; register width, we optimize such mem-to-mem move into mem-to-mem
+;; scalar move.  Otherwise, we always force operands[1] into register
+;; so that we will never get mem-to-mem move after RA.
+(define_split
   [(set (match_operand:VLS_AVL_IMM 0 "memory_operand")
(match_operand:VLS_AVL_IMM 1 "memory_operand"))]
-  "TARGET_VECTOR && can_create_pseudo_p ()"
-  "#"
-  "&& 1"
+  "TARGET_VECTOR && can_create_pseudo_p ()
+   && GET_MODE_BITSIZE (mode).to_constant () <= MAX_BITS_PER_WORD"
   [(const_int 0)]
   {
-if (GET_MODE_BITSIZE (mode).to_constant () <= MAX_BITS_PER_WORD)
-  {
-/* Opitmize the following case:
-
-   typedef int8_t v2qi __attribute__ ((vector_size (2)));
-   v2qi v = *(v2qi*)in;
-   *(v2qi*)out = v;
-
-   We prefer scalar load/store instead of vle.v/vse.v when
-   the VLS modes size is smaller scalar mode.  */
-machine_mode mode;
-unsigned size = GET_MODE_BITSIZE (mode).to_constant ();
-if (FLOAT_MODE_P (mode))
- mode = mode_for_size (size, MODE_FLOAT, 0).require ();
-else
- mode = mode_for_size (size, MODE_INT, 0).require ();
-emit_move_insn (gen_lowpart (mode, operands[0]),
-   gen_lowpart (mode, operands[1]));
-  }
+/* Opitmize the following case:
+
+   typedef int8_t v2qi __attribute__ ((vector_size (2)));
+   v2qi v = *(v2qi*)in;
+   *(v2qi*)out = v;
+
+   We prefer scalar load/store instead of vle.v/vse.v when
+   the VLS modes size is smaller scalar mode.  */
+machine_mode mode;
+unsigned size = GET_MODE_BITSIZE (mode).to_constant ();
+if (FLOAT_MODE_P (mode))
+  mode = mode_for_size (size, MODE_FLOAT, 0).require ();
 else
-  {
-   operands[1] = force_reg (mode, operands[1]);
-   emit_move_insn (operands[0], operands[1]);
-  }
+  mode = mode_for_size (size, MODE_INT, 0).require ();
+emit_move_insn (gen_lowpart (mode, operands[0]),
+   gen_lowpart (mode, operands[1]));
 DONE;
   }
-  [(set_attr "type" "vmov")]
 )
 
+;; We recognize mem-to-mem move in pre-RA stage so that we won't have
+;; ICE (unrecognizable insn: (set (mem) (mem))).  Then, the previous
+;; mem-to-mem split pattern will force operands[1] into a register so
+;; that mem-to-mem move will never happen after RA.
+;;
+;; We don't allow mem-to-mem move in post-RA stage since we
+;; don't have an instruction to split mem-to-mem move after RA.
 (define_insn_and_split "*mov"
   [(set (match_operand:VLS_AVL_IMM 0 "reg_or_mem_operand" "=vr, m, vr")
(match_operand:VLS_AVL_IMM 1 "reg_or_mem_operand" "  m,vr, vr"))]
   "TARGET_VECTOR
-   && (register_operand (operands[0], mode)
+   && (can_create_pseudo_p ()
+   || register_operand (operands[0], mode)
|| register_operand (operands[1], mode))"
   "@
#
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/fortran/pr111566.f90 
b/gcc/testsuite/gcc.target/riscv/rvv/fortran/pr111566.f90
new file mode 100644
index 000..2e30dc9bfaa
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/fortran/pr111566.f90
@@ -0,0 +1,31 @@
+! { dg-do compile }
+! { dg-options "-march=rv64gcv -mabi=lp64d -Ofast -fallow-argument-mismatch 
-fmax-stack-var-size=65536 -S  -std=legacy -w" }
+
+module a
+  integer,parameter :: SHR_KIND_R8 = selected_real_kind(12)
+end module a
+module b
+  use a,  c => shr_kind_r8
+contains
+  subroutine d(cg , km, i1, i2)
+real (c) ch(i2,km)
+real (c) cg(4,i1:i2,km)
+real  dc(i2,km)
+real(c) ci(i2,km)
+real(c) cj(i2,km)
+do k=2,ck
+   do i=i1,0
+  cl = ci(i,k) *ci(i,1) /  cj(i,k)+ch(i,1)
+  cm = cg(1,i,k) - min(e,cg(1,i,co))
+  dc(i,k) = sign(cm, cl)
+   enddo
+enddo
+if ( cq == 0 ) then
+   do i=i1,i2
+  

Re: Re: [Committed] RISC-V: Fix mem-to-mem VLS move pattern[PR111566]

2023-09-26 Thread 钟居哲
Thanks Jeff.

Address comments:
[PATCH V2] RISC-V: Fix mem-to-mem VLS move pattern[PR111566] (gnu.org)

Actually, we only allow mem-to-mem move for VLS modes size <= MAX_BITS_PER_WORD.
Since we want to optimize this case:
-   typedef int8_t v2qi __attribute__ ((vector_size (2)));
-   v2qi v = *(v2qi*)in;
-   *(v2qi*)out = v;
using scalar load/store.

Does it look more reasonable ?


juzhe.zh...@rivai.ai
 
From: Jeff Law
Date: 2023-09-26 21:35
To: Juzhe-Zhong; gcc-patches
CC: kito.cheng; kito.cheng; rdapp.gcc
Subject: Re: [Committed] RISC-V: Fix mem-to-mem VLS move pattern[PR111566]
 
 
On 9/26/23 07:15, Juzhe-Zhong wrote:
> The mem-to-mem insn pattern is splitted from reg-to-mem/mem-to-reg/reg-to-reg
> causes ICE in RA since RA prefer they stay together.
> 
> Now, we split mem-to-mem as a pure pre-RA split pattern and only allow
> define_insn match mem-to-mem VLS move in pre-RA stage (Forbid mem-to-mem move 
> after RA).
> 
> Tested no difference. Committed.
> 
> PR target/111566
> 
> gcc/ChangeLog:
> 
> * config/riscv/vector.md (*mov_mem_to_mem):
> 
> gcc/testsuite/ChangeLog:
> 
> * gcc.target/riscv/rvv/fortran/pr111566.f90: New test.
ChangeLog for the vector.md is missing.
 
In general we shouldn't be allowing mem->mem in most patterns since the 
hardware doesn't actually implement such instructions.  I suspect that's 
the real problem here and that ultimately you're just papering over it.
 
 
 
Jeff
 
 


Re: Re: [PATCH] RISC-V: Add opaque integer modes to fix ICE on DSE[PR111590]

2023-09-26 Thread 钟居哲
Thanks Richard.

Is it correct as follows ?

diff --git a/gcc/dse.cc b/gcc/dse.cc
index 8b07be17674..c58d3bf4e1b 100644
--- a/gcc/dse.cc
+++ b/gcc/dse.cc
@@ -1733,7 +1733,7 @@ find_shift_sequence (poly_int64 access_size,
   /* If a constant was stored into memory, try to simplify it here,
  otherwise the cost of the shift might preclude this optimization
  e.g. at -Os, even when no actual shift will be needed.  */
-  if (store_info->const_rhs)
+  if (store_info->const_rhs && known_le (access_size, GET_MODE_SIZE 
(MAX_MODE_INT)))

I failed to find native_encode_rtx and native_decode_rtx.



juzhe.zh...@rivai.ai
 
From: Richard Sandiford
Date: 2023-09-26 21:50
To: Juzhe-Zhong
CC: gcc-patches; kito.cheng; kito.cheng; jeffreyalaw; rdapp.gcc
Subject: Re: [PATCH] RISC-V: Add opaque integer modes to fix ICE on 
DSE[PR111590]
Juzhe-Zhong  writes:
> When doing fortran test with 'V' extension enabled on RISC-V port.
> I saw multiple ICE: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111590
>
> The root cause is on DSE:
>
> internal compiler error: in smallest_mode_for_size, at stor-layout.cc:356
> 0x1918f70 smallest_mode_for_size(poly_int<2u, unsigned long>, mode_class)
> ../../../../gcc/gcc/stor-layout.cc:356
> 0x11f75bb smallest_int_mode_for_size(poly_int<2u, unsigned long>)
> ../../../../gcc/gcc/machmode.h:916
> 0x3304141 find_shift_sequence
> ../../../../gcc/gcc/dse.cc:1738
> 0x3304f1a get_stored_val
> ../../../../gcc/gcc/dse.cc:1906
> 0x3305377 replace_read
> ../../../../gcc/gcc/dse.cc:2010
> 0x3306226 check_mem_read_rtx
> ../../../../gcc/gcc/dse.cc:2310
> 0x330667b check_mem_read_use
> ../../../../gcc/gcc/dse.cc:2415
>
> After investigations, DSE is trying to do optimization like this following 
> codes:
>
> (insn 86 85 87 9 (set (reg:V4DI 168)
> (mem/u/c:V4DI (reg/f:DI 171) [0  S32 A128])) "bug.f90":6:18 discrim 6 
> 1167 {*movv4di}
>  (expr_list:REG_EQUAL (const_vector:V4DI [
> (const_int 4 [0x4])
> (const_int 1 [0x1]) repeated x2
> (const_int 3 [0x3])
> ])
> (nil)))
>
> (set (mem) (reg:V4DI 168))
>
> Then it ICE on: auto new_mode = smallest_int_mode_for_size (access_size * 
> BITS_PER_UNIT);
>
> The access_size may be 24 or 32. We don't have such integer modes with these 
> size so it ICE.
>
> I saw both aarch64 and ARM has EI/OI/CI/XI opaque modes. 
>
> So I add it to walk around ICE on DCE, it works as all ICE are resolved.
>
> CC Richard to review to make sure I am doing the right thing to fix the bug.
>
> Hi, Richard, could you help me with this issue ? Thanks.
>
> gcc/ChangeLog:
>
> * config/riscv/riscv-modes.def (INT_MODE): Add opaque modes
 
I think it's a bug in dse.  The contract is:
 
/* Find the narrowest integer mode that contains at least SIZE bits.
   Such a mode must exist.  */
 
(emphasis on the last line).
 
The easy fix would be to add:
 
  && known_le (access_size, GET_MODE_SIZE (MAX_MODE_INT))
 
The better but more complex fix would be to make dse use native_encode_rtx/
native_decode_rtx (which IIRC didn't exist when the dse code was written).
 
Thanks,
Richard
 
 
>
> ---
>  gcc/config/riscv/riscv-modes.def | 6 ++
>  1 file changed, 6 insertions(+)
>
> diff --git a/gcc/config/riscv/riscv-modes.def 
> b/gcc/config/riscv/riscv-modes.def
> index e3c6ccb2809..ab86032c914 100644
> --- a/gcc/config/riscv/riscv-modes.def
> +++ b/gcc/config/riscv/riscv-modes.def
> @@ -393,6 +393,12 @@ VLS_MODES (1024); /* V1024QI  V512HI  V256SI V128DI  
> V512HF  V256SF V128DF */
>  VLS_MODES (2048); /* V2048QI V1024HI  V512SI V256DI V1024HF  V512SF V256DF */
>  VLS_MODES (4096); /* V4096QI V2048HI V1024SI V512DI V2048HF V1024SF V512DF */
>  
> +/* Opaque integer modes 3, 4, 6 or 8 general double registers.  */
> +INT_MODE (EI, 24);
> +INT_MODE (OI, 32);
> +INT_MODE (CI, 48);
> +INT_MODE (XI, 64);
> +
>  /* TODO: According to RISC-V 'V' ISA spec, the maximun vector length can
> be 65536 for a single vector register which means the vector mode in
> GCC can be maximum = 65536 * 8 bits (LMUL=8).
 


Re: [PATCH] c++: non-static memfn call dependence cleanup

2023-09-26 Thread Krishna Narayanan
On Tue, Sep 26, 2023, 19:52 Patrick Palka  wrote:

> Bootstrapped and regtested on x86_64-pc-linux-gnu, does this look OK for
> trunk?
>
> -- >8 --
>
> In cp_parser_postfix_expression, we essentially repeat the
> type-dependent and COMPONENT_REF callee cases of finish_call_expr.
> This patch deduplicates this logic.
>
> gcc/cp/ChangeLog:
>
> * parser.cc (cp_parser_postfix_expression): Consolidate three
> calls to finish_call_expr, one to build_new_method_call and
> one to build_min_nt_call_vec into one call to finish_call_expr.
> * pt.cc (type_dependent_expression_p): Use t_d_object_e_p
> instead of t_d_e_p for COMPONENT_REF and OFFSET_REF.
>
> gcc/testsuite/ChangeLog:
>
> * g++.dg/template/crash127.C: Expect additional error due to
> being able to check the member access expression ahead of time.
> Strengthen the test by not instantiating the class template.
> ---
>  gcc/cp/parser.cc | 60 ++--
>  gcc/cp/pt.cc |  2 +-
>  gcc/testsuite/g++.dg/template/crash127.C |  3 +-
>  3 files changed, 16 insertions(+), 49 deletions(-)
>
> diff --git a/gcc/cp/parser.cc b/gcc/cp/parser.cc
> index f3abae716fe..78082ee7284 100644
> --- a/gcc/cp/parser.cc
> +++ b/gcc/cp/parser.cc
> @@ -8047,54 +8047,12 @@ cp_parser_postfix_expression (cp_parser *parser,
> bool address_p, bool cast_p,
> close_paren_loc);
> iloc_sentinel ils (combined_loc);
>
> -   if (TREE_CODE (postfix_expression) == COMPONENT_REF)
> - {
> -   tree instance = TREE_OPERAND (postfix_expression, 0);
> -   tree fn = TREE_OPERAND (postfix_expression, 1);
> -
> -   if (processing_template_decl
> -   && (type_dependent_object_expression_p (instance)
> -   || (!BASELINK_P (fn)
> -   && TREE_CODE (fn) != FIELD_DECL)
> -   || type_dependent_expression_p (fn)
> -   || any_type_dependent_arguments_p (args)))
> - {
> -   maybe_generic_this_capture (instance, fn);
> -   postfix_expression
> - = build_min_nt_call_vec (postfix_expression, args);
> - }
> -   else if (BASELINK_P (fn))
> - {
> - postfix_expression
> -   = (build_new_method_call
> -  (instance, fn, &args, NULL_TREE,
> -   (idk == CP_ID_KIND_QUALIFIED
> -? LOOKUP_NORMAL|LOOKUP_NONVIRTUAL
> -: LOOKUP_NORMAL),
> -   /*fn_p=*/NULL,
> -   complain));
> - }
> -   else
> - postfix_expression
> -   = finish_call_expr (postfix_expression, &args,
> -   /*disallow_virtual=*/false,
> -   /*koenig_p=*/false,
> -   complain);
> - }
> -   else if (TREE_CODE (postfix_expression) == OFFSET_REF
> -|| TREE_CODE (postfix_expression) == MEMBER_REF
> -|| TREE_CODE (postfix_expression) == DOTSTAR_EXPR)
> +   if (TREE_CODE (postfix_expression) == OFFSET_REF
> +   || TREE_CODE (postfix_expression) == MEMBER_REF
> +   || TREE_CODE (postfix_expression) == DOTSTAR_EXPR)
>   postfix_expression = (build_offset_ref_call_from_tree
> (postfix_expression, &args,
>  complain));
> -   else if (idk == CP_ID_KIND_QUALIFIED)
> - /* A call to a static class member, or a namespace-scope
> -function.  */
> - postfix_expression
> -   = finish_call_expr (postfix_expression, &args,
> -   /*disallow_virtual=*/true,
> -   koenig_p,
> -   complain);
> else
>   /* All other function calls.  */
>   {
> @@ -8107,12 +8065,22 @@ cp_parser_postfix_expression (cp_parser *parser,
> bool address_p, bool cast_p,
>"not permitted in intervening code");
> parser->omp_for_parse_state->fail = true;
>   }
> +   bool disallow_virtual = (idk == CP_ID_KIND_QUALIFIED);
> postfix_expression
>   = finish_call_expr (postfix_expression, &args,
> - /*disallow_virtual=*/false,
> + disallow_virtual,
>   koenig_p,
>   complain);
> +
> +   if (type_dependent_expression_p (postfix_expre

Re: [Committed] RISC-V: Fix mem-to-mem VLS move pattern[PR111566]

2023-09-26 Thread Jeff Law




On 9/26/23 08:51, 钟居哲 wrote:

Thanks Jeff.

Address comments:
[PATCH V2] RISC-V: Fix mem-to-mem VLS move pattern[PR111566] (gnu.org) 



Actually, we only allow mem-to-mem move for VLS modes size <= 
MAX_BITS_PER_WORD.

Since we want to optimize this case:

-   typedef int8_t v2qi __attribute__ ((vector_size (2)));
-   v2qi v = *(v2qi*)in;
-   *(v2qi*)out = v;

using scalar load/store.
That should be do-able without resorting to a pattern that allowed 
mem->mem moves.


THe thing you have to be careful about is in the effort to optimize this 
case, you can end up confusing the register allocator into making poor 
choices elsewhere.  ie, once you expose a small vector move as 
implementable in GPRs you run the very real risk of pessimizing other code.


But even with that caveat, the better way to go here is to disallow the 
mem->mem case.


jeff


Re: [PATCH] testsuite: Require thread-fence for 29_atomics/atomic_flag/cons/value_init.cc

2023-09-26 Thread Jeff Law




On 9/26/23 08:40, Hans-Peter Nilsson wrote:

Ok to commit?
-- >8 --
A recent patch made __atomic_test_and_set no longer fall
back to emitting non-atomic code, but instead will then emit
a call to __atomic_test_and_set, thereby exposing the need
to gate also this test on support for atomics, similar to
r14-3980-g62b29347c38394.

libstdc++-v3:
* testsuite/29_atomics/atomic_flag/cons/value_init.cc: Add
dg-require-thread-fence.

OK.

Jeff

ps.  Missed you at the Cauldron



Re: [PATCH] __atomic_test_and_set: Fall back to library, not non-atomic code

2023-09-26 Thread Jeff Law




On 9/26/23 08:34, Hans-Peter Nilsson wrote:

Tested cris-elf, native x86_64-pc-linux-gnu and arm-eabi.

For arm-eabi, notably lacking any atomic support for the
default multilib, with --target_board=arm-sim it regressed
29_atomics/atomic_flag/cons/value_init.cc with the expected
linker failure due to lack of __atomic_test_and_set - which
is a good thing.  With this one, there are 44 unexpected
FAILs for libstdc+++ at r14-4210-g94982a6b9cf4.  This number
was 206 as late as r14-3470-g721f7e2c4e5e, but mitigated by
r14-3980-g62b29347c38394, deliberately.  To fix the
regression, I'll do the same and follow up with adding
dg-require-thread-fence on
29_atomics/atomic_flag/cons/value_init.cc (and if approved,
commit it before this one).

Incidentally, the fortran test-results for arm-eabi are
riddled with missing-__sync_synchronize linker errors
causing some 18134 unexpected failures, where cris-elf has
121.

Ok to commit?

-- >8 --
Make __atomic_test_and_set consistent with other __atomic_ and __sync_
builtins: call a matching library function instead of emitting
non-atomic code when the target has no direct insn support.

There's special-case code handling targetm.atomic_test_and_set_trueval
!= 1 trying a modified maybe_emit_sync_lock_test_and_set.  Previously,
if that worked but its matching emit_store_flag_force returned NULL,
we'd segfault later on.  Now that the caller handles NULL, gcc_assert
here instead.

While the referenced PR:s are ARM-specific, the issue is general.

PR target/107567
PR target/109166
* builtins.cc (expand_builtin) :
Handle failure from expand_builtin_atomic_test_and_set.
* optabs.cc (expand_atomic_test_and_set): When all attempts fail to
generate atomic code through target support, return NULL
instead of emitting non-atomic code.  Also, for code handling
targetm.atomic_test_and_set_trueval != 1, gcc_assert result
from calling emit_store_flag_force instead of returning NULL.

OK
jeff


[PATCH V3] RISC-V: Remove mem-to-mem VLS move pattern[PR111566]

2023-09-26 Thread Juzhe-Zhong


PR target/111566

gcc/ChangeLog:

* config/riscv/vector.md (*mov_mem_to_mem): Remove.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/vls/mov-1.c: Adapt test.
* gcc.target/riscv/rvv/autovec/vls/mov-10.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls/mov-3.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls/mov-5.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls/mov-7.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls/mov-8.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls/mov-9.c: Ditto.1
* gcc.target/riscv/rvv/autovec/vls/mov-2.c: Removed.
* gcc.target/riscv/rvv/autovec/vls/mov-4.c: Removed.
* gcc.target/riscv/rvv/autovec/vls/mov-6.c: Removed.
* gcc.target/riscv/rvv/fortran/pr111566.f90: New test.

---
 gcc/config/riscv/vector.md| 40 +---
 .../gcc.target/riscv/rvv/autovec/vls/mov-1.c  | 48 ---
 .../gcc.target/riscv/rvv/autovec/vls/mov-10.c | 12 -
 .../gcc.target/riscv/rvv/autovec/vls/mov-2.c  | 19 
 .../gcc.target/riscv/rvv/autovec/vls/mov-3.c  | 36 --
 .../gcc.target/riscv/rvv/autovec/vls/mov-4.c  | 19 
 .../gcc.target/riscv/rvv/autovec/vls/mov-5.c  | 24 --
 .../gcc.target/riscv/rvv/autovec/vls/mov-6.c  | 19 
 .../gcc.target/riscv/rvv/autovec/vls/mov-7.c  | 12 -
 .../gcc.target/riscv/rvv/autovec/vls/mov-8.c  | 36 --
 .../gcc.target/riscv/rvv/autovec/vls/mov-9.c  | 24 --
 .../gcc.target/riscv/rvv/fortran/pr111566.f90 | 31 
 12 files changed, 33 insertions(+), 287 deletions(-)
 delete mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-2.c
 delete mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-4.c
 delete mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-6.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/fortran/pr111566.f90

diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md
index d5300a33946..57205025ff8 100644
--- a/gcc/config/riscv/vector.md
+++ b/gcc/config/riscv/vector.md
@@ -1222,48 +1222,12 @@
 DONE;
 })
 
-(define_insn_and_split "*mov_mem_to_mem"
-  [(set (match_operand:VLS_AVL_IMM 0 "memory_operand")
-   (match_operand:VLS_AVL_IMM 1 "memory_operand"))]
-  "TARGET_VECTOR && can_create_pseudo_p ()"
-  "#"
-  "&& 1"
-  [(const_int 0)]
-  {
-if (GET_MODE_BITSIZE (mode).to_constant () <= MAX_BITS_PER_WORD)
-  {
-/* Opitmize the following case:
-
-   typedef int8_t v2qi __attribute__ ((vector_size (2)));
-   v2qi v = *(v2qi*)in;
-   *(v2qi*)out = v;
-
-   We prefer scalar load/store instead of vle.v/vse.v when
-   the VLS modes size is smaller scalar mode.  */
-machine_mode mode;
-unsigned size = GET_MODE_BITSIZE (mode).to_constant ();
-if (FLOAT_MODE_P (mode))
- mode = mode_for_size (size, MODE_FLOAT, 0).require ();
-else
- mode = mode_for_size (size, MODE_INT, 0).require ();
-emit_move_insn (gen_lowpart (mode, operands[0]),
-   gen_lowpart (mode, operands[1]));
-  }
-else
-  {
-   operands[1] = force_reg (mode, operands[1]);
-   emit_move_insn (operands[0], operands[1]);
-  }
-DONE;
-  }
-  [(set_attr "type" "vmov")]
-)
-
 (define_insn_and_split "*mov"
   [(set (match_operand:VLS_AVL_IMM 0 "reg_or_mem_operand" "=vr, m, vr")
(match_operand:VLS_AVL_IMM 1 "reg_or_mem_operand" "  m,vr, vr"))]
   "TARGET_VECTOR
-   && (register_operand (operands[0], mode)
+   && (can_create_pseudo_p ()
+   || register_operand (operands[0], mode)
|| register_operand (operands[1], mode))"
   "@
#
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-1.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-1.c
index aedf98819bb..24bb7240db8 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-1.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-1.c
@@ -4,54 +4,6 @@
 
 #include "def.h"
 
-/*
-** mov0:
-** lbu\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
-** sb\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
-**  ret
-*/
-void mov0 (int8_t *in, int8_t *out)
-{
-  v1qi v = *(v1qi*)in;
-  *(v1qi*)out = v;
-}
-
-/*
-** mov1:
-** lhu\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
-** sh\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
-**  ret
-*/
-void mov1 (int8_t *in, int8_t *out)
-{
-  v2qi v = *(v2qi*)in;
-  *(v2qi*)out = v;
-}
-
-/*
-** mov2:
-** lw\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
-** sw\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
-**  ret
-*/
-void mov2 (int8_t *in, int8_t *out)
-{
-  v4qi v = *(v4qi*)in;
-  *(v4qi*)out = v;
-}
-
-/*
-** mov3:
-** ld\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
-** sd\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
-**  ret
-*/
-void mov3 (int8_t *in, int8_t *out)
-{
-  v8qi v = *(v8qi*)in;
-  *(v8qi*)out = v;
-}
-
 /*
 ** mov4:
 ** vsetivli\s+zero,\s*16,\s*e8,\s*mf8,\s*t[au],\s*m[au]
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-10.c 
b/gcc/testsuite/gc

Re: Re: [Committed] RISC-V: Fix mem-to-mem VLS move pattern[PR111566]

2023-09-26 Thread 钟居哲
OK。

Remove mem-to-mem pattern:
[PATCH V3] RISC-V: Remove mem-to-mem VLS move pattern[PR111566] (gnu.org)




juzhe.zh...@rivai.ai
 
From: Jeff Law
Date: 2023-09-26 23:15
To: 钟居哲; gcc-patches
CC: kito.cheng; kito.cheng; rdapp.gcc
Subject: Re: [Committed] RISC-V: Fix mem-to-mem VLS move pattern[PR111566]
 
 
On 9/26/23 08:51, 钟居哲 wrote:
> Thanks Jeff.
> 
> Address comments:
> [PATCH V2] RISC-V: Fix mem-to-mem VLS move pattern[PR111566] (gnu.org) 
> 
> 
> Actually, we only allow mem-to-mem move for VLS modes size <= 
> MAX_BITS_PER_WORD.
> Since we want to optimize this case:
> 
> - typedef int8_t v2qi __attribute__ ((vector_size (2)));
> - v2qi v = *(v2qi*)in;
> - *(v2qi*)out = v;
> 
> using scalar load/store.
That should be do-able without resorting to a pattern that allowed 
mem->mem moves.
 
THe thing you have to be careful about is in the effort to optimize this 
case, you can end up confusing the register allocator into making poor 
choices elsewhere.  ie, once you expose a small vector move as 
implementable in GPRs you run the very real risk of pessimizing other code.
 
But even with that caveat, the better way to go here is to disallow the 
mem->mem case.
 
jeff
 


[PATCH 2/1] c++: more non-static memfn call dependence cleanup [PR106086]

2023-09-26 Thread Patrick Palka
Bootstrapped and regtested on x86_64-pc-linux-gnu, does this look OK
for trunk?

-- >8 --

This follow-up patch removes some more repetition of the type-dependent
case of finish_call_expr, this time in of tsubst_copy_and_build.  This
allows us to easily fix PR106086 -- which is about us failing to capture
'this' when we resolve a use of a non-static member function of the
current instantiation only at lambda regeneration time and neglect to
capture 'this' -- by moving the call to maybe_generic_this_capture from
the parser to finish_call_expr so that we attempt to capture 'this' at
regeneration time as well.

PR c++/106086

gcc/cp/ChangeLog:

* parser.cc (cp_parser_postfix_expression): Don't call
maybe_generic_this_capture here.
* pt.cc (tsubst_copy_and_build) : Remove
COMPONENT_REF callee handling.
* semantics.cc (finish_call_expr): In the type-dependent case,
call maybe_generic_this_capture here instead.

gcc/testsuite/ChangeLog:

* g++.dg/cpp1y/lambda-generic-this5.C: New test.
---
 gcc/cp/parser.cc  |  8 --
 gcc/cp/pt.cc  | 25 ---
 gcc/cp/semantics.cc   | 12 ++---
 .../g++.dg/cpp1y/lambda-generic-this5.C   | 22 
 4 files changed, 30 insertions(+), 37 deletions(-)
 create mode 100644 gcc/testsuite/g++.dg/cpp1y/lambda-generic-this5.C

diff --git a/gcc/cp/parser.cc b/gcc/cp/parser.cc
index 78082ee7284..b00ef36b831 100644
--- a/gcc/cp/parser.cc
+++ b/gcc/cp/parser.cc
@@ -8071,14 +8071,6 @@ cp_parser_postfix_expression (cp_parser *parser, bool 
address_p, bool cast_p,
  disallow_virtual,
  koenig_p,
  complain);
-
-   if (type_dependent_expression_p (postfix_expression))
- {
-   tree fn = CALL_EXPR_FN (postfix_expression);
-   if (TREE_CODE (fn) == COMPONENT_REF)
- maybe_generic_this_capture (TREE_OPERAND (fn, 0),
- TREE_OPERAND (fn, 1));
- }
  }
 
if (close_paren_loc != UNKNOWN_LOCATION)
diff --git a/gcc/cp/pt.cc b/gcc/cp/pt.cc
index b19b634690a..4400d429b6f 100644
--- a/gcc/cp/pt.cc
+++ b/gcc/cp/pt.cc
@@ -21364,31 +21364,6 @@ tsubst_copy_and_build (tree t,
 || TREE_CODE (function) == MEMBER_REF)
  ret = build_offset_ref_call_from_tree (function, &call_args,
 complain);
-   else if (TREE_CODE (function) == COMPONENT_REF)
- {
-   tree instance = TREE_OPERAND (function, 0);
-   tree fn = TREE_OPERAND (function, 1);
-
-   if (processing_template_decl
-   && (type_dependent_expression_p (instance)
-   || (!BASELINK_P (fn)
-   && TREE_CODE (fn) != FIELD_DECL)
-   || type_dependent_expression_p (fn)
-   || any_type_dependent_arguments_p (call_args)))
- ret = build_min_nt_call_vec (function, call_args);
-   else if (!BASELINK_P (fn))
- ret = finish_call_expr (function, &call_args,
-  /*disallow_virtual=*/false,
-  /*koenig_p=*/false,
-  complain);
-   else
- ret = (build_new_method_call
- (instance, fn,
-  &call_args, NULL_TREE,
-  qualified_p ? LOOKUP_NONVIRTUAL : LOOKUP_NORMAL,
-  /*fn_p=*/NULL,
-  complain));
- }
else if (concept_check_p (function))
  {
/* FUNCTION is a template-id referring to a concept definition.  */
diff --git a/gcc/cp/semantics.cc b/gcc/cp/semantics.cc
index 1d478f0781f..412eaa12851 100644
--- a/gcc/cp/semantics.cc
+++ b/gcc/cp/semantics.cc
@@ -2793,18 +2793,19 @@ finish_call_expr (tree fn, vec **args, 
bool disallow_virtual,
 (c++/89780, c++/107363).  This also suppresses the
 -Wredundant-move warning.  */
  suppress_warning (result, OPT_Wpessimizing_move);
- if (is_overloaded_fn (fn))
-   fn = get_fns (fn);
 
  if (cfun)
{
  bool abnormal = true;
- for (lkp_iterator iter (fn); abnormal && iter; ++iter)
+ for (lkp_iterator iter (maybe_get_fns (fn)); iter; ++iter)
{
  tree fndecl = STRIP_TEMPLATE (*iter);
  if (TREE_CODE (fndecl) != FUNCTION_DECL
  || !TREE_THIS_VOLATILE (fndecl))
-   abnormal = false;
+   {
+ abnormal = false;
+ break;
+   }
}
  /* FIXME: Stop warning abou

Re: [PATCH 01/12] [contrib] validate_failures.py: Avoid testsuite aliasing

2023-09-26 Thread Bernhard Reutner-Fischer
Hi Maxim!

On Mon, 5 Jun 2023 18:06:25 +0400
Maxim Kuvyrkov via Gcc-patches  wrote:

> > On Jun 3, 2023, at 19:17, Jeff Law  wrote:
> > 
> > On 6/2/23 09:20, Maxim Kuvyrkov via Gcc-patches wrote:  
> >> This patch adds tracking of current testsuite "tool" and "exp"
> >> to the processing of .sum files.  This avoids aliasing between
> >> tests from different testsuites with same name+description.
> >> E.g., this is necessary for testsuite/c-c++-common, which is ran
> >> for both gcc and g++ "tools".
> >> This patch changes manifest format from ...
> >> 
> >> FAIL: gcc_test
> >> FAIL: g++_test
> >> 
> >> ... to ...
> >> 
> >> === gcc tests ===
> >> Running gcc/foo.exp ...
> >> FAIL: gcc_test
> >> === gcc Summary ==
> >> === g++ tests ===
> >> Running g++/bar.exp ...
> >> FAIL: g++_test
> >> === g++ Summary ==
> >> .
> >> The new format uses same formatting as DejaGnu's .sum files
> >> to specify which "tool" and "exp" the test belongs to.  
> > I think the series is fine.  You're not likely to hear from Diego or Doug I 
> > suspect, I don't think either are involved in GNU stuff anymore.
> >   
> 
> Thanks, Jeff.  I'll wait for a couple of days and will merge if there are no 
> new comments.

Maxim, may i ask you to have a look at the following problem, please?

ISTM that your exp code does not work as expected for go, maybe you
forgot to test the changes with go enabled?

Ever since your changes in summer i see the following:

gcc-14.mine$ 
/scratch/src/gcc-14.mine/contrib/testsuite-management/validate_failures.py 
--clean_build ../gcc-14.orig/
Getting actual results from build directory .
./gcc/testsuite/go/go.sum
./gcc/testsuite/gcc/gcc.sum
./gcc/testsuite/objc/objc.sum
./gcc/testsuite/jit/jit.sum
./gcc/testsuite/gdc/gdc.sum
./gcc/testsuite/gnat/gnat.sum
./gcc/testsuite/ada/acats/acats.sum
./gcc/testsuite/g++/g++.sum
./gcc/testsuite/obj-c++/obj-c++.sum
./gcc/testsuite/rust/rust.sum
./gcc/testsuite/gfortran/gfortran.sum
./x86_64-pc-linux-gnu/libgomp/testsuite/libgomp.sum
./x86_64-pc-linux-gnu/libphobos/testsuite/libphobos.sum
./x86_64-pc-linux-gnu/libstdc++-v3/testsuite/libstdc++.sum
./x86_64-pc-linux-gnu/libffi/testsuite/libffi.sum
./x86_64-pc-linux-gnu/libitm/testsuite/libitm.sum
./x86_64-pc-linux-gnu/libgo/libgo.sum
./x86_64-pc-linux-gnu/libatomic/testsuite/libatomic.sum
./gotools/gotools.sum
.sum file seems to be broken: tool="gotools", exp="None", summary_line="FAIL: 
TestScript"
Traceback (most recent call last):
  File 
"/scratch/src/gcc-14.mine/contrib/testsuite-management/validate_failures.py", 
line 732, in 
retval = Main(sys.argv)
  File 
"/scratch/src/gcc-14.mine/contrib/testsuite-management/validate_failures.py", 
line 721, in Main
retval = CompareBuilds()
  File 
"/scratch/src/gcc-14.mine/contrib/testsuite-management/validate_failures.py", 
line 622, in CompareBuilds
actual = GetResults(sum_files)
  File 
"/scratch/src/gcc-14.mine/contrib/testsuite-management/validate_failures.py", 
line 466, in GetResults
build_results.update(ParseSummary(sum_fname))
  File 
"/scratch/src/gcc-14.mine/contrib/testsuite-management/validate_failures.py", 
line 405, in ParseSummary
result = result_set.MakeTestResult(line, ordinal)
  File 
"/scratch/src/gcc-14.mine/contrib/testsuite-management/validate_failures.py", 
line 239, in MakeTestResult
return TestResult(summary_line, ordinal,
  File 
"/scratch/src/gcc-14.mine/contrib/testsuite-management/validate_failures.py", 
line 151, in __init__
raise
RuntimeError: No active exception to reraise


The problem seems to be that gotools.sum does not mention any ".exp"
files.

$ grep "Running " gotools/gotools.sum 
Running cmd/go
Running runtime
Running cgo
Running carchive
Running cmd/vet
Running embed
$ grep -c "\.exp" gotools/gotools.sum 
0

The .sum files looks like this:
---8<---
Test Run By foo on Tue Sep 26 14:46:48 CEST 2023
Native configuration is x86_64-foo-linux-gnu

=== gotools tests ===

Running cmd/go
UNTESTED: TestAccidentalGitCheckout
PASS: TestAlwaysLinkSysoFiles
...
UNTESTED: TestParallelTest
FAIL: TestScript
...
---8<---

May i ask you to have a look, please?

TIA,


[PATCH] RFC: Add late-combine pass [PR106594]

2023-09-26 Thread Richard Sandiford
This patch adds a combine pass that runs late in the pipeline.
There are two instances: one between combine and split1, and one
after postreload.

The pass currently has a single objective: remove definitions by
substituting into all uses.  The pre-RA version tries to restrict
itself to cases that are likely to have a neutral or beneficial
effect on register pressure.

The patch fixes PR106594.  It also fixes a few FAILs and XFAILs
in the aarch64 test results, mostly due to making proper use of
MOVPRFX in cases where we didn't previously.  I hope it would
also help with Robin's vec_duplicate testcase, although the
pressure heuristic might need tweaking for that case.

This is just a first step..  I'm hoping that the pass could be
used for other combine-related optimisations in future.  In particular,
the post-RA version doesn't need to restrict itself to cases where all
uses are substitutitable, since it doesn't have to worry about register
pressure.  If we did that, and if we extended it to handle multi-register
REGs, the pass might be a viable replacement for regcprop, which in
turn might reduce the cost of having a post-RA instance of the new pass.

I've run an assembly comparison with one target per CPU directory,
and it seems to be a win for all targets except nvptx (which is hard
to measure, being a higher-level asm).  The biggest winner seemed
to be AVR.

However, if a version of the pass does go in, it might be better
to enable it by default only on targets where the extra compile
time seems to be worth it.  IMO, fixing PR106594 and the MOVPRFX
issues makes it worthwhile for AArch64.

The patch contains various bug fixes and new helper routines.
I'd submit those separately in the final version.  Because of
that, there's no GNU changelog yet.

Bootstrapped & regression tested on aarch64-linux-gnu so far.

Thanks,
Richard


---
 gcc/Makefile.in   |   2 +
 gcc/config/aarch64/aarch64.cc |  25 +
 gcc/config/aarch64/atomics.md |   2 +-
 gcc/late-combine.cc   | 706 ++
 gcc/passes.def|   2 +
 gcc/recog.cc  |  42 +-
 gcc/reload.cc |   6 -
 gcc/rtl-ssa.h |   1 +
 gcc/rtl-ssa/access-utils.h|  65 +-
 gcc/rtl-ssa/accesses.cc   |  91 +++
 gcc/rtl-ssa/blocks.cc |   4 +
 gcc/rtl-ssa/changes.cc|  74 +-
 gcc/rtl-ssa/functions.cc  |   2 +-
 gcc/rtl-ssa/functions.h   |  15 +
 gcc/rtl-ssa/insns.cc  |   2 +
 gcc/rtl-ssa/member-fns.inl|  11 +-
 gcc/rtl-ssa/movement.cc   |  40 +
 gcc/rtl-ssa/movement.h|   7 +-
 gcc/testsuite/gcc.dg/ira-shrinkwrap-prep-1.c  |   2 +-
 gcc/testsuite/gcc.dg/stack-check-4.c  |   2 +-
 .../gcc.target/aarch64/sve/cond_asrd_3.c  |  10 +-
 .../gcc.target/aarch64/sve/cond_convert_3.c   |   8 +-
 .../gcc.target/aarch64/sve/cond_convert_6.c   |   8 +-
 .../gcc.target/aarch64/sve/cond_fabd_5.c  |  11 +-
 .../gcc.target/aarch64/sve/cond_unary_4.c |  13 +-
 gcc/tree-pass.h   |   1 +
 26 files changed, 1059 insertions(+), 93 deletions(-)
 create mode 100644 gcc/late-combine.cc
 create mode 100644 gcc/rtl-ssa/movement.cc

diff --git a/gcc/Makefile.in b/gcc/Makefile.in
index 6d608db4dd2..b5dc3b9ed47 100644
--- a/gcc/Makefile.in
+++ b/gcc/Makefile.in
@@ -1542,6 +1542,7 @@ OBJS = \
ira-lives.o \
jump.o \
langhooks.o \
+   late-combine.o \
lcm.o \
lists.o \
loop-doloop.o \
@@ -1623,6 +1624,7 @@ OBJS = \
rtl-ssa/changes.o \
rtl-ssa/functions.o \
rtl-ssa/insns.o \
+   rtl-ssa/movement.o \
rtl-tests.o \
rtl.o \
rtlhash.o \
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 219c4ee6d4c..d2ac657e017 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -15495,6 +15495,28 @@ aarch64_memory_move_cost (machine_mode mode, 
reg_class_t rclass_i, bool in)
  : aarch64_tune_params.memmov_cost.store_int);
 }
 
+/* Implement TARGET_INSN_COST.  We have the opportunity to do something
+   much more productive here, such as using insn attributes to cost things.
+   But we don't, not yet.
+
+   The main point of this current definition is to make calling insn_cost
+   on one instruction equivalent to calling seq_cost on a sequence that
+   contains only that instruction.  The default definition would instead
+   only look at SET_SRCs, ignoring SET_DESTs.
+
+   This ensures that, for example, storing a 128-bit zero vector is more
+   expensive than storing a 128-bit vector register.  A move of zero
+   into a 128-bit vector register followed by multiple stores of that
+   registe

Re: RISC-V: Added support for CRC.

2023-09-26 Thread Alexander Monakov


On Tue, 26 Sep 2023, Jeff Law wrote:

> What ultimately pushed us to keep moving forward on this effort was
> discovering numerous CRC loop implementations out in the wild, including 4
> implementations (IIRC) in the kernel itself.

The kernel employs bitwise CRC only in look-up table generators.
Which run at build time. You are quite literally slowing down the compiler
in order to speed up generators that don't account for even one millisecond
of kernel build time, and have no relation to its run-time performance.

(incidentally you can't detect the actual CRC impls using those tables)

> And as I've stated before, the latency of clmuls is dropping.   I wouldn't be
> terribly surprised to see single cycle clmul implmementations showing up
> within the next 18-24 months.  It's really just a matter of gate budget vs
> expected value.

In a commercial implementation? I'll take that bet. You spend gates budget
like that after better avenues for raising ILP are exhausted (like adding
more ALUs that can do clmul at a reasonable 3c-4c latency).

> To reiterate the real goal here is to take code as-is and make it
> significantly faster.

Which code? Table generators in the kernel and xz-utils? 

> While the original target was Coremark, we've found similar bitwise
> implementations of CRCs all over the place. There's no good reason that code
> should have to change.

But did you look at them? There's no point to optimize table generators either.

Alexander


[PATCH] vect, omp: inbranch simdclone dropping const

2023-09-26 Thread Andre Vieira (lists)
The const attribute is ignored when simdclone's are used inbranch. This 
is due to the fact that when analyzing a MASK_CALL we were not looking 
at the targeted function for flags, but instead only at the internal 
function call itself.
This patch adds code to make sure we look at the target function to 
check for the const attribute and enables the autovectorization of 
inbranch const simdclones without needing the loop to be adorned the 
'openmp simd' pragma.


Not sure about how to add new includes to the ChangeLog. Which brings me 
to another point, I contemplated changing gimple_call_flags to do the 
checking of flags of the first argument of IFN_MASK_CALL itself rather 
than only calling internal_fn_flags on gimple_call_internal_fn (stmt), 
but that might be a bit too intrusive, opinions welcome :)


Bootstrapped and regression tested on aarch64-unknown-linux-gnu and 
x86_64-pc-linux-gnu.


Is this OK for trunk?

gcc/ChangeLog:

* tree-vect-data-ref.cc (include calls.h): Add new include.
(get_references_in_stmt): Correctly handle IFN_MASK_CALL.

gcc/testsuite/ChangeLog:

* gcc.dg/vect/vect-simd-clone-19.c: New test.diff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-19.c 
b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-19.c
new file mode 100644
index 
..09127b8cb6f2e3699b6073591f58be7047330273
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-19.c
@@ -0,0 +1,23 @@
+/* { dg-require-effective-target vect_simd_clones } */
+/* { dg-do compile } */
+/* { dg-additional-options "-fopenmp-simd" } */
+
+int __attribute__ ((__simd__, const)) fn (int);
+
+void test (int * __restrict__ a, int * __restrict__ b, int n)
+{
+  for (int i = 0; i < n; ++i)
+{
+  int a_;
+  if (b[i] > 0)
+a_ = fn (b[i]);
+  else
+a_ = b[i] + 5;
+  a[i] = a_;
+}
+}
+
+/* { dg-final { scan-tree-dump-not {loop contains function calls or data 
references} "vect" } } */
+
+/* The LTO test produces two dump files and we scan the wrong one.  */
+/* { dg-skip-if "" { *-*-* } { "-flto" } { "" } } */
diff --git a/gcc/tree-data-ref.cc b/gcc/tree-data-ref.cc
index 
6d3b7c2290e4db9c1168a4c763facb481157c97c..2926c3925ee7897fef53c16cfd1d19d23dbf05f3
 100644
--- a/gcc/tree-data-ref.cc
+++ b/gcc/tree-data-ref.cc
@@ -100,6 +100,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "vr-values.h"
 #include "range-op.h"
 #include "tree-ssa-loop-ivopts.h"
+#include "calls.h"
 
 static struct datadep_stats
 {
@@ -5816,6 +5817,18 @@ get_references_in_stmt (gimple *stmt, vec *references)
}
  case IFN_MASK_LOAD:
  case IFN_MASK_STORE:
+ case IFN_MASK_CALL:
+   {
+ tree orig_fndecl
+   = gimple_call_addr_fndecl (gimple_call_arg (stmt, 0));
+ if (!orig_fndecl)
+   {
+ clobbers_memory = true;
+ break;
+   }
+ if ((flags_from_decl_or_type (orig_fndecl) & ECF_CONST) == 0)
+   clobbers_memory = true;
+   }
break;
  default:
clobbers_memory = true;
@@ -5852,7 +5865,7 @@ get_references_in_stmt (gimple *stmt, vec *references)
 }
   else if (stmt_code == GIMPLE_CALL)
 {
-  unsigned i, n;
+  unsigned i  = 0, n;
   tree ptr, type;
   unsigned int align;
 
@@ -5879,13 +5892,15 @@ get_references_in_stmt (gimple *stmt, vec *references)
   ptr);
references->safe_push (ref);
return false;
+ case IFN_MASK_CALL:
+   i = 1;
  default:
break;
  }
 
   op0 = gimple_call_lhs (stmt);
   n = gimple_call_num_args (stmt);
-  for (i = 0; i < n; i++)
+  for (; i < n; i++)
{
  op1 = gimple_call_arg (stmt, i);
 


Re: [PATCH] fwprop: Allow UNARY_P and check register pressure.

2023-09-26 Thread Richard Sandiford
Robin Dapp via Gcc-patches  writes:
> Thanks for looking at it in detail.
>
>> Yeah, I think this is potentially a blocker for propagating A into B
>> when A is used elsewhere.  Combine is able to combine A and B while
>> keeping A in parallel with the result.  I think either fwprop would
>> need to try that too, or it would need to be restricted to cases where A
>> is only used in B.
>
> That seems a rather severe limitation and my original use case would
> not get optimized considerably anymore.  The intention was to replace
> all uses (if register pressure allows).  Of course the example is simple
> enough that a propagation is always useful if the costs allow it, so
> it might not be representative.
>
> I'm wondering if we could (my original misunderstanding) tentatively
> try to propagate into all uses of a definition and, when reaching
> a certain ratio, decide that it might be worth it, otherwise revert.
> Would be very crude though, and not driven by the actual problem we're
> trying to avoid. 
>
>> I think the summary is:
>> 
>> IMO, we have to be mindful that combine is still to run.  We need to
>> avoid making equal-cost changes if the new form is more complex, or
>> otherwise likely to interfere with combine.
>
> I guess we don't have a good measure for complexity or "combinability"
> and even lower-cost changes could result in worse options later.
> Would it make sense to have a strict less-than cost policy for those
> more complex propagations?  Or do you consider the approach in its
> current shape "hopeless", given the complications we discussed?
>
>> Alternatively, we could delay the optimisation until after combine
>> and have freer rein, since we're then just mopping up opportunities
>> that other passes left behind.
>> 
>> A while back I was experimenting with a second combine pass.  That was
>> the original motiviation for rtl-ssa.  I never got chance to finish it
>> off though.
>
> This doesn't sound like something that would still materialize before
> the end of stage 1 :)
> Do you see any way of restricting the current approach to make it less
> intrusive and still worthwhile?  Limiting to vec_duplicate might be
> much too arbitrary but would still help for my original example.

FWIW, I sent an RFC for a late-combine pass that might help:

  https://gcc.gnu.org/pipermail/gcc-patches/2023-September/631406.html

I think it'll need some tweaking for your use case, but hopefully
it's "just" a case of expanding the register pressure tests.

Thanks,
Richard



Re: [PATCH] vect, omp: inbranch simdclone dropping const

2023-09-26 Thread Andrew Stubbs

I don't have authority to approve anything, but here's a review anyway.

Thanks for working on this.

On 26/09/2023 17:24, Andre Vieira (lists) wrote:
The const attribute is ignored when simdclone's are used inbranch. This 
is due to the fact that when analyzing a MASK_CALL we were not looking 
at the targeted function for flags, but instead only at the internal 
function call itself.
This patch adds code to make sure we look at the target function to 
check for the const attribute and enables the autovectorization of 
inbranch const simdclones without needing the loop to be adorned the 
'openmp simd' pragma.


Not sure about how to add new includes to the ChangeLog. Which brings me 
to another point, I contemplated changing gimple_call_flags to do the 
checking of flags of the first argument of IFN_MASK_CALL itself rather 
than only calling internal_fn_flags on gimple_call_internal_fn (stmt), 
but that might be a bit too intrusive, opinions welcome :)


Bootstrapped and regression tested on aarch64-unknown-linux-gnu and 
x86_64-pc-linux-gnu.


Is this OK for trunk?

gcc/ChangeLog:

     * tree-vect-data-ref.cc (include calls.h): Add new include.
     (get_references_in_stmt): Correctly handle IFN_MASK_CALL.

gcc/testsuite/ChangeLog:

     * gcc.dg/vect/vect-simd-clone-19.c: New test.



diff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-19.c 
b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-19.c
new file mode 100644
index 
..09127b8cb6f2e3699b6073591f58be7047330273
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-19.c
@@ -0,0 +1,23 @@
+/* { dg-require-effective-target vect_simd_clones } */
+/* { dg-do compile } */
+/* { dg-additional-options "-fopenmp-simd" } */
+


Do you need -fopenmp-simd for this?


+ tree orig_fndecl
+   = gimple_call_addr_fndecl (gimple_call_arg (stmt, 0));
+ if (!orig_fndecl)
+   {
+ clobbers_memory = true;
+ break;
+   }
+ if ((flags_from_decl_or_type (orig_fndecl) & ECF_CONST) == 0)
+   clobbers_memory = true;
+   }
break;
 


Can be simplified:

  if (!orig_fndecl
  || (flags_from_decl_or_type (orig_fndecl) & ECF_CONST) == 0)
clobbers_memory = true;
  break;


@@ -5852,7 +5865,7 @@ get_references_in_stmt (gimple *stmt, vec *references)
 }
   else if (stmt_code == GIMPLE_CALL)
 {
-  unsigned i, n;
+  unsigned i  = 0, n;
   tree ptr, type;
   unsigned int align;
 


Rogue space.


@@ -5879,13 +5892,15 @@ get_references_in_stmt (gimple *stmt, vec *references)
   ptr);
references->safe_push (ref);
return false;
+ case IFN_MASK_CALL:
+   i = 1;
  default:
break;
  }
 


If the fall-through is deliberate please add a /* FALLTHROUGH */ comment 
(or whatever spelling disables the warning).


Andrew


Re: [PATCH] vect, omp: inbranch simdclone dropping const

2023-09-26 Thread Tobias Burnus

On 26.09.23 18:37, Andrew Stubbs wrote:

If the fall-through is deliberate please add a /* FALLTHROUGH */
comment (or whatever spelling disables the warning).


It's: gcc_fallthrough ();

Which gets converted to "__attribute__((fallthrough))"; it could also
expand to "[[fallthrough]]" but that's C++17 (and, also, an C23 feature
- albeit so far unimplemented in gcc).

Tobias

-
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 
München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas 
Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht 
München, HRB 106955


Re: [PATCH] vect, omp: inbranch simdclone dropping const

2023-09-26 Thread Jakub Jelinek
On Tue, Sep 26, 2023 at 05:24:26PM +0100, Andre Vieira (lists) wrote:
> @@ -5816,6 +5817,18 @@ get_references_in_stmt (gimple *stmt, 
> vec *references)
>   }
> case IFN_MASK_LOAD:
> case IFN_MASK_STORE:
> +   case IFN_MASK_CALL:
> + {
> +   tree orig_fndecl
> + = gimple_call_addr_fndecl (gimple_call_arg (stmt, 0));
> +   if (!orig_fndecl)
> + {
> +   clobbers_memory = true;
> +   break;
> + }
> +   if ((flags_from_decl_or_type (orig_fndecl) & ECF_CONST) == 0)
> + clobbers_memory = true;
> + }

Should IFN_MASK_LOAD/STORE really go through this?  I thought those have
first argument address of the memory being conditionally loaded/stored, not
function address.

Jakub



[PATCH] RISC-V: Use stdint-gcc.h in rvv testsuite

2023-09-26 Thread Patrick O'Neill
stdint.h can be replaced with stdint-gcc.h to resolve some missing
system headers in non-multilib installations.

Tested using glibc rv32gcv and rv64gcv on r14-4258-gc9837443075.

gcc/ChangeLog:

* config/riscv/riscv_vector.h (__RISCV_VECTOR_H): Replace
stdint.h with stdint-gcc.h

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/cond/cond_convert_float2float-1.h:
Replace stdint.h with stdint-gcc.h.
* gcc.target/riscv/rvv/autovec/cond/cond_convert_float2float-2.h:
Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_convert_float2int-1.h:
Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_convert_float2int-2.h:
Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_convert_int2float-1.h:
Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_convert_int2float-2.h:
Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_convert_int2int-1.h:
Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_convert_int2int-2.h:
Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_sqrt-1.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_sqrt-2.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_unary-1.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_unary-2.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_unary-3.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_unary-4.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_unary-5.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_unary-6.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_unary-7.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_unary-8.c: Ditto.
* gcc.target/riscv/rvv/autovec/partial/slp-8.c: Ditto.
* gcc.target/riscv/rvv/autovec/partial/slp-9.c: Ditto.
* gcc.target/riscv/rvv/autovec/pr111232.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls-vlmax/perm.h: Ditto.
* gcc.target/riscv/rvv/base/abi-call-args-4-run.c: Ditto.
* gcc.target/riscv/rvv/base/pr110119-2.c: Ditto.
* gcc.target/riscv/rvv/vsetvl/pr111255.c: Ditto.
* gcc.target/riscv/rvv/vsetvl/wredsum_vlmax.c: Ditto.

Signed-off-by: Patrick O'Neill 
---
Failures looked like this:
In file included from 
/riscv-gnu-toolchain/build/sysroot/usr/include/features.h:515,
  from 
/riscv-gnu-toolchain/build/sysroot/usr/include/bits/libc-header-start.h:33,
  from /riscv-gnu-toolchain/build/sysroot/usr/include/stdint.h:26,
  from 
/riscv-gnu-toolchain/build/lib/gcc/riscv32-unknown-linux-gnu/14.0.0/include/stdint.h:9,
  from 
/riscv-gnu-toolchain/build/build-gcc-linux-stage2/gcc/include/stdint.h:9,
  from 
/riscv-gnu-toolchain/build/build-gcc-linux-stage2/gcc/include/riscv_vector.h:28,
  from 
/riscv-gnu-toolchain/gcc/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul1-7.c:4:
/riscv-gnu-toolchain/build/sysroot/usr/include/gnu/stubs.h:8:11: fatal error: 
gnu/stubs-ilp32.h: No such file or directory

Resolves these failures on rv64gcv (non-multilib):
FAIL: gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul1-7.c -O3 -ftree-vectorize 
--param riscv-autovec-lmul=dynamic (test for excess errors)
FAIL: gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-4.c -O3 -ftree-vectorize 
--param riscv-autovec-lmul=dynamic (test for excess errors)
FAIL: gcc.target/riscv/rvv/autovec/cond/cond_convert_float2float-rv32-1.c (test 
for excess errors)
FAIL: gcc.target/riscv/rvv/autovec/cond/cond_convert_float2float-rv32-1.c (test 
for excess errors)
FAIL: gcc.target/riscv/rvv/autovec/cond/cond_convert_float2float-rv32-1.c (test 
for excess errors)
FAIL: gcc.target/riscv/rvv/autovec/cond/cond_convert_float2float-rv32-1.c (test 
for excess errors)
FAIL: gcc.target/riscv/rvv/autovec/cond/cond_convert_float2float-rv32-1.c (test 
for excess errors)
FAIL: gcc.target/riscv/rvv/autovec/cond/cond_convert_float2float-rv32-1.c (test 
for excess errors)
FAIL: gcc.target/riscv/rvv/autovec/cond/cond_convert_float2float-rv32-1.c (test 
for excess errors)
FAIL: gcc.target/riscv/rvv/autovec/cond/cond_convert_float2float-rv32-1.c (test 
for excess errors)
FAIL: gcc.target/riscv/rvv/autovec/cond/cond_convert_float2float-rv32-2.c (test 
for excess errors)
FAIL: gcc.target/riscv/rvv/autovec/cond/cond_convert_float2float-rv32-2.c (test 
for excess errors)
FAIL: gcc.target/riscv/rvv/autovec/cond/cond_convert_float2float-rv32-2.c (test 
for excess errors)
FAIL: gcc.target/riscv/rvv/autovec/cond/cond_convert_float2float-rv32-2.c (test 
for excess errors)
FAIL: gcc.target/riscv/rvv/autovec/cond/cond_convert_float2float-rv32-2.c (test 
for excess errors)
FAIL: gcc.target/riscv/rvv/autovec/cond/cond_convert_float2float-rv32-2.c (test 
for excess errors)
FAIL: gcc.target/riscv/rvv/autovec/cond/cond_convert_float2float-rv32-2.c (test 
for excess errors)
FAIL: gcc.target/riscv/rvv/autovec/cond/cond_convert_float2float-rv32-2.c (test 
for excess errors)

[COMMITTED][GCC13] PR tree-optimization/110315 - Reduce the initial size of int_range_max.

2023-09-26 Thread Andrew MacLeod
This patch adds the ability to resize ranges as needed, defaulting to no 
resizing.  int_range_max now defaults to 3 sub-ranges (instead of 255) 
and grows to 255 when the range being calculated does not fit.


Bootstraps on x86_64-pc-linux-gnu with no regressions.   Pushed.

Andrew
From 70639014a69cf50fe11dc1adbfe1db4c7760ce69 Mon Sep 17 00:00:00 2001
From: Andrew MacLeod 
Date: Tue, 26 Sep 2023 09:44:39 -0400
Subject: [PATCH] Reduce the initial size of int_range_max.

This patch adds the ability to resize ranges as needed, defaulting to no
resizing.  int_range_max now defaults to 3 sub-ranges (instead of 255)
and grows to 255 when the range being calculated does not fit.

	PR tree-optimization/110315
	* value-range-storage.h (vrange_allocator::alloc_irange): Adjust
	new params.
	* value-range.cc (irange::operator=): Resize range.
	(irange::irange_union): Same.
	(irange::irange_intersect): Same.
	(irange::invert): Same.
	* value-range.h (irange::maybe_resize): New.
	(~int_range): New.
	(int_range_max): Default to 3 sub-ranges and resize as needed.
	(int_range::int_range): Adjust for resizing.
	(int_range::operator=): Same.
---
 gcc/value-range-storage.h |  2 +-
 gcc/value-range.cc| 15 ++
 gcc/value-range.h | 96 +++
 3 files changed, 83 insertions(+), 30 deletions(-)

diff --git a/gcc/value-range-storage.h b/gcc/value-range-storage.h
index 6da377ebd2e..1ed6f1ccd61 100644
--- a/gcc/value-range-storage.h
+++ b/gcc/value-range-storage.h
@@ -184,7 +184,7 @@ vrange_allocator::alloc_irange (unsigned num_pairs)
   // Allocate the irange and required memory for the vector.
   void *r = alloc (sizeof (irange));
   tree *mem = static_cast  (alloc (nbytes));
-  return new (r) irange (mem, num_pairs);
+  return new (r) irange (mem, num_pairs, /*resizable=*/false);
 }
 
 inline frange *
diff --git a/gcc/value-range.cc b/gcc/value-range.cc
index ec826c2fe1b..753f5e8cc76 100644
--- a/gcc/value-range.cc
+++ b/gcc/value-range.cc
@@ -831,6 +831,10 @@ irange::operator= (const irange &src)
   copy_to_legacy (src);
   return *this;
 }
+
+  int needed = src.num_pairs ();
+  maybe_resize (needed);
+
   if (src.legacy_mode_p ())
 {
   copy_legacy_to_multi_range (src);
@@ -2506,6 +2510,7 @@ irange::irange_union (const irange &r)
   // Now it simply needs to be copied, and if there are too many
   // ranges, merge some.  We wont do any analysis as to what the
   // "best" merges are, simply combine the final ranges into one.
+  maybe_resize (i / 2);
   if (i > m_max_ranges * 2)
 {
   res[m_max_ranges * 2 - 1] = res[i - 1];
@@ -2605,6 +2610,11 @@ irange::irange_intersect (const irange &r)
   if (r.irange_contains_p (*this))
 return intersect_nonzero_bits (r);
 
+  // ?? We could probably come up with something smarter than the
+  // worst case scenario here.
+  int needed = num_pairs () + r.num_pairs ();
+  maybe_resize (needed);
+
   signop sign = TYPE_SIGN (TREE_TYPE(m_base[0]));
   unsigned bld_pair = 0;
   unsigned bld_lim = m_max_ranges;
@@ -2831,6 +2841,11 @@ irange::invert ()
   m_num_ranges = 1;
   return;
 }
+
+  // At this point, we need one extra sub-range to represent the
+  // inverse.
+  maybe_resize (m_num_ranges + 1);
+
   // The algorithm is as follows.  To calculate INVERT ([a,b][c,d]), we
   // generate [-MIN, a-1][b+1, c-1][d+1, MAX].
   //
diff --git a/gcc/value-range.h b/gcc/value-range.h
index 969b2b68418..96e59ecfa72 100644
--- a/gcc/value-range.h
+++ b/gcc/value-range.h
@@ -172,7 +172,8 @@ public:
   bool legacy_verbose_intersect (const irange *);	// DEPRECATED
 
 protected:
-  irange (tree *, unsigned);
+  void maybe_resize (int needed);
+  irange (tree *, unsigned nranges, bool resizable);
   // potential promotion to public?
   tree tree_lower_bound (unsigned = 0) const;
   tree tree_upper_bound (unsigned) const;
@@ -200,6 +201,8 @@ protected:
   void copy_to_legacy (const irange &);
   void copy_legacy_to_multi_range (const irange &);
 
+  // Hard limit on max ranges allowed.
+  static const int HARD_MAX_RANGES = 255;
 private:
   friend void gt_ggc_mx (irange *);
   friend void gt_pch_nx (irange *);
@@ -214,15 +217,21 @@ private:
 
   bool intersect (const wide_int& lb, const wide_int& ub);
   unsigned char m_num_ranges;
+  bool m_resizable;
   unsigned char m_max_ranges;
   tree m_nonzero_mask;
+protected:
   tree *m_base;
 };
 
 // Here we describe an irange with N pairs of ranges.  The storage for
 // the pairs is embedded in the class as an array.
+//
+// If RESIZABLE is true, the storage will be resized on the heap when
+// the number of ranges needed goes past N up to a max of
+// HARD_MAX_RANGES.  This new storage is freed upon destruction.
 
-template
+template
 class GTY((user)) int_range : public irange
 {
 public:
@@ -233,7 +242,7 @@ public:
   int_range (tree type);
   int_range (const int_range &);
   int_range (const irange &);
-  virtual ~int_range () = default;
+  virtual ~int_range ();
  

Re: [PATCH] RISC-V: Use stdint-gcc.h in rvv testsuite

2023-09-26 Thread Andrew Pinski
On Tue, Sep 26, 2023 at 10:59 AM Patrick O'Neill  wrote:
>
> stdint.h can be replaced with stdint-gcc.h to resolve some missing
> system headers in non-multilib installations.
>
> Tested using glibc rv32gcv and rv64gcv on r14-4258-gc9837443075.
>
> gcc/ChangeLog:
>
> * config/riscv/riscv_vector.h (__RISCV_VECTOR_H): Replace
> stdint.h with stdint-gcc.h

I don't think this will work when testing an installed compiler which I do.

Thanks,
Andrew

>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/riscv/rvv/autovec/cond/cond_convert_float2float-1.h:
> Replace stdint.h with stdint-gcc.h.
> * gcc.target/riscv/rvv/autovec/cond/cond_convert_float2float-2.h:
> Ditto.
> * gcc.target/riscv/rvv/autovec/cond/cond_convert_float2int-1.h:
> Ditto.
> * gcc.target/riscv/rvv/autovec/cond/cond_convert_float2int-2.h:
> Ditto.
> * gcc.target/riscv/rvv/autovec/cond/cond_convert_int2float-1.h:
> Ditto.
> * gcc.target/riscv/rvv/autovec/cond/cond_convert_int2float-2.h:
> Ditto.
> * gcc.target/riscv/rvv/autovec/cond/cond_convert_int2int-1.h:
> Ditto.
> * gcc.target/riscv/rvv/autovec/cond/cond_convert_int2int-2.h:
> Ditto.
> * gcc.target/riscv/rvv/autovec/cond/cond_sqrt-1.c: Ditto.
> * gcc.target/riscv/rvv/autovec/cond/cond_sqrt-2.c: Ditto.
> * gcc.target/riscv/rvv/autovec/cond/cond_unary-1.c: Ditto.
> * gcc.target/riscv/rvv/autovec/cond/cond_unary-2.c: Ditto.
> * gcc.target/riscv/rvv/autovec/cond/cond_unary-3.c: Ditto.
> * gcc.target/riscv/rvv/autovec/cond/cond_unary-4.c: Ditto.
> * gcc.target/riscv/rvv/autovec/cond/cond_unary-5.c: Ditto.
> * gcc.target/riscv/rvv/autovec/cond/cond_unary-6.c: Ditto.
> * gcc.target/riscv/rvv/autovec/cond/cond_unary-7.c: Ditto.
> * gcc.target/riscv/rvv/autovec/cond/cond_unary-8.c: Ditto.
> * gcc.target/riscv/rvv/autovec/partial/slp-8.c: Ditto.
> * gcc.target/riscv/rvv/autovec/partial/slp-9.c: Ditto.
> * gcc.target/riscv/rvv/autovec/pr111232.c: Ditto.
> * gcc.target/riscv/rvv/autovec/vls-vlmax/perm.h: Ditto.
> * gcc.target/riscv/rvv/base/abi-call-args-4-run.c: Ditto.
> * gcc.target/riscv/rvv/base/pr110119-2.c: Ditto.
> * gcc.target/riscv/rvv/vsetvl/pr111255.c: Ditto.
> * gcc.target/riscv/rvv/vsetvl/wredsum_vlmax.c: Ditto.
>
> Signed-off-by: Patrick O'Neill 
> ---
> Failures looked like this:
> In file included from 
> /riscv-gnu-toolchain/build/sysroot/usr/include/features.h:515,
>   from 
> /riscv-gnu-toolchain/build/sysroot/usr/include/bits/libc-header-start.h:33,
>   from /riscv-gnu-toolchain/build/sysroot/usr/include/stdint.h:26,
>   from 
> /riscv-gnu-toolchain/build/lib/gcc/riscv32-unknown-linux-gnu/14.0.0/include/stdint.h:9,
>   from 
> /riscv-gnu-toolchain/build/build-gcc-linux-stage2/gcc/include/stdint.h:9,
>   from 
> /riscv-gnu-toolchain/build/build-gcc-linux-stage2/gcc/include/riscv_vector.h:28,
>   from 
> /riscv-gnu-toolchain/gcc/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul1-7.c:4:
> /riscv-gnu-toolchain/build/sysroot/usr/include/gnu/stubs.h:8:11: fatal error: 
> gnu/stubs-ilp32.h: No such file or directory
>
> Resolves these failures on rv64gcv (non-multilib):
> FAIL: gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul1-7.c -O3 -ftree-vectorize 
> --param riscv-autovec-lmul=dynamic (test for excess errors)
> FAIL: gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-4.c -O3 -ftree-vectorize 
> --param riscv-autovec-lmul=dynamic (test for excess errors)
> FAIL: gcc.target/riscv/rvv/autovec/cond/cond_convert_float2float-rv32-1.c 
> (test for excess errors)
> FAIL: gcc.target/riscv/rvv/autovec/cond/cond_convert_float2float-rv32-1.c 
> (test for excess errors)
> FAIL: gcc.target/riscv/rvv/autovec/cond/cond_convert_float2float-rv32-1.c 
> (test for excess errors)
> FAIL: gcc.target/riscv/rvv/autovec/cond/cond_convert_float2float-rv32-1.c 
> (test for excess errors)
> FAIL: gcc.target/riscv/rvv/autovec/cond/cond_convert_float2float-rv32-1.c 
> (test for excess errors)
> FAIL: gcc.target/riscv/rvv/autovec/cond/cond_convert_float2float-rv32-1.c 
> (test for excess errors)
> FAIL: gcc.target/riscv/rvv/autovec/cond/cond_convert_float2float-rv32-1.c 
> (test for excess errors)
> FAIL: gcc.target/riscv/rvv/autovec/cond/cond_convert_float2float-rv32-1.c 
> (test for excess errors)
> FAIL: gcc.target/riscv/rvv/autovec/cond/cond_convert_float2float-rv32-2.c 
> (test for excess errors)
> FAIL: gcc.target/riscv/rvv/autovec/cond/cond_convert_float2float-rv32-2.c 
> (test for excess errors)
> FAIL: gcc.target/riscv/rvv/autovec/cond/cond_convert_float2float-rv32-2.c 
> (test for excess errors)
> FAIL: gcc.target/riscv/rvv/autovec/cond/cond_convert_float2float-rv32-2.c 
> (test for excess errors)
> FAIL: gcc.target/riscv/rvv/autovec/cond/cond_convert_float2flo

[COMMITTED] PR tree-optimization/111599 - Ensure ssa_name is still valid.

2023-09-26 Thread Andrew MacLeod
When processing an equivalence list, I neglected to make sure the 
ssa-name is still valid.  This patch simply checks to make sure it 
non-null and not in the free-list.


Bootstraps on x86_64-pc-linux-gnu with no regressions.   Pushed.

Andrew
From 9df0f6bd582ceee53bfed8769cf156329ae33bd0 Mon Sep 17 00:00:00 2001
From: Andrew MacLeod 
Date: Tue, 26 Sep 2023 09:27:52 -0400
Subject: [PATCH] Ensure ssa_name is still valid.

When the IL changes, an equivalence set may contain ssa_names that no
longer exist.  Ensure names are still valid and not in the free list.

	PR tree-optimization/111599
	gcc/
	* value-relation.cc (relation_oracle::valid_equivs): Ensure
	ssa_name is valid.

	gcc/testsuite/
	* gcc.dg/pr111599.c: New.
---
 gcc/testsuite/gcc.dg/pr111599.c | 16 
 gcc/value-relation.cc   |  9 ++---
 2 files changed, 22 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/pr111599.c

diff --git a/gcc/testsuite/gcc.dg/pr111599.c b/gcc/testsuite/gcc.dg/pr111599.c
new file mode 100644
index 000..25880b759f7
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/pr111599.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-inline-functions-called-once -fno-inline-small-functions -fno-tree-dce -fno-tree-forwprop -fno-tree-fre" } */
+
+int h(void);
+void l(int);
+void func_56(int p_57, unsigned p_58) {
+ // p_57 = 0x101BC642L;
+  if (p_57 || h()) {
+int *l_105[2];
+l_105[0] = &p_57;
+l(p_57);
+  }
+}
+void func_31(int p_33) {
+  func_56(0x101BC642L, (p_33));
+}
diff --git a/gcc/value-relation.cc b/gcc/value-relation.cc
index f2c668a0193..8fea4aad345 100644
--- a/gcc/value-relation.cc
+++ b/gcc/value-relation.cc
@@ -274,9 +274,12 @@ relation_oracle::valid_equivs (bitmap b, const_bitmap equivs, basic_block bb)
   EXECUTE_IF_SET_IN_BITMAP (equivs, 0, i, bi)
 {
   tree ssa = ssa_name (i);
-  const_bitmap ssa_equiv = equiv_set (ssa, bb);
-  if (ssa_equiv == equivs)
-	bitmap_set_bit (b, i);
+  if (ssa && !SSA_NAME_IN_FREE_LIST (ssa))
+	{
+	  const_bitmap ssa_equiv = equiv_set (ssa, bb);
+	  if (ssa_equiv == equivs)
+	bitmap_set_bit (b, i);
+	}
 }
 }
 
-- 
2.41.0



Re: [PATCH] RISC-V: Use stdint-gcc.h in rvv testsuite

2023-09-26 Thread Patrick O'Neill

On 9/26/23 11:13, Andrew Pinski wrote:

On Tue, Sep 26, 2023 at 10:59 AM Patrick O'Neill  wrote:

stdint.h can be replaced with stdint-gcc.h to resolve some missing
system headers in non-multilib installations.

Tested using glibc rv32gcv and rv64gcv on r14-4258-gc9837443075.

gcc/ChangeLog:

 * config/riscv/riscv_vector.h (__RISCV_VECTOR_H): Replace
 stdint.h with stdint-gcc.h

I don't think this will work when testing an installed compiler which I do.

Thanks,
Andrew

In the riscv target testsuite (gcc.target/riscv) all occurrences of
#include  are currently constrained to the rvv/ subdirectory.
All non-vector tests use #include  rather than
#include . Have you encountered any issues when testing
installations with non-vector tests?

Thanks,
Patrick


Re: [PATCH v3] [RISC-V] Generate Zicond instruction for select pattern with condition eq or neq to 0

2023-09-26 Thread Jeff Law




On 8/1/23 19:38, Xiao Zeng wrote:

This patch recognizes Zicond patterns when the select pattern
with condition eq or neq to 0 (using eq as an example), namely:

1 rd = (rs2 == 0) ? non-imm : 0
2 rd = (rs2 == 0) ? non-imm : non-imm
3 rd = (rs2 == 0) ? reg : non-imm
4 rd = (rs2 == 0) ? reg : reg

gcc/ChangeLog:

 * config/riscv/riscv.cc (riscv_expand_conditional_move): Recognize
 Zicond patterns
 * config/riscv/riscv.md: Recognize Zicond patterns through movcc

gcc/testsuite/ChangeLog:

 * gcc.target/riscv/zicond-primitiveSemantics_return_0_imm.c: New test.
 * gcc.target/riscv/zicond-primitiveSemantics_return_imm_imm.c: New 
test.
 * gcc.target/riscv/zicond-primitiveSemantics_return_imm_reg.c: New 
test.
 * gcc.target/riscv/zicond-primitiveSemantics_return_reg_reg.c: New 
test.
I've added -Oz and -Og to the skip list and pushed the tests up to the 
trunk.


jeff


Re: RISC-V: Added support for CRC.

2023-09-26 Thread Joern Rennecke
On Tue, 26 Sept 2023 at 14:18, Jeff Law  wrote:

>  But the Coremark code is what it is.  This isn't a whole lot
> different than the work in the 90s which rewrote loops and compromised
> some of the spec benchmarks, or the eqntott hack to simplify that one
> key loop in eqntott.

I think the stated purpose of the benchmark matters.  If dhrystone had been
pushed as an abstraction-penalty benchmark, it would have been fine to
present results with WPA, inlining and dead code elimination as ordinary
dhrystone results.  But since it's supposed to exercise specific hardware
features, and not have the tests for these optimized away, that's not
appropriate.

So, first, we make the compiled program perform the work that the benchmark
was supposed to include in the measurement, just more efficiently.
Second, we not only optimize the benchmark, but also make the target-optimized
code generation available for other programs.  For new programs targeted at
GNU C, that is minimally archived by providing a built-in function,
and in general
for new code, by being able to replicate the idiom from coremark that
is recognized
by GCC.  The mere existence of a C idiom in a popular benchmark also makes this
idiom a common idiom, if it hasn't already been that before.
As we find new patterns that are used to implement CRC which would
be better replaced with a target-specific implementation, we can add these.

This is similar to rotate operations, which are supported directly by
some processors,
and even for other targets, there are generally preferred ways to
expand the code,
but there are a couple of different variants depending on the
available instruction set,
registers, and the microarchitecture (pipeline, latency etc).  We
started out with
one patterns that was recognized, and as new patterns were identified
in C code, we
improved GCC to recognize these other patterns.

> What ultimately pushed us to keep moving forward on this effort was
> discovering numerous CRC loop implementations out in the wild, including
> 4 implementations (IIRC) in the kernel itself.

I have always assumed that such must exist (CRCs are useful for a number
of problems, and besides, they wouldn't have been included in coremark as
a part of the workload if they were not relevant), but it is good to have
confirmation, and even better to have code that can detect and analyse a
entire class of idioms that is in such widespread use.

This still leaves room for further improvements, like detecting fully-unrolled
code, table lookup, or slice-by-N, and replacing them with better
target-optimized code where this is indicated by the optimization flags to
save execution time or code/rodata size.  Not something we have to tackle
now, but just because we don't do it now, doesn't mean we couldn't address
these in the future if that appears worthwhile.

> I can easily see creating a clmul RTL opcode for targets which support
> it and hoisting the clmul vs lookup table selection into generic code.
> I'm still pondering if we're likely to ever see cases where we want a
> vector clmul intrinsic or support in the autovectorizer for clmul.
> We've certainly got targets with vector clmul in the ISA, the question
> is using it.

If we aim for optimal code, I think it more likely that we want to detect a
block CRC computation, and have a target expander decide to do that
with inline code or a library call that uses vectorized clmul.  At the time
we add such block-CRC expansion code, it also makes sense to add a
builtin for block CRC so that new GNU C programs can directly request
that functionality without having to go through the cargo cult of matching
a supported idiom.

Now, the library might be written in GNU C, and for that it might be useful
to have a vector clmul intrinsic so that we can express this algorithm more
easily.

> Probably the biggest task in that space right now is to see if we can
> avoid the symbolic execution engine by re-using ranger.

I'll be interested to see what you'll come up with, but if reverting to the
symbolic execution engine, the compile time cost isn't much if you only
use it for a proper match.  So whatever heuristics are used before deciding
to use the engine matter.  Can all the cases detected by the engine be
recognized as a loop with a reduction?  We might use different heuristics
for different optimization levels, i.e. allow more false negatives at -O1,
and more false positives at -O2 / -fexpensive-optimizations.

> To reiterate the real goal here is to take code as-is and make it
> significantly faster.  While the original target was Coremark, we've
> found similar bitwise implementations of CRCs all over the place.
> There's no good reason that code should have to change.
>
> The idea of exposing a CRC builtin was an intermediate step that would
> allow those willing to change their code or writing new code to write
> their CRC in a trivial way and let the compiler figure out a sensible
> implementation while we clean up the CR

Re: [PATCH] RISC-V: Add opaque integer modes to fix ICE on DSE[PR111590]

2023-09-26 Thread Richard Sandiford
钟居哲  writes:
> Thanks Richard.
>
> Is it correct as follows ?
>
> diff --git a/gcc/dse.cc b/gcc/dse.cc
> index 8b07be17674..c58d3bf4e1b 100644
> --- a/gcc/dse.cc
> +++ b/gcc/dse.cc
> @@ -1733,7 +1733,7 @@ find_shift_sequence (poly_int64 access_size,
>/* If a constant was stored into memory, try to simplify it here,
>   otherwise the cost of the shift might preclude this optimization
>   e.g. at -Os, even when no actual shift will be needed.  */
> -  if (store_info->const_rhs)
> +  if (store_info->const_rhs && known_le (access_size, GET_MODE_SIZE 
> (MAX_MODE_INT)))

Yes, but the "&& ..." needs to be on a new line because of the 80-character
limit.

> I failed to find native_encode_rtx and native_decode_rtx.

See simplify-rtx.cc.

Richard


Re: [PATCH] vect, omp: inbranch simdclone dropping const

2023-09-26 Thread Bernhard Reutner-Fischer
On 26 September 2023 18:46:11 CEST, Tobias Burnus  
wrote:
>On 26.09.23 18:37, Andrew Stubbs wrote:
>> If the fall-through is deliberate please add a /* FALLTHROUGH */
>> comment (or whatever spelling disables the warning).
>
>It's: gcc_fallthrough ();
>
>Which gets converted to "__attribute__((fallthrough))"; it could also
>expand to "[[fallthrough]]" but that's C++17 (and, also, an C23 feature
>- albeit so far unimplemented in gcc).

OT
IIRC we do parse comments for a number of spellings of the hint by the user 
that the fallthrough is deliberate:

https://gcc.gnu.org/onlinedocs/gcc/Warning-Options.html

See the numerous levels of -Wimplicit-fallthrough=n, the default being 3.

---8<---
-Wimplicit-fallthrough=3 case sensitively matches one of the following regular 
expressions:
-fallthrough
@fallthrough@
lint -fallthrough[ \t]*
[ \t.!]*(ELSE,? |INTENTIONAL(LY)? )?
FALL(S | |-)?THR(OUGH|U)[ \t.!]*(-[^\n\r]*)?
[ \t.!]*(Else,? |Intentional(ly)? )?
Fall((s | |-)[Tt]|t)hr(ough|u)[ \t.!]*(-[^\n\r]*)?
[ \t.!]*([Ee]lse,? |[Ii]ntentional(ly)? )?
fall(s | |-)?thr(ough|u)[ \t.!]*(-[^\n\r]*)?
---8<---

Just FWIW.
thanks,


Re: [PATCH] vect, omp: inbranch simdclone dropping const

2023-09-26 Thread Andre Vieira (lists)




On 26/09/2023 17:48, Jakub Jelinek wrote:

On Tue, Sep 26, 2023 at 05:24:26PM +0100, Andre Vieira (lists) wrote:

@@ -5816,6 +5817,18 @@ get_references_in_stmt (gimple *stmt, vec *references)
}
  case IFN_MASK_LOAD:
  case IFN_MASK_STORE:
+ case IFN_MASK_CALL:
+   {
+ tree orig_fndecl
+   = gimple_call_addr_fndecl (gimple_call_arg (stmt, 0));
+ if (!orig_fndecl)
+   {
+ clobbers_memory = true;
+ break;
+   }
+ if ((flags_from_decl_or_type (orig_fndecl) & ECF_CONST) == 0)
+   clobbers_memory = true;
+   }


Should IFN_MASK_LOAD/STORE really go through this?  I thought those have
first argument address of the memory being conditionally loaded/stored, not
function address.


No it shouldn't, my bad...
Surprising testing didn't catch it though, I'm guessing 
gimple_call_addr_fndecl just returned null everytime for those. I'll 
clean it up.


Re: [PATCH] vect, omp: inbranch simdclone dropping const

2023-09-26 Thread Andre Vieira (lists)




On 26/09/2023 21:26, Bernhard Reutner-Fischer wrote:

On 26 September 2023 18:46:11 CEST, Tobias Burnus  
wrote:

On 26.09.23 18:37, Andrew Stubbs wrote:

If the fall-through is deliberate please add a /* FALLTHROUGH */
comment (or whatever spelling disables the warning).


It's: gcc_fallthrough ();

Which gets converted to "__attribute__((fallthrough))"; it could also
expand to "[[fallthrough]]" but that's C++17 (and, also, an C23 feature
- albeit so far unimplemented in gcc).


OT
IIRC we do parse comments for a number of spellings of the hint by the user 
that the fallthrough is deliberate:

https://gcc.gnu.org/onlinedocs/gcc/Warning-Options.html

See the numerous levels of -Wimplicit-fallthrough=n, the default being 3.

---8<---
-Wimplicit-fallthrough=3 case sensitively matches one of the following regular 
expressions:
-fallthrough
@fallthrough@
lint -fallthrough[ \t]*
[ \t.!]*(ELSE,? |INTENTIONAL(LY)? )?
FALL(S | |-)?THR(OUGH|U)[ \t.!]*(-[^\n\r]*)?
[ \t.!]*(Else,? |Intentional(ly)? )?
Fall((s | |-)[Tt]|t)hr(ough|u)[ \t.!]*(-[^\n\r]*)?
[ \t.!]*([Ee]lse,? |[Ii]ntentional(ly)? )?
fall(s | |-)?thr(ough|u)[ \t.!]*(-[^\n\r]*)?
---8<---

Just FWIW.
thanks,


I was surprised my bootstrap didn't catch this, I thought we generated 
warnings in such cases and bootstrap builds with -Werror does it not?


Re: [PATCH 0/2] RISC-V: Define not broken prefetch builtins

2023-09-26 Thread Jeff Law




On 9/22/23 01:11, Tsukasa OI wrote:

Hello,

As I explained earlier:
,
the builtin function for RISC-V "__builtin_riscv_zicbop_cbo_prefetchi" is
completely broken.  Instead, this patch set (in PATCH 1/2) creates three
new, working builtin intrinsics.

void __builtin_riscv_prefetch_i(void *addr, [intptr_t offset,] ...);
void __builtin_riscv_prefetch_r(void *addr, [intptr_t offset,] ...);
void __builtin_riscv_prefetch_w(void *addr, [intptr_t offset,] ...);


For consistency with "prefetch.i" and the reason I describe later (which
requires native instructions for "prefetch.r" and "prefetch.w"), I decided
to make builtin functions for "prefetch.[rw]" as well.

Optional second argument (named "offset" here) defaults to zero and must be
a compile-time integral constant.  Also, it must be a valid offset for a
"prefetch.[irw]" HINT instruction (x % 32 == 0 && x >= -2048 && x < 2048).

They are defined if the 'Zicbop' extension is supported and expands to:


prefetch.i offset(addr_reg)  ; __builtin_riscv_prefetch_i
prefetch.r offset(addr_reg)  ; __builtin_riscv_prefetch_r
prefetch.w offset(addr_reg)  ; __builtin_riscv_prefetch_w



The hardest part of this patch set was to support builtin function with
variable argument (making "offset" optional).  It required:

1.  Support for variable argument function prototype for RISC-V builtins
 (corresponding "..." on C-based languages)
2.  Support for (non-vector) RISC-V builtins with custom expansion
 (on RVV intrinsics, custom expansion is already implemented)


... and PATCH 2/2 fixes an ICE while I'm investigating regular prefetch
builtin (__builtin_prefetch).  If the 'Zicbop' extension is enabled,
__builtin_prefetch with the first argument NULL or (not all but) some
fixed addresses (like ((void*)0x20)) can cause an ICE.  This is because
the "r" constraint is not checked and a constant can be a first argument
of target-specific "prefetch" RTL instruction.

PATCH 2/2 fixes this issue by:

1.  Making "prefetch" not an instruction but instead an expansion
 (this is not rare; e.g. on i386) and
2.  Coercing the address argument into a register in the expansion

It requires separate instructions for "prefetch.[rw]" and I decided to make
those prefetch instructions very similar to "prefetch.i".  That's one of the
reasons I created builtins corresponding those.
What I still don't understand is why we're dealing with a decomposed 
address in the builtin, define_expand and/or define_insn.


Have the builtin accept an address, any address.  Then use force_reg to 
force the address into a register in the expander.  My understanding is 
register indirect is always valid.


Create an operand predicate that accepts reg and reg+d for the limited 
displacements allowed.  Use that for the address operand in the 
associated define_insn.



It seems like you're making this more complex than it needs to be.  Or 
I'm missing something critically important.


jeff


Re: [PATCH] vect, omp: inbranch simdclone dropping const

2023-09-26 Thread Bernhard Reutner-Fischer
On 26 September 2023 23:02:10 CEST, "Andre Vieira (lists)" 
 wrote:
>
>
>On 26/09/2023 21:26, Bernhard Reutner-Fischer wrote:
>> On 26 September 2023 18:46:11 CEST, Tobias Burnus  
>> wrote:
>>> On 26.09.23 18:37, Andrew Stubbs wrote:
 If the fall-through is deliberate please add a /* FALLTHROUGH */
 comment (or whatever spelling disables the warning).
>>> 

>
>I was surprised my bootstrap didn't catch this, I thought we generated 
>warnings in such cases and bootstrap builds with -Werror does it not?

Well, I wouldn't see much benefit to warn in this case, no?
You're falling through to a break, not other "active" code AFAICS?

You had:

references->safe_push (ref);
return false;
+ case IFN_MASK_CALL:
+   i = 1;
  default:
break;
  }

I would not have expected a warning here, TBH :-)
thanks,


[COMMITTED] Fix pr111456-1.c for targets that use unsigned char by default

2023-09-26 Thread Andrew Pinski
This fixes the testcase to use an explicit `signed char` instead of plain 
`char`.

Committed as obvious after a test with a cross to powerpc64-linux-gnu and 
x86_64-linux-gnu.

gcc/testsuite/ChangeLog:

PR testsuite/111603
* gcc.dg/tree-ssa/pr111456-1.c: Use `signed char` instead of plain 
`char`.
---
 gcc/testsuite/gcc.dg/tree-ssa/pr111456-1.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr111456-1.c 
b/gcc/testsuite/gcc.dg/tree-ssa/pr111456-1.c
index 8a2f730b387..664a1af 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/pr111456-1.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr111456-1.c
@@ -5,14 +5,14 @@
 void foo(void);
 static int i;
 static int *j = &i;
-static char l;
-static void(a)(char) {}
+static signed char l;
+static void(a)(signed char) {}
 static short(b)(short c, short d) { return c - d; }
 static short(e)(short f, int g) {
 return f < 0 || g < 0 || g >= 32 ? f : f >> g;
 }
 static short(h)(short f, int g) { return g >= 2 ?: f >> g; }
-static char k(char m, short n) {
+static signed char k(signed char m, short n) {
 short o;
 int *p = &i;
 if (!(((m) >= 1) && ((m) <= 1))) {
-- 
2.39.3



Re: [PATCH] AArch64: Fix __sync_val_compare_and_swap [PR111404]

2023-09-26 Thread Ramana Radhakrishnan
Hi Wilco,

Thanks for your email.

On Tue, Sep 26, 2023 at 12:07 AM Wilco Dijkstra  wrote:
>
> Hi Ramana,
>
> >> __sync_val_compare_and_swap may be used on 128-bit types and either calls 
> >> the
> >> outline atomic code or uses an inline loop.  On AArch64 LDXP is only 
> >> atomic if
> >> the value is stored successfully using STXP, but the current 
> >> implementations
> >> do not perform the store if the comparison fails.  In this case the value 
> >> returned
> >> is not read atomically.
> >
> > IIRC, the previous discussions in this space revolved around the
> > difficulty with the store writing to readonly memory which is why I
> > think we went with LDXP in this form.
>
> That's not related to this patch - this fixes a serious atomicity bug that may
> affect the Linux kernel since it uses the older sync primitives. Given that 
> LDXP
> is not atomic on its own, you have to execute the STXP even in the failure 
> case.
> Note that you can't rely on compare&swap not to write memory: load-exclusive
> loops may either always write or avoid writes in the failure case if the load 
> is
> atomic. CAS instructions always write.
>

I am aware of the capabilities of the architecture.

> > Has something changed from then ?
>
> Yes, we now know that using locking atomics was a bad decision. Developers
> actually require efficient and lock-free atomics. Since we didn't support 
> them,
> many applications were forced to add their own atomic implementations using
> hacky inline assembler. It also resulted in a nasty ABI incompatibility 
> between
> GCC and LLVM. Yes - atomics are part of the ABI!

I agree that atomics are part of the ABI.

>
> All that is much worse than worrying about a theoretical corner case that
> can't happen in real applications - atomics only work on writeable memory
> since their purpose is to synchronize reads with writes.


I remember this to be the previous discussions and common understanding.

https://gcc.gnu.org/legacy-ml/gcc/2016-06/msg00017.html

and here

https://gcc.gnu.org/legacy-ml/gcc-patches/2017-02/msg00168.html

Can you point any discussion recently that shows this has changed and
point me at that discussion if any anywhere ? I can't find it in my
searches . Perhaps you've had the discussion some place to show it has
changed.


regards
Ramana



>
> Cheers,
> Wilco


Re: [PATCH 2/3] build: Add libgrust as compilation modules

2023-09-26 Thread Thomas Schwinge
Hi!

On 2023-09-20T13:59:53+0200, Arthur Cohen  wrote:
> From: Pierre-Emmanuel Patry 
>
> Define the libgrust directory as a host compilation module as well as
> for targets.

> --- a/Makefile.def
> +++ b/Makefile.def
> @@ -149,6 +149,7 @@ host_modules= { module= libcc1; 
> extra_configure_flags=--enable-shared; };
>  host_modules= { module= gotools; };
>  host_modules= { module= libctf; bootstrap=true; };
>  host_modules= { module= libsframe; bootstrap=true; };
> +host_modules= { module= libgrust; };
>
>  target_modules = { module= libstdc++-v3;
>  bootstrap=true;
> @@ -192,6 +193,7 @@ target_modules = { module= libgm2; lib_path=.libs; };
>  target_modules = { module= libgomp; bootstrap= true; lib_path=.libs; };
>  target_modules = { module= libitm; lib_path=.libs; };
>  target_modules = { module= libatomic; bootstrap=true; lib_path=.libs; };
> +target_modules = { module= libgrust; };
>
>  // These are (some of) the make targets to be done in each subdirectory.
>  // Not all; these are the ones which don't have special options.

Maybe just I am confused, but to make sure that the build doesn't break
for different GCC configurations, don't we also directly need to
incorporate here a few GCC/Rust master branch follow-on commits, like:

  - commit 171ea4e2b3e202067c50f9c206974fbe1da691c0 "fixup: Fix bootstrap build"
  - commit 61cbe201029658c32e5c360823b9a1a17d21b03c "fixup: Fix missing build 
dependency"
  - commit 6a8b207b9ef7f9038e0cae7766117428783825d8 "libgrust: Add dependency 
to libstdc++"

(Not sure if all of these are necessary and/or if that's the complete
list; haven't looked up the corresponding GCC/Rust GitHub PRs.)

> --- a/gcc/rust/config-lang.in
> +++ b/gcc/rust/config-lang.in

> +target_libs="target-libffi target-libbacktrace target-libgrust"

Please don't add back 'target-libffi' and 'target-libbacktrace' here;
just 'target-libgrust'.  (As is present in GCC/Rust master branch, and
per commit 7411eca498beb13729cc2acec77e68250940aa81
"Rust: Don't depend on unused 'target-libffi', 'target-libbacktrace'".)


Grüße
 Thomas
-
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 
München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas 
Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht 
München, HRB 106955


Re: [PATCH 1/3] librust: Add libproc_macro and build system

2023-09-26 Thread Thomas Schwinge
Hi!

On 2023-09-26T13:40:40+0200, Richard Biener  wrote:
> On Wed, Sep 20, 2023 at 2:04 PM Arthur Cohen  
> wrote:
>>
>> From: Pierre-Emmanuel Patry 
>>
>> This patch series adds the build system changes to allow the Rust
>> frontend to develop and distribute its own libraries. The first library
>> we have been working on is the `proc_macro` library, comprised of a C++
>> library as well as a user-facing Rust library.
>>
>> Follow up commits containing the actual library code will be committed.
>> Should I submit patches to the MAINTAINERS file to allow Philip and I to
>> commit to this folder without first approval?
>
> I think the Rust language frontend maintainership implicitly includes
> the rust runtime libraries.

Would seem reasonable -- but to make things explicit, may also add in
'MAINTAINERS', 'Various Maintainers' an entry like:

libgrustAll Rust front end maintainers

..., or similar?  Or, explicitly duplicate for all Rust front end
maintainers.  (We don't seem to be consistent with respect to that.)

>> This first commit adds a simple `libgrust` folder with on top of which the
>> full library will be built.
>
> OK.

Before pushing this, don't you first have to set up an empty
'libgrust/ChangeLog', like
commit 24ff0b3e0c41e3997fb4c11736b8a412afbaadf3
"Add stub 'gcc/rust/ChangeLog'", for example?

..., and adjust
'contrib/gcc-changelog/git_commit.py:default_changelog_locations', like
commit 325529e21e81fbc3561d2568cb7e8a26296e5b2f
"Prepare 'contrib/gcc-changelog/git_commit.py' for GCC/Rust", for
example?

..., and then replace your 'libgrust/[...]' Git commit log entries:

>> Add some dummy files in libproc_macro along with it's build system.
>>
>> ChangeLog:
>>
>> * libgrust/Makefile.am: New file.
>> * libgrust/configure.ac: New file.
>> * libgrust/libproc_macro/Makefile.am: New file.
>> * libgrust/libproc_macro/proc_macro.cc: New file.
>> * libgrust/libproc_macro/proc_macro.h: New file.
>>
>> Signed-off-by: Pierre-Emmanuel Patry 

... with:

libgrust/
* Makefile.am: New file.
[...]

(Or similar syntax.)  That way, the "nightly" auto-updater will use
'libgrust/ChangeLog' instead of the top-level 'ChangeLog'.

(I hope I got all that right.)


Please also update 'contrib/gcc_update:files_and_dependencies' for all
'libgrust/' generated files' dependencies.


Grüße
 Thomas


>> ---
>>  libgrust/Makefile.am |  68 
>>  libgrust/configure.ac| 113 +++
>>  libgrust/libproc_macro/Makefile.am   |  58 ++
>>  libgrust/libproc_macro/proc_macro.cc |   7 ++
>>  libgrust/libproc_macro/proc_macro.h  |   7 ++
>>  5 files changed, 253 insertions(+)
>>  create mode 100644 libgrust/Makefile.am
>>  create mode 100644 libgrust/configure.ac
>>  create mode 100644 libgrust/libproc_macro/Makefile.am
>>  create mode 100644 libgrust/libproc_macro/proc_macro.cc
>>  create mode 100644 libgrust/libproc_macro/proc_macro.h
>>
>> diff --git a/libgrust/Makefile.am b/libgrust/Makefile.am
>> new file mode 100644
>> index 000..8e5274922c5
>> --- /dev/null
>> +++ b/libgrust/Makefile.am
>> @@ -0,0 +1,68 @@
>> +AUTOMAKE_OPTIONS = 1.8 foreign
>> +
>> +SUFFIXES = .c .rs .def .o .lo .a
>> +
>> +ACLOCAL_AMFLAGS = -I . -I .. -I ../config
>> +
>> +AM_CFLAGS = -I $(srcdir)/../libgcc -I $(MULTIBUILDTOP)../../gcc/include
>> +
>> +TOP_GCCDIR := $(shell cd $(top_srcdir) && cd .. && pwd)
>> +
>> +GCC_DIR = $(TOP_GCCDIR)/gcc
>> +RUST_SRC = $(GCC_DIR)/rust
>> +
>> +toolexeclibdir=@toolexeclibdir@
>> +toolexecdir=@toolexecdir@
>> +
>> +SUBDIRS = libproc_macro
>> +
>> +RUST_BUILDDIR := $(shell pwd)
>> +
>> +# Work around what appears to be a GNU make bug handling MAKEFLAGS
>> +# values defined in terms of make variables, as is the case for CC and
>> +# friends when we are called from the top level Makefile.
>> +AM_MAKEFLAGS = \
>> +"GCC_DIR=$(GCC_DIR)" \
>> +"RUST_SRC=$(RUST_SRC)" \
>> +   "AR_FLAGS=$(AR_FLAGS)" \
>> +   "CC_FOR_BUILD=$(CC_FOR_BUILD)" \
>> +   "CC_FOR_TARGET=$(CC_FOR_TARGET)" \
>> +   "RUST_FOR_TARGET=$(RUST_FOR_TARGET)" \
>> +   "CFLAGS=$(CFLAGS)" \
>> +   "CXXFLAGS=$(CXXFLAGS)" \
>> +   "CFLAGS_FOR_BUILD=$(CFLAGS_FOR_BUILD)" \
>> +   "CFLAGS_FOR_TARGET=$(CFLAGS_FOR_TARGET)" \
>> +   "INSTALL=$(INSTALL)" \
>> +   "INSTALL_DATA=$(INSTALL_DATA)" \
>> +   "INSTALL_PROGRAM=$(INSTALL_PROGRAM)" \
>> +   "INSTALL_SCRIPT=$(INSTALL_SCRIPT)" \
>> +   "LDFLAGS=$(LDFLAGS)" \
>> +   "LIBCFLAGS=$(LIBCFLAGS)" \
>> +   "LIBCFLAGS_FOR_TARGET=$(LIBCFLAGS_FOR_TARGET)" \
>> +   "MAKE=$(MAKE)" \
>> +   "MAKEINFO=$(MAKEINFO) $(MAKEINFOFLAGS)" \
>> +   "PICFLAG=$(PICFLAG)" \
>> +   "PICFLAG_FOR_TARGET=$(PICFLAG_FOR_TARGET)" \
>> +   "SHELL=$(SHELL)" \
>> +   "RUNTESTFLAGS=$(RUNTESTFLAGS)" \
>> +   "exec_prefix=$(exec_prefix)" \
>> +   "infodir=$(infodir)" \
>> +   "libdir=$(libdir)" \
>> +

[PATCH] DSE: Fix ICE when the mode with access_size don't exist on the target[PR111590]

2023-09-26 Thread Juzhe-Zhong
hen doing fortran test with 'V' extension enabled on RISC-V port.
I saw multiple ICE: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111590

The root cause is on DSE:

internal compiler error: in smallest_mode_for_size, at stor-layout.cc:356
0x1918f70 smallest_mode_for_size(poly_int<2u, unsigned long>, mode_class)
../../../../gcc/gcc/stor-layout.cc:356
0x11f75bb smallest_int_mode_for_size(poly_int<2u, unsigned long>)
../../../../gcc/gcc/machmode.h:916
0x3304141 find_shift_sequence
../../../../gcc/gcc/dse.cc:1738
0x3304f1a get_stored_val
../../../../gcc/gcc/dse.cc:1906
0x3305377 replace_read
../../../../gcc/gcc/dse.cc:2010
0x3306226 check_mem_read_rtx
../../../../gcc/gcc/dse.cc:2310
0x330667b check_mem_read_use
../../../../gcc/gcc/dse.cc:2415

After investigations, DSE is trying to do optimization like this following 
codes:

(insn 86 85 87 9 (set (reg:V4DI 168)
(mem/u/c:V4DI (reg/f:DI 171) [0  S32 A128])) "bug.f90":6:18 discrim 6 
1167 {*movv4di}
 (expr_list:REG_EQUAL (const_vector:V4DI [
(const_int 4 [0x4])
(const_int 1 [0x1]) repeated x2
(const_int 3 [0x3])
])
(nil)))

(set (mem) (reg:V4DI 168))

Then it ICE on: auto new_mode = smallest_int_mode_for_size (access_size * 
BITS_PER_UNIT);

The access_size may be 24 or 32. We don't have such integer modes with these 
size so it ICE.

TODO: The better way maybe make DSE use native_encode_rtx/native_decode_rtx
  but I don't know how to do that.  So let's quickly fix this issue, we
  can improve the fix later.

gcc/ChangeLog:

* dse.cc (find_shift_sequence): Check the mode with access_size exist 
on the target.

Authored-By: Richard Sandiford 

---
 gcc/dse.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/gcc/dse.cc b/gcc/dse.cc
index 8b07be17674..1a85dae1f8c 100644
--- a/gcc/dse.cc
+++ b/gcc/dse.cc
@@ -1733,7 +1733,8 @@ find_shift_sequence (poly_int64 access_size,
   /* If a constant was stored into memory, try to simplify it here,
  otherwise the cost of the shift might preclude this optimization
  e.g. at -Os, even when no actual shift will be needed.  */
-  if (store_info->const_rhs)
+  if (store_info->const_rhs
+  && known_le (access_size, GET_MODE_SIZE (MAX_MODE_INT)))
 {
   auto new_mode = smallest_int_mode_for_size (access_size * BITS_PER_UNIT);
   auto byte = subreg_lowpart_offset (new_mode, store_mode);
-- 
2.36.3



Re: [PATCH] ARM: Block predication on atomics [PR111235]

2023-09-26 Thread Ramana Radhakrishnan
Reviewed-by: Ramana Radhakrishnan 

A very initial review here . I think it largely looks ok based on the
description but I've spotted a few obvious nits and things that come
to mind on reviewing this. I've not done a very deep review but hope
it helps you move forward. I'm happy to work with you on landing this
if that helps. I'll try and find some time tomorrow to look at this
again.

Hope this helps.


On Thu, Sep 7, 2023 at 3:07 PM Wilco Dijkstra via Gcc-patches
 wrote:
>
> The v7 memory ordering model allows reordering of conditional atomic 
> instructions.
> To avoid this, make all atomic patterns unconditional.  Expand atomic loads 
> and
> stores for all architectures so the memory access can be wrapped into an 
> UNSPEC.

>
> Passes regress/bootstrap, OK for commit?

Target ? armhf ? --with-arch , -with-fpu , -with-float parameters ?
Please be specific.


Since these patterns touch armv8m.baseline can you find all the
testcases in the testsuite and ensure no change in code for
armv8m.baseline as that's unpredicated already and this patch brings
this in line with the same ? Does the testsuite already cover these
arch variants and are you satisfied that the tests in the testsuite
can catch / don't make any additional code changes to the other
architectures affected by this ?


>
> gcc/ChangeLog/
> PR target/111235
> * config/arm/constraints.md: Remove Pf constraint.
> * onfig/arm/sync.md (arm_atomic_load): Add new pattern.

Nit: s/onfig/config

> (arm_atomic_load_acquire): Likewise.
> (arm_atomic_store): Likewise.
> (arm_atomic_store_release): Likewise.

Ok.

> (atomic_load): Always expand atomic loads explicitly.
> (atomic_store): Always expand atomic stores explicitly.

Nit: Change message to :

Switch patterns to define_expand.

> (arm_atomic_loaddi2_ldrd): Remove predication.
> (arm_load_exclusive): Likewise.
> (arm_load_acquire_exclusive): Likewise.
> (arm_load_exclusivesi): Likewise.
> (arm_load_acquire_exclusivesi: Likewise.
> (arm_load_exclusivedi): Likewise.
> (arm_load_acquire_exclusivedi): Likewise.
> (arm_store_exclusive): Likewise.
> (arm_store_release_exclusivedi): Likewise.
> (arm_store_release_exclusive): Likewise.
> * gcc/config/arm/unspecs.md: Add VUNSPEC_LDR and VUNSPEC_STR.
>
> gcc/testsuite/ChangeLog/
> PR target/111235
> * gcc.target/arm/pr111235.c: Add new test.
>

Largely looks ok though I cannot work out tonight if we need more v8-a
or v8m-baseline specific tests for scan-assembler patterns.

Clearly our testsuite doesn't catch it , so perhaps the OP could help
validate this patch with their formal models to see if this fixes
these set of issues and creates no new regressions ? Is that feasible
to do ?

> ---
>
> diff --git a/gcc/config/arm/constraints.md b/gcc/config/arm/constraints.md
> index 
> 05a4ebbdd67601d7b92aa44a619d17634cc69f17..d7c4a1b0cd785f276862048005e6cfa57cdcb20d
>  100644
> --- a/gcc/config/arm/constraints.md
> +++ b/gcc/config/arm/constraints.md
> @@ -36,7 +36,7 @@
>  ;; in Thumb-1 state: Pa, Pb, Pc, Pd, Pe
>  ;; in Thumb-2 state: Ha, Pj, PJ, Ps, Pt, Pu, Pv, Pw, Px, Py, Pz, Rd, Rf, Rb, 
> Ra,
>  ;;  Rg, Ri
> -;; in all states: Pf, Pg
> +;; in all states: Pg
>
>  ;; The following memory constraints have been used:
>  ;; in ARM/Thumb-2 state: Uh, Ut, Uv, Uy, Un, Um, Us, Up, Uf, Ux, Ul
> @@ -239,13 +239,6 @@ (define_constraint "Pe"
>(and (match_code "const_int")
> (match_test "TARGET_THUMB1 && ival >= 256 && ival <= 510")))
>
> -(define_constraint "Pf"
> -  "Memory models except relaxed, consume or release ones."
> -  (and (match_code "const_int")
> -   (match_test "!is_mm_relaxed (memmodel_from_int (ival))
> -   && !is_mm_consume (memmodel_from_int (ival))
> -   && !is_mm_release (memmodel_from_int (ival))")))
> -
>  (define_constraint "Pg"
>"@internal In Thumb-2 state a constant in range 1 to 32"
>(and (match_code "const_int")
> diff --git a/gcc/config/arm/sync.md b/gcc/config/arm/sync.md
> index 
> 7626bf3c443285dc63b4c4367b11a879a99c93c6..2210810f67f37ce043b8fdc73b4f21b54c5b1912
>  100644
> --- a/gcc/config/arm/sync.md
> +++ b/gcc/config/arm/sync.md
> @@ -62,68 +62,110 @@ (define_insn "*memory_barrier"
> (set_attr "conds" "unconditional")
> (set_attr "predicable" "no")])
>
> -(define_insn "atomic_load"
> -  [(set (match_operand:QHSI 0 "register_operand" "=r,r,l")
> +(define_insn "arm_atomic_load"
> +  [(set (match_operand:QHSI 0 "register_operand" "=r,l")
>  (unspec_volatile:QHSI
> -  [(match_operand:QHSI 1 "arm_sync_memory_operand" "Q,Q,Q")
> -   (match_operand:SI 2 "const_int_operand" "n,Pf,n")]  ;; model
> +  [(match_operand:QHSI 1 "memory_operand" "m,m")]
]


Remind me again why is it safe to go from the Q constraint to the m
constraint here and everywhere else you've done this ?

> +  ""

  1   2   >