Re: [PATCH 1/7] RISC-V: Add intrinsic functions for crypto vector Zvbb extension

2023-12-04 Thread Kito Cheng
Hi Feng:

Thanks for the patch! a few inline comments below, also don't include
all test files from doc generator, only include a few within the patch
is fine, e.g. pick one for each group, so that it won't make GCC
source tree bloat too much.

> diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
> index 935eeb7fd8e..2a3777e168c 100644
> --- a/gcc/config/riscv/riscv.md
> +++ b/gcc/config/riscv/riscv.md
> @@ -428,6 +428,15 @@
> ;; vcompress vector compress instruction
> ;; vmov whole vector register move
> ;; vector unknown vector instruction
> +;; vandn crypto vector bitwise and-not instructions
> +;; vbrev crypto vector reverse bits in elements instructions
> +;; vbrev8 crypto vector reverse bits in bytes instructions
> +;; vrev8 crypto vector reverse bytes instructions
> +;; vclz crypto vector count leading Zeros instructions
> +;; vctz crypto vector count lrailing Zeros instructions
> +;; vrol crypto vector rotate left instructions
> +;; vror crypto vector rotate right instructions

Use vialu for above operations, no new type for those instructions.

> +;; vwsll crypto vector widening shift left logical instructions

Rename to vwshift to make it consistent with vnshift.

> diff --git a/gcc/config/riscv/vector-crypto.md 
> b/gcc/config/riscv/vector-crypto.md
> new file mode 100755
> index 000..0373cf6f48a
> --- /dev/null
> +++ b/gcc/config/riscv/vector-crypto.md
> @@ -0,0 +1,207 @@
> +(define_c_enum "unspec" [
> + ;; Zvbb unspecs
> + UNSPEC_VANDN
> + UNSPEC_VBREV
> + UNSPEC_VBREV8
> + UNSPEC_VREV8
> + UNSPEC_VCLZ
> + UNSPEC_VCTZ
> + UNSPEC_VROL
> + UNSPEC_VROR
> + UNSPEC_VWSLL
> +])

Could you use generic RTL code for andn, clz, ctl, rol, ror and wsll
rather than unspec?


[PATCH v1] RISC-V: Add test case for bug PR112813

2023-12-04 Thread pan2 . li
From: Pan Li 

The bugzilla 112813 has been fixed recently, add below test
case for the bug.

PR target/112813

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/vsetvl/pr112813-1.c: New test.

Signed-off-by: Pan Li 
---
 .../gcc.target/riscv/rvv/vsetvl/pr112813-1.c  | 32 +++
 1 file changed, 32 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr112813-1.c

diff --git a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr112813-1.c 
b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr112813-1.c
new file mode 100644
index 000..5aab9c2bf09
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr112813-1.c
@@ -0,0 +1,32 @@
+/* Test that we do not have ice when compile */
+/* { dg-do compile } */
+/* { dg-options "-march=rv32gcv_zvl256b -mabi=ilp32d -O3" } */
+
+int a, c, d, f, j;
+int b[7];
+long e;
+char *g;
+int *h;
+long long *i;
+
+void k() {
+  int l[][1] = {{}, {1}, {1}};
+  int *m = &d, *n = &l[0][0];
+
+  for (; e;)
+{
+  f = 3;
+
+  for (; f >= 0; f--)
+   {
+ *m &= b[f] >= 0;
+ j = a >= 2 ? 0 : 1 >> a;
+ *i |= j;
+}
+
+   for (; c;)
+ *g = 0;
+ }
+
+  h = n;
+}
-- 
2.34.1



Re: [PATCH v1] RISC-V: Add test case for bug PR112813

2023-12-04 Thread juzhe.zh...@rivai.ai
LGTM Thanks.



juzhe.zh...@rivai.ai
 
From: pan2.li
Date: 2023-12-04 16:09
To: gcc-patches
CC: juzhe.zhong; pan2.li; yanzhang.wang; kito.cheng
Subject: [PATCH v1] RISC-V: Add test case for bug PR112813
From: Pan Li 
 
The bugzilla 112813 has been fixed recently, add below test
case for the bug.
 
PR target/112813
 
gcc/testsuite/ChangeLog:
 
* gcc.target/riscv/rvv/vsetvl/pr112813-1.c: New test.
 
Signed-off-by: Pan Li 
---
.../gcc.target/riscv/rvv/vsetvl/pr112813-1.c  | 32 +++
1 file changed, 32 insertions(+)
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr112813-1.c
 
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr112813-1.c 
b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr112813-1.c
new file mode 100644
index 000..5aab9c2bf09
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr112813-1.c
@@ -0,0 +1,32 @@
+/* Test that we do not have ice when compile */
+/* { dg-do compile } */
+/* { dg-options "-march=rv32gcv_zvl256b -mabi=ilp32d -O3" } */
+
+int a, c, d, f, j;
+int b[7];
+long e;
+char *g;
+int *h;
+long long *i;
+
+void k() {
+  int l[][1] = {{}, {1}, {1}};
+  int *m = &d, *n = &l[0][0];
+
+  for (; e;)
+{
+  f = 3;
+
+  for (; f >= 0; f--)
+ {
+   *m &= b[f] >= 0;
+   j = a >= 2 ? 0 : 1 >> a;
+   *i |= j;
+}
+
+ for (; c;)
+   *g = 0;
+ }
+
+  h = n;
+}
-- 
2.34.1
 
 


RE: [PATCH v1] RISC-V: Add test case for bug PR112813

2023-12-04 Thread Li, Pan2
Committed, thanks Juzhe.

Pan

From: juzhe.zh...@rivai.ai 
Sent: Monday, December 4, 2023 4:10 PM
To: Li, Pan2 ; gcc-patches 
Cc: Li, Pan2 ; Wang, Yanzhang ; 
kito.cheng 
Subject: Re: [PATCH v1] RISC-V: Add test case for bug PR112813

LGTM Thanks.


juzhe.zh...@rivai.ai

From: pan2.li
Date: 2023-12-04 16:09
To: gcc-patches
CC: juzhe.zhong; 
pan2.li; 
yanzhang.wang; 
kito.cheng
Subject: [PATCH v1] RISC-V: Add test case for bug PR112813
From: Pan Li mailto:pan2...@intel.com>>

The bugzilla 112813 has been fixed recently, add below test
case for the bug.

PR target/112813

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/vsetvl/pr112813-1.c: New test.

Signed-off-by: Pan Li mailto:pan2...@intel.com>>
---
.../gcc.target/riscv/rvv/vsetvl/pr112813-1.c  | 32 +++
1 file changed, 32 insertions(+)
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr112813-1.c

diff --git a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr112813-1.c 
b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr112813-1.c
new file mode 100644
index 000..5aab9c2bf09
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr112813-1.c
@@ -0,0 +1,32 @@
+/* Test that we do not have ice when compile */
+/* { dg-do compile } */
+/* { dg-options "-march=rv32gcv_zvl256b -mabi=ilp32d -O3" } */
+
+int a, c, d, f, j;
+int b[7];
+long e;
+char *g;
+int *h;
+long long *i;
+
+void k() {
+  int l[][1] = {{}, {1}, {1}};
+  int *m = &d, *n = &l[0][0];
+
+  for (; e;)
+{
+  f = 3;
+
+  for (; f >= 0; f--)
+ {
+   *m &= b[f] >= 0;
+   j = a >= 2 ? 0 : 1 >> a;
+   *i |= j;
+}
+
+ for (; c;)
+   *g = 0;
+ }
+
+  h = n;
+}
--
2.34.1




Patch ping: [PATCH] rs6000: Canonicalize copysign (x, -1) back to -abs (x) in the backend [PR112606]

2023-12-04 Thread Jakub Jelinek
Hi!

I'd like to ping this patch.

Thanks

On Sat, Nov 25, 2023 at 11:17:48AM +0100, Jakub Jelinek wrote:
> The middle-end has been changed quite recently to canonicalize
> -abs (x) to copysign (x, -1) rather than the other way around.
> While I agree with that at GIMPLE level, since it matches the GIMPLE
> goal of as few operations as possible for a canonical form (-abs (x)
> is 2 GIMPLE statements, copysign (x, -1) is just one), I must say
> I don't really like that being done on RTL as well (or at least
> not canonicalizing (COPYSIGN x, negative) back to (NEG (ABS x))),
> because on most targets most of floating point constants need to be loaded
> from memory, there are a few exceptions but -1 is often not one of them.
> 
> Anyway, the following patch fixes the rs6000 regression caused by the
> change in GIMPLE canonicalization (i.e. the desirable one).  As rs6000
> clearly prefers -abs (x) form because it has a single instruction to do
> that while it also has copysign instruction, but that requires loading the
> -1 from memory, the following patch just ensures the copysign expander
> can actually see the floating point constant and in that case emits the
> -abs (x) code (or in the hypothetical case of copysign with non-negative
> constant abs (x) - but there copysign (x, 1) in GIMPLE is canonicalized
> to abs (x)), otherwise forces the operand to be the expected gpc_reg_operand
> and does what it did before.
> 
> Bootstrapped/regtested on powerpc64le-linux, ok for trunk?
> 
> 2023-11-25  Jakub Jelinek  
> 
>   PR target/112606
>   * config/rs6000/rs6000.md (copysign3): Change predicate
>   of the last argument from gpc_reg_operand to any_operand.  If
>   operands[2] is CONST_DOUBLE, emit abs or neg abs depending on
>   its sign, otherwise if it doesn't satisfy gpc_reg_operand,
>   force it to REG using copy_to_mode_reg.
> 
> --- gcc/config/rs6000/rs6000.md.jj2023-10-13 19:34:43.927834877 +0200
> +++ gcc/config/rs6000/rs6000.md   2023-11-24 18:54:13.587876170 +0100
> @@ -5358,7 +5358,7 @@ (define_expand "copysign3"
> (set (match_dup 4)
>   (neg:SFDF (abs:SFDF (match_dup 1
> (set (match_operand:SFDF 0 "gpc_reg_operand")
> -(if_then_else:SFDF (ge (match_operand:SFDF 2 "gpc_reg_operand")
> + (if_then_else:SFDF (ge (match_operand:SFDF 2 "any_operand")
>  (match_dup 5))
>(match_dup 3)
>(match_dup 4)))]
> @@ -5369,6 +5369,24 @@ (define_expand "copysign3"
> || TARGET_CMPB
> || VECTOR_UNIT_VSX_P (mode))"
>  {
> +  /* Middle-end canonicalizes -fabs (x) to copysign (x, -1),
> + but PowerPC prefers -fabs (x).  */
> +  if (CONST_DOUBLE_AS_FLOAT_P (operands[2]))
> +{
> +  if (real_isneg (CONST_DOUBLE_REAL_VALUE (operands[2])))
> + {
> +   operands[3] = gen_reg_rtx (mode);
> +   emit_insn (gen_abs2 (operands[3], operands[1]));
> +   emit_insn (gen_neg2 (operands[0], operands[3]));
> + }
> +  else
> + emit_insn (gen_abs2 (operands[0], operands[1]));
> +  DONE;
> +}
> +
> +  if (!gpc_reg_operand (operands[2], mode))
> +operands[2] = copy_to_mode_reg (mode, operands[2]);
> +
>if (TARGET_CMPB || VECTOR_UNIT_VSX_P (mode))
>  {
>emit_insn (gen_copysign3_fcpsgn (operands[0], operands[1],

Jakub



Re: Re: [PATCH 1/7] RISC-V: Add intrinsic functions for crypto vector Zvbb extension

2023-12-04 Thread Feng Wang
2023-12-04 16:01 Kito Cheng  wrote:



>Hi Feng:
>
>Thanks for the patch! a few inline comments below, also don't include
>all test files from doc generator, only include a few within the patch
>is fine, e.g. pick one for each group, so that it won't make GCC
>source tree bloat too much.
>

OK. All the test cases are indeed too large, will be reduced.


>> diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
>> index 935eeb7fd8e..2a3777e168c 100644
>> --- a/gcc/config/riscv/riscv.md
>> +++ b/gcc/config/riscv/riscv.md
>> @@ -428,6 +428,15 @@
>> ;; vcompress vector compress instruction
>> ;; vmov whole vector register move
>> ;; vector unknown vector instruction
>> +;; vandn crypto vector bitwise and-not instructions
>> +;; vbrev crypto vector reverse bits in elements instructions
>> +;; vbrev8 crypto vector reverse bits in bytes instructions
>> +;; vrev8 crypto vector reverse bytes instructions
>> +;; vclz crypto vector count leading Zeros instructions
>> +;; vctz crypto vector count lrailing Zeros instructions
>> +;; vrol crypto vector rotate left instructions
>> +;; vror crypto vector rotate right instructions
>
>Use vialu for above operations, no new type for those instructions.
>
>> +;; vwsll crypto vector widening shift left logical instructions
>
>Rename to vwshift to make it consistent with vnshift.
>
>> diff --git a/gcc/config/riscv/vector-crypto.md 
>> b/gcc/config/riscv/vector-crypto.md
>> new file mode 100755
>> index 000..0373cf6f48a
>> --- /dev/null
>> +++ b/gcc/config/riscv/vector-crypto.md
>> @@ -0,0 +1,207 @@
>> +(define_c_enum "unspec" [
>> + ;; Zvbb unspecs
>> + UNSPEC_VANDN
>> + UNSPEC_VBREV
>> + UNSPEC_VBREV8
>> + UNSPEC_VREV8
>> + UNSPEC_VCLZ
>> + UNSPEC_VCTZ
>> + UNSPEC_VROL
>> + UNSPEC_VROR
>> + UNSPEC_VWSLL
>> +])
>
>Could you use generic RTL code for andn, clz, ctl, rol, ror and wsll
>rather than unspec?


Got it! will optimize it., thanks!
Feng Wang

[PATCH] RISC-V: Remove earlyclobber from widen reduction

2023-12-04 Thread Juzhe-Zhong
Since the destination of reduction is not a vector register group, there
is no need to apply overlap constraint.

Also confirm Clang:

The mir in LLVM has early clobber:
early-clobber %49:vrm2 = PseudoVWADD_VX_M1 $noreg(tied-def 0), killed %17:vr, 
%48:gpr, %0:gprnox0, 3, 0; example.c:59:24

The mir in LLVM doesn't have early clobber:
%48:vr = PseudoVWREDSUM_VS_M2_E8 $noreg(tied-def 0), %17:vrm2, killed %33:vr, 
%0:gprnox0, 3, 1; example.c:60:26

And also confirm both:

vwredsum.vs v24, v8, v24 and vwredsum.vs v8, v8, v24 all legal on LLVM.

Align with LLVM and honor RISC-V V spec, remove earlyclobber.

Before this patch:

vwredsum.vs v8,v24,v8
vwredsum.vs v7,v22,v7
vwredsum.vs v6,v20,v6
vwredsum.vs v5,v18,v5
vwredsum.vs v4,v16,v4
vwredsum.vs v3,v14,v3
vwredsum.vs v2,v12,v2
vwredsum.vs v1,v10,v1
vmv1r.v v9,v8
vwredsum.vs v9,v24,v9
vmv1r.v v24,v7
vwredsum.vs v24,v22,v24
vmv1r.v v22,v6
vwredsum.vs v22,v20,v22
vmv1r.v v20,v5
vwredsum.vs v20,v18,v20
vmv1r.v v18,v4
vwredsum.vs v18,v16,v18
vmv1r.v v16,v3
vwredsum.vs v16,v14,v16
vmv1r.v v14,v2
vwredsum.vs v14,v12,v14
vmv1r.v v12,v1
vwredsum.vs v12,v10,v12

After this patch:

vfwredusum.vs   v17,v12,v17
vfwredusum.vs   v18,v10,v18
vfwredusum.vs   v15,v26,v15
vfwredusum.vs   v16,v24,v16
vfwredusum.vs   v12,v12,v17
vfwredusum.vs   v10,v10,v18
vfwredusum.vs   v13,v6,v20
vfwredusum.vs   v11,v8,v19
vfwredusum.vs   v6,v6,v13
vfwredusum.vs   v8,v8,v11
vfwredusum.vs   v7,v4,v21
vfwredusum.vs   v9,v2,v22
vfwredusum.vs   v14,v26,v15
vfwredusum.vs   v1,v24,v16
vfwredusum.vs   v4,v4,v7
vfwredusum.vs   v2,v2,v9

Same behavior as LLVM, and honor RISC-V V spec.

PR 112431

gcc/ChangeLog:

* config/riscv/vector.md: Remove earlyclobber from widen reduction.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/base/pr112431-35.c: New test.
* gcc.target/riscv/rvv/base/pr112431-36.c: New test.

---
 gcc/config/riscv/vector.md|   8 +-
 .../gcc.target/riscv/rvv/base/pr112431-35.c   | 107 ++
 .../gcc.target/riscv/rvv/base/pr112431-36.c   | 107 ++
 3 files changed, 218 insertions(+), 4 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-35.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-36.c

diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md
index 731057416cd..72cf3553e45 100644
--- a/gcc/config/riscv/vector.md
+++ b/gcc/config/riscv/vector.md
@@ -7861,7 +7861,7 @@
 
 ;; Integer Widen Reduction Sum (vwredsum[u].vs)
 (define_insn "@pred_"
-  [(set (match_operand:   0 "register_operand"  
"=&vr,&vr")
+  [(set (match_operand:   0 "register_operand""=vr,   
vr")
(unspec:
  [(unspec:
[(match_operand:   1 "vector_mask_operand"   
"vmWc1,vmWc1")
@@ -7872,7 +7872,7 @@
 (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
(unspec: [
 (match_operand:VI_QHS 3 "register_operand"  "   vr,   
vr")
-(match_operand:  4 "register_operand"  "  vr0,  
vr0")
+(match_operand:  4 "register_operand"  "   vr,   
vr")
] ANY_WREDUC)
   (match_operand:2 "vector_merge_operand"  "   vu,
0")] UNSPEC_REDUC))]
   "TARGET_VECTOR"
@@ -7928,7 +7928,7 @@
 
 ;; Float Widen Reduction Sum (vfwred[ou]sum.vs)
 (define_insn "@pred_"
-  [(set (match_operand: 0 "register_operand"  "=&vr, 
&vr")
+  [(set (match_operand: 0 "register_operand"  "=vr,   
vr")
(unspec:
  [(unspec:
[(match_operand:   1 "vector_mask_operand"   
"vmWc1,vmWc1")
@@ -7941,7 +7941,7 @@
 (reg:SI FRM_REGNUM)] UNSPEC_VPREDICATE)
(unspec: [
 (match_operand:VF_HS  3 "register_operand"  "   vr,   
vr")
-(match_operand:  4 "register_operand"  "  vr0,  
vr0")
+(match_operand:  4 "register_operand"  "   vr,   
vr")
] ANY_FWREDUC_SUM)
   (match_operand:2 "vector_merge_operand"  "   vu,
0")] UNSPEC_REDUC))]
   "TARGET_VECTOR"
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-35.c 
b/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-35.c
new file mode 100644
index 000..6f72e93aa38
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-35.c
@@ -0,0 +1,107 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3" } */
+
+#include "riscv_vector.h"
+
+size_t __attribute__ ((noinline))
+sumation (size_t sum0, size_t sum1, size_t sum2, size_t sum3, size_t sum4,
+ size_

Re: [r14-5930 Regression] FAIL: gcc.c-torture/compile/libcall-2.c -Os (test for excess errors) on Linux/x86_64

2023-12-04 Thread Jose E. Marchesi


>> mcmodel=large s not supported (yet) on any Darwin arch [PR90698], so
> the test needs skipping or xfailing, I think (either way with a
> reference to the PR).
>
> Pushed as 
> https://gcc.gnu.org/git/?p=gcc.git;a=commitdiff;h=b74981b5cf32ebf4bfffd25e7174b5c80243447a

Thanks for fixing this.



Re: [patch-2, rs6000] guard fctid on PPC64 and powerpc 476 [PR112707]

2023-12-04 Thread Kewen.Lin
Hi Haochen,

on 2023/12/1 10:42, HAO CHEN GUI wrote:
> Hi,
>   The "fctid" is supported on 64-bit Power processors and powerpc 476. It
> need a guard to check it. The patch fixes the issue.
> 
>   Bootstrapped and tested on x86 and powerpc64-linux BE and LE with
> no regressions. Is this OK for trunk?
> 
> Thanks
> Gui Haochen
> 
> ChangeLog
> rs6000: guard fctid on PPC64 and powerpc 476
> 
> fctid is supported on 64-bit Power processors and powerpc 476. It should
> be guarded by this condition. The patch fixes the issue.
> 
> gcc/
>   PR target/112707
>   * config/rs6000/rs6000.h (TARGET_FCTID): Define.
>   * config/rs6000/rs6000.md (lrintdi2): Add guard TARGET_FCTID.
> 
> gcc/testsuite/
>   PR target/112707
>   * gcc.target/powerpc/pr112707.h: New.
>   * gcc.target/powerpc/pr112707-2.c: New.
>   * gcc.target/powerpc/pr112707-3.c: New.
> 
> 
> patch.diff
> diff --git a/gcc/config/rs6000/rs6000.h b/gcc/config/rs6000/rs6000.h
> index 22595f6..497ae3d 100644
> --- a/gcc/config/rs6000/rs6000.h
> +++ b/gcc/config/rs6000/rs6000.h
> @@ -467,6 +467,8 @@ extern int rs6000_vector_align[];
>  #define TARGET_FCFIDUS   TARGET_POPCNTD
>  #define TARGET_FCTIDUZ   TARGET_POPCNTD
>  #define TARGET_FCTIWUZ   TARGET_POPCNTD
> +/* Enable fctid on ppc64 and powerpc476.  */
> +#define TARGET_FCTID (TARGET_POWERPC64 | TARGET_FPRND)

Like some existing macros which are checking rs6000_cpu, I think it's
more specific to check with PROCESSOR_PPC476, that is: rs6000_cpu ==
PROCESSOR_PPC476.  And say something like: "Only powerpc64 and powerpc476
support fctid." instead?

>  #define TARGET_CTZ   TARGET_MODULO
>  #define TARGET_EXTSWSLI  (TARGET_MODULO && TARGET_POWERPC64)
>  #define TARGET_MADDLDTARGET_MODULO
> diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
> index d4337ce..4a5e63c 100644
> --- a/gcc/config/rs6000/rs6000.md
> +++ b/gcc/config/rs6000/rs6000.md
> @@ -6718,7 +6718,7 @@ (define_insn "lrintdi2"
>[(set (match_operand:DI 0 "gpc_reg_operand" "=d")
>   (unspec:DI [(match_operand:SFDF 1 "gpc_reg_operand" "")]
>  UNSPEC_FCTID))]
> -  "TARGET_HARD_FLOAT"
> +  "TARGET_HARD_FLOAT && TARGET_FCTID"
>"fctid %0,%1"
>[(set_attr "type" "fp")])
> 
> diff --git a/gcc/testsuite/gcc.target/powerpc/pr112707-2.c 
> b/gcc/testsuite/gcc.target/powerpc/pr112707-2.c
> new file mode 100644
> index 000..ae91913
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/pr112707-2.c
> @@ -0,0 +1,6 @@
> +/* { dg-do compile { target { powerpc*-*-* && be } } } */

Nit: powerpc*-*-* && be checks are useless as ilp32 excludes le.

> +/* { dg-options "-O2 -mdejagnu-cpu=7450 -m32 -fno-math-errno" } */

Nit: -m32 isn't required.

> +/* { dg-require-effective-target ilp32 } */> +/* { dg-final { 
> scan-assembler-not {\mfctid\M} } }  */
> +
> +#include "pr112707.h"
> diff --git a/gcc/testsuite/gcc.target/powerpc/pr112707-3.c 
> b/gcc/testsuite/gcc.target/powerpc/pr112707-3.c
> new file mode 100644
> index 000..e47ce20
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/pr112707-3.c
> @@ -0,0 +1,9 @@
> +/* { dg-do compile { target { powerpc*-*-* && be } } } */
> +/* { dg-options "-O2 -m32 -fno-math-errno -mdejagnu-cpu=476fp" } */

Likewise.

The others look good to me, thanks!

BR,
Kewen

> +/* { dg-require-effective-target ilp32 } */
> +
> +/* powerpc 476fp has hard float enabled which is required by fctid */
> +
> +#include "pr112707.h"
> +
> +/* { dg-final { scan-assembler-times {\mfctid\M} 2 } } */
> diff --git a/gcc/testsuite/gcc.target/powerpc/pr112707.h 
> b/gcc/testsuite/gcc.target/powerpc/pr112707.h
> new file mode 100644
> index 000..e427dc6
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/pr112707.h
> @@ -0,0 +1,10 @@
> +long long test1 (double a)
> +{
> +  return __builtin_llrint (a);
> +}
> +
> +long long test2 (float a)
> +{
> +  return __builtin_llrint (a);
> +}
> +




Re: [aarch64] PR111702 - ICE in insert_regs after interleave+zip1 vector initialization patch

2023-12-04 Thread Prathamesh Kulkarni
On Thu, 23 Nov 2023 at 17:06, Prathamesh Kulkarni
 wrote:
>
> Hi Richard,
> For the test-case mentioned in PR111702, compiling with -O2
> -frounding-math -fstack-protector-all results in following ICE during
> cse2 pass:
>
> test.c: In function 'foo':
> test.c:119:1: internal compiler error: in insert_regs, at cse.cc:1120
>   119 | }
>   | ^
> 0xb7ebb0 insert_regs
> ../../gcc/gcc/cse.cc:1120
> 0x1f95134 merge_equiv_classes
> ../../gcc/gcc/cse.cc:1764
> 0x1f9b9ab cse_insn
> ../../gcc/gcc/cse.cc:4793
> 0x1f9fe30 cse_extended_basic_block
> ../../gcc/gcc/cse.cc:6577
> 0x1f9fe30 cse_main
> ../../gcc/gcc/cse.cc:6722
> 0x1fa0984 rest_of_handle_cse2
> ../../gcc/gcc/cse.cc:7620
> 0x1fa0984 execute
> ../../gcc/gcc/cse.cc:7675
>
> This happens only with interleave+zip1 vector initialization with
> -frounding-math -fstack-protector-all, while it compiles OK without
> -fstack-protector-all. Also, it compiles OK with fallback sequence
> code-gen (with or without -fstack-protector-all). Unfortunately, I
> haven't been able to reduce the test-case further :/
>
> From the test-case, it seems only the vector initializer for type J
> uses interleave+zip1 approach, while rest of the vector initializers
> use fallback sequence.
>
> J is defined as:
> typedef _Float16 __attribute__((__vector_size__ (16))) J;
>
> and the initializer is:
> (J) { 11654, 4801, 5535, 9743, 61680}
>
> interleave+zip1 sequence for above initializer J:
> mode = V8HF
>
> vals: (parallel:V8HF [
> (reg:HF 642)
> (reg:HF 645)
> (reg:HF 648)
> (reg:HF 651)
> (reg:HF 654)
> (const_double:HF 0.0 [0x0.0p+0]) repeated x3
> ])
>
> target: (reg:V8HF 641)
> seq:
> (insn 1058 0 1059 (set (reg:V4HF 657)
> (const_vector:V4HF [
> (const_double:HF 0.0 [0x0.0p+0]) repeated x4
> ])) "test.c":81:8 -1
>  (nil))
> (insn 1059 1058 1060 (set (reg:V4HF 657)
> (vec_merge:V4HF (vec_duplicate:V4HF (reg:HF 642))
> (reg:V4HF 657)
> (const_int 1 [0x1]))) "test.c":81:8 -1
>  (nil))
> (insn 1060 1059 1061 (set (reg:V4HF 657)
> (vec_merge:V4HF (vec_duplicate:V4HF (reg:HF 648))
> (reg:V4HF 657)
> (const_int 2 [0x2]))) "test.c":81:8 -1
>  (nil))
> (insn 1061 1060 1062 (set (reg:V4HF 657)
> (vec_merge:V4HF (vec_duplicate:V4HF (reg:HF 654))
> (reg:V4HF 657)
> (const_int 4 [0x4]))) "test.c":81:8 -1
>  (nil))
> (insn 1062 1061 1063 (set (reg:V4HF 658)
> (const_vector:V4HF [
> (const_double:HF 0.0 [0x0.0p+0]) repeated x4
> ])) "test.c":81:8 -1
>  (nil))
> (insn 1063 1062 1064 (set (reg:V4HF 658)
> (vec_merge:V4HF (vec_duplicate:V4HF (reg:HF 645))
> (reg:V4HF 658)
> (const_int 1 [0x1]))) "test.c":81:8 -1
>  (nil))
> (insn 1064 1063 1065 (set (reg:V4HF 658)
> (vec_merge:V4HF (vec_duplicate:V4HF (reg:HF 651))
> (reg:V4HF 658)
> (const_int 2 [0x2]))) "test.c":81:8 -1
>  (nil))
> (insn 1065 1064 0 (set (reg:V8HF 641)
> (unspec:V8HF [
> (subreg:V8HF (reg:V4HF 657) 0)
> (subreg:V8HF (reg:V4HF 658) 0)
> ] UNSPEC_ZIP1)) "test.c":81:8 -1
>  (nil))
>
> It seems to me that the above sequence correctly initializes the
> vector into r641 ?
> insns 1058-1061 construct r657 = { r642, r648, r654, 0 }
> insns 1062-1064 construct r658 = { r645, r651, 0, 0 }
> and zip1 will create r641 = { r642, r645, r648, r651, r654, 0, 0, 0 }
>
> For the above test, it seems that with interleave+zip1 approach and
> -fstack-protector-all,
> in cse pass, there are two separate equivalence classes created for
> (const_int 1), that need
> to be merged in cse_insn:
>
>if (elt->first_same_value != src_eqv_elt->first_same_value)
> {
>   /* The REG_EQUAL is indicating that two formerly distinct
>  classes are now equivalent.  So merge them.  */
>   merge_equiv_classes (elt, src_eqv_elt);
>
> elt equivalence chain:
> Equivalence chain for (subreg:QI (reg:V16QI 671) 0):
> (subreg:QI (reg:V16QI 671) 0)
> (const_int 1 [0x1])
>
> src_eqv_elt equivalence chain:
> Equivalence chain for (const_int 1 [0x1]):
> (reg:QI 34 v2)
> (reg:QI 32 v0)
> (reg:QI 34 v2)
> (const_int 1 [0x1])
> (vec_select:QI (reg:V16QI 671)
> (parallel [
> (const_int 1 [0x1])
> ]))
> (vec_select:QI (reg:V16QI 32 v0)
> (parallel [
> (const_int 1 [0x1])
> ]))
> (vec_select:QI (reg:V16QI 33 v1)
> (parallel [
> (const_int 2 [0x2])
> ]))
> (vec_select:QI (reg:V16QI 33 v1)
> (parallel [
> (const_int 1 [0x1])
> ]))
>
> The issue is that merge_equiv_classes doesn't seem to deal correctly with
> multiple occurences of same register in class2 (src_eqv_elt), which
> has two occurrences of
> (reg:QI 34 v2)
>
>

Re: [PATCH] RISC-V: Check if zcd conflicts with zcmt and zcmp

2023-12-04 Thread Christoph Müllner
On Mon, Dec 4, 2023 at 8:48 AM Kito Cheng  wrote:

LGTM

I've double-checked this in the Zc-1.0.4-3.pdf:
* Zcmp is incompatible with Zcd
* Zcmp depends on Zca
* Zcmt is incompatible with Zcd
* Zcmt depends on Zca and Zicsr

The implies-relations are already implemented.
This patch enforces the incompatibility-relations.

>
> gcc/ChangeLog:
>
> * common/config/riscv/riscv-common.cc
> (riscv_subset_list::check_conflict_ext): Check and conflicts
> with zcmt and zcmp.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/riscv/arch-29.c: New test.
> * gcc.target/riscv/arch-30.c: New test.
> ---
>  gcc/common/config/riscv/riscv-common.cc  | 8 
>  gcc/testsuite/gcc.target/riscv/arch-29.c | 7 +++
>  gcc/testsuite/gcc.target/riscv/arch-30.c | 7 +++
>  3 files changed, 22 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/riscv/arch-29.c
>  create mode 100644 gcc/testsuite/gcc.target/riscv/arch-30.c
>
> diff --git a/gcc/common/config/riscv/riscv-common.cc 
> b/gcc/common/config/riscv/riscv-common.cc
> index aecb342b164..bfb41827f7a 100644
> --- a/gcc/common/config/riscv/riscv-common.cc
> +++ b/gcc/common/config/riscv/riscv-common.cc
> @@ -1230,6 +1230,14 @@ riscv_subset_list::check_conflict_ext ()
>/* 'H' hypervisor extension requires base ISA with 32 registers.  */
>if (lookup ("e") && lookup ("h"))
>  error_at (m_loc, "%<-march=%s%>: h extension requires i extension", 
> m_arch);
> +
> +  if (lookup ("zcd"))
> +{
> +  if (lookup ("zcmt"))
> +   error_at (m_loc, "%<-march=%s%>: zcd conflicts with zcmt", m_arch);
> +  if (lookup ("zcmp"))
> +   error_at (m_loc, "%<-march=%s%>: zcd conflicts with zcmp", m_arch);
> +}
>  }
>
>  /* Parsing function for multi-letter extensions.
> diff --git a/gcc/testsuite/gcc.target/riscv/arch-29.c 
> b/gcc/testsuite/gcc.target/riscv/arch-29.c
> new file mode 100644
> index 000..f8281275878
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/arch-29.c
> @@ -0,0 +1,7 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=rv64id_zcd_zcmt -mabi=lp64d" } */
> +int foo()
> +{
> +}
> +
> +/* { dg-error "zcd conflicts with zcmt" "" { target *-*-* } 0 } */
> diff --git a/gcc/testsuite/gcc.target/riscv/arch-30.c 
> b/gcc/testsuite/gcc.target/riscv/arch-30.c
> new file mode 100644
> index 000..3e67ea0bb06
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/arch-30.c
> @@ -0,0 +1,7 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=rv64id_zcd_zcmp -mabi=lp64d" } */
> +int foo()
> +{
> +}
> +
> +/* { dg-error "zcd conflicts with zcmp" "" { target *-*-* } 0 } */
> --
> 2.40.1
>


Re: [PATCH] rs6000: Canonicalize copysign (x, -1) back to -abs (x) in the backend [PR112606]

2023-12-04 Thread Kewen.Lin
Hi Jakub,

on 2023/11/25 18:17, Jakub Jelinek wrote:
> Hi!
> 
> The middle-end has been changed quite recently to canonicalize
> -abs (x) to copysign (x, -1) rather than the other way around.
> While I agree with that at GIMPLE level, since it matches the GIMPLE
> goal of as few operations as possible for a canonical form (-abs (x)
> is 2 GIMPLE statements, copysign (x, -1) is just one), I must say
> I don't really like that being done on RTL as well (or at least
> not canonicalizing (COPYSIGN x, negative) back to (NEG (ABS x))),
> because on most targets most of floating point constants need to be loaded
> from memory, there are a few exceptions but -1 is often not one of them.
> 
> Anyway, the following patch fixes the rs6000 regression caused by the
> change in GIMPLE canonicalization (i.e. the desirable one).  As rs6000
> clearly prefers -abs (x) form because it has a single instruction to do
> that while it also has copysign instruction, but that requires loading the
> -1 from memory, the following patch just ensures the copysign expander
> can actually see the floating point constant and in that case emits the
> -abs (x) code (or in the hypothetical case of copysign with non-negative
> constant abs (x) - but there copysign (x, 1) in GIMPLE is canonicalized
> to abs (x)), otherwise forces the operand to be the expected gpc_reg_operand
> and does what it did before.
> 
> Bootstrapped/regtested on powerpc64le-linux, ok for trunk?

Thanks for fixing this!  IIUC even with Tamar's further improvement proposal,
we still need some rs6000 specific work, then updating this copysign expansion
looks more straightforward.  So okay for trunk, thanks!

BR,
Kewen

> 
> 2023-11-25  Jakub Jelinek  
> 
>   PR target/112606
>   * config/rs6000/rs6000.md (copysign3): Change predicate
>   of the last argument from gpc_reg_operand to any_operand.  If
>   operands[2] is CONST_DOUBLE, emit abs or neg abs depending on
>   its sign, otherwise if it doesn't satisfy gpc_reg_operand,
>   force it to REG using copy_to_mode_reg.
> 
> --- gcc/config/rs6000/rs6000.md.jj2023-10-13 19:34:43.927834877 +0200
> +++ gcc/config/rs6000/rs6000.md   2023-11-24 18:54:13.587876170 +0100
> @@ -5358,7 +5358,7 @@ (define_expand "copysign3"
> (set (match_dup 4)
>   (neg:SFDF (abs:SFDF (match_dup 1
> (set (match_operand:SFDF 0 "gpc_reg_operand")
> -(if_then_else:SFDF (ge (match_operand:SFDF 2 "gpc_reg_operand")
> + (if_then_else:SFDF (ge (match_operand:SFDF 2 "any_operand")
>  (match_dup 5))
>(match_dup 3)
>(match_dup 4)))]
> @@ -5369,6 +5369,24 @@ (define_expand "copysign3"
> || TARGET_CMPB
> || VECTOR_UNIT_VSX_P (mode))"
>  {
> +  /* Middle-end canonicalizes -fabs (x) to copysign (x, -1),
> + but PowerPC prefers -fabs (x).  */
> +  if (CONST_DOUBLE_AS_FLOAT_P (operands[2]))
> +{
> +  if (real_isneg (CONST_DOUBLE_REAL_VALUE (operands[2])))
> + {
> +   operands[3] = gen_reg_rtx (mode);
> +   emit_insn (gen_abs2 (operands[3], operands[1]));
> +   emit_insn (gen_neg2 (operands[0], operands[3]));
> + }
> +  else
> + emit_insn (gen_abs2 (operands[0], operands[1]));
> +  DONE;
> +}
> +
> +  if (!gpc_reg_operand (operands[2], mode))
> +operands[2] = copy_to_mode_reg (mode, operands[2]);
> +
>if (TARGET_CMPB || VECTOR_UNIT_VSX_P (mode))
>  {
>emit_insn (gen_copysign3_fcpsgn (operands[0], operands[1],
> 
>   Jakub
>


Re: [PATCH] combine: Fix ICE in try_combine on pr112494.c [PR112560]

2023-12-04 Thread Uros Bizjak
On Wed, Nov 29, 2023 at 1:25 PM Richard Biener
 wrote:
>
> On Wed, Nov 29, 2023 at 10:35 AM Uros Bizjak  wrote:
> >
> > The compiler, configured with --enable-checking=yes,rtl,extra ICEs with:
> >
> > internal compiler error: RTL check: expected elt 0 type 'e' or 'u',
> > have 'E' (rtx unspec) in try_combine, at combine.cc:3237
> >
> > This is
> >
> > 3236  /* Just replace the CC reg with a new mode.  */
> > 3237  SUBST (XEXP (*cc_use_loc, 0), newpat_dest);
> > 3238  undobuf.other_insn = cc_use_insn;
> >
> > in combine.cc, where *cc_use_loc is
> >
> > (unspec:DI [
> > (reg:CC 17 flags)
> > ] UNSPEC_PUSHFL)
> >
> > combine assumes CC must be used inside of a comparison and uses XEXP (..., 
> > 0)
> > without checking on the RTX type of the argument.
> >
> > Skip the modification of CC-using operation if *cc_use_loc is not 
> > COMPARISON_P.
> >
> > PR middle-end/112560
> >
> > gcc/ChangeLog:
> >
> > * combine.cc (try_combine): Skip the modification of CC-using
> > operation if *cc_use_loc is not COMPARISON_P.
> >
> > Bootstrapped and regression tested on x86_64-linux-gnu {,-m32}.
> >
> > OK for master?
>
> Don't we need to stop the attempt to combine when we cannot handle a use?
> Simply not adjusting another use doesn't look correct, does it?

I have analysed [1] all targets that define SELECT_CC_MODE, and almost
all use CC_REG exclusively in comparison. Besides i386 that defines:

(define_insn "@pushfl2"
  [(set (match_operand:W 0 "push_operand" "=<")
(unspec:W [(match_operand:CC 1 "flags_reg_operand")]
  UNSPEC_PUSHFL))]

other non-comparison pre-reload uses of CC_REG are:

arm:

(define_insn "get_fpscr_nzcvqc"
 [(set (match_operand:SI 0 "register_operand" "=r")
   (unspec_volatile:SI [(reg:SI VFPCC_REGNUM)] UNSPEC_GET_FPSCR_NZCVQC))]

(define_insn "mve_vadcq_v4si"
  [(set (match_operand:V4SI 0 "s_register_operand" "=w")
(unspec:V4SI [(match_operand:V4SI 1 "s_register_operand" "w")
   (match_operand:V4SI 2 "s_register_operand" "w")]
 VADCQ))
   (set (reg:SI VFPCC_REGNUM)
(unspec:SI [(reg:SI VFPCC_REGNUM)]
 VADCQ))
  ]

rs6000:

(define_insn "prologue_movesi_from_cr"
  [(set (match_operand:SI 0 "gpc_reg_operand" "=r")
(unspec:SI [(reg:CC CR2_REGNO) (reg:CC CR3_REGNO)
(reg:CC CR4_REGNO)]
   UNSPEC_MOVESI_FROM_CR))]

and just for reference s390:

(define_insn_and_split "*ccraw_to_int"
  [(set (pc)
(if_then_else
 (match_operator 0 "s390_eqne_operator"
 [(reg:CCRAW CC_REGNUM)
  (match_operand 1 "const_int_operand" "")])
 (label_ref (match_operand 2 "" ""))
 (pc)))
   (set (match_operand:SI 3 "register_operand" "=d")
(unspec:SI [(reg:CCRAW CC_REGNUM)] UNSPEC_CC_TO_INT))]

The above is not single_use, so the issue does not apply here.

These uses can all break with checking enabled at the mentioned spot
in combine.cc in the same way as x86. Actually, it is undesirable to
change the mode in the "other instruction" - the machine instruction
doesn't care about mode at all, but the insn pattern may fail
recognition due to CC mode change.

Based on the above analysis, I propose we proceed with my original patch.

[1] Starting with 'egrep -v -w "set|clobber" *.md | grep ' and
analysing all hits

Uros.


[PATCH 1/2][RFC] middle-end/112830 - memcpy expansion drops address-spaces

2023-12-04 Thread Richard Biener
The following makes sure we are not losing address-space info
when expanding a __builtin_memcpy (synthesized by gimplification,
which _might_ be the other actual problem).  The issue is with
get_memory_rtx which is also used by other builtin expansions
but is not aware of address-spaces.  The following fixes that
which resolves an ICE with AVR and properly dispatches to
inline expansion then in emit_block_move_hints.

The alternative, to not intoduce memcpy by gimplification is
possible but IL verification rejects WITH_SIZE_EXPR as part
of GIMPLE assignments, removing that restriction also "works"
(for the testcase).

Alternative is in [2/2].

Sofar untested, will test on x86_64 - in principle I'd prefer 2/2.

Richard.

PR middle-end/112830
* builtins.cc (get_memory_rtx): Make address-space aware.

* gcc.target/avr/pr112830.c: New testcase.
---
 gcc/builtins.cc | 10 +++---
 gcc/testsuite/gcc.target/avr/pr112830.c | 11 +++
 2 files changed, 18 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/avr/pr112830.c

diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index 4fc58a0bda9..05c50b782d7 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -1353,7 +1353,8 @@ get_memory_rtx (tree exp, tree len)
 exp = TREE_OPERAND (exp, 0);
 
   addr = expand_expr (orig_exp, NULL_RTX, ptr_mode, EXPAND_NORMAL);
-  mem = gen_rtx_MEM (BLKmode, memory_address (BLKmode, addr));
+  addr_space_t as = TYPE_ADDR_SPACE (TREE_TYPE (TREE_TYPE (exp)));
+  mem = gen_rtx_MEM (BLKmode, memory_address_addr_space (BLKmode, addr, as));
 
   /* Get an expression we can use to find the attributes to assign to MEM.
  First remove any nops.  */
@@ -1363,8 +1364,11 @@ get_memory_rtx (tree exp, tree len)
 
   /* Build a MEM_REF representing the whole accessed area as a byte blob,
  (as builtin stringops may alias with anything).  */
+  tree ctype = char_type_node;
+  if (!ADDR_SPACE_GENERIC_P (as))
+ctype = build_qualified_type (ctype, ENCODE_QUAL_ADDR_SPACE (as));
   exp = fold_build2 (MEM_REF,
-build_array_type (char_type_node,
+build_array_type (ctype,
   build_range_type (sizetype,
 size_one_node, len)),
 exp, build_int_cst (ptr_type_node, 0));
@@ -1381,7 +1385,7 @@ get_memory_rtx (tree exp, tree len)
   unsigned int align = get_pointer_alignment (TREE_OPERAND (exp, 0));
   exp = build_fold_addr_expr (base);
   exp = fold_build2 (MEM_REF,
-build_array_type (char_type_node,
+build_array_type (ctype,
   build_range_type (sizetype,
 size_zero_node,
 NULL)),
diff --git a/gcc/testsuite/gcc.target/avr/pr112830.c 
b/gcc/testsuite/gcc.target/avr/pr112830.c
new file mode 100644
index 000..576107b9909
--- /dev/null
+++ b/gcc/testsuite/gcc.target/avr/pr112830.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+
+typedef __SIZE_TYPE__ size_t;
+
+void copy_n (void *vdst, const __memx void *vsrc, size_t n)
+{
+typedef struct { char a[n]; } T;
+T *dst = (T*) vdst;
+const __memx T *src = (const __memx T*) vsrc;
+*dst = *src;
+}
-- 
2.35.3



[PATCH 2/2] middle-end/112830 - avoid gimplifying non-default addr-space assign to memcpy

2023-12-04 Thread Richard Biener
The following avoids turning aggregate copy or initialization involving
non-default address-spaces to memcpy or memset since they are not
prepared for that.

GIMPLE verification no longer(?) accepts WITH_SIZE_EXPR in aggregate
copies, the following re-allows that.

Sofar untested, will test on x86_64-unknown-linux-gnu.  This is
the variant I prefer.

Richard.

PR middle-end/112830
* gimplify.cc (gimplify_modify_expr): Avoid  turning aggregate
copy or initialization non-default address-spaces to memcpy or
memset.
* tree-cfg.cc (verify_gimple_assign_single): Allow
WITH_SIZE_EXPR as part of the RHS of an assignment.
---
 gcc/gimplify.cc | 11 +++
 gcc/tree-cfg.cc | 16 ++--
 2 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/gcc/gimplify.cc b/gcc/gimplify.cc
index 02f85e7109b..a1d5ee28cbe 100644
--- a/gcc/gimplify.cc
+++ b/gcc/gimplify.cc
@@ -6331,7 +6331,8 @@ gimplify_modify_expr (tree *expr_p, gimple_seq *pre_p, 
gimple_seq *post_p,
   && TYPE_SIZE_UNIT (TREE_TYPE (*from_p))
   && !poly_int_tree_p (TYPE_SIZE_UNIT (TREE_TYPE (*from_p)))
   && TREE_CODE (*from_p) == CONSTRUCTOR
-  && CONSTRUCTOR_NELTS (*from_p) == 0)
+  && CONSTRUCTOR_NELTS (*from_p) == 0
+  && ADDR_SPACE_GENERIC_P (TYPE_ADDR_SPACE (TREE_TYPE (*to_p
 {
   maybe_with_size_expr (from_p);
   gcc_assert (TREE_CODE (*from_p) == WITH_SIZE_EXPR);
@@ -6464,10 +6465,12 @@ gimplify_modify_expr (tree *expr_p, gimple_seq *pre_p, 
gimple_seq *post_p,
   tree from = TREE_OPERAND (*from_p, 0);
   tree size = TREE_OPERAND (*from_p, 1);
 
-  if (TREE_CODE (from) == CONSTRUCTOR)
+  if (!ADDR_SPACE_GENERIC_P (TYPE_ADDR_SPACE (TREE_TYPE (*to_p)))
+ || !ADDR_SPACE_GENERIC_P (TYPE_ADDR_SPACE (TREE_TYPE (from
+   ;
+  else if (TREE_CODE (from) == CONSTRUCTOR)
return gimplify_modify_expr_to_memset (expr_p, size, want_value, pre_p);
-
-  if (is_gimple_addressable (from))
+  else if (is_gimple_addressable (from))
{
  *from_p = from;
  return gimplify_modify_expr_to_memcpy (expr_p, size, want_value,
diff --git a/gcc/tree-cfg.cc b/gcc/tree-cfg.cc
index a30a2de33a1..3917bee5a92 100644
--- a/gcc/tree-cfg.cc
+++ b/gcc/tree-cfg.cc
@@ -4673,6 +4673,16 @@ verify_gimple_assign_single (gassign *stmt)
   error ("%qs in gimple IL", code_name);
   return true;
 
+case WITH_SIZE_EXPR:
+  if (!is_gimple_val (TREE_OPERAND (rhs1, 1)))
+   {
+ error ("invalid WITH_SIZE_EXPR size argument in load");
+ debug_generic_stmt (lhs);
+ debug_generic_stmt (rhs1);
+ return true;
+   }
+  rhs1 = TREE_OPERAND (rhs1, 0);
+  /* Fallthru.  */
 case COMPONENT_REF:
 case BIT_FIELD_REF:
 case ARRAY_REF:
@@ -4810,12 +4820,6 @@ verify_gimple_assign_single (gassign *stmt)
}
   return res;
 
-case WITH_SIZE_EXPR:
-  error ("%qs RHS in assignment statement",
-get_tree_code_name (rhs_code));
-  debug_generic_expr (rhs1);
-  return true;
-
 case OBJ_TYPE_REF:
   /* FIXME.  */
   return res;
-- 
2.35.3


[PATCH] range: Workaround different type precision issue between _Float128 and long double [PR112788]

2023-12-04 Thread Kewen.Lin
Hi,

As PR112788 shows, on rs6000 with -mabi=ieeelongdouble type _Float128
has the different type precision (128) from that (127) of type long
double, but actually they has the same underlying mode, so they have
the same precision as the mode indicates the same real type format
ieee_quad_format.

It's not sensible to have such two types which have the same mode but
different type precisions, some fix attempt was posted at [1].
As the discussion there, there are some historical reasons and
practical issues.  Considering we passed stage 1 and it also affected
the build as reported, this patch is trying to temporarily workaround
it.  I thought to introduce a hookpod but that seems a bit overkill,
assuming scalar float type with the same mode should have the same
precision looks sensible.

Bootstrapped and regtested on powerpc64-linux-gnu P7/P8/P9 and
powerpc64le-linux-gnu P9/P10.

Is it ok for trunk?

[1] 
https://inbox.sourceware.org/gcc-patches/718677e7-614d-7977-312d-05a75e1fd...@linux.ibm.com/

BR,
Kewen

PR tree-optimization/112788

gcc/ChangeLog:

* value-range.h (range_compatible_p): Workaround same type mode but
different type precision issue for rs6000 scalar float types
_Float128 and long double.
---
 gcc/value-range.h | 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/gcc/value-range.h b/gcc/value-range.h
index 33f204a7171..d0a84754a10 100644
--- a/gcc/value-range.h
+++ b/gcc/value-range.h
@@ -1558,7 +1558,13 @@ range_compatible_p (tree type1, tree type2)
   // types_compatible_p requires conversion in both directions to be useless.
   // GIMPLE only requires a cast one way in order to be compatible.
   // Ranges really only need the sign and precision to be the same.
-  return (TYPE_PRECISION (type1) == TYPE_PRECISION (type2)
- && TYPE_SIGN (type1) == TYPE_SIGN (type2));
+  return TYPE_SIGN (type1) == TYPE_SIGN (type2)
+&& (TYPE_PRECISION (type1) == TYPE_PRECISION (type2)
+// FIXME: As PR112788 shows, for now on rs6000 _Float128 has
+// type precision 128 while long double has type precision 127
+// but both have the same mode so their precision is actually
+// the same, workaround it temporarily.
+|| (SCALAR_FLOAT_TYPE_P (type1)
+&& TYPE_MODE (type1) == TYPE_MODE (type2)));
 }
 #endif // GCC_VALUE_RANGE_H
--
2.42.0



PING^7 [PATCH 0/9] rs6000: Rework rs6000_emit_vector_compare

2023-12-04 Thread Kewen.Lin
Hi,

Gentle ping this series:

https://gcc.gnu.org/pipermail/gcc-patches/2022-November/607146.html

BR,
Kewen

> 
>> on 2022/11/24 17:15, Kewen Lin wrote:
>>> Hi,
>>>
>>> Following Segher's suggestion, this patch series is to rework
>>> function rs6000_emit_vector_compare for vector float and int
>>> in multiple steps, it's based on the previous attempts [1][2].
>>> As mentioned in [1], the need to rework this for float is to
>>> make a centralized place for vector float comparison handlings
>>> instead of supporting with swapping ops and reversing code etc.
>>> dispersedly.  It's also for a subsequent patch to handle
>>> comparison operators with or without trapping math (PR105480).
>>> With the handling on vector float reworked, we can further make
>>> the handling on vector int simplified as shown.
>>>
>>> For Segher's concern about whether this rework causes any
>>> assembly change, I constructed two testcases for vector float[3]
>>> and int[4] respectively before, it showed the most are fine
>>> excepting for the difference on LE and UNGT, it's demonstrated
>>> as improvement since it uses GE instead of GT ior EQ.  The
>>> associated test case in patch 3/9 is a good example.
>>>
>>> Besides, w/ and w/o the whole patch series, I built the whole
>>> SPEC2017 at options -O3 and -Ofast separately, checked the
>>> differences on object assembly.  The result showed that the
>>> most are unchanged, except for:
>>>
>>>   * at -O3, 521.wrf_r has 9 object files and 526.blender_r has
>>> 9 object files with differences.
>>>
>>>   * at -Ofast, 521.wrf_r has 12 object files, 526.blender_r has
>>> one and 527.cam4_r has 4 object files with differences.
>>>
>>> By looking into these differences, all significant differences
>>> are caused by the known improvement mentined above transforming
>>> GT ior EQ to GE, which can also affect unrolling decision due
>>> to insn count.  Some other trivial differences are branch
>>> target offset difference, nop difference for alignment, vsx
>>> register number differences etc.
>>>
>>> I also evaluated the runtime performance for these changed
>>> benchmarks, the result is neutral.
>>>
>>> These patches are bootstrapped and regress-tested
>>> incrementally on powerpc64-linux-gnu P7 & P8, and
>>> powerpc64le-linux-gnu P9 & P10.
>>>
>>> Is it ok for trunk?
>>>
>>> BR,
>>> Kewen
>>> -
>>> [1] https://gcc.gnu.org/pipermail/gcc-patches/2022-November/606375.html
>>> [2] https://gcc.gnu.org/pipermail/gcc-patches/2022-November/606376.html
>>> [3] https://gcc.gnu.org/pipermail/gcc-patches/2022-November/606504.html
>>> [4] https://gcc.gnu.org/pipermail/gcc-patches/2022-November/606506.html
>>>
>>> Kewen Lin (9):
>>>   rs6000: Rework vector float comparison in rs6000_emit_vector_compare 
>>> - p1
>>>   rs6000: Rework vector float comparison in rs6000_emit_vector_compare 
>>> - p2
>>>   rs6000: Rework vector float comparison in rs6000_emit_vector_compare 
>>> - p3
>>>   rs6000: Rework vector float comparison in rs6000_emit_vector_compare 
>>> - p4
>>>   rs6000: Rework vector integer comparison in 
>>> rs6000_emit_vector_compare - p1
>>>   rs6000: Rework vector integer comparison in 
>>> rs6000_emit_vector_compare - p2
>>>   rs6000: Rework vector integer comparison in 
>>> rs6000_emit_vector_compare - p3
>>>   rs6000: Rework vector integer comparison in 
>>> rs6000_emit_vector_compare - p4
>>>   rs6000: Rework vector integer comparison in 
>>> rs6000_emit_vector_compare - p5
>>>
>>>  gcc/config/rs6000/rs6000.cc | 180 ++--
>>>  gcc/testsuite/gcc.target/powerpc/vcond-fp.c |  25 +++
>>>  2 files changed, 74 insertions(+), 131 deletions(-)
>>>  create mode 100644 gcc/testsuite/gcc.target/powerpc/vcond-fp.c
>>>
>>


[PATCH v2] RISC-V: Document optimization parameter riscv-strcmp-inline-limit

2023-12-04 Thread Christoph Müllner
This patch documents the optimization parameter
riscv-strcmp-inline-limit, which can be used to tweak the behaviour
of -minline-strcmp and -minline-strncmp.

gcc/ChangeLog:

PR target/112650
* doc/invoke.texi: Document riscv-strcmp-inline-limit.

Signed-off-by: Christoph Müllner 
---
 gcc/doc/invoke.texi | 8 
 1 file changed, 8 insertions(+)

diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 6fe63b5f999..2b51ff304f6 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -29846,6 +29846,10 @@ Inlining will only be done if the strings are properly 
aligned
 and instructions for accelerated processing are available.
 The default is to not inline strcmp calls.
 
+The @option{--param riscv-strcmp-inline-limit=@var{n}} parameter controls
+the maximum number of bytes compared by the inlined code.
+The default value is 64.
+
 @opindex minline-strncmp
 @item -minline-strncmp
 @itemx -mno-inline-strncmp
@@ -29854,6 +29858,10 @@ Inlining will only be done if the strings are properly 
aligned
 and instructions for accelerated processing are available.
 The default is to not inline strncmp calls.
 
+The @option{--param riscv-strcmp-inline-limit=@var{n}} parameter controls
+the maximum number of bytes compared by the inlined code.
+The default value is 64.
+
 @opindex mshorten-memrefs
 @item -mshorten-memrefs
 @itemx -mno-shorten-memrefs
-- 
2.43.0



Re: [PATCH] RISC-V: Document optimization parameter riscv-strcmp-inline-limit

2023-12-04 Thread Christoph Müllner
On Mon, Dec 4, 2023 at 4:46 AM Kito Cheng  wrote:
>
> Wait, I got this on my machine?
>
> ../../../../riscv-gnu-toolchain-trunk/gcc/gcc/doc/invoke.texi:29774: 
> misplaced }
> ../../../../riscv-gnu-toolchain-trunk/gcc/gcc/doc/invoke.texi:29786: 
> misplaced }

@{n} should be @var{n}.
I was too optimistic and sent the patch before the build finished (or
in this case failed).
Sorry for that.

I have sent a v2 that builds fine:
  https://gcc.gnu.org/pipermail/gcc-patches/2023-December/639142.html

>
>
> On Mon, Dec 4, 2023 at 10:43 AM Kito Cheng  wrote:
> >
> > LGTM
> >
> > On Sun, Dec 3, 2023 at 5:16 AM Christoph Müllner 
> >  wrote:
> >>
> >> This patch documents the optimization parameter
> >> riscv-strcmp-inline-limit, which can be used to tweak the behaviour
> >> of -minline-strcmp and -minline-strncmp.
> >>
> >> gcc/ChangeLog:
> >>
> >> PR target/112650
> >> * doc/invoke.texi: Document riscv-strcmp-inline-limit.
> >>
> >> Signed-off-by: Christoph Müllner 
> >> ---
> >>  gcc/doc/invoke.texi | 8 
> >>  1 file changed, 8 insertions(+)
> >>
> >> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
> >> index 2fab4c5d71f..ba2d843b484 100644
> >> --- a/gcc/doc/invoke.texi
> >> +++ b/gcc/doc/invoke.texi
> >> @@ -29846,6 +29846,10 @@ Inlining will only be done if the strings are 
> >> properly aligned
> >>  and instructions for accelerated processing are available.
> >>  The default is to not inline strcmp calls.
> >>
> >> +The @option{--param riscv-strcmp-inline-limit=@{n}} parameter controls
> >> +the maximum number of bytes compared by the inlined code.
> >> +The default value is 64.
> >> +
> >>  @opindex minline-strncmp
> >>  @item -minline-strncmp
> >>  @itemx -mno-inline-strncmp
> >> @@ -29854,6 +29858,10 @@ Inlining will only be done if the strings are 
> >> properly aligned
> >>  and instructions for accelerated processing are available.
> >>  The default is to not inline strncmp calls.
> >>
> >> +The @option{--param riscv-strcmp-inline-limit=@{n}} parameter controls
> >> +the maximum number of bytes compared by the inlined code.
> >> +The default value is 64.
> >> +
> >>  @opindex mshorten-memrefs
> >>  @item -mshorten-memrefs
> >>  @itemx -mno-shorten-memrefs
> >> --
> >> 2.41.0
> >>


PING^5 [PATCH v2] rs6000: Don't use optimize_function_for_speed_p too early [PR108184]

2023-12-04 Thread Kewen.Lin
Hi,

Gentle ping this:

https://gcc.gnu.org/pipermail/gcc-patches/2023-January/609993.html

BR,
Kewen

> 
 on 2023/1/16 17:08, Kewen.Lin via Gcc-patches wrote:
> Hi,
>
> As Honza pointed out in [1], the current uses of function
> optimize_function_for_speed_p in rs6000_option_override_internal
> are too early, since the query results from the functions
> optimize_function_for_{speed,size}_p could be changed later due
> to profile feedback and some function attributes handlings etc.
>
> This patch is to move optimize_function_for_speed_p to all the
> use places of the corresponding flags, which follows the existing
> practices.  Maybe we can cache it somewhere at an appropriate
> timing, but that's another thing.
>
> Comparing with v1[2], this version added one test case for
> SAVE_TOC_INDIRECT as Segher questioned and suggested, and it
> also considered the possibility of explicit option (see test
> cases pr108184-2.c and pr108184-4.c).  I believe that excepting
> for the intentional change on optimize_function_for_{speed,
> size}_p, there is no other function change.
>
> [1] https://gcc.gnu.org/pipermail/gcc-patches/2022-November/607527.html
> [2] https://gcc.gnu.org/pipermail/gcc-patches/2023-January/609379.html
>
> Bootstrapped and regtested on powerpc64-linux-gnu P8,
> powerpc64le-linux-gnu P{9,10} and powerpc-ibm-aix.
>
> Is it ok for trunk?
>
> BR,
> Kewen
> -
> gcc/ChangeLog:
>
>   * config/rs6000/rs6000.cc (rs6000_option_override_internal): Remove
>   all optimize_function_for_speed_p uses.
>   (fusion_gpr_load_p): Call optimize_function_for_speed_p along
>   with TARGET_P8_FUSION_SIGN.
>   (expand_fusion_gpr_load): Likewise.
>   (rs6000_call_aix): Call optimize_function_for_speed_p along with
>   TARGET_SAVE_TOC_INDIRECT.
>   * config/rs6000/predicates.md (fusion_gpr_mem_load): Call
>   optimize_function_for_speed_p along with TARGET_P8_FUSION_SIGN.
>
> gcc/testsuite/ChangeLog:
>
>   * gcc.target/powerpc/pr108184-1.c: New test.
>   * gcc.target/powerpc/pr108184-2.c: New test.
>   * gcc.target/powerpc/pr108184-3.c: New test.
>   * gcc.target/powerpc/pr108184-4.c: New test.
> ---
>  gcc/config/rs6000/predicates.md   |  5 +++-
>  gcc/config/rs6000/rs6000.cc   | 19 +-
>  gcc/testsuite/gcc.target/powerpc/pr108184-1.c | 16 
>  gcc/testsuite/gcc.target/powerpc/pr108184-2.c | 15 +++
>  gcc/testsuite/gcc.target/powerpc/pr108184-3.c | 25 +++
>  gcc/testsuite/gcc.target/powerpc/pr108184-4.c | 24 ++
>  6 files changed, 97 insertions(+), 7 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/powerpc/pr108184-1.c
>  create mode 100644 gcc/testsuite/gcc.target/powerpc/pr108184-2.c
>  create mode 100644 gcc/testsuite/gcc.target/powerpc/pr108184-3.c
>  create mode 100644 gcc/testsuite/gcc.target/powerpc/pr108184-4.c
>
> diff --git a/gcc/config/rs6000/predicates.md 
> b/gcc/config/rs6000/predicates.md
> index a1764018545..9f84468db84 100644
> --- a/gcc/config/rs6000/predicates.md
> +++ b/gcc/config/rs6000/predicates.md
> @@ -1878,7 +1878,10 @@ (define_predicate "fusion_gpr_mem_load"
>
>/* Handle sign/zero extend.  */
>if (GET_CODE (op) == ZERO_EXTEND
> -  || (TARGET_P8_FUSION_SIGN && GET_CODE (op) == SIGN_EXTEND))
> +  || (TARGET_P8_FUSION_SIGN
> +   && GET_CODE (op) == SIGN_EXTEND
> +   && (rs6000_isa_flags_explicit & OPTION_MASK_P8_FUSION_SIGN
> +   || optimize_function_for_speed_p (cfun
>  {
>op = XEXP (op, 0);
>mode = GET_MODE (op);
> diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc
> index 6ac3adcec6b..f47d21980a9 100644
> --- a/gcc/config/rs6000/rs6000.cc
> +++ b/gcc/config/rs6000/rs6000.cc
> @@ -3997,8 +3997,7 @@ rs6000_option_override_internal (bool global_init_p)
>/* If we can shrink-wrap the TOC register save separately, then use
>   -msave-toc-indirect unless explicitly disabled.  */
>if ((rs6000_isa_flags_explicit & OPTION_MASK_SAVE_TOC_INDIRECT) == 0
> -  && flag_shrink_wrap_separate
> -  && optimize_function_for_speed_p (cfun))
> +  && flag_shrink_wrap_separate)
>  rs6000_isa_flags |= OPTION_MASK_SAVE_TOC_INDIRECT;
>
>/* Enable power8 fusion if we are tuning for power8, even if we aren't
> @@ -4032,7 +4031,6 @@ rs6000_option_override_internal (bool global_init_p)
>   zero extending load, and an explicit sign extension.  */
>if (TARGET_P8_FUSION
>&& !(rs6000_isa_flags_explicit & OPTION_MASK_P8_FUSION_SIGN)
> -  && optimize_function_for_speed_p (cfun)
>&& optimize >= 3)
>  rs600

[PATCH] tree-optimization/PR112774 - SCEV: extend the chrec tree with a nonwrapping flag

2023-12-04 Thread Hao Liu OS
Loop vecotorization can not optimize following case due to SCEV is not affine
failure (i+offset may overflow):

int A[1024 * 2];

int foo (unsigned offset, unsigned N) 
{
  int sum = 0;
  for (unsigned i = 0; i < N; i++)
sum += A[i + offset];
  return sum;
}

Actually, niter pass can find nonwrapping induction variables (i.e., i + offset
can not overflow) by inferring from undefined behaviors like array access (see
record_nonwrapping_iv). But this information is not shared to SCEV yet. This
patch adds a nonwrapping flag to the chrec tree, which allows SCEV to re-use it 
to pass the nonwrapping checks like scev_probably_wraps_p, and finaly loop 
vectorization could succeed.

The new flag is defined as CHREC_NOWRAP(tree), and the dump info is changed from
"{offset, +, 1}_1" -> "{offset, +, 1}_1" (nw is short for nonwrapping). Two
SCEV interfaces record_nonwrapping_chrec and nonwrapping_chrec_p are added to 
set and check the flag respectively.

However, an extra problem is caused by resetting SCEV cache (i.e., scev_reset or
reset_scev_htab), which may not be synchronized with the calling of
free_numbers_of_iterations_estimates, which set the loop->estimate_state to
EST_NOT_COMPUTED and make sure the above inferring from array access is called.
In other words, the nonwrapping flag could be cleared and lost by resetting SCEV
cache if the loop->estimate_state is not reset.
E.g., gimple_duplicate_loop_body_to_header_edge/flush_ssaname_freelist,
which calls scev_reset/scev_reset_htab to clear the SCEV cache, but the 
estimate_state is still kept as EST_AVAILABLE and the flag will not be set in
loop vectorization.

This patch uses a simple fix by calling free_numbers_of_iterations_estimates in
vect_analyze_loop, which will make sure the flag is always set propriately in
loop vectorization. This fix is a bit ad-hoc (works for loop vectorization
only), if there is more reasonable method, I will revert the simple fix and try
that.

This patch is bootstrapped and tested on aarch64-linux-gnu with no regressions. 
OK for trunk?

---
The patch is as following:

[PATCH] SCEV: extend the chrec tree with a nonwrapping flag
 [PR112774]

The flag is defined as CHREC_NOWRAP(tree), and will be dumped from
"{offset, +, 1}_1" to "{offset, +, 1}_1" (nw is short for nonwrapping).
Two SCEV interfaces record_nonwrapping_chrec and nonwrapping_chrec_p are
added to set and check the flag respectively.

As resetting the SCEV cache (i.e., the chrec trees) may not reset the
loop->estimate_state, free_numbers_of_iterations_estimates is called
explicitly in loop vectorization to make sure the flag can be
calculated propriately by niter.

gcc/ChangeLog:

PR tree-optimization/112774
* tree-pretty-print.cc: if nonwrapping flag is set, chrec will be
printed with additional  info.
* tree-scalar-evolution.cc: add record_nonwrapping_chrec and
nonwrapping_chrec_p to set and check the new flag respectively.
* tree-scalar-evolution.h: Likewise.
* tree-ssa-loop-niter.cc (idx_infer_loop_bounds,
infer_loop_bounds_from_pointer_arith, infer_loop_bounds_from_signedness,
scev_probably_wraps_p): call record_nonwrapping_chrec before
record_nonwrapping_iv, call nonwrapping_chrec_p to check the flag is 
set and
return false from scev_probably_wraps_p.
* tree-vect-loop.cc (vect_analyze_loop): call
free_numbers_of_iterations_estimates explicitly.
* gcc/tree.h: add CHREC_NOWRAP(NODE), base.nothrow_flag is used
to represent the nonwrapping info.

gcc/testsuite/ChangeLog:

* gcc.dg/tree-ssa/scev-16.c: New test.
---
 gcc/testsuite/gcc.dg/tree-ssa/scev-16.c | 17 +
 gcc/tree-pretty-print.cc|  2 +-
 gcc/tree-scalar-evolution.cc| 24 
 gcc/tree-scalar-evolution.h |  2 ++
 gcc/tree-ssa-loop-niter.cc  | 21 -
 gcc/tree-vect-loop.cc   |  4 
 gcc/tree.h  |  8 +---
 7 files changed, 69 insertions(+), 9 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/scev-16.c

diff --git a/gcc/testsuite/gcc.dg/tree-ssa/scev-16.c 
b/gcc/testsuite/gcc.dg/tree-ssa/scev-16.c
new file mode 100644
index 000..96ea36e4c65
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/scev-16.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fdump-tree-vect-scev" } */
+
+int A[1024 * 2];
+
+int foo (unsigned offset, unsigned N)
+{
+  int sum = 0;
+
+  for (unsigned i = 0; i < N; i++)
+sum += A[i + offset];
+
+  return sum;
+}
+
+/* { dg-final { scan-tree-dump "vec_transform_loop" "vect" } } */
+/* { dg-final { scan-tree-dump-not "missed:  failed: evolution of offset is 
not affine" "vect" } } */
diff --git a/gcc/tree-pretty-print.cc b/gcc/tree-pretty-print.cc
index 1fadd752d05..0dabb6d1580 100644
--- a/gcc/tree-pretty-print.cc
+++ b/gcc/tree-pretty

[PATCH] RISC-V: Support highest-number regno overlap for widen ternary vx instructions

2023-12-04 Thread Juzhe-Zhong
Consider this example:

#include "riscv_vector.h"
void
foo6 (void *in, void *out)
{
  vfloat64m8_t accum = __riscv_vle64_v_f64m8 (in, 4);
  vfloat64m4_t high_eew64 = __riscv_vget_v_f64m8_f64m4 (accum, 1);
  vint64m4_t high_eew64_i = __riscv_vreinterpret_v_f64m4_i64m4 (high_eew64);
  vint32m4_t high_eew32_i = __riscv_vreinterpret_v_i64m4_i32m4 (high_eew64_i);
  vfloat32m4_t high_eew32 = __riscv_vreinterpret_v_i32m4_f32m4 (high_eew32_i);
  vfloat64m8_t result = __riscv_vfwnmsac_vf_f64m8 (accum, 64, high_eew32, 4);
  __riscv_vse64_v_f64m8 (out, result, 4);
}

Before this patch:

foo6:   # @foo6
vsetivlizero, 4, e32, m4, ta, ma
vle64.v v8, (a0)
lui a0, 272384
fmv.w.x fa5, a0
vmv8r.v v16, v8
vfwnmsac.vf v16, fa5, v12
vse64.v v16, (a1)
ret

After this patch:

foo6:
.LFB5:
.cfi_startproc
lui a5,%hi(.LC0)
flw fa5,%lo(.LC0)(a5)
vsetivlizero,4,e32,m4,ta,ma
vle64.v v8,0(a0)
vfwnmsac.vf v8,fa5,v12
vse64.v v8,0(a1)
ret

PR target/112431

gcc/ChangeLog:

* config/riscv/vector-iterators.md: New attributes.
* config/riscv/vector.md: Support highest-number overlap for widen 
ternary vx.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/base/pr112431-37.c: New test.
* gcc.target/riscv/rvv/base/pr112431-38.c: New test.

---
 gcc/config/riscv/vector-iterators.md  | 1071 +
 gcc/config/riscv/vector.md|  115 +-
 .../gcc.target/riscv/rvv/base/pr112431-37.c   |  103 ++
 .../gcc.target/riscv/rvv/base/pr112431-38.c   |   82 ++
 4 files changed, 1316 insertions(+), 55 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-37.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-38.c

diff --git a/gcc/config/riscv/vector-iterators.md 
b/gcc/config/riscv/vector-iterators.md
index f97f33f98ee..97a83358c4b 100644
--- a/gcc/config/riscv/vector-iterators.md
+++ b/gcc/config/riscv/vector-iterators.md
@@ -4993,3 +4993,1074 @@
   (V256DF "W21,W21,W21,W21,W42,W42,W42,W42,W84,W84,W84,W84,none,none")
   (V512DF "W21,W21,W21,W21,W42,W42,W42,W42,W84,W84,W84,W84,none,none")
 ])
+
+(define_mode_attr widen_ternop_dest_constraint [
+  (RVVM8QI "=vd, vr, vd, vr, vd, vr, ?&vr")
+  (RVVM4QI "=vd, vr, vd, vr, vd, vr, ?&vr")
+  (RVVM2QI "=vd, vr, vd, vr, vd, vr, ?&vr")
+  (RVVM1QI "=vd, vr, vd, vr, vd, vr, ?&vr")
+  (RVVMF2QI "=vd, vr, vd, vr, vd, vr, ?&vr")
+  (RVVMF4QI "=vd, vr, vd, vr, vd, vr, ?&vr")
+  (RVVMF8QI "=vd, vr, vd, vr, vd, vr, ?&vr")
+  (RVVM8HI "=vd, vr, vd, vr, vd, vr, ?&vr")
+  (RVVM4HI "=vd, vr, vd, vr, vd, vr, ?&vr")
+  (RVVM2HI "=vd, vr, vd, vr, vd, vr, ?&vr")
+  (RVVM1HI "=vd, vr, vd, vr, vd, vr, ?&vr")
+  (RVVMF2HI "=vd, vr, vd, vr, vd, vr, ?&vr")
+  (RVVMF4HI "=vd, vr, vd, vr, vd, vr, ?&vr")
+  (RVVM8HF "=vd, vr, vd, vr, vd, vr, ?&vr")
+  (RVVM4HF "=vd, vr, vd, vr, vd, vr, ?&vr")
+  (RVVM2HF "=vd, vr, vd, vr, vd, vr, ?&vr")
+  (RVVM1HF "=vd, vr, vd, vr, vd, vr, ?&vr")
+  (RVVMF2HF "=vd, vr, vd, vr, vd, vr, ?&vr")
+  (RVVMF4HF "=vd, vr, vd, vr, vd, vr, ?&vr")
+  (RVVM8SI "=vd, vr, vd, vr, vd, vr, ?&vr")
+  (RVVM4SI "=vd, vr, vd, vr, vd, vr, ?&vr")
+  (RVVM2SI "=vd, vr, vd, vr, vd, vr, ?&vr")
+  (RVVM1SI "=vd, vr, vd, vr, vd, vr, ?&vr")
+  (RVVMF2SI "=vd, vr, vd, vr, vd, vr, ?&vr")
+  (RVVM8SF "=vd, vr, vd, vr, vd, vr, ?&vr")
+  (RVVM4SF "=vd, vr, vd, vr, vd, vr, ?&vr")
+  (RVVM2SF "=vd, vr, vd, vr, vd, vr, ?&vr")
+  (RVVM1SF "=vd, vr, vd, vr, vd, vr, ?&vr")
+  (RVVMF2SF "=vd, vr, vd, vr, vd, vr, ?&vr")
+  (RVVM8DI "=vd, vr, vd, vr, vd, vr, ?&vr")
+  (RVVM4DI "=vd, vr, vd, vr, vd, vr, ?&vr")
+  (RVVM2DI "=vd, vr, vd, vr, vd, vr, ?&vr")
+  (RVVM1DI "=vd, vr, vd, vr, vd, vr, ?&vr")
+  (RVVM8DF "=vd, vr, vd, vr, vd, vr, ?&vr")
+  (RVVM4DF "=vd, vr, vd, vr, vd, vr, ?&vr")
+  (RVVM2DF "=vd, vr, vd, vr, vd, vr, ?&vr")
+  (RVVM1DF "=vd, vr, vd, vr, vd, vr, ?&vr")
+  (V1QI "=vd, vr, vd, vr, vd, vr, ?&vr")
+  (V2QI "=vd, vr, vd, vr, vd, vr, ?&vr")
+  (V4QI "=vd, vr, vd, vr, vd, vr, ?&vr")
+  (V8QI "=vd, vr, vd, vr, vd, vr, ?&vr")
+  (V16QI "=vd, vr, vd, vr, vd, vr, ?&vr")
+  (V32QI "=vd, vr, vd, vr, vd, vr, ?&vr")
+  (V64QI "=vd, vr, vd, vr, vd, vr, ?&vr")
+  (V128QI "=vd, vr, vd, vr, vd, vr, ?&vr")
+  (V256QI "=vd, vr, vd, vr, vd, vr, ?&vr")
+  (V512QI "=vd, vr, vd, vr, vd, vr, ?&vr")
+  (V1024QI "=vd, vr, vd, vr, vd, vr, ?&vr")
+  (V2048QI "=vd, vr, vd, vr, vd, vr, ?&vr")
+  (V4096QI "=vd, vr, vd, vr, vd, vr, ?&vr")
+  (V1HI "=vd, vr, vd, vr, vd, vr, ?&vr")
+  (V2HI "=vd, vr, vd, vr, vd, vr, ?&vr")
+  (V4HI "=vd, vr, vd, vr, vd, vr, ?&vr")
+  (V8HI "=vd, vr, vd, vr, vd, vr, ?&vr")
+  (V16HI "=vd, vr, vd, vr, vd, vr, ?&vr")
+  (V32HI "=vd, vr, vd, vr, vd, vr, ?&vr")
+  (V64HI "=vd, vr, vd, vr, vd, vr, ?&vr")
+  (V128HI "=vd, vr, vd, vr, vd, vr, ?&vr")
+  (V256HI "=vd, vr, vd, vr, vd, vr, ?&vr")
+  (V512HI "=vd, vr, vd, vr, v

[PATCH] s390: Fix expansion of vec_step

2023-12-04 Thread Stefan Schulze Frielinghaus
Add missing "s390" while expanding vec_step to __builtin_s390_vec_step.

gcc/ChangeLog:

* config/s390/vecintrin.h (vec_step): Expand vec_step to
__builtin_s390_vec_step.
---
 gcc/config/s390/vecintrin.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gcc/config/s390/vecintrin.h b/gcc/config/s390/vecintrin.h
index 133492c5b19..7cd1db57aec 100644
--- a/gcc/config/s390/vecintrin.h
+++ b/gcc/config/s390/vecintrin.h
@@ -59,8 +59,8 @@ along with GCC; see the file COPYING3.  If not see
| __VEC_CLASS_FP_INFINITY)
 
 /* This also accepts a type for its parameter, so it is not enough
-   to #define vec_step to __builtin_vec_step.  */
-#define vec_step(x) __builtin_vec_step (* (__typeof__ (x) *) 0)
+   to #define vec_step to __builtin_s390_vec_step.  */
+#define vec_step(x) __builtin_s390_vec_step (* (__typeof__ (x) *) 0)
 
 static inline int
 __lcbb(const void *ptr, int bndry)
-- 
2.43.0



[PATCH] libiberty: Fix pex_unix_wait return type

2023-12-04 Thread Rainer Orth
The recent warning patches broke Solaris bootstrap:

/vol/gcc/src/hg/master/local/libiberty/pex-unix.c:326:3: error: initialization 
of 'pid_t (*)(struct pex_obj *, pid_t,  int *, struct pex_time *, int,  const 
char **, int *)' {aka 'long int (*)(struct pex_obj *, long int,  int *, struct 
pex_time *, int,  const char **, int *)'} from incompatible pointer type 'int 
(*)(struct pex_obj *, pid_t,  int *, struct pex_time *, int,  const char **, 
int *)' {aka 'int (*)(struct pex_obj *, long int,  int *, struct pex_time *, 
int,  const char **, int *)'} [-Wincompatible-pointer-types]
  326 |   pex_unix_wait,
  |   ^
/vol/gcc/src/hg/master/local/libiberty/pex-unix.c:326:3: note: (near 
initialization for 'funcs.wait')

While pex_funcs.wait expects a function returning pid_t, pex_unix_wait
currently returns int.  However, on Solaris pid_t is long for 32-bit,
but int for 64-bit.

This patches fixes this by having pex_unix_wait return pid_t as
expected, and like every other variant already does.

Bootstrapped without regressions on i386-pc-solaris2.11,
sparc-sun-solaris2.11, x86_64-pc-linux-gnu, and
x86_64-apple-darwin23.1.0.

Ok for trunk?

Rainer

-- 
-
Rainer Orth, Center for Biotechnology, Bielefeld University


2023-12-03  Rainer Orth  

libiberty:
* pex-unix.c (pex_unix_wait): Change return type to pid_t.

# HG changeset patch
# Parent  2e3efea7d8370094e0472added4d944ecf1e0270
libiberty: Fix pex_unix_wait signature

diff --git a/libiberty/pex-unix.c b/libiberty/pex-unix.c
--- a/libiberty/pex-unix.c
+++ b/libiberty/pex-unix.c
@@ -308,8 +308,8 @@ static pid_t pex_unix_exec_child (struct
  int, int, int, int,
  const char **, int *);
 static int pex_unix_close (struct pex_obj *, int);
-static int pex_unix_wait (struct pex_obj *, pid_t, int *, struct pex_time *,
-			  int, const char **, int *);
+static pid_t pex_unix_wait (struct pex_obj *, pid_t, int *, struct pex_time *,
+			   int, const char **, int *);
 static int pex_unix_pipe (struct pex_obj *, int *, int);
 static FILE *pex_unix_fdopenr (struct pex_obj *, int, int);
 static FILE *pex_unix_fdopenw (struct pex_obj *, int, int);
@@ -934,7 +934,7 @@ pex_unix_exec_child (struct pex_obj *obj
 
 /* Wait for a child process to complete.  */
 
-static int
+static pid_t
 pex_unix_wait (struct pex_obj *obj, pid_t pid, int *status,
 	   struct pex_time *time, int done, const char **errmsg,
 	   int *err)


[PATCH] gm2: Fix mc/mc.flex compilation on Solaris

2023-12-04 Thread Rainer Orth
The recent warning changes broke gm2 bootstrap on Solaris:

/vol/gcc/src/hg/master/local/gcc/m2/mc/mc.flex: In function 'handleFile':
/vol/gcc/src/hg/master/local/gcc/m2/mc/mc.flex:297:21: error: implicit 
declaration of function 'alloca' [-Wimplicit-function-declaration]
  297 |   char *s = (char *)alloca (strlen (filename) + 2 + 1);
  | ^~

alloca needs  on Solaris, which isn't universally available.
Since mc.flex doesn't include any config header, I chose to switch to
__builtin_alloca instead.

/vol/gcc/src/hg/master/local/gcc/m2/mc/mc.flex:332:19: error: implicit 
declaration of function 'index' [-Wimplicit-function-declaration]
  332 |   char   *p = index(sdate, '\n');
  |   ^

index is declared in  on Solaris, again not a standard
header.  I simply switched to using strchr to avoid that issue.

Bootstrapped without regressions on i386-pc-solaris2.11,
sparc-sun-solaris2.11, x86_64-pc-linux-gnu, and
x86_64-apple-darwin23.1.0.

Ok for trunk?

Rainer

-- 
-
Rainer Orth, Center for Biotechnology, Bielefeld University


2023-12-03  Rainer Orth  

gcc/m2:
* mc/mc.flex [__GNUC__]: Define alloca as __builtin_alloca.
(handleDate): Use strchr instead of index.

# HG changeset patch
# Parent  76246607bf26a4639355410e36af4cbf08c04f99
gm2: Fix mc/mc.flex compilation on Solaris

diff --git a/gcc/m2/mc/mc.flex b/gcc/m2/mc/mc.flex
--- a/gcc/m2/mc/mc.flex
+++ b/gcc/m2/mc/mc.flex
@@ -28,6 +28,10 @@ along with GNU Modula-2; see the file CO
 #include 
 #include 
 
+#ifdef __GNUC__
+#define alloca __builtin_alloca
+#endif
+
 #if !defined(TRUE)
 #  define TRUE (1==1)
 #endif
@@ -329,7 +333,7 @@ handleDate (void)
   time_t  clock = time ((long *)0);
   char   *sdate = ctime (&clock);
   char   *s = (char *)alloca (strlen (sdate)+2+1);
-  char   *p = index(sdate, '\n');
+  char   *p = strchr(sdate, '\n');
 
   if (p != NULL) {
 *p = (char) 0;


Re: [PATCH v2[1/5] aarch64: Add cpu feature detection to libgcc

2023-12-04 Thread Andrew Carlotti
On Mon, Nov 20, 2023 at 03:46:06PM +, Richard Sandiford wrote:
> Andrew Carlotti  writes:
> > This is added to enable function multiversioning, but can also be used
> > directly.  The interface is chosen to match that used in LLVM's
> > compiler-rt, to facilitate cross-compiler compatibility.
> >
> > The content of the patch is derived almost entirely from Pavel's prior
> > contributions to compiler-rt/lib/builtins/cpu_model.c. I have made minor
> > changes to align more closely with GCC coding style, and to exclude any code
> > from other LLVM contributors, and am adding this to GCC with Pavel's 
> > approval.
> >
> > libgcc/ChangeLog:
> >
> > * config/aarch64/t-aarch64: Include cpuinfo.c
> > * config/aarch64/cpuinfo.c: New file
> > (__init_cpu_features_constructor) New.
> > (__init_cpu_features_resolver) New.
> > (__init_cpu_features) New.
> 
> OK on the basis that you mentioed in the covering note: we can deal
> with fixes incrementally.  One question though...
> >
> > Co-authored-by: Pavel Iliin 
> >
> >
> > diff --git a/libgcc/config/aarch64/cpuinfo.c 
> > b/libgcc/config/aarch64/cpuinfo.c
> > new file mode 100644
> > index 
> > ..0888ca4ed058430f524b99cb0e204bd996fa0e55
> > --- /dev/null
> > +++ b/libgcc/config/aarch64/cpuinfo.c
> > @@ -0,0 +1,502 @@
> > +/* CPU feature detection for AArch64 architecture.
> > +   Copyright (C) 2023 Free Software Foundation, Inc.
> > +
> > +   This file is part of GCC.
> > +
> > +   This file is free software; you can redistribute it and/or modify it
> > +   under the terms of the GNU General Public License as published by the
> > +   Free Software Foundation; either version 3, or (at your option) any
> > +   later version.
> > +
> > +   This file is distributed in the hope that it will be useful, but
> > +   WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   General Public License for more details.
> > +
> > +   Under Section 7 of GPL version 3, you are granted additional
> > +   permissions described in the GCC Runtime Library Exception, version
> > +   3.1, as published by the Free Software Foundation.
> > +  
> > +   You should have received a copy of the GNU General Public License and
> > +   a copy of the GCC Runtime Library Exception along with this program;
> > +   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
> > +   .  */
> > +
> > +#if defined(__has_include)
> 
> Is this protecting against a known condition?  libgcc has to be built
> with the associated version of GCC, so it might be better to drop the
> #if and get a noisy failure if something unexpected happens.  That can
> be part of 5/5 though.
> 
> Thanks,
> Richard

I don't know that this is required, so I'll assume it isn't.  I'll drop it in
the next version of this patch.

> > +#if __has_include()
> > +#include 
> > +
> > +#if __has_include()
> > +#include 
> > +#else
> > +typedef struct __ifunc_arg_t {
> > +  unsigned long _size;
> > +  unsigned long _hwcap;
> > +  unsigned long _hwcap2;
> > +} __ifunc_arg_t;
> > +#endif
> > +
> > +#if __has_include()
> > +#include 
> > +
> > +/* CPUFeatures must correspond to the same AArch64 features in aarch64.cc  
> > */
> > +enum CPUFeatures {
> > +  FEAT_RNG,
> > +  FEAT_FLAGM,
> > +  FEAT_FLAGM2,
> > +  FEAT_FP16FML,
> > +  FEAT_DOTPROD,
> > +  FEAT_SM4,
> > +  FEAT_RDM,
> > +  FEAT_LSE,
> > +  FEAT_FP,
> > +  FEAT_SIMD,
> > +  FEAT_CRC,
> > +  FEAT_SHA1,
> > +  FEAT_SHA2,
> > +  FEAT_SHA3,
> > +  FEAT_AES,
> > +  FEAT_PMULL,
> > +  FEAT_FP16,
> > +  FEAT_DIT,
> > +  FEAT_DPB,
> > +  FEAT_DPB2,
> > +  FEAT_JSCVT,
> > +  FEAT_FCMA,
> > +  FEAT_RCPC,
> > +  FEAT_RCPC2,
> > +  FEAT_FRINTTS,
> > +  FEAT_DGH,
> > +  FEAT_I8MM,
> > +  FEAT_BF16,
> > +  FEAT_EBF16,
> > +  FEAT_RPRES,
> > +  FEAT_SVE,
> > +  FEAT_SVE_BF16,
> > +  FEAT_SVE_EBF16,
> > +  FEAT_SVE_I8MM,
> > +  FEAT_SVE_F32MM,
> > +  FEAT_SVE_F64MM,
> > +  FEAT_SVE2,
> > +  FEAT_SVE_AES,
> > +  FEAT_SVE_PMULL128,
> > +  FEAT_SVE_BITPERM,
> > +  FEAT_SVE_SHA3,
> > +  FEAT_SVE_SM4,
> > +  FEAT_SME,
> > +  FEAT_MEMTAG,
> > +  FEAT_MEMTAG2,
> > +  FEAT_MEMTAG3,
> > +  FEAT_SB,
> > +  FEAT_PREDRES,
> > +  FEAT_SSBS,
> > +  FEAT_SSBS2,
> > +  FEAT_BTI,
> > +  FEAT_LS64,
> > +  FEAT_LS64_V,
> > +  FEAT_LS64_ACCDATA,
> > +  FEAT_WFXT,
> > +  FEAT_SME_F64,
> > +  FEAT_SME_I64,
> > +  FEAT_SME2,
> > +  FEAT_RCPC3,
> > +  FEAT_MAX,
> > +  FEAT_EXT = 62, /* Reserved to indicate presence of additional features 
> > field
> > +   in __aarch64_cpu_features.  */
> > +  FEAT_INIT  /* Used as flag of features initialization completion.  */
> > +};
> > +
> > +/* Architecture features used in Function Multi Versioning.  */
> > +struct {
> > +  unsigned long long features;
> > +  /* As features grows new fields could be added.  */
> > +} __aarch64_cpu_features __attribute__((visibility("hidden"), nocommo

[PATCH] ada: Fix Ada bootstrap on Solaris

2023-12-04 Thread Rainer Orth
The recent warning patches broke Ada bootstrap on Solaris:

adaint.c: In function '__gnat_kill':
adaint.c:3597:3: error: implicit declaration of function 'kill' 
[-Wimplicit-function-declaration]
 3597 |   kill (pid, sig);
  |   ^~~~

expect.c: In function '__gnat_expect_poll':
expect.c:409:5: error: implicit declaration of function 'memset' 
[-Wimplicit-function-declaration]
  409 | FD_ZERO (&rset);
  | ^~~
expect.c:55:1: note: include '' or provide a declaration of 'memset'
   54 | #include 
  +++ |+#include 
   55 | #endif

I'm now including the necessary headers:  for kill and
 for memset.  However, I found the GNAT C sources to be an
incredible and hard to untangle maze: instead of using autoconf, they
are generously sprinkled with deeply nested platform macros, so I may
well have included the headers in an inappropriate place.

Bootstrapped without regressions on i386-pc-solaris2.11,
sparc-sun-solaris2.11, x86_64-pc-linux-gnu, and
x86_64-apple-darwin23.1.0.

Ok for trunk?

Rainer

-- 
-
Rainer Orth, Center for Biotechnology, Bielefeld University


2023-12-03  Rainer Orth  

gcc/ada:
* adaint.c: Include .
* expect.c: Include .

# HG changeset patch
# Parent  5cf4f08be31b5169b377de570ac0ab0f159161af
ada: Fix Ada bootstrap on Solaris

diff --git a/gcc/ada/adaint.c b/gcc/ada/adaint.c
--- a/gcc/ada/adaint.c
+++ b/gcc/ada/adaint.c
@@ -243,6 +243,7 @@ UINT __gnat_current_ccs_encoding;
 #define DIR_SEPARATOR '\\'
 
 #else
+#include 
 #include 
 #endif
 
diff --git a/gcc/ada/expect.c b/gcc/ada/expect.c
--- a/gcc/ada/expect.c
+++ b/gcc/ada/expect.c
@@ -41,6 +41,7 @@
 
 #include "adaint.h"
 #include 
+#include 
 
 #if defined (__vxworks) && defined (__RTP__)
 # include 


Re: [PATCH] RISC-V: Remove earlyclobber from widen reduction

2023-12-04 Thread Robin Dapp
LGTM.

Regards
 Robin



Re: [PATCH] ada: Fix Ada bootstrap on Solaris

2023-12-04 Thread Arnaud Charlet
> The recent warning patches broke Ada bootstrap on Solaris:
> 
> adaint.c: In function '__gnat_kill':
> adaint.c:3597:3: error: implicit declaration of function 'kill' 
> [-Wimplicit-function-declaration]
>  3597 |   kill (pid, sig);
>   |   ^~~~
> 
> expect.c: In function '__gnat_expect_poll':
> expect.c:409:5: error: implicit declaration of function 'memset' 
> [-Wimplicit-function-declaration]
>   409 | FD_ZERO (&rset);
>   | ^~~
> expect.c:55:1: note: include '' or provide a declaration of 'memset'
>54 | #include 
>   +++ |+#include 
>55 | #endif
> 
> I'm now including the necessary headers:  for kill and
>  for memset.  However, I found the GNAT C sources to be an
> incredible and hard to untangle maze: instead of using autoconf, they
> are generously sprinkled with deeply nested platform macros, so I may
> well have included the headers in an inappropriate place.
> 
> Bootstrapped without regressions on i386-pc-solaris2.11,
> sparc-sun-solaris2.11, x86_64-pc-linux-gnu, and
> x86_64-apple-darwin23.1.0.
> 
> Ok for trunk?

OK, thanks.


Re: [PATCH] Workaround array_slice constructor portability issues (with older g++).

2023-12-04 Thread Richard Sandiford
"Roger Sayle"  writes:
> The recent change to represent language and target attribute tables using
> vec.h's array_slice template class triggers an issue/bug in older g++
> compilers, specifically the g++ 4.8.5 system compiler of older RedHat
> distributions.  This exhibits as the following compilation errors during
> bootstrap:
>
> ../../gcc/gcc/c/c-lang.cc:55:2661: error: could not convert '(const
> scoped_attribute_specs* const*)(& c_objc_attribute_table)' from 'const
> scoped_attribute_specs* const*' to 'array_slice scoped_attribute_specs* const>'
>  struct lang_hooks lang_hooks = LANG_HOOKS_INITIALIZER;
>
> ../../gcc/gcc/c/c-decl.cc:4657:1: error: could not convert '(const
> attribute_spec*)(& std_attributes)' from 'const attribute_spec*' to
> 'array_slice'
>
> Here the issue is with constructors of the from:
>
> static const int table[] = { 1, 2, 3 };
> array_slice t = table;

It's array_slice rather than array_slice.  The above
would be invalid even with functioning compilers.

> Perhaps there's a fix possible in vec.h (an additional constructor?), but
> the patch below fixes this issue by using one of array_slice's constructors
> (that takes a size) explicitly, rather than rely on template resolution.
> In the example above this looks like:
>
> array_slice t (table, 3);
>
> or equivalently
>
> array_slice t = array_slice(table, 3);
>
> or equivalently
>
> array_slice t = array_slice(table, ARRAY_SIZE (table));

Taking c-decl.cc as an arbitrary example, it seems to be enough to change:

const scoped_attribute_specs std_attribute_table =
{
  nullptr, std_attributes
};

to:

const scoped_attribute_specs std_attribute_table =
{
  nullptr, { std_attributes }
};

which seems less ugly than the explicit constructors.

But if we're going to do this, we should do it across the board,
not just for x86.

I think it's getting a bit ridiculous though.  Let's just accept
that 4.8.5 is not a fully functioning C++11 compiler and move on.
People who are still using that as their host compiler will need
to upgrade soon anyway, so we're just putting off the inevitable.
It's unlikely that these workarounds that we keep adding will ever
fully be removed.

Thanks,
Richard

> This patch has been tested on x86_64-pc-linux-gnu with make bootstrap,
> where these changes allow the bootstrap to complete.  Ok for mainline?
> This fix might not by ideal, but it both draws attention to the problem
> and restores bootstrap whilst better approaches are investigated.  For
> example, an ARRAY_SLICE(table) macro might be appropriate if there isn't
> an easy/portable template resolution solution.  Thoughts?
>
>
> 2023-12-03  Roger Sayle  
>
> gcc/c-family/ChangeLog
> * c-attribs.cc (c_common_gnu_attribute_table): Use an explicit
> array_slice constructor with an explicit size argument.
> (c_common_format_attribute_table): Likewise.
>
> gcc/c/ChangeLog
> * c-decl.cc (std_attribute_table): Use an explicit
> array_slice constructor with an explicit size argument.
> * c-objc-common.h (LANG_HOOKS_ATTRIBUTE_TABLE): Likewise.
>
> gcc/ChangeLog
> * config/i386/i386-options.cc (ix86_gnu_attribute_table): Use an
> explicit array_slice constructor with an explicit size argument.
> * config/i386/i386.cc (TARGET_ATTRIBUTE_TABLE): Likewise.
>
> gcc/cp/ChangeLog
> * cp-objcp-common.h (LANG_HOOKS_ATTRIBUTE_TABLE): Use an
> explicit array_slice constructor with an explicit size argument.
> * tree.cc (cxx_gnu_attribute_table): Likewise.
> (std_attribute_table): Likewise.
>
> gcc/lto/ChangeLog
> * lto-lang.cc (lto_gnu_attribute_table): Use an explicit
> array_slice constructor with an explicit size argument.
> (lto_format_attribute_table): Likewise.
> (LANG_HOOKS_ATTRIBUTE_TABLE): Likewise.
>
>
> Thanks in advance,
> Roger
> --


[PATCH] libssp: Fix gets-chk.c compilation on Solaris

2023-12-04 Thread Rainer Orth
The recent warning patches broke the libssp build on Solaris:

/vol/gcc/src/hg/master/local/libssp/gets-chk.c: In function '__gets_chk':
/vol/gcc/src/hg/master/local/libssp/gets-chk.c:67:12: error: implicit 
declaration of function 'gets'; did you mean 'getw'? 
[-Wimplicit-function-declaration]
   67 | return gets (s);
  |^~~~
  |getw 
/vol/gcc/src/hg/master/local/libssp/gets-chk.c:67:12: error: returning 'int' 
from a function with return type 'char *' makes pointer from integer without a 
cast [-Wint-conversion]   
   67 | return gets (s);
  |^~~~
/vol/gcc/src/hg/master/local/libssp/gets-chk.c:74:12: error: returning 'int' 
from a function with return type 'char *' makes pointer from integer without a 
cast [-Wint-conversion]
   74 | return gets (s);
  |^~~~

The guard around the gets declaration in gets-chk.c is

#if !(!defined __USE_ISOC11 \
  || (defined __cplusplus && __cplusplus <= 201103L))
extern char *gets (char *);
#endif

__USE_ISOC11 is a glibc-only thing, while Solaris 
declares gets like

#if __STDC_VERSION__ < 201112L && __cplusplus < 201402L
extern char *gets(char *) __ATTR_DEPRECATED;
#endif

If one needs to check __USE_ISO11 at all, one certainly needs to check
__STDC_VERSION__ to avoid breaking every non-glibc target.  Besides, I
don't see what's the use of checking __cplusplus when compiling a C-only
source file.  On top of all that, the double negation makes the guard
unnecessarily hard to understand.

I really don't know if it's useful/appropriate to check __USE_ISOC11 and
__cplusplus here at all; still I've left both for now.

Here's what I've used to complete the Solaris bootstrap.

Tested on i386-pc-solaris2.11, sparc-sun-solaris2.11,
x86_64-pc-linux-gnu, and x86_64-apple-darwin23.1.0.

-- 
-
Rainer Orth, Center for Biotechnology, Bielefeld University


2023-12-03  Rainer Orth  

libssp:
* gets-chk.c (gets): Avoid double negation.
Also check __STDC_VERSION__ >= 201112L.

# HG changeset patch
# Parent  334015ab01f6c0e5af821c1e9bc83b8677cc0bfb
libssp: Fix gets-chk.c compilation on Solaris

diff --git a/libssp/gets-chk.c b/libssp/gets-chk.c
--- a/libssp/gets-chk.c
+++ b/libssp/gets-chk.c
@@ -51,8 +51,9 @@ see the files COPYING3 and COPYING.RUNTI
 # include 
 #endif
 
-#if !(!defined __USE_ISOC11\
-  || (defined __cplusplus && __cplusplus <= 201103L))
+#if (defined __STDC_VERSION__ && __STDC_VERSION__ >= 201112L)	\
+ || !defined __USE_ISOC11	\
+ || (defined __cplusplus && __cplusplus >= 201402L)
 extern char *gets (char *);
 #endif
 


[PATCH] expmed: Perform mask extraction via QImode [PR112773].

2023-12-04 Thread Robin Dapp
Hi,

this changes the vec_extract path of extract_bit_field to use QImode
instead of BImode when extracting from mask vectors and changes
GET_MODE_BITSIZE to GET_MODE_PRECISION.  This fixes an ICE on riscv
where we did not find a vec_extract optab and continued with the generic
code that requires 1-byte alignment that riscv mask modes do not
provide.  Using QImode extraction makes this piece of code
behave similarly as vectorizable_live_operation where we create
a VEC_EXTRACT whose extraction mode expand_convert_optab_fn converts
from  to QImode by TYPE_MODE.

Apart from that it adds poly_int support to riscv's vec_extract
expander and makes the RVV..BImode -> QImode expander call
emit_vec_extract in order to not duplicate code.

Bootstrapped and regtested on aarch64 and x86.  Regtested on
riscv64, still running on riscv32.

Regards
 Robin

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/partial/pr112773.c: New test.
---
 gcc/config/riscv/autovec.md   | 35 ++-
 gcc/config/riscv/riscv-protos.h   |  3 +-
 gcc/config/riscv/riscv-v.cc   | 14 
 gcc/config/riscv/riscv.cc |  6 ++--
 gcc/expmed.cc | 24 -
 .../riscv/rvv/autovec/partial/pr112773.c  | 20 +++
 6 files changed, 68 insertions(+), 34 deletions(-)
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/pr112773.c

diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md
index 2d727c2609b..3c4d68367f0 100644
--- a/gcc/config/riscv/autovec.md
+++ b/gcc/config/riscv/autovec.md
@@ -1380,12 +1380,23 @@ (define_expand "vec_extract"
   rtx tmp = NULL_RTX;
   if (operands[2] != const0_rtx)
 {
-  /* Emit the slide down to index 0 in a new vector.  */
-  tmp = gen_reg_rtx (mode);
-  operands[2] = gen_lowpart (Pmode, operands[2]);
-  rtx ops[] = {tmp, operands[1], operands[2]};
-  riscv_vector::emit_vlmax_insn
-   (code_for_pred_slide (UNSPEC_VSLIDEDOWN, mode), 
riscv_vector::BINARY_OP, ops);
+  /* Properly convert a poly_int value and put the result into a
+register.  */
+  if (CONST_POLY_INT_P (operands[2]))
+   {
+ rtx pos = gen_reg_rtx (Pmode);
+ riscv_legitimize_poly_move (Pmode, pos, gen_reg_rtx (Pmode),
+ operands[2]);
+ operands[2] = pos;
+   }
+
+/* Emit the slide down to index 0 in a new vector.  */
+tmp = gen_reg_rtx (mode);
+operands[2] = gen_lowpart (Pmode, operands[2]);
+rtx ops[] = {tmp, operands[1], operands[2]};
+riscv_vector::emit_vlmax_insn
+  (code_for_pred_slide (UNSPEC_VSLIDEDOWN, mode),
+   riscv_vector::BINARY_OP, ops);
 }
 
   /* Emit v(f)mv.[xf].s.  */
@@ -1417,16 +1428,8 @@ (define_expand "vec_extractqi"
   riscv_vector::emit_vlmax_insn (code_for_pred_merge (qimode),
 riscv_vector::MERGE_OP, ops1);
 
-  /* Slide down the requested byte element.  */
-  rtx tmp2 = gen_reg_rtx (qimode);
-
-  rtx ops2[] = {tmp2, tmp1, operands[2]};
-  riscv_vector::emit_vlmax_insn
-(code_for_pred_slide (UNSPEC_VSLIDEDOWN, qimode),
- riscv_vector::BINARY_OP, ops2);
-
-  /* Extract it.  */
-  emit_insn (gen_pred_extract_first (qimode, operands[0], tmp2));
+  /* Extract from it.  */
+  riscv_vector::emit_vec_extract (operands[0], tmp1, operands[2]);
   DONE;
 })
 
diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 695ee24ad6f..c02de84d6ef 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -129,6 +129,7 @@ extern void riscv_asm_output_alias (FILE *, const tree, 
const tree);
 extern void riscv_asm_output_external (FILE *, const tree, const char *);
 extern bool
 riscv_zcmp_valid_stack_adj_bytes_p (HOST_WIDE_INT, int);
+extern void riscv_legitimize_poly_move (machine_mode, rtx, rtx, rtx);
 
 #ifdef RTX_CODE
 extern void riscv_expand_int_scc (rtx, enum rtx_code, rtx, rtx, bool 
*invert_ptr = 0);
@@ -558,7 +559,7 @@ void expand_cond_binop (unsigned, rtx *);
 void expand_cond_ternop (unsigned, rtx *);
 void expand_popcount (rtx *);
 void expand_rawmemchr (machine_mode, rtx, rtx, rtx);
-void emit_vec_extract (rtx, rtx, poly_int64);
+void emit_vec_extract (rtx, rtx, rtx);
 
 /* Rounding mode bitfield for fixed point VXRM.  */
 enum fixed_point_rounding_mode
diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index 588c127343e..430aae3dc69 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -3253,7 +3253,7 @@ shuffle_extract_and_slide1up_patterns (struct 
expand_vec_perm_d *d)
   /* Extract the last element of the first vector.  */
   scalar_mode smode = GET_MODE_INNER (d->vmode);
   rtx tmp = gen_reg_rtx (smode);
-  emit_vec_extract (tmp, d->op0, nunits - 1);
+  emit_vec_extract (tmp, d->op0, gen_int_mode (nunits - 1, Pmode));
 
   /* Insert the scalar into element 0.  */
   unsigned int un

Re: [PATCH] pro_and_epilogue: Call df_note_add_problem () if SHRINK_WRAPPING_ENABLED [PR112760]

2023-12-04 Thread Richard Sandiford
Jakub Jelinek  writes:
> On Sat, Dec 02, 2023 at 11:04:04AM +, Richard Sandiford wrote:
>> I still maintain that so much stuff relies on the lack of false-positive
>> REG_UNUSED notes that (whatever the intention might have been) we need
>> to prevent the false positive.  Like Andrew says, any use of single_set
>> is suspect if there's a REG_UNUSED note for something that is in fact used.
>
> The false positive REG_UNUSED in that case comes from
> (insn 15 14 35 2 (set (reg:CCZ 17 flags)
> (compare:CCZ (reg:DI 0 ax [111])
> (reg:DI 1 dx [112]))) "pr112760.c":11:22 12 {*cmpdi_1}
>  (expr_list:REG_UNUSED (reg:CCZ 17 flags)
> (nil)))
> (insn 35 15 36 2 (set (reg:CCZ 17 flags)
> (compare:CCZ (reg:DI 0 ax [111])
> (reg:DI 1 dx [112]))) "pr112760.c":11:22 12 {*cmpdi_1}
>  (expr_list:REG_DEAD (reg:DI 1 dx [112])
> (expr_list:REG_DEAD (reg:DI 0 ax [111])
> (nil
> ...
> use of flags
> Haven't verified what causes the redundant comparison, but postreload cse
> then does:
> 110 if (!count && cselib_redundant_set_p (body))
> 111   {
> 112 if (check_for_inc_dec (insn))
> 113   delete_insn_and_edges (insn);
> 114 /* We're done with this insn.  */
> 115 goto done;
> 116   }
> So, we'd in such cases need to look up what instruction was the earlier
> setter and if it has REG_UNUSED note, drop it.

Hmm, OK.  I guess it's not as simple as I'd imagined.  cselib does have
some code to track which instruction established which equivalence,
but it doesn't currently record what we want, and it would be difficult
to reuse that information here anyway.  Something "simple" like a map of
register numbers to instructions, populated only for REG_UNUSED sets,
would be enough, and low overhead.  But it's not very natural.

Perhaps DF should maintain a flag to say "the current pass keeps
notes up-to-date", with the assumption being that any pass that
uses the notes problem does that.  Then single_set and the
regcprop.cc uses can check that flag.

I don't think it's worth adding the note problem to shrink-wrapping
just for the regcprop code.  If we're prepared to take that compile-time
hit, we might as well run a proper (fast) DCE.

Thanks,
Richard


Re: [PATCH v2 4/5] Add support for target_version attribute

2023-12-04 Thread Andrew Carlotti
On Wed, Nov 29, 2023 at 05:53:56PM +, Richard Sandiford wrote:
> Andrew Carlotti  writes:
> > This patch adds support for the "target_version" attribute to the middle
> > end and the C++ frontend, which will be used to implement function
> > multiversioning in the aarch64 backend.
> >
> > On targets that don't use the "target" attribute for multiversioning,
> > there is no conflict between the "target" and "target_clones"
> > attributes.  This patch therefore makes the mutual exclusion in
> > C-family, D and Ada conditonal upon the value of the
> > expanded_clones_attribute target hook.
> >
> > The "target_version" attribute is only added to C++ in this patch,
> > because this is currently the only frontend which supports
> > multiversioning using the "target" attribute.  Support for the
> > "target_version" attribute will be extended to C at a later date.
> >
> > Targets that currently use the "target" attribute for function
> > multiversioning (i.e. i386 and rs6000) are not affected by this patch.
> >
> > Ok for master?
> >
> > gcc/ChangeLog:
> >
> > * attribs.cc (decl_attributes): Pass attribute name to target.
> > (is_function_default_version): Update comment to specify
> > incompatibility with target_version attributes.
> > * cgraphclones.cc (cgraph_node::create_version_clone_with_body):
> > Call valid_version_attribute_p for target_version attributes.
> > * target.def (valid_version_attribute_p): New hook.
> > (expanded_clones_attribute): New hook.
> > * doc/tm.texi.in: Add new hooks.
> > * doc/tm.texi: Regenerate.
> > * multiple_target.cc (create_dispatcher_calls): Remove redundant
> > is_function_default_version check.
> > (expand_target_clones): Use target hook for attribute name.
> > * targhooks.cc (default_target_option_valid_version_attribute_p):
> > New.
> > * targhooks.h (default_target_option_valid_version_attribute_p):
> > New.
> > * tree.h (DECL_FUNCTION_VERSIONED): Update comment to include
> > target_version attributes.
> >
> > gcc/c-family/ChangeLog:
> >
> > * c-attribs.cc (CLONES_USES_TARGET): New macro.
> > (attr_target_exclusions): Use new macro.
> > (attr_target_clones_exclusions): Ditto, and add target_version.
> > (attr_target_version_exclusions): New.
> > (c_common_attribute_table): Add target_version.
> > (handle_target_version_attribute): New.
> >
> > gcc/ada/ChangeLog:
> >
> > * gcc-interface/utils.cc (CLONES_USES_TARGET): New macro.
> > (attr_target_exclusions): Use new macro.
> > (attr_target_clones_exclusions): Ditto.
> >
> > gcc/d/ChangeLog:
> >
> > * d-attribs.cc (CLONES_USES_TARGET): New macro.
> > (attr_target_exclusions): Use new macro.
> > (attr_target_clones_exclusions): Ditto.
> >
> > gcc/cp/ChangeLog:
> >
> > * decl2.cc (check_classfn): Update comment to include
> > target_version attributes.
> >
> >
> > diff --git a/gcc/ada/gcc-interface/utils.cc b/gcc/ada/gcc-interface/utils.cc
> > index 
> > e33a63948cebdeafc3abcdd539a35141969ad978..8850943cb3326568b4679a73405f50487aa1b7c6
> >  100644
> > --- a/gcc/ada/gcc-interface/utils.cc
> > +++ b/gcc/ada/gcc-interface/utils.cc
> > @@ -143,16 +143,21 @@ static const struct attribute_spec::exclusions 
> > attr_noinline_exclusions[] =
> >{ NULL, false, false, false },
> >  };
> >  
> > +#define CLONES_USES_TARGET \
> > +  (strcmp (targetm.target_option.expanded_clones_attribute, \
> > +  "target") == 0)
> > +
> 
> Sorry for the slower review on this part.  I was hoping inspiration
> would strike for a way to resolve this, but it hasn't, so:
> 
> The codebase usually avoids static variables that need dynamic
> initialisation.  So although macros are not the preferred way of
> doing things, I think one is probably appropriate here.  How about:
> 
>   TARGET_HAS_FMV_TARGET_ATTRIBUTE
> 
> with the default being true, and with AArch64 defining it to false?
> 
> This would replace the expanded_clones_attribute hook, with:
> 
>   const char *new_attr_name = targetm.target_option.expanded_clones_attribute;
> 
> becoming:
> 
>   const char *new_attr_name = (TARGET_HAS_FMV_TARGET_ATTRIBUTE
>  ? "target" : "target_version");
> 
> I realise this is anything but elegant, but I think it's probably
> the least worst option, given where we are.

I thought this could be an issue, and had deliberately not committed patches
2+3 in this series in case fixing this required reverting to specific runtime
checks within each handler.

I've changed it to use your suggestion in the next version.
 
> >  static const struct attribute_spec::exclusions attr_target_exclusions[] =
> >  {
> > -  { "target_clones", true, true, true },
> > +  { "target_clones", CLONES_USES_TARGET, CLONES_USES_TARGET,
> > +CLONES_USES_TARGET },
> >{ NULL, false, false, false },
> >  };
> >  
> >  static const struct attribute_spec::exclusions 
> > attr_target_clones_exclusions[] =
> >  {
> >{ "always_in

[PATCH] tree-optimization/112827 - corrupt SCEV cache during SCCP

2023-12-04 Thread Richard Biener
The following avoids corrupting the SCEV cache by my last change
to propagate constant final values immediately.  The easiest fix
is to keep a dead initialization around.

Bootstrapped and tested on x86_64-unknown-linux-gnu, pushed.

PR tree-optimization/112827
* tree-scalar-evolution.cc (final_value_replacement_loop):
Do not release SSA name but keep a dead initialization around.

* gcc.dg/torture/pr112827-1.c: New testcase.
* gcc.dg/torture/pr112827-2.c: Likewise.
---
 gcc/testsuite/gcc.dg/torture/pr112827-1.c | 14 ++
 gcc/testsuite/gcc.dg/torture/pr112827-2.c | 18 ++
 gcc/tree-scalar-evolution.cc  |  9 +++--
 3 files changed, 35 insertions(+), 6 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/torture/pr112827-1.c
 create mode 100644 gcc/testsuite/gcc.dg/torture/pr112827-2.c

diff --git a/gcc/testsuite/gcc.dg/torture/pr112827-1.c 
b/gcc/testsuite/gcc.dg/torture/pr112827-1.c
new file mode 100644
index 000..6838cbbe62f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/pr112827-1.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+
+int a, b, c, d, e;
+int main() {
+  for (; c; c++) {
+for (a = 0; a < 2; a++)
+  ;
+for (; b; b++) {
+  e = d;
+  d = a;
+}
+  }
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.dg/torture/pr112827-2.c 
b/gcc/testsuite/gcc.dg/torture/pr112827-2.c
new file mode 100644
index 000..a7a2a70211b
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/pr112827-2.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+
+short a, b[1], f;
+char c, g;
+int d, e;
+int main() {
+  for (; f; f++) {
+for (d = 0; d < 2; d++)
+  ;
+if (a)
+  for (g = 0; g < 2; g++)
+for (c = 0; c < 2; c += b[d+g])
+  ;
+for (; e; e++)
+  ;
+  }
+  return 0;
+}
diff --git a/gcc/tree-scalar-evolution.cc b/gcc/tree-scalar-evolution.cc
index 065bcd0743d..7556d89e9f8 100644
--- a/gcc/tree-scalar-evolution.cc
+++ b/gcc/tree-scalar-evolution.cc
@@ -3847,13 +3847,10 @@ final_value_replacement_loop (class loop *loop)
   def = unshare_expr (def);
   remove_phi_node (&psi, false);
 
-  /* Propagate constants immediately.  */
+  /* Propagate constants immediately, but leave an unused initialization
+around to avoid invalidating the SCEV cache.  */
   if (CONSTANT_CLASS_P (def))
-   {
- replace_uses_by (rslt, def);
- release_ssa_name (rslt);
- continue;
-   }
+   replace_uses_by (rslt, def);
 
   /* Create the replacement statements.  */
   gimple_seq stmts;
-- 
2.35.3


Re: [PATCH] testsuite: scev: expect fail on ilp32

2023-12-04 Thread Richard Biener
On Sat, 2 Dec 2023, Hans-Peter Nilsson wrote:

> > Date: Fri, 1 Dec 2023 08:07:14 +0100 (CET)
> > From: Richard Biener 
> 
> > On Fri, 1 Dec 2023, Hans-Peter Nilsson wrote:
> > 
> > > > From: Hans-Peter Nilsson 
> > > > Date: Thu, 30 Nov 2023 18:09:10 +0100
> > > 
> > > Richard B.:
> > > > > > In the end we might need to move/duplicate the test to some
> > > > > > gcc.target/* dir and restrict it to a specific tuning.
> > > > 
> > > > I intend to post two alternative patches to get this
> > > > resolved:
> > > > 1: Move the tests to gcc.target/i386/scev-[3-5].c
> > > 
> > > Subject: [PATCH 1/2] testsuite: Fix XPASS for gcc.dg/tree-ssa/scev-3.c, 
> > > -4.c and -5.c [PR112786]
> > > 
> > > This is the first alternative, perhaps the more appropriate one.
> > > 
> > > Tested cris-elf, arm-eabi (default), x86_64-linux, ditto -m32,
> > > h8300-elf and shle-linux; xpassing, skipped and passing as
> > > applicable and intended.
> > > 
> > > Ok to commit?
> > 
> > Digging in history reveals the testcases were added by
> > Jiangning Liu , not for any
> > particular bugreport but "The problem is originally from a real benchmark,
> > and the test case only tries to detect the GIMPLE level changes."
> > 
> > I'm not sure we can infer the testcase should be moved to
> > gcc.target/arm/ because of that, but it does seem plausible.
> 
> It's been so long and so many changes since these tests were
> regression guards, that the original target has lost
> importance.  Heck, it was even xfail lp64 at one time!
> According to my git dig, it's been adjusted for pass
> changes, including reordering and dump output changes.  But
> you know that; you've been instrumental in many of those
> changes. :)
> 
> I'd say gcc.target/arm/ is the one target that's *not*
> plausible, as according to Alex result differs between
> subtargets.
> 
> > I read from your messages that the testcases pass on arm*-*-*?
> 
> Yes: they pass (currently XPASS) on arm-eabi and
> arm-unknown-linux-gnueabi, default configurations.  But,
> scev-3 and -5 fail with for example -mcpu=cortex-r5

I see.  As said, the testcases test for "cost" things, so that we
"regressed" might mean we really "regressed" here.  Even the x86 -m32
result is questionable.

Of course whether using a single IV makes sense for all archs is
unknown.

Btw, if we turn the testcases into ones that are (sub-)target
specific then we want to again use C code as input.

I think at this point we've lost track and I'm juggling between
removing the testcases or moving them to a place they succeed
(with some specific -mcpu=?)

Richard.


Re: [PATCH] RISC-V: Support highest-number regno overlap for widen ternary vx instructions

2023-12-04 Thread Robin Dapp
> +(define_mode_attr widen_ternop_dest_constraint [
> +  (RVVM8QI "=vd, vr, vd, vr, vd, vr, ?&vr")
> +  (RVVM4QI "=vd, vr, vd, vr, vd, vr, ?&vr")
> +  (RVVM2QI "=vd, vr, vd, vr, vd, vr, ?&vr")
> +  (RVVM1QI "=vd, vr, vd, vr, vd, vr, ?&vr")
> +  (RVVMF2QI "=vd, vr, vd, vr, vd, vr, ?&vr")
> +  (RVVMF4QI "=vd, vr, vd, vr, vd, vr, ?&vr")
> +  (RVVMF8QI "=vd, vr, vd, vr, vd, vr, ?&vr")
> +  (RVVM8HI "=vd, vr, vd, vr, vd, vr, ?&vr")
[...]

I'm fine with avoiding the overlap but I'm not sure this is
easily maintainable because the constraints don't actually
depend on the mode?  I suppose this is for easy re-use across
different insns but there are only six(?) widening patterns
so we don't even save lines of code by this?

I guess I would prefer the normal approach of writing it out
explicitly in the pattern.  Maybe add a different replacement
method like define_subst in the future to simplify such
situations?


+  "vwmacc.vx\t%0,%z3,%4%p1"

Why the z here?  For canonicalization?

Regards
 Robin



[PATCH v1] LoongArch: Modify the check type of the vector builtin function.

2023-12-04 Thread chenxiaolong
On LoongArch architecture, using the latest gcc14 in regression test,
it is found that the vector test cases in vector directory appear FAIL
entries with unmatched pointer types. In order to solve this kind of
problem, the type of the variable in the check result is modified with
the parameter type defined in the vector builtin function.

gcc/testsuite/ChangeLog:

* gcc.target/loongarch/vector/simd_correctness_check.h:The variable
types in the check results are modified in conjunction with the
parameter types defined in the vector builtin function.
---
 .../gcc.target/loongarch/vector/simd_correctness_check.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/gcc/testsuite/gcc.target/loongarch/vector/simd_correctness_check.h 
b/gcc/testsuite/gcc.target/loongarch/vector/simd_correctness_check.h
index eb7fbd59cc7..f780f6586b3 100644
--- a/gcc/testsuite/gcc.target/loongarch/vector/simd_correctness_check.h
+++ b/gcc/testsuite/gcc.target/loongarch/vector/simd_correctness_check.h
@@ -8,7 +8,8 @@
   int fail = 0;   \
   for (size_t i = 0; i < sizeof (res) / sizeof (res[0]); ++i) \
 { \
-  long *temp_ref = &ref[i], *temp_res = &res[i];  \
+  long long *temp_ref = (long long *)&ref[i], \
+   *temp_res = (long long *)&res[i]; \
   if (abs (*temp_ref - *temp_res) > 0)\
 { \
   printf (" error: %s at line %ld , expected " #ref   \
@@ -28,7 +29,7 @@
   int fail = 0;   \
   for (size_t i = 0; i < sizeof (res) / sizeof (res[0]); ++i) \
 { \
-  int *temp_ref = &ref[i], *temp_res = &res[i];   \
+  int *temp_ref = (int *)&ref[i], *temp_res = (int *)&res[i]; \
   if (abs (*temp_ref - *temp_res) > 0)\
 { \
   printf (" error: %s at line %ld , expected " #ref   \
-- 
2.20.1



Re: [PATCH] driver: Fix memory leak.

2023-12-04 Thread Jonathan Wakely
On Sat, 2 Dec 2023 at 21:24, Costas Argyris wrote:
>
> Use std::vector instead of malloc'd pointer
> to get automatic freeing of memory.

You can't include  there. Instead you need to define
INCLUDE_VECTOR before "system.h"

Shouldn't you be using resize, not reserve? Otherwise mdswitches[i] is
undefined.



Re: [PATCH] libstdc++: Simplify ranges::to closure objects

2023-12-04 Thread Jonathan Wakely
On Thu, 30 Nov 2023 at 19:23, Patrick Palka wrote:
>
> Tested on x86_64-pc-linux-gnu, does this look OK for trunk?

OK, thanks for simplifying it.

>
> -- >8 --
>
> Use the existing _Partial range adaptor closure object in the
> definition of ranges::to instead of essentially open coding it.
>
> libstdc++-v3/ChangeLog:
>
> * include/std/ranges (__detail::_ToClosure): Replace with ...
> (__detail::_To): ... this.
> (__detail::_ToClosure2): Replace with ...
> (__detail::To2): ... this.
> (to): Simplify using the existing _Partial range adaptor
> closure object.
> ---
>  libstdc++-v3/include/std/ranges | 140 
>  1 file changed, 32 insertions(+), 108 deletions(-)
>
> diff --git a/libstdc++-v3/include/std/ranges b/libstdc++-v3/include/std/ranges
> index 9d4c2e01c4d..33e576e563a 100644
> --- a/libstdc++-v3/include/std/ranges
> +++ b/libstdc++-v3/include/std/ranges
> @@ -1007,6 +1007,7 @@ namespace views::__adaptor
>
>// Invoke _Adaptor with arguments __r, _M_args... according to the
>// value category of this _Partial object.
> +  // TODO: use explicit object functions ("deducing this").
>template
> requires __adaptor_invocable<_Adaptor, _Range, const _Args&...>
> constexpr auto
> @@ -1137,6 +1138,7 @@ namespace views::__adaptor
>
>// Invoke _M_rhs(_M_lhs(__r)) according to the value category of this
>// range adaptor closure object.
> +  // TODO: use explicit object functions ("deducing this").
>template
> requires __pipe_invocable
> constexpr auto
> @@ -9391,59 +9393,16 @@ namespace __detail
>  /// @cond undocumented
>  namespace __detail
>  {
> -  template
> -class _ToClosure
> -: public views::__adaptor::_RangeAdaptorClosure<_ToClosure<_Cont, 
> _Args...>>
> +  template
> +struct _To
>  {
> -  tuple...> _M_bound_args;
> -
> -public:
> -  constexpr
> -  _ToClosure(_Args&&... __args)
> -  : _M_bound_args(std::forward<_Args>(__args)...)
> -  { }
> -
> -  // TODO: use explicit object functions ("deducing this").
> -
> -  template
> -   constexpr auto
> -   operator()(_Rg&& __r) &
> -   {
> - return std::apply([&__r](_Tp&&... __args) {
> -   return ranges::to<_Cont>(std::forward<_Rg>(__r),
> -std::forward<_Tp>(__args)...);
> - }, _M_bound_args);
> -   }
> -
> -  template
> -   constexpr auto
> -   operator()(_Rg&& __r) const &
> -   {
> - return std::apply([&__r](_Tp&&... __args) {
> -   return ranges::to<_Cont>(std::forward<_Rg>(__r),
> -std::forward<_Tp>(__args)...);
> - }, _M_bound_args);
> -   }
> -
> -  template
> -   constexpr auto
> -   operator()(_Rg&& __r) &&
> -   {
> - return std::apply([&__r](_Tp&&... __args) {
> -   return ranges::to<_Cont>(std::forward<_Rg>(__r),
> -std::forward<_Tp>(__args)...);
> - }, std::move(_M_bound_args));
> -   }
> -
> -  template
> -   constexpr auto
> -   operator()(_Rg&& __r) const &&
> -   {
> - return std::apply([&__r](_Tp&&... __args) {
> -   return ranges::to<_Cont>(std::forward<_Rg>(__r),
> -std::forward<_Tp>(__args)...);
> - }, std::move(_M_bound_args));
> -   }
> +  template
> +  constexpr auto
> +  operator()(_Range&& __r, _Args&&... __args) const
> +  {
> +   return ranges::to<_Cont>(std::forward<_Range>(__r),
> +std::forward<_Args>(__args)...);
> +  }
>  };
>  } // namespace __detail
>  /// @endcond
> @@ -9465,66 +9424,27 @@ namespace __detail
> */
>template
>  requires (!view<_Cont>)
> -constexpr __detail::_ToClosure<_Cont, _Args...>
> +constexpr auto
>  to [[nodiscard]] (_Args&&... __args)
> -{ return {std::forward<_Args>(__args)...}; }
> +{
> +  using __detail::_To;
> +  using views::__adaptor::_Partial;
> +  return _Partial<_To<_Cont>, 
> decay_t<_Args>...>{std::forward<_Args>(__args)...};
> +}
>
>  /// @cond undocumented
>  namespace __detail
>  {
> -  template typename _Cont, typename... _Args>
> -class _ToClosure2
> -: public views::__adaptor::_RangeAdaptorClosure<_ToClosure2<_Cont, 
> _Args...>>
> +  template typename _Cont>
> +struct _To2
>  {
> -  tuple...> _M_bound_args;
> -
> -public:
> -  constexpr
> -  _ToClosure2(_Args&&... __args)
> -  : _M_bound_args(std::forward<_Args>(__args)...)
> -  { }
> -
> -  // TODO: use explicit object functions ("deducing this").
> -
> -  template
> -   constexpr auto
> -   operator()(_Rg&& __r) &
> -   {
> - return std::apply([&__r](_Tp&&... __args) {
> -   return ranges::to<_Cont>(std::forward<_Rg>(__r),
> -   

Re: [PATCH v1] LoongArch: Modify the check type of the vector builtin function.

2023-12-04 Thread Xi Ruoyao
On Mon, 2023-12-04 at 20:14 +0800, chenxiaolong wrote:
> On LoongArch architecture, using the latest gcc14 in regression test,
> it is found that the vector test cases in vector directory appear FAIL
> entries with unmatched pointer types. In order to solve this kind of
> problem, the type of the variable in the check result is modified with
> the parameter type defined in the vector builtin function.

IMO we should write something more readable:

static inline
void ASSERTEQ_64 (int line, const void *_ref, const void *_res)
{
  if (memcmp (_ref, _res, 16) == 0)
return;

  const char *ref = (const char *)_ref;
  const char *res = (const char *)_res;

  printf ("error %s:%d: result mismatch\n", __FILE__, line);

  printf ("ref:");
  for (int i = 0; i < 16; i++)
printf (" %02x", ref[i]);

  printf ("\nres:");
  for (int i = 0; i < 16; i++)
printf (" %02x", res[i]);

  putchar ('\n');
  abort ();
}

> gcc/testsuite/ChangeLog:
> 
>   * gcc.target/loongarch/vector/simd_correctness_check.h:The variable
>   types in the check results are modified in conjunction with the
>   parameter types defined in the vector builtin function.
> ---
>  .../gcc.target/loongarch/vector/simd_correctness_check.h | 5 +++--
>  1 file changed, 3 insertions(+), 2 deletions(-)
> 
> diff --git 
> a/gcc/testsuite/gcc.target/loongarch/vector/simd_correctness_check.h 
> b/gcc/testsuite/gcc.target/loongarch/vector/simd_correctness_check.h
> index eb7fbd59cc7..f780f6586b3 100644
> --- a/gcc/testsuite/gcc.target/loongarch/vector/simd_correctness_check.h
> +++ b/gcc/testsuite/gcc.target/loongarch/vector/simd_correctness_check.h
> @@ -8,7 +8,8 @@
>    int fail = 0;  
>  \
>    for (size_t i = 0; i < sizeof (res) / sizeof (res[0]); ++i)
>  \
>  {
>  \
> -  long *temp_ref = &ref[i], *temp_res = &res[i]; 
>  \
> +  long long *temp_ref = (long long *)&ref[i],
>  \
> + *temp_res = (long long *)&res[i];     \
>    if (abs (*temp_ref - *temp_res) > 0)   
>  \
>  {
>  \
>    printf (" error: %s at line %ld , expected " #ref  
>  \
> @@ -28,7 +29,7 @@
>    int fail = 0;  
>  \
>    for (size_t i = 0; i < sizeof (res) / sizeof (res[0]); ++i)
>  \
>  {
>  \
> -  int *temp_ref = &ref[i], *temp_res = &res[i];  
>  \
> +  int *temp_ref = (int *)&ref[i], *temp_res = (int *)&res[i];
>  \
>    if (abs (*temp_ref - *temp_res) > 0)   
>  \
>  {
>  \
>    printf (" error: %s at line %ld , expected " #ref  
>  \

-- 
Xi Ruoyao 
School of Aerospace Science and Technology, Xidian University


[PATCH] c/86869 - preserve address-space info when building qualified ARRAY_TYPE

2023-12-04 Thread Richard Biener
The following adjusts the C FE specific qualified type building
to preserve address-space info also for ARRAY_TYPE.

Bootstrap / regtest running on x86_64-unknown-linux-gnu, OK?

Thanks,
Richard.

PR c/86869
gcc/c/
* c-typeck.cc (c_build_qualified_type): Preserve address-space
info for ARRAY_TYPE.

gcc/testsuite/
* gcc.target/avr/pr86869.c: New testcase.
---
 gcc/c/c-typeck.cc  | 1 +
 gcc/testsuite/gcc.target/avr/pr86869.c | 9 +
 2 files changed, 10 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/avr/pr86869.c

diff --git a/gcc/c/c-typeck.cc b/gcc/c/c-typeck.cc
index a6edbc85f10..836893905fa 100644
--- a/gcc/c/c-typeck.cc
+++ b/gcc/c/c-typeck.cc
@@ -16263,6 +16263,7 @@ c_build_qualified_type (tree type, int type_quals, tree 
orig_qual_type,
 
  t = build_variant_type_copy (type);
  TREE_TYPE (t) = element_type;
+ TYPE_ADDR_SPACE (t) = TYPE_ADDR_SPACE (element_type);
 
   if (TYPE_STRUCTURAL_EQUALITY_P (element_type)
   || (domain && TYPE_STRUCTURAL_EQUALITY_P (domain)))
diff --git a/gcc/testsuite/gcc.target/avr/pr86869.c 
b/gcc/testsuite/gcc.target/avr/pr86869.c
new file mode 100644
index 000..fbfb378e8c9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/avr/pr86869.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+
+struct S {
+  char y[2];
+};
+
+void foo(const __memx  struct S *s) {
+  const char (*p)[2] = &s->y;
+}
-- 
2.35.3


Re: [PATCH v1] LoongArch: Modify the check type of the vector builtin function.

2023-12-04 Thread Xi Ruoyao
On Mon, 2023-12-04 at 20:31 +0800, Xi Ruoyao wrote:
> On Mon, 2023-12-04 at 20:14 +0800, chenxiaolong wrote:
> > On LoongArch architecture, using the latest gcc14 in regression test,
> > it is found that the vector test cases in vector directory appear FAIL
> > entries with unmatched pointer types. In order to solve this kind of
> > problem, the type of the variable in the check result is modified with
> > the parameter type defined in the vector builtin function.
> 
> IMO we should write something more readable:
> 
> static inline
> void ASSERTEQ_64 (int line, const void *_ref, const void *_res)
> {
>   if (memcmp (_ref, _res, 16) == 0)
>     return;
> 
>   const char *ref = (const char *)_ref;
>   const char *res = (const char *)_res;
> 
>   printf ("error %s:%d: result mismatch\n", __FILE__, line);
> 
>   printf ("ref:");
>   for (int i = 0; i < 16; i++)
>     printf (" %02x", ref[i]);

Sorry, should be " %02hhx" here.

> 
>   printf ("\nres:");
>   for (int i = 0; i < 16; i++)
>     printf (" %02x", res[i]);

Likewise.

>   putchar ('\n');
>   abort ();
> }

-- 
Xi Ruoyao 
School of Aerospace Science and Technology, Xidian University


Re: [PATCH] download_prerequisites: add --only-gettext

2023-12-04 Thread Richard Biener
On Sat, Dec 2, 2023 at 4:53 PM Arsen Arsenović  wrote:
>
> contrib/ChangeLog:
>
> * download_prerequisites
> : Parse --only-gettext.
> (echo_archives): Check only_gettext and stop early if true.
> (helptext): Document --only-gettext.
> ---
> Afternoon,
>
> This patch adds a --only-gettext option to download_prerequisites for
> when the only useful dependency to download is gettext (which will
> restore a gcc source tree to a similar 'intlness' as before the
> externalization of gettext-runtime).
>
> For context, see
> https://inbox.sourceware.org/CAFiYyc2-JxH358GUcZfR4iBMq5qj6Nf4W=7lyoqyw6b-u8d...@mail.gmail.com/
>
> OK for trunk?

OK

> TIA, have a lovely day!
>
>  contrib/download_prerequisites | 8 +++-
>  1 file changed, 7 insertions(+), 1 deletion(-)
>
> diff --git a/contrib/download_prerequisites b/contrib/download_prerequisites
> index 9568091c0dba..30ff0cc9491a 100755
> --- a/contrib/download_prerequisites
> +++ b/contrib/download_prerequisites
> @@ -36,16 +36,18 @@ gettext='gettext-0.22.tar.gz'
>  base_url='http://gcc.gnu.org/pub/gcc/infrastructure/'
>
>  echo_archives() {
> +echo "${gettext}"
> +if "${only_gettext}"; then return; fi
>  echo "${gmp}"
>  echo "${mpfr}"
>  echo "${mpc}"
> -echo "${gettext}"
>  if [ ${graphite} -gt 0 ]; then echo "${isl}"; fi
>  }
>
>  graphite=1
>  verify=1
>  force=0
> +only_gettext=false
>  OS=$(uname)
>
>  if type wget > /dev/null ; then
> @@ -74,6 +76,7 @@ The following options are available:
>   --no-verify  don't verify package integrity
>   --sha512 use SHA512 checksum to verify package integrity (default)
>   --md5use MD5 checksum to verify package integrity
> + --only-gettext   inhibit downloading any package but gettext
>   --help   show this text and exit
>   --versionshow version information and exit
>  "
> @@ -159,6 +162,9 @@ do
>  chksum_extension='md5'
>  verify=1
>  ;;
> +--only-gettext)
> +only_gettext=true
> +;;
>  -*)
>  die "unknown option: ${arg}"
>  ;;
> --
> 2.43.0
>


Re: [PATCH] gettext: disable install, docs targets, libasprintf, threads

2023-12-04 Thread Richard Biener
On Sat, Dec 2, 2023 at 5:03 PM Arsen Arsenović  wrote:
>
> This fixes issues reported by David Edelsohn , and by
> Eric Gallager .
>
> ChangeLog:
>
> * Makefile.def (gettext): Disable (via missing)
> {install-,}{pdf,html,info,dvi} and TAGS targets.  Set no_install
> to true.  Add --disable-threads --disable-libasprintf.  Drop the
> lib_path (as there are no shared libs).
> ---
> Afternoon,
>
> This patch disables various targets and features on the gettext module
> to fix problems reported by David Edelsohn and Eric Gallager in
> https://inbox.sourceware.org/CAGWvnynmWgNjup4cAwSbsy1vw_MJLQqSULwM=kth_+lt+_s...@mail.gmail.com/
> and followups and on IRC, respectively.
>
> The gettext module does not actually require any of these to be usable
> for the purposes of the toolchain, so disabling them seems to be a
> decent workaround.
>
> This seemed to fix the respective issues for both Eric and David,
> though, I could not get GDB to build on AIX with or without this patch
> applied (I needed to disable sim, gdb and gnulib modules).
>
> It is possible I am missing something.  Due to some unfortunate
> circumstances, it's taken more time than anticipated to actually get
> this change tested, and I've had to context swap quite a few bits.  Such
> a process has quite a lot of room for error.
>
> Tested on x86_64-unknown-freebsd13.2.

OK.

>  Makefile.def |  13 +++-
>  Makefile.in  | 202 ---
>  [removed regenerated file from the patch below]
>  2 files changed, 40 insertions(+), 175 deletions(-)
>
> diff --git a/Makefile.def b/Makefile.def
> index 792f81447e1b..ba89d46b2495 100644
> --- a/Makefile.def
> +++ b/Makefile.def
> @@ -80,8 +80,17 @@ host_modules= { module= gettext; bootstrap=true; 
> no_install=true;
> // need it in some configuratons, which is determined via 
> nontrivial tests.
> // Always enabling pic seems to make sense for something tied 
> to
> // user-facing output.
> -extra_configure_flags='--disable-shared --disable-java 
> --disable-csharp --with-pic';
> -lib_path=intl/.libs; };
> +   extra_configure_flags='--disable-shared --disable-threads 
> --disable-java --disable-csharp --with-pic --disable-libasprintf';
> +   missing= pdf;
> +   missing= html;
> +   missing= info;
> +   missing= dvi;
> +   missing= install-pdf;
> +   missing= install-html;
> +   missing= install-info;
> +   missing= install-dvi;
> +   missing= TAGS;
> +   no_install= true; };
>  host_modules= { module= tcl;
>  missing=mostlyclean; };
>  host_modules= { module= itcl; };
> diff --git a/Makefile.in b/Makefile.in
> index da2344b3f3dc..3bd7d37e9605 100644
>
> --
> 2.43.0
>


Re: [PATCH v2] RISC-V: Document optimization parameter riscv-strcmp-inline-limit

2023-12-04 Thread Kito Cheng
LGTM

On Mon, Dec 4, 2023 at 5:55 PM Christoph Müllner
 wrote:
>
> This patch documents the optimization parameter
> riscv-strcmp-inline-limit, which can be used to tweak the behaviour
> of -minline-strcmp and -minline-strncmp.
>
> gcc/ChangeLog:
>
> PR target/112650
> * doc/invoke.texi: Document riscv-strcmp-inline-limit.
>
> Signed-off-by: Christoph Müllner 
> ---
>  gcc/doc/invoke.texi | 8 
>  1 file changed, 8 insertions(+)
>
> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
> index 6fe63b5f999..2b51ff304f6 100644
> --- a/gcc/doc/invoke.texi
> +++ b/gcc/doc/invoke.texi
> @@ -29846,6 +29846,10 @@ Inlining will only be done if the strings are 
> properly aligned
>  and instructions for accelerated processing are available.
>  The default is to not inline strcmp calls.
>
> +The @option{--param riscv-strcmp-inline-limit=@var{n}} parameter controls
> +the maximum number of bytes compared by the inlined code.
> +The default value is 64.
> +
>  @opindex minline-strncmp
>  @item -minline-strncmp
>  @itemx -mno-inline-strncmp
> @@ -29854,6 +29858,10 @@ Inlining will only be done if the strings are 
> properly aligned
>  and instructions for accelerated processing are available.
>  The default is to not inline strncmp calls.
>
> +The @option{--param riscv-strcmp-inline-limit=@var{n}} parameter controls
> +the maximum number of bytes compared by the inlined code.
> +The default value is 64.
> +
>  @opindex mshorten-memrefs
>  @item -mshorten-memrefs
>  @itemx -mno-shorten-memrefs
> --
> 2.43.0
>


[PATCH] RISC-V: Fix two testscases related to -std changes.

2023-12-04 Thread Robin Dapp
Hi,

recent -std changes caused testsuite failures.  Fix those by adding
-std=gnu99 and -Wno-incompatible-pointer-types.

Going to commit as obvious.

Regards
 Robin

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/pr112552.c: Add
-Wno-incompatible-pointer-types.
* gcc.target/riscv/rvv/autovec/struct/struct_vect_run-10.c:
Add -std=gnu99.
---
 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr112552.c   | 2 +-
 .../gcc.target/riscv/rvv/autovec/struct/struct_vect_run-10.c| 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr112552.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr112552.c
index 32d221ccede..4ef76cd3506 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr112552.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr112552.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O3 -march=rv64gcv -mabi=lp64d 
--param=riscv-autovec-preference=fixed-vlmax -w" } */
+/* { dg-options "-O3 -march=rv64gcv -mabi=lp64d 
--param=riscv-autovec-preference=fixed-vlmax -w 
-Wno-incompatible-pointer-types" } */
 
 int a, c, d;
 void (*b)();
diff --git 
a/gcc/testsuite/gcc.target/riscv/rvv/autovec/struct/struct_vect_run-10.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/struct/struct_vect_run-10.c
index 79037048f55..3beca30c361 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/struct/struct_vect_run-10.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/struct/struct_vect_run-10.c
@@ -1,5 +1,5 @@
 /* { dg-do run { target { riscv_v && riscv_zvfh_hw } } } */
-/* { dg-additional-options "-std=c99 --param=riscv-autovec-preference=scalable 
-fno-vect-cost-model" } */
+/* { dg-additional-options "-std=gnu99 
--param=riscv-autovec-preference=scalable -fno-vect-cost-model" } */
 
 #define TYPE _Float16
 #define ITYPE int16_t
-- 
2.43.0



[PATCH] Restore build with GCC 4.8 to GCC 5 (was Re: [PATCH] Workaround array_slice constructor portability issues (with older g++).)

2023-12-04 Thread Richard Sandiford
Richard Sandiford  writes:
> "Roger Sayle"  writes:
>> The recent change to represent language and target attribute tables using
>> vec.h's array_slice template class triggers an issue/bug in older g++
>> compilers, specifically the g++ 4.8.5 system compiler of older RedHat
>> distributions.  This exhibits as the following compilation errors during
>> bootstrap:
>>
>> ../../gcc/gcc/c/c-lang.cc:55:2661: error: could not convert '(const
>> scoped_attribute_specs* const*)(& c_objc_attribute_table)' from 'const
>> scoped_attribute_specs* const*' to 'array_slice> scoped_attribute_specs* const>'
>>  struct lang_hooks lang_hooks = LANG_HOOKS_INITIALIZER;
>>
>> ../../gcc/gcc/c/c-decl.cc:4657:1: error: could not convert '(const
>> attribute_spec*)(& std_attributes)' from 'const attribute_spec*' to
>> 'array_slice'
>>
>> Here the issue is with constructors of the from:
>>
>> static const int table[] = { 1, 2, 3 };
>> array_slice t = table;
>
> It's array_slice rather than array_slice.  The above
> would be invalid even with functioning compilers.
>
>> Perhaps there's a fix possible in vec.h (an additional constructor?), but
>> the patch below fixes this issue by using one of array_slice's constructors
>> (that takes a size) explicitly, rather than rely on template resolution.
>> In the example above this looks like:
>>
>> array_slice t (table, 3);
>>
>> or equivalently
>>
>> array_slice t = array_slice(table, 3);
>>
>> or equivalently
>>
>> array_slice t = array_slice(table, ARRAY_SIZE (table));
>
> Taking c-decl.cc as an arbitrary example, it seems to be enough to change:
>
> const scoped_attribute_specs std_attribute_table =
> {
>   nullptr, std_attributes
> };
>
> to:
>
> const scoped_attribute_specs std_attribute_table =
> {
>   nullptr, { std_attributes }
> };
>
> which seems less ugly than the explicit constructors.
>
> But if we're going to do this, we should do it across the board,
> not just for x86.

Here's an attempt to do that.  Tested so far by building
aarch64-linux-gnu (which has target attributes) and frv-elf
(which doesn't) with GCC 4.8.

OK if further testing passes?

Richard

---

GCC 5 and earlier applied array-to-pointer decay too early,
which affected the new attribute namespace code.  A reduced
example of the construct that the attribute code uses is:

struct S { template<__SIZE_TYPE__ N> S(int (&)[N]); };
struct T { int a; S b; };
int a[] = { 1 };
T t = { 1, a };

This was fixed by f85e1317f8ea933f5c615680353bd646f480f7d3
(PR 16333 et al).

This patch tries to add a minimally-invasive workaround.

gcc/ada/
* gcc-interface/utils.cc (gnat_internal_attribute_table): Add extra
braces to work around PR 16333 in older compilers.

gcc/
* attribs.cc (handle_ignored_attributes_option): Add extra
braces to work around PR 16333 in older compilers.
* config/arm/arm.cc (arm_gnu_attribute_table): Likewise.
* config/i386/i386-options.cc (ix86_gnu_attribute_table): Likewise.
* config/ia64/ia64.cc (ia64_gnu_attribute_table): Likewise.
* config/rs6000/rs6000.cc (rs6000_gnu_attribute_table): Likewise.
* target-def.h (TARGET_GNU_ATTRIBUTES): Likewise.
* genhooks.cc (emit_init_macros): Likewise, when emitting the
instantiation of TARGET_ATTRIBUTE_TABLE.
* langhooks-def.h (LANG_HOOKS_INITIALIZER): Likewise, when
instantiating LANG_HOOKS_ATTRIBUTE_TABLE.
(LANG_HOOKS_ATTRIBUTE_TABLE): Define to be empty by default.
* target.def (attribute_table): Likewise.

gcc/c-family/
* c-attribs.cc (c_common_gnu_attribute_table): Add extra
braces to work around PR 16333 in older compilers.

gcc/c/
* c-decl.cc (std_attribute_table): Add extra braces to work
around PR 16333 in older compilers.

gcc/cp/
* tree.cc (cxx_gnu_attribute_table): Add extra braces to work
around PR 16333 in older compilers.

gcc/d/
* d-attribs.cc (d_langhook_common_attribute_table): Add extra braces
to work around PR 16333 in older compilers.
(d_langhook_gnu_attribute_table): Likewise.

gcc/fortran/
* f95-lang.cc (gfc_gnu_attribute_table): Add extra braces to work
around PR 16333 in older compilers.

gcc/jit/
* dummy-frontend.cc (jit_gnu_attribute_table): Add extra braces
to work around PR 16333 in older compilers.
(jit_format_attribute_table): Likewise.

gcc/lto/
* lto-lang.cc (lto_gnu_attribute_table): Add extra braces to work
around PR 16333 in older compilers.
(lto_format_attribute_table): Likewise.
---
 gcc/ada/gcc-interface/utils.cc  | 2 +-
 gcc/attribs.cc  | 2 +-
 gcc/c-family/c-attribs.cc   | 4 ++--
 gcc/c/c-decl.cc | 2 +-
 gcc/config/arm/arm.cc   | 2 +-
 gcc/config/i386/i386-options.cc | 2 +-
 gcc/config/ia64/ia64.cc | 2 +-
 gcc/config/rs6000/rs6000.cc | 2 +-
 gcc/cp/tree.cc  | 7 +--
 gcc/d/d-attribs.cc   

Re: [PATCH v2 5/5] aarch64: Add function multiversioning support

2023-12-04 Thread Andrew Carlotti
On Fri, Nov 24, 2023 at 04:22:54PM +, Richard Sandiford wrote:
> Andrew Carlotti  writes:
> > This adds initial support for function multiversioning on aarch64 using
> > the target_version and target_clones attributes.  This loosely follows
> > the Beta specification in the ACLE [1], although with some differences
> > that still need to be resolved (possibly as follow-up patches).
> >
> > Existing function multiversioning implementations are broken in various
> > ways when used across translation units.  This includes placing
> > resolvers in the wrong translation units, and using symbol mangling that
> > callers to unintentionally bypass the resolver in some circumstances.
> > Fixing these issues for aarch64 will require modifications to our ACLE
> > specification.  It will also require further adjustments to existing
> > middle end code, to facilitate different mangling and resolver
> > placement while preserving existing target behaviours.
> >
> > The list of function multiversioning features specified in the ACLE is
> > also inconsistent with the list of features supported in target option
> > extensions.  I intend to resolve some or all of these inconsistencies at
> > a later stage.
> >
> > The target_version attribute is currently only supported in C++, since
> > this is the only frontend with existing support for multiversioning
> > using the target attribute.  On the other hand, this patch happens to
> > enable multiversioning with the target_clones attribute in Ada and D, as
> > well as the entire C family, using their existing frontend support.
> >
> > This patch also does not support the following aspects of the Beta
> > specification:
> >
> > - The target_clones attribute should allow an implicit unlisted
> >   "default" version.
> > - There should be an option to disable function multiversioning at
> >   compile time.
> > - Unrecognised target names in a target_clones attribute should be
> >   ignored (with an optional warning).  This current patch raises an
> >   error instead.
> >
> > [1] 
> > https://github.com/ARM-software/acle/blob/main/main/acle.md#function-multi-versioning
> >
> > ---
> >
> > I believe the support present in this patch correctly handles function
> > multiversioning within a single translation unit for all features in the 
> > ACLE
> > specification with option extension support.
> >
> > Is it ok to push this patch in its current state? I'd then continue working 
> > on
> > incremental improvements to the supported feature extensions and the ABI 
> > issues
> > in followup patches, in along with corresponding changes and improvements to
> > the ACLE specification.
> >
> >
> > gcc/ChangeLog:
> >
> > * config/aarch64/aarch64-feature-deps.h (fmv_deps_):
> > Define aarch64_feature_flags mask foreach FMV feature.
> > * config/aarch64/aarch64-option-extensions.def: Use new macros
> > to define FMV feature extensions.
> > * config/aarch64/aarch64.cc (aarch64_option_valid_attribute_p):
> > Check for target_version attribute after processing target
> > attribute.
> > (aarch64_fmv_feature_data): New.
> > (aarch64_parse_fmv_features): New.
> > (aarch64_process_target_version_attr): New.
> > (aarch64_option_valid_version_attribute_p): New.
> > (get_feature_mask_for_version): New.
> > (compare_feature_masks): New.
> > (aarch64_compare_version_priority): New.
> > (build_ifunc_arg_type): New.
> > (make_resolver_func): New.
> > (add_condition_to_bb): New.
> > (compare_feature_version_info): New.
> > (dispatch_function_versions): New.
> > (aarch64_generate_version_dispatcher_body): New.
> > (aarch64_get_function_versions_dispatcher): New.
> > (aarch64_common_function_versions): New.
> > (aarch64_mangle_decl_assembler_name): New.
> > (TARGET_OPTION_VALID_VERSION_ATTRIBUTE_P): New implementation.
> > (TARGET_OPTION_EXPANDED_CLONES_ATTRIBUTE): New implementation.
> > (TARGET_OPTION_FUNCTION_VERSIONS): New implementation.
> > (TARGET_COMPARE_VERSION_PRIORITY): New implementation.
> > (TARGET_GENERATE_VERSION_DISPATCHER_BODY): New implementation.
> > (TARGET_GET_FUNCTION_VERSIONS_DISPATCHER): New implementation.
> > (TARGET_MANGLE_DECL_ASSEMBLER_NAME): New implementation.
> > * config/arm/aarch-common.h (enum aarch_parse_opt_result): Add
> >   new value to report duplicate FMV feature.
> > * common/config/aarch64/cpuinfo.h: New file.
> >
> > libgcc/ChangeLog:
> >
> > * config/aarch64/cpuinfo.c (enum CPUFeatures): Move to shared
> >   copy in gcc/common
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.target/aarch64/options_set_17.c: Reorder expected flags.
> > * gcc.target/aarch64/cpunative/native_cpu_0.c: Ditto.
> > * gcc.target/aarch64/cpunative/native_cpu_13.c: Ditto.
> > * gcc.target/aarch64/cpunative/native_cpu_16.c: Ditto.
> > * gcc.target/aarch64/cpunative/native_cpu_17.c: Ditto.
> > * gcc.target/aarch64/cpunative/native_c

[PATCH] c/89270 - honor registered_builtin_types in type_for_size

2023-12-04 Thread Richard Biener
The following fixes the intermediate conversions inserted by
convert_to_integer when facing address-spaces and converts
to their effective [u]intptr_t when they are registered_builtin_types
by considering those also from c_common_type_for_size and not
only from c_common_type_for_mode.

Bootstrap and regtest on x86_64-unknown-linux-gnu, OK?

Thanks,
Richard.

PR c/89270
gcc/c-family/
* c-common.cc (c_common_type_for_size): Consider
registered_builtin_types.

gcc/testsuite/
* gcc.target/avr/pr89270.c: New testcase.
---
 gcc/c-family/c-common.cc   | 9 +
 gcc/testsuite/gcc.target/avr/pr89270.c | 7 +++
 2 files changed, 16 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/avr/pr89270.c

diff --git a/gcc/c-family/c-common.cc b/gcc/c-family/c-common.cc
index b2b70c99338..d175054dddb 100644
--- a/gcc/c-family/c-common.cc
+++ b/gcc/c-family/c-common.cc
@@ -2362,6 +2362,15 @@ c_common_type_for_size (unsigned int bits, int unsignedp)
 return (unsignedp ? widest_unsigned_literal_type_node
: widest_integer_literal_type_node);
 
+  for (tree t = registered_builtin_types; t; t = TREE_CHAIN (t))
+{
+  tree type = TREE_VALUE (t);
+  if (TREE_CODE (type) == INTEGER_TYPE
+ && bits == TYPE_PRECISION (type)
+ && !!unsignedp == !!TYPE_UNSIGNED (type))
+   return type;
+}
+
   if (bits <= TYPE_PRECISION (intQI_type_node))
 return unsignedp ? unsigned_intQI_type_node : intQI_type_node;
 
diff --git a/gcc/testsuite/gcc.target/avr/pr89270.c 
b/gcc/testsuite/gcc.target/avr/pr89270.c
new file mode 100644
index 000..2b6e4a8aa5b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/avr/pr89270.c
@@ -0,0 +1,7 @@
+/* { dg-do compile } */
+
+void test()
+{
+  extern const unsigned char __memx __data_load_end;
+  __uint24 top=(__uint24)&__data_load_end;
+}
-- 
2.35.3


[PATCH] tree-optimization/112818 - re-instantiate vector type size check for bswap

2023-12-04 Thread Richard Biener
For __builtin_bswap vectorization we still require an equal vector
type size.  Re-instantiate that check.

Bootstrapped and tested on x86_64-unknown-linux-gnu, pushed.

PR tree-optimization/112818
* tree-vect-stmts.cc (vectorizable_bswap): Check input and
output vector types have the same size.

* gcc.dg/vect/pr112818.c: New testcase.
---
 gcc/testsuite/gcc.dg/vect/pr112818.c | 34 
 gcc/tree-vect-stmts.cc   |  9 
 2 files changed, 43 insertions(+)
 create mode 100644 gcc/testsuite/gcc.dg/vect/pr112818.c

diff --git a/gcc/testsuite/gcc.dg/vect/pr112818.c 
b/gcc/testsuite/gcc.dg/vect/pr112818.c
new file mode 100644
index 000..61a30a576b7
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr112818.c
@@ -0,0 +1,34 @@
+/* { dg-do compile } */
+
+extern char tag_data[];
+struct pppoe_tag {
+unsigned short tag_type;
+unsigned short tag_len;
+};
+
+char code;
+int *add_tag_pack;
+void *add_tag_data;
+short e;
+long c, d;
+
+static int add_tag(int type, int len) {
+short a, b;
+struct pppoe_tag *tag = (struct pppoe_tag *)add_tag_pack;
+if (e + len || len < 0)
+  return 1;
+b = __builtin_bswap16(type);
+tag->tag_type = b;
+a = __builtin_bswap16(len);
+tag->tag_len = a;
+if (add_tag_data)
+  __builtin___memcpy_chk(tag_data, add_tag_data, len, c);
+return 0;
+}
+void pppoe_serv_read() {
+switch (code)
+  case 9: {
+ add_tag(2, d);
+ add_tag(0, 2);
+ }
+}
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 067abac3917..390c8472fd6 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -2976,6 +2976,15 @@ vectorizable_bswap (vec_info *vinfo,
 
   gcc_assert (ncopies >= 1);
 
+  if (TYPE_SIZE (vectype_in) != TYPE_SIZE (vectype))
+{
+  if (dump_enabled_p ())
+   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+"mismatched vector sizes %T and %T\n",
+vectype_in, vectype);
+  return false;
+}
+
   tree char_vectype = get_same_sized_vectype (char_type_node, vectype_in);
   if (! char_vectype)
 return false;
-- 
2.35.3


[PATCH V2] RISC-V: Support highest-number regno overlap for widen ternary

2023-12-04 Thread Juzhe-Zhong
Consider this example:

#include "riscv_vector.h"
void
foo6 (void *in, void *out)
{
  vfloat64m8_t accum = __riscv_vle64_v_f64m8 (in, 4);
  vfloat64m4_t high_eew64 = __riscv_vget_v_f64m8_f64m4 (accum, 1);
  vint64m4_t high_eew64_i = __riscv_vreinterpret_v_f64m4_i64m4 (high_eew64);
  vint32m4_t high_eew32_i = __riscv_vreinterpret_v_i64m4_i32m4 (high_eew64_i);
  vfloat32m4_t high_eew32 = __riscv_vreinterpret_v_i32m4_f32m4 (high_eew32_i);
  vfloat64m8_t result = __riscv_vfwnmsac_vf_f64m8 (accum, 64, high_eew32, 4);
  __riscv_vse64_v_f64m8 (out, result, 4);
}

Before this patch:

foo6:   # @foo6
vsetivlizero, 4, e32, m4, ta, ma
vle64.v v8, (a0)
lui a0, 272384
fmv.w.x fa5, a0
vmv8r.v v16, v8
vfwnmsac.vf v16, fa5, v12
vse64.v v16, (a1)
ret

After this patch:

foo6:
.LFB5:
.cfi_startproc
lui a5,%hi(.LC0)
flw fa5,%lo(.LC0)(a5)
vsetivlizero,4,e32,m4,ta,ma
vle64.v v8,0(a0)
vfwnmsac.vf v8,fa5,v12
vse64.v v8,0(a1)
ret

PR target/112431

gcc/ChangeLog:

* config/riscv/vector.md: Add highest-number overlap support.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/base/pr112431-37.c: New test.
* gcc.target/riscv/rvv/base/pr112431-38.c: New test.

---
 gcc/config/riscv/vector.md| 115 +-
 .../gcc.target/riscv/rvv/base/pr112431-37.c   | 103 
 .../gcc.target/riscv/rvv/base/pr112431-38.c   |  82 +
 3 files changed, 245 insertions(+), 55 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-37.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-38.c

diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md
index 72cf3553e45..ee222980bed 100644
--- a/gcc/config/riscv/vector.md
+++ b/gcc/config/riscv/vector.md
@@ -5866,29 +5866,30 @@
(set_attr "mode" "")])
 
 (define_insn "@pred_widen_mul_plus_scalar"
-  [(set (match_operand:VWEXTI 0 "register_operand""=&vr")
+  [(set (match_operand:VWEXTI 0 "register_operand"   "=vd, vr, 
vd, vr, vd, vr, ?&vr")
(if_then_else:VWEXTI
  (unspec:
-   [(match_operand: 1 "vector_mask_operand" "vmWc1")
-(match_operand 5 "vector_length_operand""   rK")
-(match_operand 6 "const_int_operand""i")
-(match_operand 7 "const_int_operand""i")
-(match_operand 8 "const_int_operand""i")
+   [(match_operand: 1 "vector_mask_operand" " vm,Wc1, 
vm,Wc1, vm,Wc1,vmWc1")
+(match_operand 5 "vector_length_operand"" rK, rK, 
rK, rK, rK, rK,   rK")
+(match_operand 6 "const_int_operand""  i,  i,  
i,  i,  i,  i,i")
+(match_operand 7 "const_int_operand""  i,  i,  
i,  i,  i,  i,i")
+(match_operand 8 "const_int_operand""  i,  i,  
i,  i,  i,  i,i")
 (reg:SI VL_REGNUM)
 (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
  (plus:VWEXTI
(mult:VWEXTI
  (any_extend:VWEXTI
(vec_duplicate:
- (match_operand: 3 "register_operand"   "r")))
+ (match_operand: 3 "reg_or_0_operand"   " rJ, rJ, 
rJ, rJ, rJ, rJ,   rJ")))
  (any_extend:VWEXTI
-   (match_operand: 4 "register_operand" "   vr")))
-   (match_operand:VWEXTI 2 "register_operand"   "0"))
+   (match_operand: 4 "register_operand" 
"W21,W21,W42,W42,W84,W84,   vr")))
+   (match_operand:VWEXTI 2 "register_operand"   "  0,  0,  
0,  0,  0,  0,0"))
  (match_dup 2)))]
   "TARGET_VECTOR"
-  "vwmacc.vx\t%0,%3,%4%p1"
+  "vwmacc.vx\t%0,%z3,%4%p1"
   [(set_attr "type" "viwmuladd")
-   (set_attr "mode" "")])
+   (set_attr "mode" "")
+   (set_attr "group_overlap" "W21,W21,W42,W42,W84,W84,none")])
 
 (define_insn "@pred_widen_mul_plussu"
   [(set (match_operand:VWEXTI 0 "register_operand""=&vr")
@@ -5915,54 +5916,56 @@
(set_attr "mode" "")])
 
 (define_insn "@pred_widen_mul_plussu_scalar"
-  [(set (match_operand:VWEXTI 0 "register_operand""=&vr")
+  [(set (match_operand:VWEXTI 0 "register_operand""=vd, 
vr, vd, vr, vd, vr, ?&vr")
(if_then_else:VWEXTI
  (unspec:
-   [(match_operand: 1 "vector_mask_operand" "vmWc1")
-(match_operand 5 "vector_length_operand""   rK")
-(match_operand 6 "const_int_operand""i")
-(match_operand 7 "const_int_operand""i")
-(match_operand 8 "const_int_operand"

Re: [PATCH V2] RISC-V: Support highest-number regno overlap for widen ternary

2023-12-04 Thread Robin Dapp
LGTM.

Regards
 Robin



Re: Re: [PATCH] RISC-V: Support highest-number regno overlap for widen ternary vx instructions

2023-12-04 Thread 钟居哲
Adapt patch in V2 with explictly write constraints in the pattern:
[V2] RISC-V: Support highest-number regno overlap for widen ternary - Patchwork 
(sourceware.org)

Thanks.



juzhe.zh...@rivai.ai
 
From: Robin Dapp
Date: 2023-12-04 20:13
To: Juzhe-Zhong; gcc-patches
CC: rdapp.gcc; kito.cheng; kito.cheng; jeffreyalaw
Subject: Re: [PATCH] RISC-V: Support highest-number regno overlap for widen 
ternary vx instructions
> +(define_mode_attr widen_ternop_dest_constraint [
> +  (RVVM8QI "=vd, vr, vd, vr, vd, vr, ?&vr")
> +  (RVVM4QI "=vd, vr, vd, vr, vd, vr, ?&vr")
> +  (RVVM2QI "=vd, vr, vd, vr, vd, vr, ?&vr")
> +  (RVVM1QI "=vd, vr, vd, vr, vd, vr, ?&vr")
> +  (RVVMF2QI "=vd, vr, vd, vr, vd, vr, ?&vr")
> +  (RVVMF4QI "=vd, vr, vd, vr, vd, vr, ?&vr")
> +  (RVVMF8QI "=vd, vr, vd, vr, vd, vr, ?&vr")
> +  (RVVM8HI "=vd, vr, vd, vr, vd, vr, ?&vr")
[...]
 
I'm fine with avoiding the overlap but I'm not sure this is
easily maintainable because the constraints don't actually
depend on the mode?  I suppose this is for easy re-use across
different insns but there are only six(?) widening patterns
so we don't even save lines of code by this?
 
I guess I would prefer the normal approach of writing it out
explicitly in the pattern.  Maybe add a different replacement
method like define_subst in the future to simplify such
situations?
 
 
+  "vwmacc.vx\t%0,%z3,%4%p1"
 
Why the z here?  For canonicalization?
 
Regards
Robin
 
 


Re: [PATCH] RISC-V: Fix overlap group incorrect overlap on v0

2023-12-04 Thread Robin Dapp
I'd suggest the same thing as in the other patch, i.e. not having
the large number of identical lines in the iterator.  That's just
my opinion, though.  Rest LGTM. 

Regards
 Robin


[Committed V2] RISC-V: Fix overlap group incorrect overlap on v0

2023-12-04 Thread Juzhe-Zhong
In serious high register pressure case (appended in this patch):

We see vluxei8.v   v0,(s1),v1,v0.t which is not allowed.
Since according to RVV ISA:

+;; The destination vector register group for a masked vector instruction 
cannot overlap the source mask register (v0),
+;; unless the destination vector register is being written with a mask value 
(e.g., compares) or the scalar result of a reduction.

Such case doesn't have spillings, however, we expect such case should be 
spilled and reload data.

The rootcause is I made a mistake in previous patch on matching dest operand 
and mask operand constraints:

dest: "=vr"
mask: "vmWc1"

After this patch:

dest: "vd,vr"
mask: "vm,Wc1"

make EEW widening pattern are same as other instruction patterns.

PR target/112431

gcc/ChangeLog:

* config/riscv/vector.md: Fix incorrect overlap in v0.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/base/pr112431-34.c: New test.

---
 gcc/config/riscv/vector.md| 268 +-
 .../gcc.target/riscv/rvv/base/pr112431-34.c   | 101 +++
 2 files changed, 235 insertions(+), 134 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-34.c

diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md
index cec1edc8190..ba0714a9971 100644
--- a/gcc/config/riscv/vector.md
+++ b/gcc/config/riscv/vector.md
@@ -2223,70 +2223,70 @@
 
 ;; DEST eew is greater than SOURCE eew.
 (define_insn "@pred_indexed_load_x2_greater_eew"
-  [(set (match_operand:VEEWEXT2 0 "register_operand" "=vr, 
  vr,   vr,   vr,   vr,   vr, ?&vr, ?&vr")
+  [(set (match_operand:VEEWEXT2 0 "register_operand"   "=vd, 
vr, vd, vr, vd, vr, vd, vr, vd, vr, vd, vr, ?&vr, ?&vr")
(if_then_else:VEEWEXT2
  (unspec:
-   [(match_operand: 1 "vector_mask_operand"   
"vmWc1,vmWc1,vmWc1,vmWc1,vmWc1,vmWc1,vmWc1,vmWc1")
-(match_operand 5 "vector_length_operand"  "   rK,  
 rK,   rK,   rK,   rK,   rK,   rK,   rK")
-(match_operand 6 "const_int_operand"  "i,  
  i,i,i,i,i,i,i")
-(match_operand 7 "const_int_operand"  "i,  
  i,i,i,i,i,i,i")
-(match_operand 8 "const_int_operand"  "i,  
  i,i,i,i,i,i,i")
+   [(match_operand: 1 "vector_mask_operand"   " 
vm,Wc1, vm,Wc1, vm,Wc1, vm,Wc1, vm,Wc1, vm,Wc1,vmWc1,vmWc1")
+(match_operand 5 "vector_length_operand"  " rK, 
rK, rK, rK, rK, rK, rK, rK, rK, rK, rK, rK,   rK,   rK")
+(match_operand 6 "const_int_operand"  "i,  i,  
i,  i,  i,  i,  i,  i,  i,  i,  i,  i,i,i")
+(match_operand 7 "const_int_operand"  "i,  i,  
i,  i,  i,  i,  i,  i,  i,  i,  i,  i,i,i")
+(match_operand 8 "const_int_operand"  "i,  i,  
i,  i,  i,  i,  i,  i,  i,  i,  i,  i,i,i")
 (reg:SI VL_REGNUM)
 (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
  (unspec:VEEWEXT2
-   [(match_operand 3 "pmode_reg_or_0_operand" "   rJ,  
 rJ,   rJ,   rJ,   rJ,   rJ,   rJ,   rJ")
+   [(match_operand 3 "pmode_reg_or_0_operand" " rJ, 
rJ, rJ, rJ, rJ, rJ, rJ, rJ, rJ, rJ, rJ, rJ,   rJ,   rJ")
 (mem:BLK (scratch))
-(match_operand: 4 "register_operand" "  W21,  
W21,  W42,  W42,  W84,  W84,   vr,   vr")] ORDER)
- (match_operand:VEEWEXT2 2 "vector_merge_operand" "   vu,  
  0,   vu,0,   vu,0,   vu,0")))]
+(match_operand: 4 "register_operand" 
"W21,W21,W21,W21,W42,W42,W42,W42,W84,W84,W84,W84,   vr,   vr")] ORDER)
+ (match_operand:VEEWEXT2 2 "vector_merge_operand" " vu, 
vu,  0,  0, vu, vu,  0,  0, vu, vu,  0,  0,   vu,0")))]
   "TARGET_VECTOR"
   "vlxei.v\t%0,(%z3),%4%p1"
   [(set_attr "type" "vldx")
(set_attr "mode" "")
-   (set_attr "group_overlap" "W21,W21,W42,W42,W84,W84,none,none")])
+   (set_attr "group_overlap" 
"W21,W21,W21,W21,W42,W42,W42,W42,W84,W84,W84,W84,none,none")])
 
 (define_insn "@pred_indexed_load_x4_greater_eew"
-  [(set (match_operand:VEEWEXT4 0 "register_operand""=vr,  
  vr,   vr,   vr, ?&vr, ?&vr")
+  [(set (match_operand:VEEWEXT4 0 "register_operand"   "=vd, 
vr, vd, vr, vd, vr, vd, vr, ?&vr, ?&vr")
(if_then_else:VEEWEXT4
  (unspec:
-   [(match_operand: 1 "vector_mask_operand"   
"vmWc1,vmWc1,vmWc1,vmWc1,vmWc1,vmWc1")
-(match_operand 5 "vector_length_operand"  "   rK,  
 rK,   rK,   rK,   rK,   rK")
-(match_operand 6 "const_int_operand"  "i,  
  i,i,i,i,i")
-(match_operand 7 "const_int_operand"  

Re: [PATCH] Don't vectorize when vector stmts are only vec_contruct and stores

2023-12-04 Thread Richard Biener
On Mon, Dec 4, 2023 at 6:32 AM liuhongt  wrote:
>
> .i.e. for below cases.
>a[0] = b1;
>a[1] = b2;
>..
>a[n] = bn;
>
> There're extra dependences when contructing the vector, but not for
> scalar store. According to experiments, it's generally worse.
>
> The patch adds an cut-off heuristic when vec_stmt is just
> vec_construct and vector store. It improves SPEC2017 a little bit.
>
> BenchMarks  Ratio
> 500.perlbench_r 2.60%
> 502.gcc_r   0.30%
> 505.mcf_r   0.40%
> 520.omnetpp_r   -1.00%
> 523.xalancbmk_r 0.90%
> 525.x264_r  0.00%
> 531.deepsjeng_r 0.30%
> 541.leela_r 0.90%
> 548.exchange2_r 3.20%
> 557.xz_r1.40%
> 503.bwaves_r0.00%
> 507.cactuBSSN_r 0.00%
> 508.namd_r  0.30%
> 510.parest_r0.00%
> 511.povray_r0.20%
> 519.lbm_r   SAME BIN
> 521.wrf_r   -0.30%
> 526.blender_r   -1.20%
> 527.cam4_r  -0.20%
> 538.imagick_r   4.00%
> 544.nab_r   0.40%
> 549.fotonik3d_r 0.00%
> 554.roms_r  0.00%
> Geomean-int 0.90%
> Geomean-fp  0.30%
> Geomean-all 0.50%
>
> And
> Regressed testcases:
>
> gcc.target/i386/part-vect-absneghf.c
> gcc.target/i386/part-vect-copysignhf.c
> gcc.target/i386/part-vect-xorsignhf.c
>
> Regressed under -m32 since it generates 2 vector
> .ABS/NEG/XORSIGN/COPYSIGN vs original 1 64-bit vec_construct. The
> original testcases are used to test vectorization capability for
> .ABS/NEG/XORG/COPYSIGN, so just restrict testcase to TARGET_64BIT.
>
> gcc.target/i386/pr111023-2.c
> gcc.target/i386/pr111023.c
> Regressed under -m32
>
> testcase as below
>
> void
> v8hi_v8qi (v8hi *dst, v16qi src)
> {
>   short tem[8];
>   tem[0] = src[0];
>   tem[1] = src[1];
>   tem[2] = src[2];
>   tem[3] = src[3];
>   tem[4] = src[4];
>   tem[5] = src[5];
>   tem[6] = src[6];
>   tem[7] = src[7];
>   dst[0] = *(v8hi *) tem;
> }
>
> under 64-bit target, vectorizer realize it's just permutation of
> original src vector, but under -m32, vectorizer relies on
> vec_construct for vectorization. I think optimziation for this case
> under 32-bit target maynot impact much, so just add
> -fno-vect-cost-model.
>
> gcc.target/i386/pr91446.c: This testcase is guard for cost model of
> vector store, not vectorization capability, so just adjust testcase.
>
> gcc.target/i386/pr108938-3.c: This testcase relies on vec_construct to
> optimize for bswap, like other optimziation vectorizer can't realize
> optimization after it. So the current solution is add
> -fno-vect-cost-model to the testcase.
>
> costmodel-pr104582-1.c
> costmodel-pr104582-2.c
> costmodel-pr104582-4.c
>
> Failed since it's now not vectorized, looked at the PR, it's exactly
> what's wanted, so adjust testcase to scan-tree-dump-not.
>
>
> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> Ok for trunk?

So the original motivation to not more aggressively prune
store-from-CTOR vectorization in the vectorizer itself is that
the vector store is possibly better for STLF (larger stores are
good, larger loads eventually problematic).

I'd also expect the costs to play out to not make those profitable.

OTOH, if you have a series of 'double' stores you can convert to
a series of V2DF stores you _may_ be faster if this reduces
pressure on the store unit.  Esp. V2DF is cheap to construct
with one movhpd.

So I don't think we want to try to pattern match it this way?

In fact the SLP vectorization cases could all arrive with an
SLP node specified (vectorizable_store would have to be
changed here), which means you could check for an
vect_external_def child instead?

But as said, I would hope that we can arrive at a better way
assessing the CONSTRUCTOR cost.  IMHO one big issue
is that load and store cost are comparatively high compared
to simple stmt ops so it's very hard to offset saving many
stores with "ops".  That's because we generally think of
'cost' to model latency but as you say stores don't really
have latency - we only have store bandwidth of the store
unit and of course issue width (but that's true for other ops
as well).  I wonder what happens if we set both scalar and
vector store cost to zero?  Or maybe one (to count one
issue slot)?

Richard.


> gcc/ChangeLog:
>
> PR target/99881
> PR target/104582
> * config/i386/i386.cc (ix86_vector_costs::add_stmt_cost):
> Check if kind is vec_construct or vector store.
> (ix86_vector_costs::finish_cost): Don't do vectorization when
> vector stmts are only vec_construct and stores.
> (ix86_vector_costs::ix86_vect_construct_store_only_p): New
> function.
> (ix86_vector_costs::ix86_vect_cut_off): Ditto.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/i386/part-vect-absneghf.c: Restrict testcase to
> TARGET_64BIT.
> * gcc.targ

Re: [PATCH] expmed: Perform mask extraction via QImode [PR112773].

2023-12-04 Thread Richard Biener
On Mon, 4 Dec 2023, Robin Dapp wrote:

> Hi,
> 
> this changes the vec_extract path of extract_bit_field to use QImode
> instead of BImode when extracting from mask vectors and changes
> GET_MODE_BITSIZE to GET_MODE_PRECISION.  This fixes an ICE on riscv
> where we did not find a vec_extract optab and continued with the generic
> code that requires 1-byte alignment that riscv mask modes do not
> provide.  Using QImode extraction makes this piece of code
> behave similarly as vectorizable_live_operation where we create
> a VEC_EXTRACT whose extraction mode expand_convert_optab_fn converts
> from  to QImode by TYPE_MODE.
> 
> Apart from that it adds poly_int support to riscv's vec_extract
> expander and makes the RVV..BImode -> QImode expander call
> emit_vec_extract in order to not duplicate code.
> 
> Bootstrapped and regtested on aarch64 and x86.  Regtested on
> riscv64, still running on riscv32.
> 
> Regards
>  Robin
> 
> gcc/testsuite/ChangeLog:
> 
>   * gcc.target/riscv/rvv/autovec/partial/pr112773.c: New test.
> ---
>  gcc/config/riscv/autovec.md   | 35 ++-
>  gcc/config/riscv/riscv-protos.h   |  3 +-
>  gcc/config/riscv/riscv-v.cc   | 14 
>  gcc/config/riscv/riscv.cc |  6 ++--
>  gcc/expmed.cc | 24 -
>  .../riscv/rvv/autovec/partial/pr112773.c  | 20 +++
>  6 files changed, 68 insertions(+), 34 deletions(-)
>  create mode 100644 
> gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/pr112773.c
> 
> diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md
> index 2d727c2609b..3c4d68367f0 100644
> --- a/gcc/config/riscv/autovec.md
> +++ b/gcc/config/riscv/autovec.md
> @@ -1380,12 +1380,23 @@ (define_expand "vec_extract"
>rtx tmp = NULL_RTX;
>if (operands[2] != const0_rtx)
>  {
> -  /* Emit the slide down to index 0 in a new vector.  */
> -  tmp = gen_reg_rtx (mode);
> -  operands[2] = gen_lowpart (Pmode, operands[2]);
> -  rtx ops[] = {tmp, operands[1], operands[2]};
> -  riscv_vector::emit_vlmax_insn
> - (code_for_pred_slide (UNSPEC_VSLIDEDOWN, mode), 
> riscv_vector::BINARY_OP, ops);
> +  /* Properly convert a poly_int value and put the result into a
> +  register.  */
> +  if (CONST_POLY_INT_P (operands[2]))
> + {
> +   rtx pos = gen_reg_rtx (Pmode);
> +   riscv_legitimize_poly_move (Pmode, pos, gen_reg_rtx (Pmode),
> +   operands[2]);
> +   operands[2] = pos;
> + }
> +
> +/* Emit the slide down to index 0 in a new vector.  */
> +tmp = gen_reg_rtx (mode);
> +operands[2] = gen_lowpart (Pmode, operands[2]);
> +rtx ops[] = {tmp, operands[1], operands[2]};
> +riscv_vector::emit_vlmax_insn
> +  (code_for_pred_slide (UNSPEC_VSLIDEDOWN, mode),
> +   riscv_vector::BINARY_OP, ops);
>  }
>  
>/* Emit v(f)mv.[xf].s.  */
> @@ -1417,16 +1428,8 @@ (define_expand "vec_extractqi"
>riscv_vector::emit_vlmax_insn (code_for_pred_merge (qimode),
>riscv_vector::MERGE_OP, ops1);
>  
> -  /* Slide down the requested byte element.  */
> -  rtx tmp2 = gen_reg_rtx (qimode);
> -
> -  rtx ops2[] = {tmp2, tmp1, operands[2]};
> -  riscv_vector::emit_vlmax_insn
> -(code_for_pred_slide (UNSPEC_VSLIDEDOWN, qimode),
> - riscv_vector::BINARY_OP, ops2);
> -
> -  /* Extract it.  */
> -  emit_insn (gen_pred_extract_first (qimode, operands[0], tmp2));
> +  /* Extract from it.  */
> +  riscv_vector::emit_vec_extract (operands[0], tmp1, operands[2]);
>DONE;
>  })
>  
> diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
> index 695ee24ad6f..c02de84d6ef 100644
> --- a/gcc/config/riscv/riscv-protos.h
> +++ b/gcc/config/riscv/riscv-protos.h
> @@ -129,6 +129,7 @@ extern void riscv_asm_output_alias (FILE *, const tree, 
> const tree);
>  extern void riscv_asm_output_external (FILE *, const tree, const char *);
>  extern bool
>  riscv_zcmp_valid_stack_adj_bytes_p (HOST_WIDE_INT, int);
> +extern void riscv_legitimize_poly_move (machine_mode, rtx, rtx, rtx);
>  
>  #ifdef RTX_CODE
>  extern void riscv_expand_int_scc (rtx, enum rtx_code, rtx, rtx, bool 
> *invert_ptr = 0);
> @@ -558,7 +559,7 @@ void expand_cond_binop (unsigned, rtx *);
>  void expand_cond_ternop (unsigned, rtx *);
>  void expand_popcount (rtx *);
>  void expand_rawmemchr (machine_mode, rtx, rtx, rtx);
> -void emit_vec_extract (rtx, rtx, poly_int64);
> +void emit_vec_extract (rtx, rtx, rtx);
>  
>  /* Rounding mode bitfield for fixed point VXRM.  */
>  enum fixed_point_rounding_mode
> diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
> index 588c127343e..430aae3dc69 100644
> --- a/gcc/config/riscv/riscv-v.cc
> +++ b/gcc/config/riscv/riscv-v.cc
> @@ -3253,7 +3253,7 @@ shuffle_extract_and_slide1up_patterns (struct 
> expand_vec_perm_d *d)
>/* Extract the last element of the first vector.  */
>

Re: [PATCH 3/3] MATCH: (convert)(zero_one !=/== 0/1) for outer type and zero_one type are the same

2023-12-04 Thread Richard Biener
On Sat, Dec 2, 2023 at 7:38 AM Andrew Pinski  wrote:
>
> When I moved two_value to match.pd, I removed the check for the {0,+-1}
> as I had placed it after the {0,+-1} case for cond in match.pd.
> In the case of {0,+-1} and non boolean, before we would optmize those
> case to just `(convert)a` but after we would get `(convert)(a != 0)`
> which was not handled anyways to just `(convert)a`.
> So this adds a pattern to match `(convert)(zeroone != 0)` and simplify
> to `(convert)zeroone`.
>
> Also this optimizes (convert)(zeroone == 0) into (zeroone^1) if the
> type match. This can only be done on the gimple level as if zeroone
> was defined by (a&1), fold will convert (a&1)^1 back into
> `(convert)(zeroone == 0)` and an infinite loop will happen.

So fold converts (a&1)^1 to (convert)(a&1 == 0)?  Can we fix (remove)
this instead or do we rely on that?

> Note the testcase pr69270.c needed a slight update due to not matching
> exactly a scan pattern, this update makes it more robust and will match
> before and afterwards and if there are other changes in this area too.
>
> Note the testcase gcc.target/i386/pr110790-2.c needs a slight update
> for better code generation in LP64 bit mode.
>
> Bootstrapped and tested on x86_64-linux-gnu with no regressions.

Otherwise OK.

Thanks,
Richard.

> gcc/ChangeLog:
>
> PR tree-optimization/111972
> PR tree-optimization/110637
> * match.pd (`(convert)(zeroone !=/== CST)`): Match
> and simplify to ((convert)zeroone){,^1}.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.dg/tree-ssa/pr110637-1.c: New test.
> * gcc.dg/tree-ssa/pr110637-2.c: New test.
> * gcc.dg/tree-ssa/pr110637-3.c: New test.
> * gcc.dg/tree-ssa/pr111972-1.c: New test.
> * gcc.dg/tree-ssa/pr69270.c: Update testcase.
> * gcc.target/i386/pr110790-2.c: Update testcase.
>
> Signed-off-by: Andrew Pinski 
> ---
>  gcc/match.pd   | 21 +
>  gcc/testsuite/gcc.dg/tree-ssa/pr110637-1.c | 10 +++
>  gcc/testsuite/gcc.dg/tree-ssa/pr110637-2.c | 13 +
>  gcc/testsuite/gcc.dg/tree-ssa/pr110637-3.c | 14 +
>  gcc/testsuite/gcc.dg/tree-ssa/pr111972-1.c | 34 ++
>  gcc/testsuite/gcc.dg/tree-ssa/pr69270.c|  4 +--
>  gcc/testsuite/gcc.target/i386/pr110790-2.c | 16 --
>  7 files changed, 108 insertions(+), 4 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr110637-1.c
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr110637-2.c
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr110637-3.c
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr111972-1.c
>
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 4d554ba4721..656b2c9edda 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -3332,6 +3332,27 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>(if (INTEGRAL_TYPE_P (TREE_TYPE (@0)) || POINTER_TYPE_P (TREE_TYPE (@0)))
>  (rcmp @0 @1
>
> +/* (type)([0,1]@a != 0) -> (type)a
> +   (type)([0,1]@a == 1) -> (type)a
> +   (type)([0,1]@a == 0) -> a ^ 1
> +   (type)([0,1]@a != 1) -> a ^ 1.  */
> +(for eqne (eq ne)
> + (simplify
> +  (convert (eqne zero_one_valued_p@0 INTEGER_CST@1))
> +  (if ((integer_zerop (@1) || integer_onep (@1)))
> +   (if ((eqne == EQ_EXPR) ^ integer_zerop (@1))
> +(convert @0)
> +   /* a^1 can only be produced for gimple as
> +  fold has the exact opposite transformation
> +  for `(X & 1) ^ 1`.
> +  See `Fold ~X & 1 as (X & 1) == 0.`
> +  and `Fold (X ^ 1) & 1 as (X & 1) == 0.` in fold-const.cc.
> +  Only do this if the types match as (type)(a == 0) is
> +  canonical form normally, while `a ^ 1` is canonical when
> +  there is no type change. */
> +   (if (GIMPLE && types_match (type, TREE_TYPE (@0)))
> +(bit_xor @0 { build_one_cst (type); } ))
> +
>  /* We can't reassociate at all for saturating types.  */
>  (if (!TYPE_SATURATING (type))
>
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr110637-1.c 
> b/gcc/testsuite/gcc.dg/tree-ssa/pr110637-1.c
> new file mode 100644
> index 000..3d03b0992a4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr110637-1.c
> @@ -0,0 +1,10 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O1 -fdump-tree-optimized" } */
> +int f(int a)
> +{
> +int b = (a & 1)!=0;
> +return b;
> +}
> +
> +/* This should be optimized to just return (a & 1); */
> +/* { dg-final { scan-tree-dump-not " == " "optimized"} } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr110637-2.c 
> b/gcc/testsuite/gcc.dg/tree-ssa/pr110637-2.c
> new file mode 100644
> index 000..f1c5b90353a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr110637-2.c
> @@ -0,0 +1,13 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O1 -fdump-tree-optimized" } */
> +int f(int a)
> +{
> +int b = a & 1;
> +int c = b == 0;
> +return c;
> +}
> +
> +/* This should be optimized to just return `(a&1) ^ 1` or `(~a) &

Re: [PATCH 1/3] MATCH: Fix zero_one_valued_p's convert pattern

2023-12-04 Thread Richard Biener
On Sat, Dec 2, 2023 at 7:38 AM Andrew Pinski  wrote:
>
> While working on PR 111972, I was getting a regression
> due to zero_one_valued_p matching a signed 1 bit integer
> when it came to convert. This patch fixes that by checking
> the outer type too.
>
> Bootstrapped and tested on x86_64-linux-gnu with no regressions.

OK

> gcc/ChangeLog:
>
> * match.pd (zero_one_valued_p): For convert
> make sure type is not a signed 1-bit integer.
>
> Signed-off-by: Andrew Pinski 
> ---
>  gcc/match.pd | 3 +++
>  1 file changed, 3 insertions(+)
>
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 26383e55767..4d554ba4721 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -2247,6 +2247,9 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>   (if (INTEGRAL_TYPE_P (TREE_TYPE (@1))
>&& (TYPE_UNSIGNED (TREE_TYPE (@1))
>   || TYPE_PRECISION (TREE_TYPE (@1)) > 1)
> +  && INTEGRAL_TYPE_P (type)
> +  && (TYPE_UNSIGNED (type)
> + || TYPE_PRECISION (type) > 1)
>&& wi::leu_p (tree_nonzero_bits (@1), 1
>
>  /* Transform { 0 or 1 } * { 0 or 1 } into { 0 or 1 } & { 0 or 1 }.  */
> --
> 2.39.3
>


[PATCH] middle-end/112785 - guard against last_clique overflow

2023-12-04 Thread Richard Biener
The PR shows that we'll ICE eventually when last_clique wraps.  The
following avoids this by refusing to hand out new cliques after
exhausting them.  We then use zero (no clique) as conservative
fallback.

Bootstrapped and tested on x86_64-unknown-linux-gnu, pushed.

PR middle-end/112785
* function.h (get_new_clique): New inline function handling
last_clique overflow.
* cfgrtl.cc (duplicate_insn_chain): Use it.
* tree-cfg.cc (gimple_duplicate_bb): Likewise.
* tree-inline.cc (remap_dependence_clique): Likewise.
---
 gcc/cfgrtl.cc  |  2 +-
 gcc/function.h | 11 +++
 gcc/tree-cfg.cc|  2 +-
 gcc/tree-inline.cc |  2 +-
 4 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/gcc/cfgrtl.cc b/gcc/cfgrtl.cc
index abcb472e2a2..2a3f853eed5 100644
--- a/gcc/cfgrtl.cc
+++ b/gcc/cfgrtl.cc
@@ -4385,7 +4385,7 @@ duplicate_insn_chain (rtx_insn *from, rtx_insn *to,
  {
gcc_assert
  (MR_DEPENDENCE_CLIQUE (op) <= cfun->last_clique);
-   newc = ++cfun->last_clique;
+   newc = get_new_clique (cfun);
  }
/* We cannot adjust MR_DEPENDENCE_CLIQUE in-place
   since MEM_EXPR is shared so make a copy and
diff --git a/gcc/function.h b/gcc/function.h
index 29846564bc6..833c35e3da6 100644
--- a/gcc/function.h
+++ b/gcc/function.h
@@ -518,6 +518,17 @@ set_loops_for_fn (struct function *fn, struct loops *loops)
   fn->x_current_loops = loops;
 }
 
+/* Get a new unique dependence clique or zero if none is left.  */
+
+inline unsigned short
+get_new_clique (function *fn)
+{
+  unsigned short clique = fn->last_clique + 1;
+  if (clique != 0)
+fn->last_clique = clique;
+  return clique;
+}
+
 /* For backward compatibility... eventually these should all go away.  */
 #define current_function_funcdef_no (cfun->funcdef_no)
 
diff --git a/gcc/tree-cfg.cc b/gcc/tree-cfg.cc
index a30a2de33a1..475ea5d99ef 100644
--- a/gcc/tree-cfg.cc
+++ b/gcc/tree-cfg.cc
@@ -6595,7 +6595,7 @@ gimple_duplicate_bb (basic_block bb, copy_bb_data *id)
if (!existed)
  {
gcc_assert (MR_DEPENDENCE_CLIQUE (op) <= cfun->last_clique);
-   newc = ++cfun->last_clique;
+   newc = get_new_clique (cfun);
  }
MR_DEPENDENCE_CLIQUE (op) = newc;
  }
diff --git a/gcc/tree-inline.cc b/gcc/tree-inline.cc
index e6d553059e3..a4fc839a22d 100644
--- a/gcc/tree-inline.cc
+++ b/gcc/tree-inline.cc
@@ -1002,7 +1002,7 @@ remap_dependence_clique (copy_body_data *id, unsigned 
short clique)
   /* Clique 1 is reserved for local ones set by PTA.  */
   if (cfun->last_clique == 0)
cfun->last_clique = 1;
-  newc = ++cfun->last_clique;
+  newc = get_new_clique (cfun);
 }
   return newc;
 }
-- 
2.35.3


[PATCH] aarch64: fix eh_return-3.c test

2023-12-04 Thread Szabolcs Nagy
gcc/testsuite/ChangeLog:

* gcc.target/aarch64/eh_return-3.c: Fix when retaa is available.
---
 gcc/testsuite/gcc.target/aarch64/eh_return-3.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/gcc/testsuite/gcc.target/aarch64/eh_return-3.c 
b/gcc/testsuite/gcc.target/aarch64/eh_return-3.c
index a17baa86501..d180fa7c455 100644
--- a/gcc/testsuite/gcc.target/aarch64/eh_return-3.c
+++ b/gcc/testsuite/gcc.target/aarch64/eh_return-3.c
@@ -12,8 +12,12 @@
 ** cbz x4, .*
 ** add sp, sp, x5
 ** br  x6
+** (
 ** hint29 // autiasp
 ** ret
+** |
+** retaa
+** )
 ** mov x5, x0
 ** mov x4, 1
 ** mov x6, x1
-- 
2.25.1



Re: [PATCH] RISC-V: Fix two testscases related to -std changes.

2023-12-04 Thread Jeff Law




On 12/4/23 06:17, Robin Dapp wrote:

Hi,

recent -std changes caused testsuite failures.  Fix those by adding
-std=gnu99 and -Wno-incompatible-pointer-types.

Going to commit as obvious.

Regards
  Robin

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/pr112552.c: Add
-Wno-incompatible-pointer-types.
* gcc.target/riscv/rvv/autovec/struct/struct_vect_run-10.c:
Add -std=gnu99.

THanks.  Seeing light fallout from this stuff is totally expected.

My tester has churned through its supported *-elf targets and all those 
have been fixed.  Now it's just a matter of waiting for the native 
emulated targets which only fire once a week.


Jeff


Re: [PATCH v3 0/3] libgomp: OpenMP low-latency omp_alloc

2023-12-04 Thread Tobias Burnus

Hi Andrew,

On 03.12.23 01:32, Andrew Stubbs wrote:

This patch series is a rework of the patch series posted in August.
https://patchwork.sourceware.org/project/gcc/list/?series=23045&state=%2A&archive=both

The series implements device-specific allocators and adds a low-latency
allocator for both GPUs architectures.


As mentioned, can you also update libgomp/libgomp.texi?

I don't have a strong preference where in that file nor how it is
documented, but it seems to make sense to document as follows:

(A) Document the GCN/NVPX specifics on the respective pages below
https://gcc.gnu.org/onlinedocs/libgomp/Offload-Target-Specifics.html

(B) To make it possible to find it, @ref'er to that page from:

And https://gcc.gnu.org/onlinedocs/libgomp/Memory-allocation.html

(May be just 'See also:' or 'For offload-device specifics to memory
allocation, see' or something like that.)

(C) Maybe, some wording should be added to OMP_ALLOCATOR that the
cgrouppteam/thread pre-defined allocators use (implementation choice)
the low-latency memory space; for instance, add a sentence under the
first table – or use the first table 'omp_low_lat_mem_space
(implementation choice)' or some other wordings which make clear what
GCC does but that the spec does not specify this.

https://gcc.gnu.org/onlinedocs/libgomp/OMP_005fALLOCATOR.html

Maybe we should add to OMP_ALLOCATOR also a @ref to "Offload Target
Specifics"?

Tobias

-
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 
München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas 
Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht 
München, HRB 106955


Re: [PATCH v3 0/3] libgomp: OpenMP low-latency omp_alloc

2023-12-04 Thread Tobias Burnus

I cannot "grep" – all three patches do contain .texi changes. I have a
comment to them, but I will comment individually on them.

Hence, scratch:

On 04.12.23 16:34, Tobias Burnus wrote:

On 03.12.23 01:32, Andrew Stubbs wrote:

This patch series is a rework of the patch series posted in August.
https://patchwork.sourceware.org/project/gcc/list/?series=23045&state=%2A&archive=both


The series implements device-specific allocators and adds a low-latency
allocator for both GPUs architectures.


As mentioned, can you also update libgomp/libgomp.texi?


Sorry for missing those changes.

Tobias

-
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 
München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas 
Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht 
München, HRB 106955


Re: [PATCH v3 1/3] libgomp, nvptx: low-latency memory allocator

2023-12-04 Thread Tobias Burnus

On 03.12.23 01:32, Andrew Stubbs wrote:

This patch adds support for allocating low-latency ".shared" memory on
NVPTX GPU device, via the omp_low_lat_mem_space and omp_alloc.  The memory
can be allocated, reallocated, and freed using a basic but fast algorithm,
is thread safe and the size of the low-latency heap can be configured using
the GOMP_NVPTX_LOWLAT_POOL environment variable.

The use of the PTX dynamic_smem_size feature means that low-latency allocator
will not work with the PTX 3.1 multilib.

For now, the omp_low_lat_mem_alloc allocator also works, but that will change
when I implement the access traits.


...

LGTM, however, I about the following:


diff --git a/libgomp/libgomp.texi b/libgomp/libgomp.texi
index e5fe7af76af..39d0749e7b3 100644
--- a/libgomp/libgomp.texi
+++ b/libgomp/libgomp.texi
@@ -3012,11 +3012,14 @@ value.
  @item omp_const_mem_alloc   @tab omp_const_mem_space
  @item omp_high_bw_mem_alloc @tab omp_high_bw_mem_space
  @item omp_low_lat_mem_alloc @tab omp_low_lat_mem_space
-@item omp_cgroup_mem_alloc  @tab --
-@item omp_pteam_mem_alloc   @tab --
-@item omp_thread_mem_alloc  @tab --
+@item omp_cgroup_mem_alloc  @tab omp_low_lat_mem_space (implementation 
defined)
+@item omp_pteam_mem_alloc   @tab omp_low_lat_mem_space (implementation 
defined)
+@item omp_thread_mem_alloc  @tab omp_low_lat_mem_space (implementation 
defined)
  @end multitable

+The @code{omp_low_lat_mem_space} is only available on supported devices.
+See @ref{Offload-Target Specifics}.
+


Whether it would be clearer to have this wording not here for the OMP_ALLOCATOR 
env, i.e.
https://gcc.gnu.org/onlinedocs/libgomp/OMP_005fALLOCATOR.html
but just a simple crossref like:

--- a/libgomp/libgomp.texi
+++ b/libgomp/libgomp.texi
@@ -3061,5 +3061,5 @@ 
OMP_ALLOCATOR=omp_low_lat_mem_space:pinned=true,partition=nearest
 @item @emph{See also}:
 @ref{Memory allocation}, @ref{omp_get_default_allocator},
-@ref{omp_set_default_allocator}
+@ref{omp_set_default_allocator}, @ref{Offload-Target Specifics}

 @item @emph{Reference}:


And add your wording to:
  https://gcc.gnu.org/onlinedocs/libgomp/Memory-allocation.html

As this sections mentions that "omp_low_lat_mem_space maps to 
omp_default_mem_space" in general.
Hence, mentioning in this section in addition that  omp_low_lat_mem_space  is 
honored on devices
seems to be the better location.

Tobias

-
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 
München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas 
Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht 
München, HRB 106955


Re: [PATCH] gcc: Disallow trampolines when -fhardened

2023-12-04 Thread Siddhesh Poyarekar

On 2023-12-02 04:42, Martin Uecker wrote:



Bootstrapped/regtested on x86_64-pc-linux-gnu, ok for trunk?

-- >8 --
It came up that a good hardening strategy is to disable trampolines
which may require executable stack.  Therefore the following patch
adds -Werror=trampolines to -fhardened.


This would add a warning about specific code (where it is then
unclear whether rewriting it is feasible or even an improvement),
which seems different to all the other flags -fhardening has
now.


It's actually -Werror=trampolines, not just -Wtrampolines; the aim is to 
hard fail on producing trampolines and consequently, an executable 
stack.  In general the goal of -fhardened is to produce hardened code 
and the nested function trampolines do the exact reverse of that, so 
-Werror=trampolines seems to align perfectly with that goal, doesn't it?



GCC now has an option to allocate trampolines on the heap,
which would seem to be a better fit.  On the other hand,
it does not work with longjmp which may be a limitation.


For hardened code in C, I think we really should look to step away from 
nested functions instead of adding ways to continue supporting it. 
There's probably a larger conversation to be had about the utility of 
nested functions in general for C (and whether this GCC extension should 
be deprecated altogether in future), but I feel like the -fhardened 
subset gives us the opportunity to enforce at least a safe subset for 
now, possibly extending it in future.


Thanks,
Sid


Re: [PATCH] gcc: Disallow trampolines when -fhardened

2023-12-04 Thread Andreas Schwab
On Dez 04 2023, Siddhesh Poyarekar wrote:

> For hardened code in C, I think we really should look to step away from
> nested functions instead of adding ways to continue supporting it. There's
> probably a larger conversation to be had about the utility of nested
> functions in general for C (and whether this GCC extension should be
> deprecated altogether in future), but I feel like the -fhardened subset
> gives us the opportunity to enforce at least a safe subset for now,
> possibly extending it in future.

Nested functions by itself don't need a trampoline, only if the address
of it is passed outside the containing function's scope (as a callback,
for example).

-- 
Andreas Schwab, SUSE Labs, sch...@suse.de
GPG Key fingerprint = 0196 BAD8 1CE9 1970 F4BE  1748 E4D4 88E3 0EEA B9D7
"And now for something completely different."


[PATCH] libstdc++: Add test for LWG Issue 3897

2023-12-04 Thread Will Hawkins
Hello!

Thank you, as always, for the great work that you do on libstdc++. The
inout_ptr implementation properly handles the issue raised in LWG 3897
but it seems like having an explicit test might be a good idea.

I hope that this helps!
Will

-- >8 --

Add a test to verify that the implementation of inout_ptr is not
vulnerable to LWG Issue 3897.

libstdc++-v3/ChangeLog:

* testsuite/20_util/smartptr.adapt/inout_ptr/3.cc: New test
for LWG Issue 3897.

Signed-off-by: Will Hawkins 
---
 .../20_util/smartptr.adapt/inout_ptr/3.cc   | 17 +
 1 file changed, 17 insertions(+)
 create mode 100644 libstdc++-v3/testsuite/20_util/smartptr.adapt/inout_ptr/3.cc

diff --git a/libstdc++-v3/testsuite/20_util/smartptr.adapt/inout_ptr/3.cc 
b/libstdc++-v3/testsuite/20_util/smartptr.adapt/inout_ptr/3.cc
new file mode 100644
index 000..f9114dc57b5
--- /dev/null
+++ b/libstdc++-v3/testsuite/20_util/smartptr.adapt/inout_ptr/3.cc
@@ -0,0 +1,17 @@
+// { dg-do run { target c++23 } }
+
+#include 
+#include 
+
+// C++23 [inout.ptr.t] Class template inout_ptr_t
+// Verify that implementation handles LWG Issue 3897
+void nuller(int **p) {
+  *p = nullptr;
+}
+
+int main(int, char **) {
+  int *i = new int{5};
+  nuller(std::inout_ptr(i));
+
+  VERIFY(i == nullptr);
+}
-- 
2.41.0



Re: [PATCH] gcc: Disallow trampolines when -fhardened

2023-12-04 Thread Jakub Jelinek
On Mon, Dec 04, 2023 at 05:39:04PM +0100, Andreas Schwab wrote:
> On Dez 04 2023, Siddhesh Poyarekar wrote:
> 
> > For hardened code in C, I think we really should look to step away from
> > nested functions instead of adding ways to continue supporting it. There's
> > probably a larger conversation to be had about the utility of nested
> > functions in general for C (and whether this GCC extension should be
> > deprecated altogether in future), but I feel like the -fhardened subset
> > gives us the opportunity to enforce at least a safe subset for now,
> > possibly extending it in future.
> 
> Nested functions by itself don't need a trampoline, only if the address
> of it is passed outside the containing function's scope (as a callback,
> for example).

And only if the code to which it is passed can't be inlined back.

I'm afraid contained functions in Fortran or in Ada (whatever it is called
there) aren't going away any time soon and having the possibility to test it
also in C and not just Fortran/Ada is very useful at least from compiler
testing POV.

Jakub



Re: [PATCH] gcc: Disallow trampolines when -fhardened

2023-12-04 Thread Siddhesh Poyarekar

On 2023-12-04 11:39, Andreas Schwab wrote:

On Dez 04 2023, Siddhesh Poyarekar wrote:


For hardened code in C, I think we really should look to step away from
nested functions instead of adding ways to continue supporting it. There's
probably a larger conversation to be had about the utility of nested
functions in general for C (and whether this GCC extension should be
deprecated altogether in future), but I feel like the -fhardened subset
gives us the opportunity to enforce at least a safe subset for now,
possibly extending it in future.


Nested functions by itself don't need a trampoline, only if the address
of it is passed outside the containing function's scope (as a callback,
for example).


Yes, that's why I said that the conversation about deprecating the C 
nested functions extension is a broader one (and hence for gcc 15) that 
will likely involve the question of whether dropping the extension 
altogether gives any benefit or if dropping support for on-stack 
trampolines is sufficient.  On-heap trampolines are maybe slightly 
better in that they don't need an executable stack, but defaulting to 
on-heap trampolines for -fhardened seems like a lost opportunity to 
enforce better user code.


Thanks,
Sid


Re: [PATCH] libiberty: Fix pex_unix_wait return type

2023-12-04 Thread Richard Sandiford
Rainer Orth  writes:
> The recent warning patches broke Solaris bootstrap:
>
> /vol/gcc/src/hg/master/local/libiberty/pex-unix.c:326:3: error: 
> initialization of 'pid_t (*)(struct pex_obj *, pid_t,  int *, struct pex_time 
> *, int,  const char **, int *)' {aka 'long int (*)(struct pex_obj *, long 
> int,  int *, struct pex_time *, int,  const char **, int *)'} from 
> incompatible pointer type 'int (*)(struct pex_obj *, pid_t,  int *, struct 
> pex_time *, int,  const char **, int *)' {aka 'int (*)(struct pex_obj *, long 
> int,  int *, struct pex_time *, int,  const char **, int *)'} 
> [-Wincompatible-pointer-types]
>   326 |   pex_unix_wait,
>   |   ^
> /vol/gcc/src/hg/master/local/libiberty/pex-unix.c:326:3: note: (near 
> initialization for 'funcs.wait')
>
> While pex_funcs.wait expects a function returning pid_t, pex_unix_wait
> currently returns int.  However, on Solaris pid_t is long for 32-bit,
> but int for 64-bit.
>
> This patches fixes this by having pex_unix_wait return pid_t as
> expected, and like every other variant already does.
>
> Bootstrapped without regressions on i386-pc-solaris2.11,
> sparc-sun-solaris2.11, x86_64-pc-linux-gnu, and
> x86_64-apple-darwin23.1.0.
>
> Ok for trunk?
>
>   Rainer

OK, thanks.

Richard


Re: [PATCH] gm2: Fix mc/mc.flex compilation on Solaris

2023-12-04 Thread Gaius Mulley
Rainer Orth  writes:

> The recent warning changes broke gm2 bootstrap on Solaris:
>
> /vol/gcc/src/hg/master/local/gcc/m2/mc/mc.flex: In function 'handleFile':
> /vol/gcc/src/hg/master/local/gcc/m2/mc/mc.flex:297:21: error: implicit 
> declaration of function 'alloca' [-Wimplicit-function-declaration]
>   297 |   char *s = (char *)alloca (strlen (filename) + 2 + 1);
>   | ^~
>
> alloca needs  on Solaris, which isn't universally available.
> Since mc.flex doesn't include any config header, I chose to switch to
> __builtin_alloca instead.
>
> /vol/gcc/src/hg/master/local/gcc/m2/mc/mc.flex:332:19: error: implicit 
> declaration of function 'index' [-Wimplicit-function-declaration]
>   332 |   char   *p = index(sdate, '\n');
>   |   ^
>
> index is declared in  on Solaris, again not a standard
> header.  I simply switched to using strchr to avoid that issue.
>
> Bootstrapped without regressions on i386-pc-solaris2.11,
> sparc-sun-solaris2.11, x86_64-pc-linux-gnu, and
> x86_64-apple-darwin23.1.0.
>
> Ok for trunk?
>
>   Rainer

yes, lgtm, - thanks for fixing index as well,

regards,
Gaius




Re: [PATCH] aarch64: fix eh_return-3.c test

2023-12-04 Thread Richard Sandiford
Szabolcs Nagy  writes:
> gcc/testsuite/ChangeLog:
>
>   * gcc.target/aarch64/eh_return-3.c: Fix when retaa is available.

OK, thanks.

Richard

> ---
>  gcc/testsuite/gcc.target/aarch64/eh_return-3.c | 4 
>  1 file changed, 4 insertions(+)
>
> diff --git a/gcc/testsuite/gcc.target/aarch64/eh_return-3.c 
> b/gcc/testsuite/gcc.target/aarch64/eh_return-3.c
> index a17baa86501..d180fa7c455 100644
> --- a/gcc/testsuite/gcc.target/aarch64/eh_return-3.c
> +++ b/gcc/testsuite/gcc.target/aarch64/eh_return-3.c
> @@ -12,8 +12,12 @@
>  **   cbz x4, .*
>  **   add sp, sp, x5
>  **   br  x6
> +** (
>  **   hint29 // autiasp
>  **   ret
> +** |
> +**   retaa
> +** )
>  **   mov x5, x0
>  **   mov x4, 1
>  **   mov x6, x1


[committed] Fix HImode load mnemonic on microblaze port

2023-12-04 Thread Jeff Law

The tester recently started failing va-arg-22.c on microblaze-linux:

gcc.c-torture/execute/va-arg-22.c   -O0  (test for excess errors)

It was failing with an undefined reference to "r7" at link time.  This 
was ultimately tracked down to a HImode load using (reg+reg) addressing 
mode, but which used the lhui instruction instead of lhu.  The "i" means 
it's supposed to be (reg+disp) so the assembler tried to interpret "r7" 
as an immediate/symbol.


The port uses %i as an output modifier to select between sh/shi 
and various other mnemonics for loads/stores.  The movhi pattern simply 
failed to use it for the two cases where it's loading from memory 
(interestingly enough it was used for stores).


Clearly we aren't using reg+reg much for HImode loads as this didn't fix 
anything else in the testsuite.


Installing on the trunk,
Jeffcommit b544ec681bdc9c48587d2e014f9559674097738a
Author: Jeff Law 
Date:   Mon Dec 4 10:06:49 2023 -0700

[committed] Fix HImode load mnemonic on microblaze port

The tester recently started failing va-arg-22.c on microblaze-linux:

gcc.c-torture/execute/va-arg-22.c   -O0  (test for excess errors)

It was failing with an undefined reference to "r7" at link time.  This was
ultimately tracked down to a HImode load using (reg+reg) addressing mode, 
but
which used the lhui instruction instead of lhu.  The "i" means it's 
supposed to
be (reg+disp) so the assembler tried to interpret "r7" as an 
immediate/symbol.

The port uses %i as an output modifier to select between sh/shi and
various other mnemonics for loads/stores.  The movhi pattern simply failed 
to
use it for the two cases where it's loading from memory (interestingly 
enough
it was used for stores).

Clearly we aren't using reg+reg much for HImode loads as this didn't fix
anything else in the testsuite.

gcc/
* config/microblaze/microblaze.md (movhi): Use %i for half-word
loads to properly select between lhu/lhui.

diff --git a/gcc/config/microblaze/microblaze.md 
b/gcc/config/microblaze/microblaze.md
index 671667b537c..a8ee886d36b 100644
--- a/gcc/config/microblaze/microblaze.md
+++ b/gcc/config/microblaze/microblaze.md
@@ -1089,8 +1089,8 @@ (define_insn "*movhi_internal2"
   "@
addik\t%0,r0,%1\t# %X1
addk\t%0,%1,r0
-   lhui\t%0,%1
-   lhui\t%0,%1
+   lhu%i1\t%0,%1
+   lhu%i1\t%0,%1
sh%i0\t%z1,%0
sh%i0\t%z1,%0"
   [(set_attr "type""arith,move,load,no_delay_load,store,no_delay_store")


Re: Re: [PATCH] RISC-V: Normalize user vsetvl intrinsics[PR112092]

2023-12-04 Thread Maciej W. Rozycki
On Wed, 8 Nov 2023, Kito Cheng wrote:

> OK, then LGTM, thanks for the explanation :)

 Please don't top-post on a GCC mailing list (and preferably in off-list 
replies to such mailing list messages unless it's been agreed to somehow 
with the participants), as it makes it difficult to make context replies.

 Best practice is to reply inline, quoting the relevant original paragraph 
(or enough context) referred to above, and with all the other parts of the 
message replied to discarded.  We may even have it written down somewhere 
(though I haven't checked; in the old days it used to be assumed), and I 
do hope any sane modern MUA can handle it.

 Otherwise the discussion thread quickly grows into an illegible mess.

 So this change does indeed fix PR 112092, however we now have an issue 
with several other test cases and the new `-mmovcc' option.  For example 
vsetvl-13.c fails with "-mmovcc -mbranch-cost=8" test options and assembly 
produced is like:

vsetvli a6,a6,e8,mf4,ta,ma
sneza5,a5
neg a5,a5
and a6,a5,a6
not a5,a5
andia5,a5,55
or  a5,a6,a5
beq a4,zero,.L10
li  a6,0
vsetvli zero,a5,e32,m1,tu,ma
.L4:
vle32.v v1,0(a0)
vle32.v v1,0(a1)
vle32.v v1,0(a2)
vse32.v v1,0(a3)
addia6,a6,1
bne a4,a6,.L4
.L10:
ret

As far as I can tell code produced is legitimate, and for the record 
analogous assembly is produced with `-march=rv32gcv_zicond' too:

vsetvli a6,a6,e8,mf4,ta,ma
czero.eqz   a6,a6,a5
li  a7,55
czero.nez   a5,a7,a5
or  a5,a5,a6
beq a4,zero,.L10
li  a6,0
vsetvli zero,a5,e32,m1,tu,ma
.L4:
vle32.v v1,0(a0)
vle32.v v1,0(a1)
vle32.v v1,0(a2)
vse32.v v1,0(a3)
addia6,a6,1
bne a4,a6,.L4
.L10:
ret

-- it's just that you can't see it with regression testing, because the 
test case overrides `-march='.  Presumably we do want to execute VSETVLI 
twice here on the basis that to avoid the second one by means of branches 
would be more costly than not to.

 Shall we just silence false failures like this with `-mno-movcc' then or 
shall we handle the conditional-move case somehow?

 For reference plain branched assembly is like:

li  a7,55
beq a5,zero,.L13
vsetvli zero,a6,e32,m1,tu,ma
.L2:
beq a4,zero,.L11
li  a5,0
.L4:
vle32.v v1,0(a0)
vle32.v v1,0(a1)
vle32.v v1,0(a2)
vse32.v v1,0(a3)
addia5,a5,1
bne a4,a5,.L4
.L11:
ret
.L13:
vsetvli zero,a7,e32,m1,tu,ma
j   .L2

  Maciej


Re: [PATCH] gcc: Disallow trampolines when -fhardened

2023-12-04 Thread Martin Uecker
Am Montag, dem 04.12.2023 um 11:46 -0500 schrieb Siddhesh Poyarekar:
> On 2023-12-04 11:39, Andreas Schwab wrote:
> > On Dez 04 2023, Siddhesh Poyarekar wrote:
> > 
> > > For hardened code in C, I think we really should look to step away from
> > > nested functions instead of adding ways to continue supporting it. There's
> > > probably a larger conversation to be had about the utility of nested
> > > functions in general for C (and whether this GCC extension should be
> > > deprecated altogether in future), but I feel like the -fhardened subset
> > > gives us the opportunity to enforce at least a safe subset for now,
> > > possibly extending it in future.
> > 
> > Nested functions by itself don't need a trampoline, only if the address
> > of it is passed outside the containing function's scope (as a callback,
> > for example).
> 
> Yes, that's why I said that the conversation about deprecating the C 
> nested functions extension is a broader one (and hence for gcc 15) that 
> will likely involve the question of whether dropping the extension 
> altogether gives any benefit or if dropping support for on-stack 
> trampolines is sufficient.  On-heap trampolines are maybe slightly 
> better in that they don't need an executable stack, but defaulting to 
> on-heap trampolines for -fhardened seems like a lost opportunity to 
> enforce better user code.

I do not really agree with that.  Nested functions can substantially
improve code quality and in C can avoid type unsafe use of
void* pointers in callbacks. The code is often much better with
nested functions than without.  Nested functions and lambdas
(i.e. anonymous nested functions) are used in many languages
because they make code better and GNU's nested function are no
exception.

So I disagree with the idea that discouraging nested functions leads 
to better code - I think the exact opposite is true.

I am generally wary of mitigations that may make exploitation of
buffer overflows a bit harder  while increasing the likelihood
of buffer overflows by reducing type safety and/or code quality.

But I would agree that trampolines are generally problematic. A
better strategy would be wide function pointer type (as in Apple'
Blocks extension). Alternatively, an explicit way to obtain the
static chain for a nested function which could be used with 
__builtin_call_with_static_chain  could also work.

But in any case, I think it diminishes the value of -fhardening 
it if requires source code changes, because then it is not as easy
to simply turn it on in larger projects / distributitions. 

Martin



> 
> Thanks,
> Sid



[PATCH] Maintain a validity flag for REG_UNUSED notes [PR112760] (was Re: [PATCH] pro_and_epilogue: Call df_note_add_problem () if SHRINK_WRAPPING_ENABLED [PR112760])

2023-12-04 Thread Richard Sandiford
Richard Sandiford  writes:
> Jakub Jelinek  writes:
>> On Sat, Dec 02, 2023 at 11:04:04AM +, Richard Sandiford wrote:
>>> I still maintain that so much stuff relies on the lack of false-positive
>>> REG_UNUSED notes that (whatever the intention might have been) we need
>>> to prevent the false positive.  Like Andrew says, any use of single_set
>>> is suspect if there's a REG_UNUSED note for something that is in fact used.
>>
>> The false positive REG_UNUSED in that case comes from
>> (insn 15 14 35 2 (set (reg:CCZ 17 flags)
>> (compare:CCZ (reg:DI 0 ax [111])
>> (reg:DI 1 dx [112]))) "pr112760.c":11:22 12 {*cmpdi_1}
>>  (expr_list:REG_UNUSED (reg:CCZ 17 flags)
>> (nil)))
>> (insn 35 15 36 2 (set (reg:CCZ 17 flags)
>> (compare:CCZ (reg:DI 0 ax [111])
>> (reg:DI 1 dx [112]))) "pr112760.c":11:22 12 {*cmpdi_1}
>>  (expr_list:REG_DEAD (reg:DI 1 dx [112])
>> (expr_list:REG_DEAD (reg:DI 0 ax [111])
>> (nil
>> ...
>> use of flags
>> Haven't verified what causes the redundant comparison, but postreload cse
>> then does:
>> 110if (!count && cselib_redundant_set_p (body))
>> 111  {
>> 112if (check_for_inc_dec (insn))
>> 113  delete_insn_and_edges (insn);
>> 114/* We're done with this insn.  */
>> 115goto done;
>> 116  }
>> So, we'd in such cases need to look up what instruction was the earlier
>> setter and if it has REG_UNUSED note, drop it.
>
> Hmm, OK.  I guess it's not as simple as I'd imagined.  cselib does have
> some code to track which instruction established which equivalence,
> but it doesn't currently record what we want, and it would be difficult
> to reuse that information here anyway.  Something "simple" like a map of
> register numbers to instructions, populated only for REG_UNUSED sets,
> would be enough, and low overhead.  But it's not very natural.
>
> Perhaps DF should maintain a flag to say "the current pass keeps
> notes up-to-date", with the assumption being that any pass that
> uses the notes problem does that.  Then single_set and the
> regcprop.cc uses can check that flag.
>
> I don't think it's worth adding the note problem to shrink-wrapping
> just for the regcprop code.  If we're prepared to take that compile-time
> hit, we might as well run a proper (fast) DCE.

Here's a patch that tries to do that.  Boostrapped & regression tested
on aarch64-linux-gnu.  Also tested on x86_64-linux-gnu for the testcase.
(I'll run full x86_64-linux-gnu testing overnight.)

OK to install if that passes?  Not an elegant fix, but it's probably
too much to hope for one of those.

Richard



PR112760 is a miscompilation caused by a stale, false-positive
REG_UNUSED note.  There were originally two consecutive,
identical instructions that set the CC flags.  The first
originally had a REG_UNUSED note, but postreload later deleted
the second in favour of the first, based on cselib_redundant_set_p.

Although in principle it would be possible to remove the note
when making the optimisation, the required bookkeeping wouldn't
fit naturally into what cselib already does.  Doing that would also
arguably be a change of policy.

This patch instead adds a global flag that says whether REG_UNUSED
notes are trustworthy.  The assumption is that any pass that calls
df_note_add_problem cares about REG_UNUSED notes and will keep them
sufficiently up-to-date to support the pass's use of things like
single_set.

gcc/
PR rtl-optimization/112760
* df.h (df_d::can_trust_reg_unused_notes): New member variable.
* df-problems.cc (df_note_add_problem): Set can_trust_reg_unused_notes
to true.
* passes.cc (execute_one_pass): Clear can_trust_reg_unused_notes
after each pass.
* rtlanal.cc (single_set_2): Check can_trust_reg_unused_notes.
* regcprop.cc (copyprop_hardreg_forward_1): Likewise.

gcc/testsuite/
* gcc.dg/pr112760.c: New test.
---
 gcc/df-problems.cc  |  1 +
 gcc/df.h|  4 
 gcc/passes.cc   |  3 +++
 gcc/regcprop.cc |  4 +++-
 gcc/rtlanal.cc  |  8 ++--
 gcc/testsuite/gcc.dg/pr112760.c | 22 ++
 6 files changed, 39 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/pr112760.c

diff --git a/gcc/df-problems.cc b/gcc/df-problems.cc
index d2cfaf7f50f..d2eb95d35ad 100644
--- a/gcc/df-problems.cc
+++ b/gcc/df-problems.cc
@@ -3782,6 +3782,7 @@ void
 df_note_add_problem (void)
 {
   df_add_problem (&problem_NOTE);
+  df->can_trust_reg_unused_notes = true;
 }
 
 
diff --git a/gcc/df.h b/gcc/df.h
index 402657a7076..a405c000235 100644
--- a/gcc/df.h
+++ b/gcc/df.h
@@ -614,6 +614,10 @@ public:
   /* True if someone added or deleted something from regs_ever_live so
  that the entry and exit blocks need be reprocessed.  */
   bool redo_entry_and_exit;
+
+  /* True if REG_UNUSED notes 

Re: [PATCH] gettext: disable install, docs targets, libasprintf, threads

2023-12-04 Thread Arsen Arsenović

Richard Biener  writes:

> OK.

Thanks.  I'll wait for the Binutils and GDB maintainers to weigh in
before pushing (plus, I can't push there).

Have a lovely day!
-- 
Arsen Arsenović


signature.asc
Description: PGP signature


[PATCH v4] aarch64: New RTL optimization pass avoid-store-forwarding.

2023-12-04 Thread Manos Anagnostakis
This is an RTL pass that detects store forwarding from stores to larger loads 
(load pairs).

This optimization is SPEC2017-driven and was found to be beneficial for some 
benchmarks,
through testing on ampere1/ampere1a machines.

For example, it can transform cases like

str  d5, [sp, #320]
fmul d5, d31, d29
ldp  d31, d17, [sp, #312] # Large load from small store

to

str  d5, [sp, #320]
fmul d5, d31, d29
ldr  d31, [sp, #312]
ldr  d17, [sp, #320]

Currently, the pass is disabled by default on all architectures and enabled by 
a target-specific option.

If deemed beneficial enough for a default, it will be enabled on 
ampere1/ampere1a,
or other architectures as well, without needing to be turned on by this option.

Bootstrapped and regtested on aarch64-linux.

gcc/ChangeLog:

* config.gcc: Add aarch64-store-forwarding.o to extra_objs.
* config/aarch64/aarch64-passes.def (INSERT_PASS_AFTER): New pass.
* config/aarch64/aarch64-protos.h (make_pass_avoid_store_forwarding): 
Declare.
* config/aarch64/aarch64.opt (mavoid-store-forwarding): New option.
(aarch64-store-forwarding-threshold): New param.
* config/aarch64/t-aarch64: Add aarch64-store-forwarding.o
* doc/invoke.texi: Document new option and new param.
* config/aarch64/aarch64-store-forwarding.cc: New file.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/ldp_ssll_no_overlap_address.c: New test.
* gcc.target/aarch64/ldp_ssll_no_overlap_offset.c: New test.
* gcc.target/aarch64/ldp_ssll_overlap.c: New test.

Signed-off-by: Manos Anagnostakis 
Co-Authored-By: Manolis Tsamis 
Co-Authored-By: Philipp Tomsich 
---
Changes in v4:
- I had problems to make cselib_subst_to_values work correctly
  so I used cselib_lookup to implement the exact same behaviour and
  record the store value at the time we iterate over it.
- Removed the store/load_mem_addr check from is_forwarding as
  unnecessary.
- The pass is called on all optimization levels right now.
- The threshold check should remain as it is as we only care for
  the front element of the list. The comment above the check explains
  why a single if is enough.
- The documentation changes requested.
- Adjusted a comment.

 gcc/config.gcc|   1 +
 gcc/config/aarch64/aarch64-passes.def |   1 +
 gcc/config/aarch64/aarch64-protos.h   |   1 +
 .../aarch64/aarch64-store-forwarding.cc   | 321 ++
 gcc/config/aarch64/aarch64.opt|   9 +
 gcc/config/aarch64/t-aarch64  |  10 +
 gcc/doc/invoke.texi   |  11 +-
 .../aarch64/ldp_ssll_no_overlap_address.c |  33 ++
 .../aarch64/ldp_ssll_no_overlap_offset.c  |  33 ++
 .../gcc.target/aarch64/ldp_ssll_overlap.c |  33 ++
 10 files changed, 452 insertions(+), 1 deletion(-)
 create mode 100644 gcc/config/aarch64/aarch64-store-forwarding.cc
 create mode 100644 
gcc/testsuite/gcc.target/aarch64/ldp_ssll_no_overlap_address.c
 create mode 100644 
gcc/testsuite/gcc.target/aarch64/ldp_ssll_no_overlap_offset.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/ldp_ssll_overlap.c

diff --git a/gcc/config.gcc b/gcc/config.gcc
index 748430194f3..2ee3b61c4fa 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -350,6 +350,7 @@ aarch64*-*-*)
cxx_target_objs="aarch64-c.o"
d_target_objs="aarch64-d.o"
extra_objs="aarch64-builtins.o aarch-common.o aarch64-sve-builtins.o 
aarch64-sve-builtins-shapes.o aarch64-sve-builtins-base.o 
aarch64-sve-builtins-sve2.o cortex-a57-fma-steering.o aarch64-speculation.o 
falkor-tag-collision-avoidance.o aarch-bti-insert.o aarch64-cc-fusion.o"
+   extra_objs="${extra_objs} aarch64-store-forwarding.o"
target_gtfiles="\$(srcdir)/config/aarch64/aarch64-builtins.cc 
\$(srcdir)/config/aarch64/aarch64-sve-builtins.h 
\$(srcdir)/config/aarch64/aarch64-sve-builtins.cc"
target_has_targetm_common=yes
;;
diff --git a/gcc/config/aarch64/aarch64-passes.def 
b/gcc/config/aarch64/aarch64-passes.def
index 6ace797b738..fa79e8adca8 100644
--- a/gcc/config/aarch64/aarch64-passes.def
+++ b/gcc/config/aarch64/aarch64-passes.def
@@ -23,3 +23,4 @@ INSERT_PASS_BEFORE (pass_reorder_blocks, 1, 
pass_track_speculation);
 INSERT_PASS_AFTER (pass_machine_reorg, 1, pass_tag_collision_avoidance);
 INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_insert_bti);
 INSERT_PASS_AFTER (pass_if_after_combine, 1, pass_cc_fusion);
+INSERT_PASS_AFTER (pass_peephole2, 1, pass_avoid_store_forwarding);
diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index d2718cc87b3..7d9dfa06af9 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -1050,6 +1050,7 @@ rtl_opt_pass *make_pass_track_speculation (gcc::context 
*);
 rtl_opt_pass *make_pass_tag_collision_avoidance (gcc::context *);
 r

Re: [PATCH v2] libatomic: Enable lock-free 128-bit atomics on AArch64 [PR110061]

2023-12-04 Thread Wilco Dijkstra
Hi Richard,

>> Enable lock-free 128-bit atomics on AArch64.  This is backwards compatible 
>> with
>> existing binaries, gives better performance than locking atomics and is what
>> most users expect.
>
> Please add a justification for why it's backwards compatible, rather
> than just stating that it's so.

This isn't any different than the LSE2 support which also switches some CPUs to
lock-free implementations. This is basically switching the rest. It trivially 
follows
from the fact that GCC always calls libatomic so that you switch all atomics in 
a
process. I'll add that to the description.

Note the compatibility story is even better than this. We are also compatible
with LLVM and future GCC versions which may inline these sequences.

> Thanks for adding this.  https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95722
> suggests that it's still an open question whether this is a correct thing
> to do, but it sounds from Joseph's comment that he isn't sure whether
> atomic loads from read-only data are valid.

Yes it's not useful to do an atomic read if it is a read-only value... It should
be feasible to mark atomic types as mutable to force them to .data (see eg.
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108659 and
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109553).

> Linus's comment in https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70490
> suggests that a reasonable compromise might be to use a storing
> implementation but not advertise that it is lock-free.  Also,
> the comment above libat_is_lock_free says:
>
> /* Note that this can return that a size/alignment is not lock-free even if
>    all the operations that we use to implement the respective accesses provide
>    lock-free forward progress as specified in C++14:  Users likely expect
>    "lock-free" to also mean "fast", which is why we do not return true if, for
>    example, we implement loads with this size/alignment using a CAS.  */

I don't believe lying about being lock-free like that is a good idea. When
you use a faster lock-free implementation, you want to tell users about it
(so they aren't forced to use nasty inline assembler hacks for example).

> We don't use a CAS for the fallbacks, but like you say, we do use a
> load/store exclusive loop.  So did you consider not doing this:

> +/* State we have lock-free 128-bit atomics.  */
> +#undef FAST_ATOMIC_LDST_16
> +#define FAST_ATOMIC_LDST_16    1

That would result in __atomic_is_lock_free incorrectly returning false.
Note that __atomic_always_lock_free remains false for 128-bit since there
is no inlining in the compiler, but __atomic_is_lock_free should be true.

> -   /* RELEASE.  */
> -5: ldxp    res0, res1, [x5]
> +   /* RELEASE/ACQ_REL/SEQ_CST.  */
> +4: ldaxp   res0, res1, [x5]
>  stlxp   w4, in0, in1, [x5]
> -   cbnz    w4, 5b
> +   cbnz    w4, 4b
>  ret
> +END (libat_exchange_16)

> Please explain (here and in the commit message) why you're adding
> acquire semantics to the RELEASE case.

That merges the RELEASE with ACQ_REL/SEQ_CST cases to keep the code
short and simple like much of the code. I've added a note in the commit msg.

Cheers,
Wilco

Here is v2 - this also incorporates the PR111404 fix to compare-exchange:

Enable lock-free 128-bit atomics on AArch64.  This is backwards compatible with
existing binaries (as for these GCC always calls into libatomic, so all 128-bit
atomic uses in  a process are switched), gives better performance than locking
atomics and is what most users expect.

Note 128-bit atomic loads use a load/store exclusive loop if LSE2 is not 
supported.
This results in an implicit store which is invisible to software as long as the
given address is writeable (which will be true when using atomics in actual 
code).

Passes regress, OK for commit?

libatomic/
config/linux/aarch64/atomic_16.S: Implement lock-free ARMv8.0 atomics.
(libat_exchange_16): Merge RELEASE and ACQ_REL/SEQ_CST cases.
config/linux/aarch64/host-config.h: Use atomic_16.S for baseline v8.0.
State we have lock-free atomics.

---

diff --git a/libatomic/config/linux/aarch64/atomic_16.S 
b/libatomic/config/linux/aarch64/atomic_16.S
index 
05439ce394b9653c9bcb582761ff7aaa7c8f9643..a099037179b3f1210145baea02a9d43418629813
 100644
--- a/libatomic/config/linux/aarch64/atomic_16.S
+++ b/libatomic/config/linux/aarch64/atomic_16.S
@@ -22,6 +22,22 @@
.  */
 
 
+/* AArch64 128-bit lock-free atomic implementation.
+
+   128-bit atomics are now lock-free for all AArch64 architecture versions.
+   This is backwards compatible with existing binaries (as we swap all uses
+   of 128-bit atomics via an ifunc) and gives better performance than locking
+   atomics.
+
+   128-bit atomic loads use a exclusive loop if LSE2 is not supported.
+   This results in an implicit store which is invisible to software as long
+   as the given address is writeable.  Since all other atomics have explicit
+   w

[gcc15] nested functions in C

2023-12-04 Thread Siddhesh Poyarekar
[Branching this into a separate conversation to avoid derailing the 
patch, which isn't directly related]


On 2023-12-04 12:21, Martin Uecker wrote:

I do not really agree with that.  Nested functions can substantially
improve code quality and in C can avoid type unsafe use of
void* pointers in callbacks. The code is often much better with
nested functions than without.  Nested functions and lambdas
(i.e. anonymous nested functions) are used in many languages
because they make code better and GNU's nested function are no
exception.

So I disagree with the idea that discouraging nested functions leads
to better code - I think the exact opposite is true.


I would argue that GNU's nested functions *are* an exception because 
they're like feathers stuck on a pig to try and make it fly; I think a 
significant specification effort is required to actually make it a 
cleanly usable feature.  It *may* be possible to implement patterns that 
use C nested functions well enough *and* result in readable code, but 
IMO it is easier to write clunky and unmaintainable code with it.


I empathize with Jakub's stated use case though of keeping the C 
frontend support for testing purposes, but that could easily be done 
behind a flag, or by putting nested C func deprecation behind a flag.



I am generally wary of mitigations that may make exploitation of
buffer overflows a bit harder  while increasing the likelihood
of buffer overflows by reducing type safety and/or code quality.

But I would agree that trampolines are generally problematic. A
better strategy would be wide function pointer type (as in Apple'
Blocks extension). Alternatively, an explicit way to obtain the
static chain for a nested function which could be used with
__builtin_call_with_static_chain  could also work.

But in any case, I think it diminishes the value of -fhardening
it if requires source code changes, because then it is not as easy
to simply turn it on in larger projects / distributitions.


I suppose you mean source code changes even in correct code just to 
comply with the flag?  I don't disagree for cases like -Warray-bounds, 
but for warnings/errors that are more deterministic in nature (like 
-Werror=trampolines), they're going to point at actual problems and 
larger projects and distributions will usually prefer to at least track 
them, if not actually fix them.  For Fedora we tend to provide macro 
overrides for packages that need to explicitly disable a security 
related flag.


Thanks,
Sid


Re: [PATCH] gettext: disable install, docs targets, libasprintf, threads

2023-12-04 Thread Tom Tromey
> "Arsen" == Arsen Arsenović  writes:

Arsen> Thanks.  I'll wait for the Binutils and GDB maintainers to weigh in
Arsen> before pushing (plus, I can't push there).

Seems fine to me.  Thank you.

Tom


Re: [gcc15] nested functions in C

2023-12-04 Thread Martin Uecker
Am Montag, dem 04.12.2023 um 13:27 -0500 schrieb Siddhesh Poyarekar:
> [Branching this into a separate conversation to avoid derailing the 
> patch, which isn't directly related]
> 
> On 2023-12-04 12:21, Martin Uecker wrote:
> > I do not really agree with that.  Nested functions can substantially
> > improve code quality and in C can avoid type unsafe use of
> > void* pointers in callbacks. The code is often much better with
> > nested functions than without.  Nested functions and lambdas
> > (i.e. anonymous nested functions) are used in many languages
> > because they make code better and GNU's nested function are no
> > exception.
> > 
> > So I disagree with the idea that discouraging nested functions leads
> > to better code - I think the exact opposite is true.
> 
> I would argue that GNU's nested functions *are* an exception because 
> they're like feathers stuck on a pig to try and make it fly; I think a 
> significant specification effort is required to actually make it a 
> cleanly usable feature.  It *may* be possible to implement patterns that 
> use C nested functions well enough *and* result in readable code, but 
> IMO it is easier to write clunky and unmaintainable code with it.

I use them in my code a lot and I think they improve
code quality.  For example:

int foo_find(int N, struct foo in_array[N], const char* *key)
{
  bool cond(struct foo* x)
  {
return 0 == strcmp(x->name, key); 
  }
  return find(N, in_array, cond);
}

is a lot cleaner and safer than what you need to write
without nested functions:

struct foo_find {
  const char* name;
}; 

int foo_cond(void *vdata, struct foo* a)
{
  struct foo *key = data;
  return 0 == strcmp(x->name, key->name);  
}

void foo_sort(int N, struct foo in_array[N], const char* key)
{
  struct foo_find data = { key };
  sort(N, in_array, foo_cond, &data);
}

and this is a toy example, the improvement gets more 
substantial with more complicated logic.

> 
> I empathize with Jakub's stated use case though of keeping the C 
> frontend support for testing purposes, but that could easily be done 
> behind a flag, or by putting nested C func deprecation behind a flag.

I am relatively sure C will get some form of nested functions.
Maybe as anonymous nested functions, i.e. lambdas, but I do
not see a fundamental difference here (I personally like naming
things for clarity, so i prefer named nested functions)

> > I am generally wary of mitigations that may make exploitation of
> > buffer overflows a bit harder  while increasing the likelihood
> > of buffer overflows by reducing type safety and/or code quality.
> > 
> > But I would agree that trampolines are generally problematic. A
> > better strategy would be wide function pointer type (as in Apple'
> > Blocks extension). Alternatively, an explicit way to obtain the
> > static chain for a nested function which could be used with
> > __builtin_call_with_static_chain  could also work.
> > 
> > But in any case, I think it diminishes the value of -fhardening
> > it if requires source code changes, because then it is not as easy
> > to simply turn it on in larger projects / distributitions.
> 
> I suppose you mean source code changes even in correct code just to 
> comply with the flag?  

Yes

> I don't disagree for cases like -Warray-bounds, 
> but for warnings/errors that are more deterministic in nature (like 
> -Werror=trampolines), they're going to point at actual problems and 
> larger projects and distributions will usually prefer to at least track 
> them, if not actually fix them.  For Fedora we tend to provide macro 
> overrides for packages that need to explicitly disable a security 
> related flag.

In projects such as mine, this will lead to a lot of code
transformations as indicated above, i.e. much worse code. 

One could get away with it, since nested functions are rarely
used, but I think this is bad, because a lot of code would
improve if it used them.

Martin

> 
> Thanks,
> Sid



Re: [gcc15] nested functions in C

2023-12-04 Thread Jakub Jelinek
On Mon, Dec 04, 2023 at 01:27:32PM -0500, Siddhesh Poyarekar wrote:
> [Branching this into a separate conversation to avoid derailing the patch,
> which isn't directly related]
> 
> On 2023-12-04 12:21, Martin Uecker wrote:
> > I do not really agree with that.  Nested functions can substantially
> > improve code quality and in C can avoid type unsafe use of
> > void* pointers in callbacks. The code is often much better with
> > nested functions than without.  Nested functions and lambdas
> > (i.e. anonymous nested functions) are used in many languages
> > because they make code better and GNU's nested function are no
> > exception.
> > 
> > So I disagree with the idea that discouraging nested functions leads
> > to better code - I think the exact opposite is true.
> 
> I would argue that GNU's nested functions *are* an exception because they're
> like feathers stuck on a pig to try and make it fly; I think a significant
> specification effort is required to actually make it a cleanly usable
> feature.

Why?  The syntax doesn't seem to be something unexpected, and as C doesn't
have lambdas, one can use the nested functions instead.
The only problem is if you need to pass function pointers somewhere else
(and target doesn't have function descriptors or something similar), if it
is only done to make code more readable compared to say use of macros, I
think the nested functions are better, one doesn't have to worry about
multiple evaluations of argument side-effects etc.  And if everything is
inlined and SRA optimized, there is no extra cost.
The problem of passing it as a function pointer to other functions is
common with C++, only lambdas which don't capture anything actually can be
convertible to function pointer, for anything else you need a template and
instantiate it for a particular lambda (which is something you can't do in
C).

Jakub



Re: [PATCH v5] Introduce strub: machine-independent stack scrubbing

2023-12-04 Thread Alexandre Oliva
The recently-installed patch for interprocedural value-range propagation
enabled some folding that was not expected by the strub-const testcases,
causing them to fail.

I'm making the following adjustments to them to restore the behavior
they tested for, and to make them more future-proof to future
improvements of ivrp.

I intend to install this as part of the monster patch upthread.


--- a/gcc/testsuite/c-c++-common/torture/strub-const1.c
+++ b/gcc/testsuite/c-c++-common/torture/strub-const1.c
@@ -1,18 +1,22 @@
 /* { dg-do compile } */
 /* { dg-options "-fstrub=strict -fdump-ipa-strub" } */
 
-/* Check that, along with a strub const function call, we issue an asm 
statement
-   to make sure the watermark passed to it is held in memory before the call,
-   and another to make sure it is not assumed to be unchanged.  */
+/* Check that, along with a strub const function call, we issue an asm
+   statement to make sure the watermark passed to it is held in memory before
+   the call, and another to make sure it is not assumed to be unchanged.  f
+   should not be inlined into g, but if it were too simple it might be folded
+   by interprocedural value-range propagation.  */
+
+extern int __attribute__ ((__strub__ ("callable"), __const__)) c ();
 
 int __attribute__ ((__strub__, __const__))
-f() {
-  return 0;
+f () {
+  return c ();
 }
 
 int
-g() {
-  return f();
+g () {
+  return f ();
 }
 
 /* { dg-final { scan-ipa-dump-times "__asm__" 2 "strub" } } */
--- a/gcc/testsuite/c-c++-common/torture/strub-const2.c
+++ b/gcc/testsuite/c-c++-common/torture/strub-const2.c
@@ -6,17 +6,19 @@
before the call, and another to make sure it is not assumed to be
unchanged.  */
 
+extern int __attribute__ ((__strub__ ("callable"), __const__)) c ();
+
 int __attribute__ ((__strub__))
 #if ! __OPTIMIZE__
 __attribute__ ((__const__))
 #endif
-f() {
-  return 0;
+f () {
+  return c ();
 }
 
 int
-g() {
-  return f();
+g () {
+  return f ();
 }
 
 /* { dg-final { scan-ipa-dump-times "__asm__" 2 "strub" } } */


-- 
Alexandre Oliva, happy hackerhttps://FSFLA.org/blogs/lxo/
   Free Software Activist   GNU Toolchain Engineer
More tolerance and less prejudice are key for inclusion and diversity
Excluding neuro-others for not behaving ""normal"" is *not* inclusive


[PATCH 09/17] [APX NDD] Support APX NDD for not insn

2023-12-04 Thread Hongyu Wang
From: Kong Lingling 

For *one_cmplsi2_2_zext, it will be splitted to xor, so its NDD form will be
added together with xor NDD support.

gcc/ChangeLog:

* config/i386/i386.md (one_cmpl2): Add new constraints for NDD
and adjust output template.
(*one_cmpl2_1): Likewise.
(*one_cmplqi2_1): Likewise.
(*one_cmpl2_doubleword): Likewise.
(*one_cmpl2_2): Likewise.
(*one_cmplsi2_1_zext): Likewise, and use nonimmediate_operand for
operands[1] to accept memory input for NDD alternative.

gcc/testsuite/ChangeLog:

* gcc.target/i386/apx-ndd.c: Add not test.
---
 gcc/config/i386/i386.md | 58 ++---
 gcc/testsuite/gcc.target/i386/apx-ndd.c | 11 +
 2 files changed, 44 insertions(+), 25 deletions(-)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 1a2fb116f01..050779273a7 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -14001,57 +14001,63 @@ (define_expand "one_cmpl2"
   [(set (match_operand:SDWIM 0 "nonimmediate_operand")
(not:SDWIM (match_operand:SDWIM 1 "nonimmediate_operand")))]
   ""
-  "ix86_expand_unary_operator (NOT, mode, operands); DONE;")
+  "ix86_expand_unary_operator (NOT, mode, operands,
+  TARGET_APX_NDD); DONE;")
 
 (define_insn_and_split "*one_cmpl2_doubleword"
-  [(set (match_operand: 0 "nonimmediate_operand" "=ro")
-   (not: (match_operand: 1 "nonimmediate_operand" "0")))]
-  "ix86_unary_operator_ok (NOT, mode, operands)"
+  [(set (match_operand: 0 "nonimmediate_operand" "=ro,r")
+   (not: (match_operand: 1 "nonimmediate_operand" "0,ro")))]
+  "ix86_unary_operator_ok (NOT, mode, operands, TARGET_APX_NDD)"
   "#"
   "&& reload_completed"
   [(set (match_dup 0)
(not:DWIH (match_dup 1)))
(set (match_dup 2)
(not:DWIH (match_dup 3)))]
-  "split_double_mode (mode, &operands[0], 2, &operands[0], 
&operands[2]);")
+  "split_double_mode (mode, &operands[0], 2, &operands[0], &operands[2]);"
+  [(set_attr "isa" "*,apx_ndd")])
 
 (define_insn "*one_cmpl2_1"
-  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm,?k")
-   (not:SWI248 (match_operand:SWI248 1 "nonimmediate_operand" "0,k")))]
-  "ix86_unary_operator_ok (NOT, mode, operands)"
+  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm,r,?k")
+   (not:SWI248 (match_operand:SWI248 1 "nonimmediate_operand" "0,rm,k")))]
+  "ix86_unary_operator_ok (NOT, mode, operands, TARGET_APX_NDD)"
   "@
not{}\t%0
+   not{}\t{%1, %0|%0, %1}
#"
-  [(set_attr "isa" "*,")
-   (set_attr "type" "negnot,msklog")
+  [(set_attr "isa" "*,apx_ndd,")
+   (set_attr "type" "negnot,negnot,msklog")
(set_attr "mode" "")])
 
 (define_insn "*one_cmplsi2_1_zext"
-  [(set (match_operand:DI 0 "register_operand" "=r,?k")
+  [(set (match_operand:DI 0 "register_operand" "=r,r,?k")
(zero_extend:DI
- (not:SI (match_operand:SI 1 "register_operand" "0,k"]
-  "TARGET_64BIT && ix86_unary_operator_ok (NOT, SImode, operands)"
+ (not:SI (match_operand:SI 1 "nonimmediate_operand" "0,rm,k"]
+  "TARGET_64BIT && ix86_unary_operator_ok (NOT, SImode, operands,
+  TARGET_APX_NDD)"
   "@
not{l}\t%k0
+   not{l}\t{%1, %k0|%k0, %1}
#"
-  [(set_attr "isa" "x64,avx512bw_512")
-   (set_attr "type" "negnot,msklog")
-   (set_attr "mode" "SI,SI")])
+  [(set_attr "isa" "x64,apx_ndd,avx512bw_512")
+   (set_attr "type" "negnot,negnot,msklog")
+   (set_attr "mode" "SI,SI,SI")])
 
 (define_insn "*one_cmplqi2_1"
-  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,r,?k")
-   (not:QI (match_operand:QI 1 "nonimmediate_operand" "0,0,k")))]
-  "ix86_unary_operator_ok (NOT, QImode, operands)"
+  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,r,r,?k")
+   (not:QI (match_operand:QI 1 "nonimmediate_operand" "0,0,rm,k")))]
+  "ix86_unary_operator_ok (NOT, QImode, operands, TARGET_APX_NDD)"
   "@
not{b}\t%0
not{l}\t%k0
+   not{b}\t{%1, %0|%0, %1}
#"
-  [(set_attr "isa" "*,*,avx512f")
-   (set_attr "type" "negnot,negnot,msklog")
+  [(set_attr "isa" "*,*,apx_ndd,avx512f")
+   (set_attr "type" "negnot,negnot,negnot,msklog")
(set (attr "mode")
(cond [(eq_attr "alternative" "1")
 (const_string "SI")
-   (and (eq_attr "alternative" "2")
+   (and (eq_attr "alternative" "3")
 (match_test "!TARGET_AVX512DQ"))
 (const_string "HI")
   ]
@@ -14081,14 +14087,16 @@ (define_insn_and_split "*one_cmpl_1_slp"
 
 (define_insn "*one_cmpl2_2"
   [(set (reg FLAGS_REG)
-   (compare (not:SWI (match_operand:SWI 1 "nonimmediate_operand" "0"))
+   (compare (not:SWI (match_operand:SWI 1 "nonimmediate_operand" "0,rm"))
 (const_int 0)))
-   (set (match_operand:SWI 0 "nonimmediate_operand" "=m")
+   (set (match_operand:SWI 0 "nonimmediate_operand" "=m,r")
(not:SWI (match_dup 1)))]
   "ix86

Re: [PATCH v6 1/1] c++: Initial support for P0847R7 (Deducing This) [PR102609]

2023-12-04 Thread waffl3x
On Monday, December 4th, 2023 at 9:39 PM, waffl3x  
wrote:

> On Monday, December 4th, 2023 at 9:35 PM, waffl3x waff...@protonmail.com 
> wrote:
>
>
>
> > > > @@ -15402,6 +15450,8 @@ tsubst_decl (tree t, tree args, tsubst_flags_t 
> > > > complain,
> >
> > > > gcc_checking_assert (TYPE_MAIN_VARIANT (TREE_TYPE (ve))
> > > > == TYPE_MAIN_VARIANT (type));
> > > > SET_DECL_VALUE_EXPR (r, ve);
> > > > + if (is_capture_proxy (t))
> > > > + type = TREE_TYPE (ve);
> >
> > > That should have close to the same effect as the lambda_proxy_type
> > > adjustment I was talking about, since that function basically returns
> > > the TREE_TYPE of the COMPONENT_REF. But the underlying problem is that
> > > finish_non_static_data_member assumes that 'object' is '*this', for
> > > which you can trust the cv-quals; for auto&&, you can't.
> > > capture_decltype has the same problem. I'm attaching a patch to address
> > > this in both places.
> >
> > Regarding this, was my change actually okay, and was your change
> > supposed to address it? I applied my patch to the latest commit in
> > master yesterday and started tests and whatnot with this change
> > commented out as I wasn't sure. It seems like my tests for constness of
> > captures no longer works with or without this change commented out.
> >
> > If you wish I can go over everything again and figure out a new
> > solution with your changes but stepping through all this code was quite
> > a task that I'm weary of doing again. Even if the second time through
> > won't be so arduous I would like to avoid it.
> >
> > You know what, I'll give it a go anyway but I don't want to spend too
> > much time on it, I still have a few tests to clean up and this crash to
> > fix.
> >
> > template  void f()
> >
> > {
> > int i;
> > [=](this T&& self){ return i; }(); // error, unrelated
> > }
> > int main() { f(); }
> >
> > If this crash doesn't take too long (I don't think it will, it seems
> > straightforward enough) then I'll look at fixing the captures with a
> > const xobject parameter bug the correct way.
> >
> > Alex
>
>
> WAIT Scratch that, I made a mistake, there's only a single case that is
> broken, I read the test log wrong. Ah, I swear I'm cursed to realize
> things the moment I hit the send button.
>
> I have to take a closer look, I'll get back to you when I know more,
> just trying to make sure you don't waste your time on this due to my
> mistake.
>
> Alex

tl;dr it wasn't important, I just have to fix my test.

Okay that was faster than I anticipated, but unfortunately I don't know
how to handle it. I think your change in finish_non_static_data_member
might have been too heavy handed, but I don't know if there's a middle
ground. Or that's what I was going to say until I tested my assumption
on godbolt.

void f(auto const& a) { a = 5; }

Clang, MSVC and GCC all accept this until it is actually instantiated.

So, the true answer to my test failing is to just instantiate the
template. The test in question that was failing looks like this.

auto f2 = [n = 5](this auto const&){ n = 10; }; // { dg-error {} }

With the way things were before, this actually worked, so what my
assumption is now is that for us to actually diagnose this before a
template is instantiated would take some significant reworking of how
things are currently done. AND, I don't even know if it's legal for us
to make this diagnostic before instantiation for either of these cases.

Hah, come to think of it, we can't, there could be an overloaded
operator= that this is valid for... how disappointing.

We can for lambdas since the type is not dependent (on the lambda
instantiation) but it just isn't worth the effort I reckon.

Whatever, moving on, spending time on these things always drains me
because I think "oh boy I can do something better" and finding out it's
just not possible sucks. It's worse when it's because I overlooked
something that's obvious in hindsight.

Oh well, only that crash left I believe.

Alex


[PATCH v2 00/17] Support Intel APX NDD

2023-12-04 Thread Hongyu Wang
Hi,

APX NDD patches have been posted at
https://gcc.gnu.org/pipermail/gcc-patches/2023-November/636604.html

Thanks to Hongtao's review, the V2 patch adds support of zext sematic with
memory input as NDD by default clear upper bits of dest for any operand size.

Also we support TImode shift with new split helper functions, which allows NDD
form split but still restric the memory src usage as in post-reload splitter
the register number is restricted, and no new register can be used for
shld/shrd.

Also fixed several typo/formatting/redundant code.

Bootstrapped/regtested on x86_64-pc-linux-gnu{-m32,} and sde.

OK for trunk?

Hongyu Wang (8):
  [APX NDD] Restrict TImode register usage when NDD enabled
  [APX NDD] Disable seg_prefixed memory usage for NDD add
  [APX NDD] Support APX NDD for left shift insns
  [APX NDD] Support APX NDD for right shift insns
  [APX NDD] Support APX NDD for rotate insns
  [APX NDD] Support APX NDD for shld/shrd insns
  [APX NDD] Support APX NDD for cmove insns
  [APX NDD] Support TImode shift for NDD

Kong Lingling (9):
  [APX NDD] Support Intel APX NDD for legacy add insn
  [APX NDD] Support APX NDD for optimization patterns of add
  [APX NDD] Support APX NDD for adc insns
  [APX NDD] Support APX NDD for sub insns
  [APX NDD] Support APX NDD for sbb insn
  [APX NDD] Support APX NDD for neg insn
  [APX NDD] Support APX NDD for not insn
  [APX NDD] Support APX NDD for and insn
  [APX NDD] Support APX NDD for or/xor insn

 gcc/config/i386/constraints.md|5 +
 gcc/config/i386/i386-expand.cc|  164 +-
 gcc/config/i386/i386-options.cc   |2 +
 gcc/config/i386/i386-protos.h |   16 +-
 gcc/config/i386/i386.cc   |   40 +-
 gcc/config/i386/i386.md   | 2323 +++--
 gcc/testsuite/gcc.target/i386/apx-ndd-adc.c   |   15 +
 gcc/testsuite/gcc.target/i386/apx-ndd-cmov.c  |   16 +
 gcc/testsuite/gcc.target/i386/apx-ndd-sbb.c   |6 +
 .../gcc.target/i386/apx-ndd-shld-shrd.c   |   24 +
 .../gcc.target/i386/apx-ndd-ti-shift.c|   91 +
 gcc/testsuite/gcc.target/i386/apx-ndd.c   |  202 ++
 12 files changed, 2149 insertions(+), 755 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/apx-ndd-adc.c
 create mode 100644 gcc/testsuite/gcc.target/i386/apx-ndd-cmov.c
 create mode 100644 gcc/testsuite/gcc.target/i386/apx-ndd-sbb.c
 create mode 100644 gcc/testsuite/gcc.target/i386/apx-ndd-shld-shrd.c
 create mode 100644 gcc/testsuite/gcc.target/i386/apx-ndd-ti-shift.c
 create mode 100644 gcc/testsuite/gcc.target/i386/apx-ndd.c

-- 
2.31.1



Re: [PATCH v4] aarch64: New RTL optimization pass avoid-store-forwarding.

2023-12-04 Thread Richard Sandiford
Manos Anagnostakis  writes:
> This is an RTL pass that detects store forwarding from stores to larger loads 
> (load pairs).
>
> This optimization is SPEC2017-driven and was found to be beneficial for some 
> benchmarks,
> through testing on ampere1/ampere1a machines.
>
> For example, it can transform cases like
>
> str  d5, [sp, #320]
> fmul d5, d31, d29
> ldp  d31, d17, [sp, #312] # Large load from small store
>
> to
>
> str  d5, [sp, #320]
> fmul d5, d31, d29
> ldr  d31, [sp, #312]
> ldr  d17, [sp, #320]
>
> Currently, the pass is disabled by default on all architectures and enabled 
> by a target-specific option.
>
> If deemed beneficial enough for a default, it will be enabled on 
> ampere1/ampere1a,
> or other architectures as well, without needing to be turned on by this 
> option.
>
> Bootstrapped and regtested on aarch64-linux.
>
> gcc/ChangeLog:
>
> * config.gcc: Add aarch64-store-forwarding.o to extra_objs.
> * config/aarch64/aarch64-passes.def (INSERT_PASS_AFTER): New pass.
> * config/aarch64/aarch64-protos.h (make_pass_avoid_store_forwarding): 
> Declare.
> * config/aarch64/aarch64.opt (mavoid-store-forwarding): New option.
>   (aarch64-store-forwarding-threshold): New param.
> * config/aarch64/t-aarch64: Add aarch64-store-forwarding.o
> * doc/invoke.texi: Document new option and new param.
> * config/aarch64/aarch64-store-forwarding.cc: New file.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/aarch64/ldp_ssll_no_overlap_address.c: New test.
> * gcc.target/aarch64/ldp_ssll_no_overlap_offset.c: New test.
> * gcc.target/aarch64/ldp_ssll_overlap.c: New test.
>
> Signed-off-by: Manos Anagnostakis 
> Co-Authored-By: Manolis Tsamis 
> Co-Authored-By: Philipp Tomsich 
> ---
> Changes in v4:
>   - I had problems to make cselib_subst_to_values work correctly
> so I used cselib_lookup to implement the exact same behaviour and
> record the store value at the time we iterate over it.
>   - Removed the store/load_mem_addr check from is_forwarding as
> unnecessary.
>   - The pass is called on all optimization levels right now.
>   - The threshold check should remain as it is as we only care for
> the front element of the list. The comment above the check explains
> why a single if is enough.

I still think this is structurally better as a while.  There's no reason
in principle we why wouldn't want to record the stores in:

stp x0, x1, [x4, #8]
ldp x0, x1, [x4, #0]
ldp x2, x3, [x4, #16]

and then the two stores should have the same distance value.
I realise we don't do that yet, but still.

>   - The documentation changes requested.
>   - Adjusted a comment.
>
>  gcc/config.gcc|   1 +
>  gcc/config/aarch64/aarch64-passes.def |   1 +
>  gcc/config/aarch64/aarch64-protos.h   |   1 +
>  .../aarch64/aarch64-store-forwarding.cc   | 321 ++
>  gcc/config/aarch64/aarch64.opt|   9 +
>  gcc/config/aarch64/t-aarch64  |  10 +
>  gcc/doc/invoke.texi   |  11 +-
>  .../aarch64/ldp_ssll_no_overlap_address.c |  33 ++
>  .../aarch64/ldp_ssll_no_overlap_offset.c  |  33 ++
>  .../gcc.target/aarch64/ldp_ssll_overlap.c |  33 ++
>  10 files changed, 452 insertions(+), 1 deletion(-)
>  create mode 100644 gcc/config/aarch64/aarch64-store-forwarding.cc
>  create mode 100644 
> gcc/testsuite/gcc.target/aarch64/ldp_ssll_no_overlap_address.c
>  create mode 100644 
> gcc/testsuite/gcc.target/aarch64/ldp_ssll_no_overlap_offset.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/ldp_ssll_overlap.c
>
> diff --git a/gcc/config.gcc b/gcc/config.gcc
> index 748430194f3..2ee3b61c4fa 100644
> --- a/gcc/config.gcc
> +++ b/gcc/config.gcc
> @@ -350,6 +350,7 @@ aarch64*-*-*)
>   cxx_target_objs="aarch64-c.o"
>   d_target_objs="aarch64-d.o"
>   extra_objs="aarch64-builtins.o aarch-common.o aarch64-sve-builtins.o 
> aarch64-sve-builtins-shapes.o aarch64-sve-builtins-base.o 
> aarch64-sve-builtins-sve2.o cortex-a57-fma-steering.o aarch64-speculation.o 
> falkor-tag-collision-avoidance.o aarch-bti-insert.o aarch64-cc-fusion.o"
> + extra_objs="${extra_objs} aarch64-store-forwarding.o"
>   target_gtfiles="\$(srcdir)/config/aarch64/aarch64-builtins.cc 
> \$(srcdir)/config/aarch64/aarch64-sve-builtins.h 
> \$(srcdir)/config/aarch64/aarch64-sve-builtins.cc"
>   target_has_targetm_common=yes
>   ;;
> diff --git a/gcc/config/aarch64/aarch64-passes.def 
> b/gcc/config/aarch64/aarch64-passes.def
> index 6ace797b738..fa79e8adca8 100644
> --- a/gcc/config/aarch64/aarch64-passes.def
> +++ b/gcc/config/aarch64/aarch64-passes.def
> @@ -23,3 +23,4 @@ INSERT_PASS_BEFORE (pass_reorder_blocks, 1, 
> pass_track_speculation);
>  INSERT_PASS_AFTER (pass_machine_reorg, 1, pass_tag_collision_avoidance);
>  INSERT_PASS_BEFORE (pass

[PATCH 08/17] [APX NDD] Support APX NDD for neg insn

2023-12-04 Thread Hongyu Wang
From: Kong Lingling 

gcc/ChangeLog:

* config/i386/i386-expand.cc (ix86_expand_unary_operator): Add use_ndd
parameter and adjust for NDD.
* config/i386/i386-protos.h: Add use_ndd parameter for
ix86_unary_operator_ok and ix86_expand_unary_operator.
* config/i386/i386.cc (ix86_unary_operator_ok): Add use_ndd parameter
and adjust for NDD.
* config/i386/i386.md (neg2): Add new constraint for NDD and
adjust output template.
(*neg_1): Likewise.
(*neg2_doubleword): Likewise.
(*neg_2): Likewise.
(*neg_ccc_1): Likewise.
(*neg_ccc_2): Likewise.
(*negsi_1_zext): Likewise, and use nonimmediate_operand for operands[1]
to accept memory input for NDD alternatives.
(*negsi_2_zext): Likewise.

gcc/testsuite/ChangeLog:

* gcc.target/i386/apx-ndd.c: Add neg test.
---
 gcc/config/i386/i386-expand.cc  |  4 +-
 gcc/config/i386/i386-protos.h   |  5 +-
 gcc/config/i386/i386.cc |  5 +-
 gcc/config/i386/i386.md | 77 -
 gcc/testsuite/gcc.target/i386/apx-ndd.c | 29 ++
 5 files changed, 87 insertions(+), 33 deletions(-)

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 93ecde4b4a8..d4bbd33ce07 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -1494,7 +1494,7 @@ ix86_binary_operator_ok (enum rtx_code code, machine_mode 
mode,
 
 void
 ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
-   rtx operands[])
+   rtx operands[], bool use_ndd)
 {
   bool matching_memory = false;
   rtx src, dst, op, clob;
@@ -1513,7 +1513,7 @@ ix86_expand_unary_operator (enum rtx_code code, 
machine_mode mode,
 }
 
   /* When source operand is memory, destination must match.  */
-  if (MEM_P (src) && !matching_memory)
+  if (!use_ndd && MEM_P (src) && !matching_memory)
 src = force_reg (mode, src);
 
   /* Emit the instruction.  */
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index 481527872e8..fa952409729 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -127,7 +127,7 @@ extern bool ix86_vec_interleave_v2df_operator_ok (rtx 
operands[3], bool high);
 extern bool ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn);
 extern bool ix86_agi_dependent (rtx_insn *set_insn, rtx_insn *use_insn);
 extern void ix86_expand_unary_operator (enum rtx_code, machine_mode,
-   rtx[]);
+   rtx[], bool = false);
 extern rtx ix86_build_const_vector (machine_mode, bool, rtx);
 extern rtx ix86_build_signbit_mask (machine_mode, bool, bool);
 extern HOST_WIDE_INT ix86_convert_const_vector_to_integer (rtx,
@@ -147,7 +147,8 @@ extern void ix86_split_fp_absneg_operator (enum rtx_code, 
machine_mode,
   rtx[]);
 extern void ix86_expand_copysign (rtx []);
 extern void ix86_expand_xorsign (rtx []);
-extern bool ix86_unary_operator_ok (enum rtx_code, machine_mode, rtx[2]);
+extern bool ix86_unary_operator_ok (enum rtx_code, machine_mode, rtx[2],
+   bool = false);
 extern bool ix86_match_ccmode (rtx, machine_mode);
 extern bool ix86_match_ptest_ccmode (rtx);
 extern void ix86_expand_branch (enum rtx_code, rtx, rtx, rtx);
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 3e670330ef6..a3b628d2f6d 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -16209,11 +16209,12 @@ ix86_dep_by_shift_count (const_rtx set_insn, 
const_rtx use_insn)
 bool
 ix86_unary_operator_ok (enum rtx_code,
machine_mode,
-   rtx operands[2])
+   rtx operands[2],
+   bool use_ndd)
 {
   /* If one of operands is memory, source and destination must match.  */
   if ((MEM_P (operands[0])
-   || MEM_P (operands[1]))
+   || (!use_ndd && MEM_P (operands[1])))
   && ! rtx_equal_p (operands[0], operands[1]))
 return false;
   return true;
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index e2705ada31a..1a2fb116f01 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -13282,13 +13282,14 @@ (define_expand "neg2"
   [(set (match_operand:SDWIM 0 "nonimmediate_operand")
(neg:SDWIM (match_operand:SDWIM 1 "nonimmediate_operand")))]
   ""
-  "ix86_expand_unary_operator (NEG, mode, operands); DONE;")
+  "ix86_expand_unary_operator (NEG, mode, operands,
+  TARGET_APX_NDD); DONE;")
 
 (define_insn_and_split "*neg2_doubleword"
-  [(set (match_operand: 0 "nonimmediate_operand" "=ro")
-   (neg: (match_operand: 1 "nonimmediate_operand" "0")))
+  [(set (match_operand: 0 "nonimmediate_operand" "=ro,r")
+   (neg: (match_operand: 1 "nonimmediate_operand" "0,ro")))
(

Re: [PATCH v2 00/17] Support Intel APX NDD

2023-12-04 Thread Hongtao Liu
On Tue, Dec 5, 2023 at 10:32 AM Hongyu Wang  wrote:
>
> Hi,
>
> APX NDD patches have been posted at
> https://gcc.gnu.org/pipermail/gcc-patches/2023-November/636604.html
>
> Thanks to Hongtao's review, the V2 patch adds support of zext sematic with
> memory input as NDD by default clear upper bits of dest for any operand size.
>
> Also we support TImode shift with new split helper functions, which allows NDD
> form split but still restric the memory src usage as in post-reload splitter
> the register number is restricted, and no new register can be used for
> shld/shrd.
>
> Also fixed several typo/formatting/redundant code.
Patches LGTM, Please wait a few more days before committing incase
other folks have comments.
>
> Bootstrapped/regtested on x86_64-pc-linux-gnu{-m32,} and sde.
>
> OK for trunk?
>
> Hongyu Wang (8):
>   [APX NDD] Restrict TImode register usage when NDD enabled
>   [APX NDD] Disable seg_prefixed memory usage for NDD add
>   [APX NDD] Support APX NDD for left shift insns
>   [APX NDD] Support APX NDD for right shift insns
>   [APX NDD] Support APX NDD for rotate insns
>   [APX NDD] Support APX NDD for shld/shrd insns
>   [APX NDD] Support APX NDD for cmove insns
>   [APX NDD] Support TImode shift for NDD
>
> Kong Lingling (9):
>   [APX NDD] Support Intel APX NDD for legacy add insn
>   [APX NDD] Support APX NDD for optimization patterns of add
>   [APX NDD] Support APX NDD for adc insns
>   [APX NDD] Support APX NDD for sub insns
>   [APX NDD] Support APX NDD for sbb insn
>   [APX NDD] Support APX NDD for neg insn
>   [APX NDD] Support APX NDD for not insn
>   [APX NDD] Support APX NDD for and insn
>   [APX NDD] Support APX NDD for or/xor insn
>
>  gcc/config/i386/constraints.md|5 +
>  gcc/config/i386/i386-expand.cc|  164 +-
>  gcc/config/i386/i386-options.cc   |2 +
>  gcc/config/i386/i386-protos.h |   16 +-
>  gcc/config/i386/i386.cc   |   40 +-
>  gcc/config/i386/i386.md   | 2323 +++--
>  gcc/testsuite/gcc.target/i386/apx-ndd-adc.c   |   15 +
>  gcc/testsuite/gcc.target/i386/apx-ndd-cmov.c  |   16 +
>  gcc/testsuite/gcc.target/i386/apx-ndd-sbb.c   |6 +
>  .../gcc.target/i386/apx-ndd-shld-shrd.c   |   24 +
>  .../gcc.target/i386/apx-ndd-ti-shift.c|   91 +
>  gcc/testsuite/gcc.target/i386/apx-ndd.c   |  202 ++
>  12 files changed, 2149 insertions(+), 755 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/apx-ndd-adc.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/apx-ndd-cmov.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/apx-ndd-sbb.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/apx-ndd-shld-shrd.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/apx-ndd-ti-shift.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/apx-ndd.c
>
> --
> 2.31.1
>


-- 
BR,
Hongtao


[PATCH] btf: avoid wrong DATASEC entries for extern vars [PR112849]

2023-12-04 Thread David Faust
The process of creating BTF_KIND_DATASEC records involves iterating
through variable declarations, determining which section they will be
placed in, and creating an entry in the appropriate DATASEC record
accordingly.

For variables without e.g. an explicit __attribute__((section)), we use
categorize_decl_for_section () to identify the appropriate named section
and corresponding BTF_KIND_DATASEC record.

This was incorrectly being done for 'extern' variable declarations as
well as non-extern ones, which meant that extern variable declarations
could result in BTF_KIND_DATASEC entries claiming the variable is
allocated in some section such as '.bss' without any knowledge whether
that is actually true. That resulted in errors building the Linux kernel
BPF selftests.

This patch corrects btf_collect_datasec () to avoid assuming a section
for extern variables, and only emit BTF_KIND_DATASEC entries for them if
they have a known section.

Bootstrapped + tested on x86_64-linux-gnu.
Tested on x86_64-linux-gnu host for bpf-unknown-none.

gcc/
PR debug/112849
* btfout.cc (btf_collect_datasec): Avoid incorrectly creating an
entry in a BTF_KIND_DATASEC record for extern variable decls without
a known section.

gcc/testsuite/
PR debug/112849
* gcc.dg/debug/btf/btf-datasec-3.c: New test.
---
 gcc/btfout.cc | 10 ++-
 .../gcc.dg/debug/btf/btf-datasec-3.c  | 27 +++
 2 files changed, 36 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.dg/debug/btf/btf-datasec-3.c

diff --git a/gcc/btfout.cc b/gcc/btfout.cc
index a5e0d640e19..db4f1084f85 100644
--- a/gcc/btfout.cc
+++ b/gcc/btfout.cc
@@ -486,7 +486,15 @@ btf_collect_datasec (ctf_container_ref ctfc)
 
   /* Mark extern variables.  */
   if (DECL_EXTERNAL (node->decl))
-   dvd->dvd_visibility = BTF_VAR_GLOBAL_EXTERN;
+   {
+ dvd->dvd_visibility = BTF_VAR_GLOBAL_EXTERN;
+
+ /* PR112849: avoid assuming a section for extern decls without
+an explicit section, which would result in incorrectly
+emitting a BTF_KIND_DATASEC entry for them.  */
+ if (node->get_section () == NULL)
+   continue;
+   }
 
   const char *section_name = get_section_name (node);
   if (section_name == NULL)
diff --git a/gcc/testsuite/gcc.dg/debug/btf/btf-datasec-3.c 
b/gcc/testsuite/gcc.dg/debug/btf/btf-datasec-3.c
new file mode 100644
index 000..3c1c7a28c2a
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/debug/btf/btf-datasec-3.c
@@ -0,0 +1,27 @@
+/* PR debug/112849
+   Test that we do not incorrectly create BTF_KIND_DATASEC entries for
+   extern decls with no known section.  */
+
+/* { dg-do compile } */
+/* { dg-options "-O0 -gbtf -dA" } */
+
+extern int VERSION __attribute__((section (".version")));
+
+extern int test_bss1;
+extern int test_data1;
+
+int test_bss2;
+int test_data2 = 2;
+
+int
+foo (void)
+{
+  test_bss2 = VERSION;
+  return test_bss1 + test_data1 + test_data2;
+}
+
+/* There should only be a DATASEC entries for VERSION out of the extern decls. 
 */
+/* { dg-final { scan-assembler-times "bts_type" 3 } } */
+/* { dg-final { scan-assembler-times "bts_type: \\(BTF_KIND_VAR 
'test_data2'\\)" 1 } } */
+/* { dg-final { scan-assembler-times "bts_type: \\(BTF_KIND_VAR 
'test_bss2'\\)" 1 } } */
+/* { dg-final { scan-assembler-times "bts_type: \\(BTF_KIND_VAR 'VERSION'\\)" 
1 } } */
-- 
2.42.0



[PATCH v2 0/2] Delete ISA_BASE_LA64V110 related definitions.

2023-12-04 Thread Lulu Cheng
1. Rebase Xi Ruoyao's patch a to the latest commit.
https://gcc.gnu.org/pipermail/gcc-patches/2023-November/636798.html

2. remove the #if
!defined(IN_LIBGCC2) && !defined(IN_TARGET_LIBS) && !defined(IN_RTS)
guards in loongarch-def.h and loongarch-opts.h as they'll be unneeded.

3. Described in LoongArch Reference Manual v1.1:
The new functional subsets in each new version have independent identification
bits in the return value of the CPUCFG instruction. It is recommended that the
software determines the running process based on this information rather than
the version number of the Loongson architecture.

So delete the ISA_BASE_LA64V110 related definitions here.

*** BLURB HERE ***

Lulu Cheng (1):
  LoongArch: Remove the definition of ISA_BASE_LA64V110 from the code.

Xi Ruoyao (1):
  LoongArch: Switch loongarch-def from C to C++ to make it possible.

 .../loongarch/genopts/loongarch-strings   |   1 -
 gcc/config/loongarch/genopts/loongarch.opt.in |   3 -
 gcc/config/loongarch/loongarch-cpu.cc |  23 +-
 gcc/config/loongarch/loongarch-def-array.h|  40 +++
 gcc/config/loongarch/loongarch-def.c  | 227 --
 gcc/config/loongarch/loongarch-def.cc | 193 +++
 gcc/config/loongarch/loongarch-def.h  |  67 +++---
 gcc/config/loongarch/loongarch-opts.cc|  10 +-
 gcc/config/loongarch/loongarch-opts.h |   9 +-
 gcc/config/loongarch/loongarch-str.h  |   1 -
 gcc/config/loongarch/loongarch-tune.h | 123 +-
 gcc/config/loongarch/loongarch.opt|   3 -
 gcc/config/loongarch/t-loongarch  |   4 +-
 13 files changed, 405 insertions(+), 299 deletions(-)
 create mode 100644 gcc/config/loongarch/loongarch-def-array.h
 delete mode 100644 gcc/config/loongarch/loongarch-def.c
 create mode 100644 gcc/config/loongarch/loongarch-def.cc

-- 
2.31.1



[PATCH 02/17] [APX NDD] Restrict TImode register usage when NDD enabled

2023-12-04 Thread Hongyu Wang
Under APX NDD, previous TImode allocation will have issue that it was
originally allocated using continuous pair, like rax:rdi, rdi:rdx.

This will cause issue for all TImode NDD patterns. For NDD we will not
assume the arithmetic operations like add have dependency between dest
and src1, then write to 1st highpart rdi will be overrided by the 2nd
lowpart rdi if 2nd lowpart rdi have different src as input, then the write
to 1st highpart rdi will missed and cause miscompliation.

To resolve this, under TARGET_APX_NDD we'd only allow register with even
regno to be allocated with TImode, then TImode registers will be allocated
with non-overlapping pairs.

There could be some error for inline assembly if it forcely allocate __int128
with odd number general register.

gcc/ChangeLog:

* config/i386/i386.cc (ix86_hard_regno_mode_ok): Restrict even regno
for TImode if APX NDD enabled.
---
 gcc/config/i386/i386.cc | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 93a9cb556a5..3efeed396c4 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -20873,6 +20873,16 @@ ix86_hard_regno_mode_ok (unsigned int regno, 
machine_mode mode)
return true;
   return !can_create_pseudo_p ();
 }
+  /* With TImode we previously have assumption that src1/dest will use same
+ register, so the allocation of highpart/lowpart can be consecutive, and
+ 2 TImode insn would held their low/highpart in continuous sequence like
+ rax:rdx, rdx:rcx. This will not work for APX_NDD since NDD allows
+ different registers as dest/src1, when writes to 2nd lowpart will impact
+ the writes to 1st highpart, then the insn will be optimized out. So for
+ TImode pattern if we support NDD form, the allowed register number should
+ be even to avoid such mixed high/low part override. */
+  else if (TARGET_APX_NDD && mode == TImode)
+return regno % 2 == 0;
   /* We handle both integer and floats in the general purpose registers.  */
   else if (VALID_INT_MODE_P (mode)
   || VALID_FP_MODE_P (mode))
-- 
2.31.1



[PATCH v2 2/2] LoongArch: Remove the definition of ISA_BASE_LA64V110 from the code.

2023-12-04 Thread Lulu Cheng
The instructions defined in LoongArch Reference Manual v1.1 are not the 
instruction
set v1.1 version. The CPU defined later may only support some instructions in
LoongArch Reference Manual v1.1. Therefore, the macro ISA_BASE_LA64V110 and
related definitions are removed here.

gcc/ChangeLog:

* config/loongarch/genopts/loongarch-strings: Delete 
STR_ISA_BASE_LA64V110.
* config/loongarch/genopts/loongarch.opt.in: Likewise.
* config/loongarch/loongarch-cpu.cc (ISA_BASE_LA64V110_FEATURES): 
Delete macro.
(fill_native_cpu_config): Define a new variable hw_isa_evolution record 
the
extended instruction set support read from cpucfg.
* config/loongarch/loongarch-def.cc: Set evolution at initialization.
* config/loongarch/loongarch-def.h (ISA_BASE_LA64V100): Delete.
(ISA_BASE_LA64V110): Likewise.
(N_ISA_BASE_TYPES): Likewise.
(defined): Likewise.
* config/loongarch/loongarch-opts.cc: Likewise.
* config/loongarch/loongarch-opts.h (TARGET_64BIT): Likewise.
(ISA_BASE_IS_LA64V110): Likewise.
* config/loongarch/loongarch-str.h (STR_ISA_BASE_LA64V110): Likewise.
* config/loongarch/loongarch.opt: Regenerate.
---
 .../loongarch/genopts/loongarch-strings   |  1 -
 gcc/config/loongarch/genopts/loongarch.opt.in |  3 ---
 gcc/config/loongarch/loongarch-cpu.cc | 23 +--
 gcc/config/loongarch/loongarch-def.cc | 14 +++
 gcc/config/loongarch/loongarch-def.h  | 12 ++
 gcc/config/loongarch/loongarch-opts.cc|  3 ---
 gcc/config/loongarch/loongarch-opts.h |  4 +---
 gcc/config/loongarch/loongarch-str.h  |  1 -
 gcc/config/loongarch/loongarch.opt|  3 ---
 9 files changed, 19 insertions(+), 45 deletions(-)

diff --git a/gcc/config/loongarch/genopts/loongarch-strings 
b/gcc/config/loongarch/genopts/loongarch-strings
index b2070c83ed0..7bc4824007e 100644
--- a/gcc/config/loongarch/genopts/loongarch-strings
+++ b/gcc/config/loongarch/genopts/loongarch-strings
@@ -30,7 +30,6 @@ STR_CPU_LA664   la664
 
 # Base architecture
 STR_ISA_BASE_LA64V100 la64
-STR_ISA_BASE_LA64V110 la64v1.1
 
 # -mfpu
 OPTSTR_ISA_EXT_FPUfpu
diff --git a/gcc/config/loongarch/genopts/loongarch.opt.in 
b/gcc/config/loongarch/genopts/loongarch.opt.in
index 8af6cc6f532..483b185b059 100644
--- a/gcc/config/loongarch/genopts/loongarch.opt.in
+++ b/gcc/config/loongarch/genopts/loongarch.opt.in
@@ -32,9 +32,6 @@ Basic ISAs of LoongArch:
 EnumValue
 Enum(isa_base) String(@@STR_ISA_BASE_LA64V100@@) Value(ISA_BASE_LA64V100)
 
-EnumValue
-Enum(isa_base) String(@@STR_ISA_BASE_LA64V110@@) Value(ISA_BASE_LA64V110)
-
 ;; ISA extensions / adjustments
 Enum
 Name(isa_ext_fpu) Type(int)
diff --git a/gcc/config/loongarch/loongarch-cpu.cc 
b/gcc/config/loongarch/loongarch-cpu.cc
index 622df47916f..4033320d0e1 100644
--- a/gcc/config/loongarch/loongarch-cpu.cc
+++ b/gcc/config/loongarch/loongarch-cpu.cc
@@ -23,7 +23,6 @@ along with GCC; see the file COPYING3.  If not see
 #include "config.h"
 #include "system.h"
 #include "coretypes.h"
-#include "tm.h"
 #include "diagnostic-core.h"
 
 #include "loongarch-def.h"
@@ -32,19 +31,6 @@ along with GCC; see the file COPYING3.  If not see
 #include "loongarch-cpucfg-map.h"
 #include "loongarch-str.h"
 
-/* loongarch_isa_base_features defined here instead of loongarch-def.c
-   because we need to use options.h.  Pay attention on the order of elements
-   in the initializer becaue ISO C++ does not allow C99 designated
-   initializers!  */
-
-#define ISA_BASE_LA64V110_FEATURES \
-  (OPTION_MASK_ISA_DIV32 | OPTION_MASK_ISA_LD_SEQ_SA \
-   | OPTION_MASK_ISA_LAM_BH | OPTION_MASK_ISA_LAMCAS)
-
-int64_t loongarch_isa_base_features[N_ISA_BASE_TYPES] = {
-  /* [ISA_BASE_LA64V100] = */ 0,
-  /* [ISA_BASE_LA64V110] = */ ISA_BASE_LA64V110_FEATURES,
-};
 
 /* Native CPU detection with "cpucfg" */
 static uint32_t cpucfg_cache[N_CPUCFG_WORDS] = { 0 };
@@ -235,18 +221,20 @@ fill_native_cpu_config (struct loongarch_target *tgt)
   /* Use the native value anyways.  */
   preset.simd = tmp;
 
+
+  int64_t hw_isa_evolution = 0;
+
   /* Features added during ISA evolution.  */
   for (const auto &entry: cpucfg_map)
if (cpucfg_cache[entry.cpucfg_word] & entry.cpucfg_bit)
- preset.evolution |= entry.isa_evolution_bit;
+ hw_isa_evolution |= entry.isa_evolution_bit;
 
   if (native_cpu_type != CPU_NATIVE)
{
  /* Check if the local CPU really supports the features of the base
 ISA of probed native_cpu_type.  If any feature is not detected,
 either GCC or the hardware is buggy.  */
- auto base_isa_feature = loongarch_isa_base_features[preset.base];
- if ((preset.evolution & base_isa_feature) != base_isa_feature)
+ if ((preset.evolution & hw_isa_evolution) != hw_isa_evolution)
warning (0,
 "detected base architect

[PATCH v2 1/2] LoongArch: Switch loongarch-def from C to C++ to make it possible.

2023-12-04 Thread Lulu Cheng
From: Xi Ruoyao 

We'll use HOST_WIDE_INT in LoongArch static properties in following patches.

To keep the same readability as C99 designated initializers, create a
std::array like data structure with position setter function, and add
field setter functions for structs used in loongarch-def.cc.

Remove unneeded guards #if
!defined(IN_LIBGCC2) && !defined(IN_TARGET_LIBS) && !defined(IN_RTS)
in loongarch-def.h and loongarch-opts.h.


gcc/ChangeLog:

* config/loongarch/loongarch-def.h: Remove extern "C".
(loongarch_isa_base_strings): Declare as loongarch_def_array
instead of plain array.
(loongarch_isa_ext_strings): Likewise.
(loongarch_abi_base_strings): Likewise.
(loongarch_abi_ext_strings): Likewise.
(loongarch_cmodel_strings): Likewise.
(loongarch_cpu_strings): Likewise.
(loongarch_cpu_default_isa): Likewise.
(loongarch_cpu_issue_rate): Likewise.
(loongarch_cpu_multipass_dfa_lookahead): Likewise.
(loongarch_cpu_cache): Likewise.
(loongarch_cpu_align): Likewise.
(loongarch_cpu_rtx_cost_data): Likewise.
(loongarch_isa): Add a constructor and field setter functions.
* config/loongarch/loongarch-opts.h (loongarch-defs.h): Do not
include for target libraries.
* config/loongarch/loongarch-tune.h (LOONGARCH_TUNE_H): Likewise.
(struct loongarch_rtx_cost_data): Likewise.
(struct loongarch_cache): Likewise.
(struct loongarch_align): Likewise.
* config/loongarch/t-loongarch: Compile loongarch-def.cc with the
C++ compiler.
* config/loongarch/loongarch-def-array.h: New file for a
std:array like data structure with position setter function.
* config/loongarch/loongarch-def.c: Rename to ...
* config/loongarch/loongarch-def.cc: ... here.
(loongarch_cpu_strings): Define as loongarch_def_array instead
of plain array.
(loongarch_cpu_default_isa): Likewise.
(loongarch_cpu_cache): Likewise.
(loongarch_cpu_align): Likewise.
(loongarch_cpu_rtx_cost_data): Likewise.
(loongarch_cpu_issue_rate): Likewise.
(loongarch_cpu_multipass_dfa_lookahead): Likewise.
(loongarch_isa_base_strings): Likewise.
(loongarch_isa_ext_strings): Likewise.
(loongarch_abi_base_strings): Likewise.
(loongarch_abi_ext_strings): Likewise.
(loongarch_cmodel_strings): Likewise.
(abi_minimal_isa): Likewise.
(loongarch_rtx_cost_optimize_size): Use field setter functions
instead of designated initializers.
(loongarch_rtx_cost_data): Implement default constructor.
---
 gcc/config/loongarch/loongarch-def-array.h |  40 
 gcc/config/loongarch/loongarch-def.c   | 227 -
 gcc/config/loongarch/loongarch-def.cc  | 187 +
 gcc/config/loongarch/loongarch-def.h   |  55 ++---
 gcc/config/loongarch/loongarch-opts.cc |   7 +
 gcc/config/loongarch/loongarch-opts.h  |   5 +-
 gcc/config/loongarch/loongarch-tune.h  | 123 ++-
 gcc/config/loongarch/t-loongarch   |   4 +-
 8 files changed, 390 insertions(+), 258 deletions(-)
 create mode 100644 gcc/config/loongarch/loongarch-def-array.h
 delete mode 100644 gcc/config/loongarch/loongarch-def.c
 create mode 100644 gcc/config/loongarch/loongarch-def.cc

diff --git a/gcc/config/loongarch/loongarch-def-array.h 
b/gcc/config/loongarch/loongarch-def-array.h
new file mode 100644
index 000..bdb3e9c6a2b
--- /dev/null
+++ b/gcc/config/loongarch/loongarch-def-array.h
@@ -0,0 +1,40 @@
+/* A std::array like data structure for LoongArch static properties.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+.  */
+
+#ifndef _LOONGARCH_DEF_ARRAY_H
+#define _LOONGARCH_DEF_ARRAY_H 1
+
+template 
+class loongarch_def_array {
+private:
+  T arr[N];
+public:
+  loongarch_def_array () : arr{} {}
+
+  T &operator[] (int n) { return arr[n]; }
+  const T &operator[] (int n) const { return arr[n]; }
+
+  loongarch_def_array set (int idx, T &&value)
+  {
+(*this)[idx] = value;
+return *this;
+  }
+};
+
+#endif
diff --git a/gcc/config/loongarch/loongarch-def.c 
b/gcc/config/loongarch/loongarch-def.c
deleted file mode 100644
index f22d488acb2..000
--- a/gcc/config/loongarch/loongarch-def.c
+++

[PATCH 03/17] [APX NDD] Support APX NDD for optimization patterns of add

2023-12-04 Thread Hongyu Wang
From: Kong Lingling 

gcc/ChangeLog:

* config/i386/i386.md: (addsi_1_zext): Add new alternatives for
NDD and adjust output templates.
(*add_2): Likewise.
(*addsi_2_zext): Likewise.
(*add_3): Likewise.
(*addsi_3_zext): Likewise.
(*adddi_4): Likewise.
(*add_4): Likewise.
(*add_5): Likewise.
(*addv4): Likewise.
(*addv4_1): Likewise.
(*add3_cconly_overflow_1): Likewise.
(*add3_cc_overflow_1): Likewise.
(*addsi3_zext_cc_overflow_1): Likewise.
(*add3_cconly_overflow_2): Likewise.
(*add3_cc_overflow_2): Likewise.
(*addsi3_zext_cc_overflow_2): Likewise.

gcc/testsuite/ChangeLog:

* gcc.target/i386/apx-ndd.c: Add more test.
---
 gcc/config/i386/i386.md | 310 +++-
 gcc/testsuite/gcc.target/i386/apx-ndd.c |  53 ++--
 2 files changed, 232 insertions(+), 131 deletions(-)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index cb227d19f40..2a73f6dcaec 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -6476,13 +6476,15 @@ (define_insn "*add_1"
 ;; patterns constructed from addsi_1 to match.
 
 (define_insn "addsi_1_zext"
-  [(set (match_operand:DI 0 "register_operand" "=r,r,r")
+  [(set (match_operand:DI 0 "register_operand" "=r,r,r,r,r")
(zero_extend:DI
- (plus:SI (match_operand:SI 1 "nonimmediate_operand" "%0,r,r")
-  (match_operand:SI 2 "x86_64_general_operand" "rBMe,0,le"
+ (plus:SI (match_operand:SI 1 "nonimmediate_operand" "%0,r,r,r,rm")
+  (match_operand:SI 2 "x86_64_general_operand" 
"rBMe,0,le,rBMe,re"
(clobber (reg:CC FLAGS_REG))]
-  "TARGET_64BIT && ix86_binary_operator_ok (PLUS, SImode, operands)"
+  "TARGET_64BIT && ix86_binary_operator_ok (PLUS, SImode, operands,
+   TARGET_APX_NDD)"
 {
+  bool use_ndd = (which_alternative == 3 || which_alternative == 4);
   switch (get_attr_type (insn))
 {
 case TYPE_LEA:
@@ -6490,11 +6492,13 @@ (define_insn "addsi_1_zext"
 
 case TYPE_INCDEC:
   if (operands[2] == const1_rtx)
-return "inc{l}\t%k0";
+return use_ndd ? "inc{l}\t{%1, %k0|%k0, %1}"
+  : "inc{l}\t%k0";
   else
 {
  gcc_assert (operands[2] == constm1_rtx);
-  return "dec{l}\t%k0";
+ return use_ndd ? "dec{l}\t{%1, %k0|%k0, %1}"
+: "dec{l}\t%k0";
}
 
 default:
@@ -6504,12 +6508,15 @@ (define_insn "addsi_1_zext"
 std::swap (operands[1], operands[2]);
 
   if (x86_maybe_negate_const_int (&operands[2], SImode))
-return "sub{l}\t{%2, %k0|%k0, %2}";
+return use_ndd ? "sub{l}\t{%2 ,%1, %k0|%k0, %1, %2}"
+  : "sub{l}\t{%2, %k0|%k0, %2}";
 
-  return "add{l}\t{%2, %k0|%k0, %2}";
+  return use_ndd ? "add{l}\t{%2 ,%1, %k0|%k0, %1, %2}"
+: "add{l}\t{%2, %k0|%k0, %2}";
 }
 }
-  [(set (attr "type")
+  [(set_attr "isa" "*,*,*,apx_ndd,apx_ndd")
+   (set (attr "type")
  (cond [(eq_attr "alternative" "2")
  (const_string "lea")
(match_operand:SI 2 "incdec_operand")
@@ -6811,37 +6818,42 @@ (define_insn "*add_2"
   [(set (reg FLAGS_REG)
(compare
  (plus:SWI
-   (match_operand:SWI 1 "nonimmediate_operand" "%0,0,")
-   (match_operand:SWI 2 "" ",,0"))
+   (match_operand:SWI 1 "nonimmediate_operand" "%0,0,,rm,r")
+   (match_operand:SWI 2 "" ",,0,r,"))
  (const_int 0)))
-   (set (match_operand:SWI 0 "nonimmediate_operand" "=m,,")
+   (set (match_operand:SWI 0 "nonimmediate_operand" "=m,,,r,r")
(plus:SWI (match_dup 1) (match_dup 2)))]
   "ix86_match_ccmode (insn, CCGOCmode)
-   && ix86_binary_operator_ok (PLUS, mode, operands)"
+   && ix86_binary_operator_ok (PLUS, mode, operands, TARGET_APX_NDD)"
 {
+  bool use_ndd = (which_alternative == 3 || which_alternative == 4);
   switch (get_attr_type (insn))
 {
 case TYPE_INCDEC:
   if (operands[2] == const1_rtx)
-return "inc{}\t%0";
+return use_ndd ? "inc{}\t{%1, %0|%0, %1}"
+  : "inc{}\t%0";
   else
 {
  gcc_assert (operands[2] == constm1_rtx);
-  return "dec{}\t%0";
+ return use_ndd ? "dec{}\t{%1, %0|%0, %1}"
+: "dec{}\t%0";
}
 
 default:
   if (which_alternative == 2)
 std::swap (operands[1], operands[2]);
 
-  gcc_assert (rtx_equal_p (operands[0], operands[1]));
   if (x86_maybe_negate_const_int (&operands[2], mode))
-return "sub{}\t{%2, %0|%0, %2}";
+return use_ndd ? "sub{}\t{%2, %1, %0|%0, %1, %2}"
+  : "sub{}\t{%2, %0|%0, %2}";
 
-  return "add{}\t{%2, %0|%0, %2}";
+  return use_ndd ? "add{}\t{%2, %1, %0|%0, %1, %2}"
+: "add{}\t{%2, %0|%0, %2}";
 }
 }
-  [(set (attr "t

[PATCH 16/17] [APX NDD] Support APX NDD for cmove insns

2023-12-04 Thread Hongyu Wang
gcc/ChangeLog:

* config/i386/i386.md (*movcc_noc): Extend with new constraints
to support NDD.
(*movsicc_noc_zext): Likewise.
(*movsicc_noc_zext_1): Likewise.
(*movqicc_noc): Likewise.

gcc/testsuite/ChangeLog:

* gcc.target/i386/apx-ndd-cmov.c: New test.
---
 gcc/config/i386/i386.md  | 48 
 gcc/testsuite/gcc.target/i386/apx-ndd-cmov.c | 16 +++
 2 files changed, 45 insertions(+), 19 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/apx-ndd-cmov.c

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 0af7e82deee..853f53c2bb9 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -24412,47 +24412,56 @@ (define_split
(neg:SWI (ltu:SWI (reg:CCC FLAGS_REG) (const_int 0])
 
 (define_insn "*movcc_noc"
-  [(set (match_operand:SWI248 0 "register_operand" "=r,r")
+  [(set (match_operand:SWI248 0 "register_operand" "=r,r,r,r")
(if_then_else:SWI248 (match_operator 1 "ix86_comparison_operator"
   [(reg FLAGS_REG) (const_int 0)])
- (match_operand:SWI248 2 "nonimmediate_operand" "rm,0")
- (match_operand:SWI248 3 "nonimmediate_operand" "0,rm")))]
+ (match_operand:SWI248 2 "nonimmediate_operand" "rm,0,rm,r")
+ (match_operand:SWI248 3 "nonimmediate_operand" "0,rm,r,rm")))]
   "TARGET_CMOVE && !(MEM_P (operands[2]) && MEM_P (operands[3]))"
   "@
cmov%O2%C1\t{%2, %0|%0, %2}
-   cmov%O2%c1\t{%3, %0|%0, %3}"
-  [(set_attr "type" "icmov")
+   cmov%O2%c1\t{%3, %0|%0, %3}
+   cmov%O2%C1\t{%2, %3, %0|%0, %3, %2}
+   cmov%O2%c1\t{%3, %2, %0|%0, %2, %3}"
+  [(set_attr "isa" "*,*,apx_ndd,apx_ndd")
+   (set_attr "type" "icmov")
(set_attr "mode" "")])
 
 (define_insn "*movsicc_noc_zext"
-  [(set (match_operand:DI 0 "register_operand" "=r,r")
+  [(set (match_operand:DI 0 "register_operand" "=r,r,r,r")
(if_then_else:DI (match_operator 1 "ix86_comparison_operator"
   [(reg FLAGS_REG) (const_int 0)])
  (zero_extend:DI
-   (match_operand:SI 2 "nonimmediate_operand" "rm,0"))
+   (match_operand:SI 2 "nonimmediate_operand" "rm,0,rm,r"))
  (zero_extend:DI
-   (match_operand:SI 3 "nonimmediate_operand" "0,rm"]
+   (match_operand:SI 3 "nonimmediate_operand" "0,rm,r,rm"]
   "TARGET_64BIT
&& TARGET_CMOVE && !(MEM_P (operands[2]) && MEM_P (operands[3]))"
   "@
cmov%O2%C1\t{%2, %k0|%k0, %2}
-   cmov%O2%c1\t{%3, %k0|%k0, %3}"
-  [(set_attr "type" "icmov")
+   cmov%O2%c1\t{%3, %k0|%k0, %3}
+   cmov%O2%C1\t{%2, %3, %k0|%k0, %3, %2}
+   cmov%O2%c1\t{%3, %2, %k0|%k0, %2, %3}"
+  [(set_attr "isa" "*,*,apx_ndd,apx_ndd")
+   (set_attr "type" "icmov")
(set_attr "mode" "SI")])
 
 (define_insn "*movsicc_noc_zext_1"
-  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,r")
+  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,r,r,r")
(zero_extend:DI
  (if_then_else:SI (match_operator 1 "ix86_comparison_operator"
 [(reg FLAGS_REG) (const_int 0)])
-(match_operand:SI 2 "nonimmediate_operand" "rm,0")
-(match_operand:SI 3 "nonimmediate_operand" "0,rm"]
+(match_operand:SI 2 "nonimmediate_operand" "rm,0,rm,r")
+(match_operand:SI 3 "nonimmediate_operand" "0,rm,r,rm"]
   "TARGET_64BIT
&& TARGET_CMOVE && !(MEM_P (operands[2]) && MEM_P (operands[3]))"
   "@
cmov%O2%C1\t{%2, %k0|%k0, %2}
-   cmov%O2%c1\t{%3, %k0|%k0, %3}"
-  [(set_attr "type" "icmov")
+   cmov%O2%c1\t{%3, %k0|%k0, %3}
+   cmov%O2%C1\t{%2, %3, %k0|%k0, %3, %2}
+   cmov%O2%c1\t{%3, %2, %k0|%k0, %2, %3}"
+  [(set_attr "isa" "*,*,apx_ndd,apx_ndd")
+   (set_attr "type" "icmov")
(set_attr "mode" "SI")])
 
 
@@ -24477,14 +24486,15 @@ (define_split
 })
 
 (define_insn "*movqicc_noc"
-  [(set (match_operand:QI 0 "register_operand" "=r,r")
+  [(set (match_operand:QI 0 "register_operand" "=r,r,r")
(if_then_else:QI (match_operator 1 "ix86_comparison_operator"
   [(reg FLAGS_REG) (const_int 0)])
- (match_operand:QI 2 "register_operand" "r,0")
- (match_operand:QI 3 "register_operand" "0,r")))]
+ (match_operand:QI 2 "register_operand" "r,0,r")
+ (match_operand:QI 3 "register_operand" "0,r,r")))]
   "TARGET_CMOVE && !TARGET_PARTIAL_REG_STALL"
   "#"
-  [(set_attr "type" "icmov")
+  [(set_attr "isa" "*,*,apx_ndd")
+   (set_attr "type" "icmov")
(set_attr "mode" "QI")])
 
 (define_split
diff --git a/gcc/testsuite/gcc.target/i386/apx-ndd-cmov.c 
b/gcc/testsuite/gcc.target/i386/apx-ndd-cmov.c
new file mode 100644
index 000..459dc965342
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/apx-ndd-cmov.c
@@ -0,0 +1,16 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -m64 -mapxf" } */
+/* { dg-final { scan-assembler-times "cmove\[^\n\r]*, %eax" 1 } } */
+/* 

[PATCH 04/17] [APX NDD] Disable seg_prefixed memory usage for NDD add

2023-12-04 Thread Hongyu Wang
NDD uses evex prefix, so when segment prefix is also applied, the instruction
could excceed its 15byte limit, especially adding immediates. This could happen
when "e" constraint accepts any UNSPEC_TPOFF/UNSPEC_NTPOFF constant and it will
add the offset to segment register, which will be encoded using segment prefix.
Disable those *POFF constant usage in NDD add alternatives with new constraint.

gcc/ChangeLog:

* config/i386/constraints.md (je): New constraint.
* config/i386/i386-protos.h (x86_poff_operand_p): New function to
check any *POFF constant in operand.
* config/i386/i386.cc (x86_poff_operand_p): New prototype.
* config/i386/i386.md (*add_1): Split out je alternative for add.
---
 gcc/config/i386/constraints.md |  5 +
 gcc/config/i386/i386-protos.h  |  1 +
 gcc/config/i386/i386.cc| 25 +
 gcc/config/i386/i386.md| 10 +-
 4 files changed, 36 insertions(+), 5 deletions(-)

diff --git a/gcc/config/i386/constraints.md b/gcc/config/i386/constraints.md
index cbee31fa40a..f4c3c3dd952 100644
--- a/gcc/config/i386/constraints.md
+++ b/gcc/config/i386/constraints.md
@@ -433,3 +433,8 @@ (define_address_constraint "jb"
 
 (define_register_constraint  "jc"
  "TARGET_APX_EGPR && !TARGET_AVX ? GENERAL_GPR16 : GENERAL_REGS")
+
+(define_constraint  "je"
+  "@internal constant that do not allow any unspec global offsets"
+  (and (match_operand 0 "x86_64_immediate_operand")
+   (match_test "!x86_poff_operand_p (op)")))
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index a9d0c568bba..7dfeb6af225 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -66,6 +66,7 @@ extern bool x86_extended_QIreg_mentioned_p (rtx_insn *);
 extern bool x86_extended_reg_mentioned_p (rtx);
 extern bool x86_extended_rex2reg_mentioned_p (rtx);
 extern bool x86_evex_reg_mentioned_p (rtx [], int);
+extern bool x86_poff_operand_p (rtx);
 extern bool x86_maybe_negate_const_int (rtx *, machine_mode);
 extern machine_mode ix86_cc_mode (enum rtx_code, rtx, rtx);
 
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 3efeed396c4..3e670330ef6 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -23341,6 +23341,31 @@ x86_evex_reg_mentioned_p (rtx operands[], int nops)
   return false;
 }
 
+/* Return true when rtx operand does not contain any UNSPEC_*POFF related
+   constant to avoid APX_NDD instructions excceed encoding length limit.  */
+bool
+x86_poff_operand_p (rtx operand)
+{
+  if (GET_CODE (operand) == CONST)
+{
+  rtx op = XEXP (operand, 0);
+  if (GET_CODE (op) == PLUS)
+   op = XEXP (op, 0);
+   
+  if (GET_CODE (op) == UNSPEC)
+   {
+ int unspec = XINT (op, 1);
+ return (unspec == UNSPEC_NTPOFF
+ || unspec == UNSPEC_TPOFF
+ || unspec == UNSPEC_DTPOFF
+ || unspec == UNSPEC_GOTTPOFF
+ || unspec == UNSPEC_GOTNTPOFF
+ || unspec == UNSPEC_INDNTPOFF);
+   }
+}
+  return false;
+}
+
 /* If profitable, negate (without causing overflow) integer constant
of mode MODE at location LOC.  Return true in this case.  */
 bool
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 2a73f6dcaec..6b316e698bb 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -6415,15 +6415,15 @@ (define_insn_and_split 
"*add3_doubleword_concat_zext"
  "split_double_mode (mode, &operands[0], 1, &operands[0], &operands[5]);")
 
 (define_insn "*add_1"
-  [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r,r,r,r,r")
+  [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r,r,r,r,r,r,r")
(plus:SWI48
- (match_operand:SWI48 1 "nonimmediate_operand" "%0,0,r,r,rm,r")
- (match_operand:SWI48 2 "x86_64_general_operand" "re,BM,0,le,re,BM")))
+ (match_operand:SWI48 1 "nonimmediate_operand" "%0,0,r,r,rm,r,m,r")
+ (match_operand:SWI48 2 "x86_64_general_operand" 
"re,BM,0,le,r,e,je,BM")))
(clobber (reg:CC FLAGS_REG))]
   "ix86_binary_operator_ok (PLUS, mode, operands,
TARGET_APX_NDD)"
 {
-  bool use_ndd = (which_alternative == 4 || which_alternative == 5);
+  bool use_ndd = (which_alternative >= 4);
   switch (get_attr_type (insn))
 {
 case TYPE_LEA:
@@ -6454,7 +6454,7 @@ (define_insn "*add_1"
: "add{}\t{%2, %0|%0, %2}";
 }
 }
-  [(set_attr "isa" "*,*,*,*,apx_ndd,apx_ndd")
+  [(set_attr "isa" "*,*,*,*,apx_ndd,apx_ndd,apx_ndd,apx_ndd")
(set (attr "type")
  (cond [(eq_attr "alternative" "3")
   (const_string "lea")
-- 
2.31.1



  1   2   >