[PATCH v4] RISC-V: Fix one bug for floating-point static frm

2023-07-05 Thread Pan Li via Gcc-patches
From: Pan Li 

This patch would like to fix one bug to align below items of spec.

1. By default, the RVV floating-point will take dyn mode.
2. DYN is invalid in FRM register for RVV floating-point.

When mode switching the function entry and exit, it will take DYN as
the frm mode.

Signed-off-by: Pan Li 

gcc/ChangeLog:

* config/riscv/riscv.cc (riscv_emit_mode_set): Avoid emit insn
when FRM_MODE_DYN.
(riscv_mode_entry): Take FRM_MODE_DYN as entry mode.
(riscv_mode_exit): Likewise for exit mode.
(riscv_mode_needed): Likewise for needed mode.
(riscv_mode_after): Likewise for after mode.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/base/float-point-frm-insert-6.c: New test.
---
 gcc/config/riscv/riscv.cc | 16 +++---
 .../riscv/rvv/base/float-point-frm-insert-6.c | 31 +++
 2 files changed, 42 insertions(+), 5 deletions(-)
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/base/float-point-frm-insert-6.c

diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index e4dc8115e69..4db32de5696 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -7670,7 +7670,7 @@ riscv_emit_mode_set (int entity, int mode, int prev_mode,
emit_insn (gen_vxrmsi (gen_int_mode (mode, SImode)));
   break;
 case RISCV_FRM:
-  if (mode != FRM_MODE_NONE && mode != prev_mode)
+  if (mode != FRM_MODE_DYN && mode != prev_mode)
{
  rtx scaler = gen_reg_rtx (SImode);
  rtx imm = gen_int_mode (mode, SImode);
@@ -7697,7 +7697,9 @@ riscv_mode_needed (int entity, rtx_insn *insn)
 case RISCV_VXRM:
   return code >= 0 ? get_attr_vxrm_mode (insn) : VXRM_MODE_NONE;
 case RISCV_FRM:
-  return code >= 0 ? get_attr_frm_mode (insn) : FRM_MODE_NONE;
+  /* According to RVV 1.0 spec, all vector floating-point operations use
+the dynamic rounding mode in the frm register.  */
+  return code >= 0 ? get_attr_frm_mode (insn) : FRM_MODE_DYN;
 default:
   gcc_unreachable ();
 }
@@ -7757,7 +7759,7 @@ riscv_mode_after (int entity, int mode, rtx_insn *insn)
 case RISCV_FRM:
   return riscv_entity_mode_after (FRM_REGNUM, insn, mode,
  (int (*)(rtx_insn *)) get_attr_frm_mode,
- FRM_MODE_NONE);
+ FRM_MODE_DYN);
 default:
   gcc_unreachable ();
 }
@@ -7774,7 +7776,9 @@ riscv_mode_entry (int entity)
 case RISCV_VXRM:
   return VXRM_MODE_NONE;
 case RISCV_FRM:
-  return FRM_MODE_NONE;
+  /* According to RVV 1.0 spec, all vector floating-point operations use
+the dynamic rounding mode in the frm register.  */
+  return FRM_MODE_DYN;
 default:
   gcc_unreachable ();
 }
@@ -7791,7 +7795,9 @@ riscv_mode_exit (int entity)
 case RISCV_VXRM:
   return VXRM_MODE_NONE;
 case RISCV_FRM:
-  return FRM_MODE_NONE;
+  /* According to RVV 1.0 spec, all vector floating-point operations use
+the dynamic rounding mode in the frm register.  */
+  return FRM_MODE_DYN;
 default:
   gcc_unreachable ();
 }
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/float-point-frm-insert-6.c 
b/gcc/testsuite/gcc.target/riscv/rvv/base/float-point-frm-insert-6.c
new file mode 100644
index 000..6d896e0953e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/float-point-frm-insert-6.c
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64 -O3 -Wno-psabi" } */
+
+#include "riscv_vector.h"
+
+typedef float float32_t;
+
+vfloat32m1_t
+test_riscv_vfadd_vv_f32m1_rm (vfloat32m1_t op1, vfloat32m1_t op2, size_t vl) {
+  return __riscv_vfadd_vv_f32m1_rm (op1, op2, 7, vl);
+}
+
+vfloat32m1_t
+test_vfadd_vv_f32m1_m_rm(vbool32_t mask, vfloat32m1_t op1, vfloat32m1_t op2,
+size_t vl) {
+  return __riscv_vfadd_vv_f32m1_m_rm(mask, op1, op2, 7, vl);
+}
+
+vfloat32m1_t
+test_vfadd_vf_f32m1_rm(vfloat32m1_t op1, float32_t op2, size_t vl) {
+  return __riscv_vfadd_vf_f32m1_rm(op1, op2, 7, vl);
+}
+
+vfloat32m1_t
+test_vfadd_vf_f32m1_m_rm(vbool32_t mask, vfloat32m1_t op1, float32_t op2,
+size_t vl) {
+  return __riscv_vfadd_vf_f32m1_m_rm(mask, op1, op2, 7, vl);
+}
+
+/* { dg-final { scan-assembler-times 
{vfadd\.v[vf]\s+v[0-9]+,\s*v[0-9]+,\s*[fav]+[0-9]+} 4 } } */
+/* { dg-final { scan-assembler-not {fsrm\s+[ax][0-9]+,\s*[ax][0-9]+} } } */
-- 
2.34.1



Re: [PATCH v4] RISC-V: Fix one bug for floating-point static frm

2023-07-05 Thread Kito Cheng via Gcc-patches
LGTM, thanks :)

On Wed, Jul 5, 2023 at 3:03 PM Pan Li via Gcc-patches
 wrote:
>
> From: Pan Li 
>
> This patch would like to fix one bug to align below items of spec.
>
> 1. By default, the RVV floating-point will take dyn mode.
> 2. DYN is invalid in FRM register for RVV floating-point.
>
> When mode switching the function entry and exit, it will take DYN as
> the frm mode.
>
> Signed-off-by: Pan Li 
>
> gcc/ChangeLog:
>
> * config/riscv/riscv.cc (riscv_emit_mode_set): Avoid emit insn
> when FRM_MODE_DYN.
> (riscv_mode_entry): Take FRM_MODE_DYN as entry mode.
> (riscv_mode_exit): Likewise for exit mode.
> (riscv_mode_needed): Likewise for needed mode.
> (riscv_mode_after): Likewise for after mode.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/riscv/rvv/base/float-point-frm-insert-6.c: New test.
> ---
>  gcc/config/riscv/riscv.cc | 16 +++---
>  .../riscv/rvv/base/float-point-frm-insert-6.c | 31 +++
>  2 files changed, 42 insertions(+), 5 deletions(-)
>  create mode 100644 
> gcc/testsuite/gcc.target/riscv/rvv/base/float-point-frm-insert-6.c
>
> diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
> index e4dc8115e69..4db32de5696 100644
> --- a/gcc/config/riscv/riscv.cc
> +++ b/gcc/config/riscv/riscv.cc
> @@ -7670,7 +7670,7 @@ riscv_emit_mode_set (int entity, int mode, int 
> prev_mode,
> emit_insn (gen_vxrmsi (gen_int_mode (mode, SImode)));
>break;
>  case RISCV_FRM:
> -  if (mode != FRM_MODE_NONE && mode != prev_mode)
> +  if (mode != FRM_MODE_DYN && mode != prev_mode)
> {
>   rtx scaler = gen_reg_rtx (SImode);
>   rtx imm = gen_int_mode (mode, SImode);
> @@ -7697,7 +7697,9 @@ riscv_mode_needed (int entity, rtx_insn *insn)
>  case RISCV_VXRM:
>return code >= 0 ? get_attr_vxrm_mode (insn) : VXRM_MODE_NONE;
>  case RISCV_FRM:
> -  return code >= 0 ? get_attr_frm_mode (insn) : FRM_MODE_NONE;
> +  /* According to RVV 1.0 spec, all vector floating-point operations use
> +the dynamic rounding mode in the frm register.  */
> +  return code >= 0 ? get_attr_frm_mode (insn) : FRM_MODE_DYN;
>  default:
>gcc_unreachable ();
>  }
> @@ -7757,7 +7759,7 @@ riscv_mode_after (int entity, int mode, rtx_insn *insn)
>  case RISCV_FRM:
>return riscv_entity_mode_after (FRM_REGNUM, insn, mode,
>   (int (*)(rtx_insn *)) get_attr_frm_mode,
> - FRM_MODE_NONE);
> + FRM_MODE_DYN);
>  default:
>gcc_unreachable ();
>  }
> @@ -7774,7 +7776,9 @@ riscv_mode_entry (int entity)
>  case RISCV_VXRM:
>return VXRM_MODE_NONE;
>  case RISCV_FRM:
> -  return FRM_MODE_NONE;
> +  /* According to RVV 1.0 spec, all vector floating-point operations use
> +the dynamic rounding mode in the frm register.  */
> +  return FRM_MODE_DYN;
>  default:
>gcc_unreachable ();
>  }
> @@ -7791,7 +7795,9 @@ riscv_mode_exit (int entity)
>  case RISCV_VXRM:
>return VXRM_MODE_NONE;
>  case RISCV_FRM:
> -  return FRM_MODE_NONE;
> +  /* According to RVV 1.0 spec, all vector floating-point operations use
> +the dynamic rounding mode in the frm register.  */
> +  return FRM_MODE_DYN;
>  default:
>gcc_unreachable ();
>  }
> diff --git 
> a/gcc/testsuite/gcc.target/riscv/rvv/base/float-point-frm-insert-6.c 
> b/gcc/testsuite/gcc.target/riscv/rvv/base/float-point-frm-insert-6.c
> new file mode 100644
> index 000..6d896e0953e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/base/float-point-frm-insert-6.c
> @@ -0,0 +1,31 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=rv64gcv -mabi=lp64 -O3 -Wno-psabi" } */
> +
> +#include "riscv_vector.h"
> +
> +typedef float float32_t;
> +
> +vfloat32m1_t
> +test_riscv_vfadd_vv_f32m1_rm (vfloat32m1_t op1, vfloat32m1_t op2, size_t vl) 
> {
> +  return __riscv_vfadd_vv_f32m1_rm (op1, op2, 7, vl);
> +}
> +
> +vfloat32m1_t
> +test_vfadd_vv_f32m1_m_rm(vbool32_t mask, vfloat32m1_t op1, vfloat32m1_t op2,
> +size_t vl) {
> +  return __riscv_vfadd_vv_f32m1_m_rm(mask, op1, op2, 7, vl);
> +}
> +
> +vfloat32m1_t
> +test_vfadd_vf_f32m1_rm(vfloat32m1_t op1, float32_t op2, size_t vl) {
> +  return __riscv_vfadd_vf_f32m1_rm(op1, op2, 7, vl);
> +}
> +
> +vfloat32m1_t
> +test_vfadd_vf_f32m1_m_rm(vbool32_t mask, vfloat32m1_t op1, float32_t op2,
> +size_t vl) {
> +  return __riscv_vfadd_vf_f32m1_m_rm(mask, op1, op2, 7, vl);
> +}
> +
> +/* { dg-final { scan-assembler-times 
> {vfadd\.v[vf]\s+v[0-9]+,\s*v[0-9]+,\s*[fav]+[0-9]+} 4 } } */
> +/* { dg-final { scan-assembler-not {fsrm\s+[ax][0-9]+,\s*[ax][0-9]+} } } */
> --
> 2.34.1
>


Re: [PATCH v4] RISC-V: Fix one bug for floating-point static frm

2023-07-05 Thread Robin Dapp via Gcc-patches
> LGTM, thanks :)

just a moment please, I still wanted to reply ;)

Regards
 Robin



Re: [PATCH v4] RISC-V: Fix one bug for floating-point static frm

2023-07-05 Thread Kito Cheng via Gcc-patches
On Wed, Jul 5, 2023 at 3:12 PM Robin Dapp via Gcc-patches
 wrote:
>
> > LGTM, thanks :)
>
> just a moment please, I still wanted to reply ;)

Sure :)

>
> Regards
>  Robin
>


RE: [PATCH v4] RISC-V: Fix one bug for floating-point static frm

2023-07-05 Thread Li, Pan2 via Gcc-patches
Thanks Robin, it passed all tests of riscv.exp and rvv.exp from my side. Could 
you please help to double confirm the issue you meet is resolved or not?

Pan

-Original Message-
From: Robin Dapp  
Sent: Wednesday, July 5, 2023 3:11 PM
To: Kito Cheng ; Li, Pan2 
Cc: rdapp@gmail.com; gcc-patches@gcc.gnu.org; juzhe.zh...@rivai.ai; 
jeffreya...@gmail.com; Wang, Yanzhang 
Subject: Re: [PATCH v4] RISC-V: Fix one bug for floating-point static frm

> LGTM, thanks :)

just a moment please, I still wanted to reply ;)

Regards
 Robin



Re: [PATCH v1] RISC-V: Use FRM_DYN when add the rounding mode operand

2023-07-05 Thread Kito Cheng via Gcc-patches
LGTM

On Wed, Jul 5, 2023 at 10:08 AM juzhe.zh...@rivai.ai
 wrote:
>
> LGTM.
>
>
>
> juzhe.zh...@rivai.ai
>
> From: pan2.li
> Date: 2023-07-04 20:26
> To: gcc-patches
> CC: juzhe.zhong; rdapp.gcc; jeffreyalaw; pan2.li; yanzhang.wang; kito.cheng
> Subject: [PATCH v1] RISC-V: Use FRM_DYN when add the rounding mode operand
> From: Pan Li 
>
> This patch would like to take FRM_DYN const rtx as the rounding mode
> operand according to the RVV spec, which takes the dyn as the only
> rounding mode for floating-point.
>
> Signed-off-by: Pan Li 
>
> gcc/ChangeLog:
>
> * config/riscv/riscv-vector-builtins.cc
> (function_expander::use_exact_insn): Use FRM_DYN instead of const0.
> ---
> gcc/config/riscv/riscv-vector-builtins.cc | 7 +++
> 1 file changed, 3 insertions(+), 4 deletions(-)
>
> diff --git a/gcc/config/riscv/riscv-vector-builtins.cc 
> b/gcc/config/riscv/riscv-vector-builtins.cc
> index 648c765a5d1..3a53b56effa 100644
> --- a/gcc/config/riscv/riscv-vector-builtins.cc
> +++ b/gcc/config/riscv/riscv-vector-builtins.cc
> @@ -3569,11 +3569,10 @@ function_expander::use_exact_insn (insn_code icode)
>if (base->has_rounding_mode_operand_p ())
>  add_input_operand (call_expr_nargs (exp) - 2);
> -  /* TODO: Currently, we don't support intrinsic that is modeling rounding 
> mode.
> - We add default rounding mode for the intrinsics that didn't model 
> rounding
> - mode yet.  */
> +  /* The RVV floating-point only support dynamic rounding mode in the
> + FRM register.  */
>if (opno != insn_data[icode].n_generator_args)
> -add_input_operand (Pmode, const0_rtx);
> +add_input_operand (Pmode, gen_int_mode (riscv_vector::FRM_DYN, Pmode));
>return generate_insn (icode);
> }
> --
> 2.34.1
>
>


Re: [PATCH] middle-end/110541 - VEC_PERM_EXPR documentation is off

2023-07-05 Thread Richard Sandiford via Gcc-patches
Richard Biener via Gcc-patches  writes:
> The following adjusts the tree.def documentation about VEC_PERM_EXPR
> which wasn't adjusted when the restrictions of permutes with constant
> mask were relaxed.

I was going to complain about having two copies of the documentation,
but then I realised that generic.texi doesn't document VEC_PERM_EXPR.
So... oops.

>
> OK?
>
> Thanks,
> Richard.
>
>   PR middle-end/110541
>   * tree.def (VEC_PERM_EXPR): Adjust documentation to reflect
>   reality.
> ---
>  gcc/tree.def | 19 +--
>  1 file changed, 13 insertions(+), 6 deletions(-)
>
> diff --git a/gcc/tree.def b/gcc/tree.def
> index 1fc2ca7a724..9e1a54ac2f9 100644
> --- a/gcc/tree.def
> +++ b/gcc/tree.def
> @@ -565,13 +565,20 @@ DEFTREECODE (VEC_COND_EXPR, "vec_cond_expr", 
> tcc_expression, 3)
>  
> N = length(mask)
> foreach i in N:
> - M = mask[i] % (2*N)
> - A = M < N ? v0[M] : v1[M-N]
> + M = mask[i] % (length(v0) + length(v1))
> + A[i] = M < length(v0) ? v0[M] : v1[M - length(v0)]
>  
> -   V0 and V1 are vectors of the same type.  MASK is an integer-typed
> -   vector.  The number of MASK elements must be the same with the
> -   number of elements in V0 and V1.  The size of the inner type
> -   of the MASK and of the V0 and V1 must be the same.
> +   V0 and V1 are vectors of the same type.
> +
> +   When MASK is not constant:
> + MASK is an integer-typed vector.  The number of MASK elements must
> + be the same with the number of elements in V0 and V1.  The size of

Preexisting, but s/same with/same as/

> + the inner type of the MASK and of the V0 and V1 must be the same.
> +
> +   When MASK is constant:
> + MASK is an integer-typed vector.   MASK elements outside of
> + [0, length(V0) + length(V1) - 1] invoke undefined behavior (the
> + modulo operation above doesn't apply).

I don't remember the last rule.  I thought the modulo did still apply.
(But the canonical form is to remove obvious modulo opportunities.)

E.g. a VLA reverse-and-rotate pattern might have { N-2, N-3, N-4, ... }.
That will wrap at the final position to 2N-1, but that seems OK.

LGTM otherwise FWIW.

Thanks,
Richard


Re: [PATCH] middle-end/110541 - VEC_PERM_EXPR documentation is off

2023-07-05 Thread Richard Biener via Gcc-patches
On Wed, 5 Jul 2023, Richard Sandiford wrote:

> Richard Biener via Gcc-patches  writes:
> > The following adjusts the tree.def documentation about VEC_PERM_EXPR
> > which wasn't adjusted when the restrictions of permutes with constant
> > mask were relaxed.
> 
> I was going to complain about having two copies of the documentation,
> but then I realised that generic.texi doesn't document VEC_PERM_EXPR.
> So... oops.

Yeah, also noticed that ...

> >
> > OK?
> >
> > Thanks,
> > Richard.
> >
> > PR middle-end/110541
> > * tree.def (VEC_PERM_EXPR): Adjust documentation to reflect
> > reality.
> > ---
> >  gcc/tree.def | 19 +--
> >  1 file changed, 13 insertions(+), 6 deletions(-)
> >
> > diff --git a/gcc/tree.def b/gcc/tree.def
> > index 1fc2ca7a724..9e1a54ac2f9 100644
> > --- a/gcc/tree.def
> > +++ b/gcc/tree.def
> > @@ -565,13 +565,20 @@ DEFTREECODE (VEC_COND_EXPR, "vec_cond_expr", 
> > tcc_expression, 3)
> >  
> > N = length(mask)
> > foreach i in N:
> > - M = mask[i] % (2*N)
> > - A = M < N ? v0[M] : v1[M-N]
> > + M = mask[i] % (length(v0) + length(v1))
> > + A[i] = M < length(v0) ? v0[M] : v1[M - length(v0)]
> >  
> > -   V0 and V1 are vectors of the same type.  MASK is an integer-typed
> > -   vector.  The number of MASK elements must be the same with the
> > -   number of elements in V0 and V1.  The size of the inner type
> > -   of the MASK and of the V0 and V1 must be the same.
> > +   V0 and V1 are vectors of the same type.
> > +
> > +   When MASK is not constant:
> > + MASK is an integer-typed vector.  The number of MASK elements must
> > + be the same with the number of elements in V0 and V1.  The size of
> 
> Preexisting, but s/same with/same as/

Fixed.

> > + the inner type of the MASK and of the V0 and V1 must be the same.
> > +
> > +   When MASK is constant:
> > + MASK is an integer-typed vector.   MASK elements outside of
> > + [0, length(V0) + length(V1) - 1] invoke undefined behavior (the
> > + modulo operation above doesn't apply).
> 
> I don't remember the last rule.  I thought the modulo did still apply.
> (But the canonical form is to remove obvious modulo opportunities.)
> 
> E.g. a VLA reverse-and-rotate pattern might have { N-2, N-3, N-4, ... }.
> That will wrap at the final position to 2N-1, but that seems OK.

OK, I'll remove that sentence.

Pushed as follows.

Richard.

>From 38f2d33e5119e6ae39f2702caced7e9b224cbc4f Mon Sep 17 00:00:00 2001
From: Richard Biener 
Date: Wed, 5 Jul 2023 08:53:01 +0200
Subject: [PATCH] middle-end/110541 - VEC_PERM_EXPR documentation is off
To: gcc-patches@gcc.gnu.org

The following adjusts the tree.def documentation about VEC_PERM_EXPR
which wasn't adjusted when the restrictions of permutes with constant
mask were relaxed.

PR middle-end/110541
* tree.def (VEC_PERM_EXPR): Adjust documentation to reflect
reality.
---
 gcc/tree.def | 17 +++--
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/gcc/tree.def b/gcc/tree.def
index 1fc2ca7a724..be94b7ece0a 100644
--- a/gcc/tree.def
+++ b/gcc/tree.def
@@ -565,13 +565,18 @@ DEFTREECODE (VEC_COND_EXPR, "vec_cond_expr", 
tcc_expression, 3)
 
N = length(mask)
foreach i in N:
- M = mask[i] % (2*N)
- A = M < N ? v0[M] : v1[M-N]
+ M = mask[i] % (length(v0) + length(v1))
+ A[i] = M < length(v0) ? v0[M] : v1[M - length(v0)]
 
-   V0 and V1 are vectors of the same type.  MASK is an integer-typed
-   vector.  The number of MASK elements must be the same with the
-   number of elements in V0 and V1.  The size of the inner type
-   of the MASK and of the V0 and V1 must be the same.
+   V0 and V1 are vectors of the same type.
+
+   When MASK is not constant:
+ MASK is an integer-typed vector.  The number of MASK elements must
+ be the same as the number of elements in V0 and V1.  The size of
+ the inner type of the MASK and of the V0 and V1 must be the same.
+
+   When MASK is constant:
+ MASK is an integer-typed vector.
 */
 DEFTREECODE (VEC_PERM_EXPR, "vec_perm_expr", tcc_expression, 3)
 
-- 
2.35.3



GTY: Explicitly reject 'string_length' option for (fields in) global variables (was: [PATCH] pch: Fix streaming of strings with embedded null bytes)

2023-07-05 Thread Thomas Schwinge
Hi!

On 2022-10-18T18:14:54-0400, Lewis Hyatt via Gcc-patches 
 wrote:
> [...] add a new
> GTY option "string_length" so that gt_pch_note_object can be informed the
> actual length it ought to use, [...]

> --- a/gcc/doc/gty.texi
> +++ b/gcc/doc/gty.texi
> @@ -196,7 +196,26 @@ static GTY((length("reg_known_value_size"))) rtx 
> *reg_known_value;
>  Note that the @code{length} option is only meant for use with arrays of
>  non-atomic objects, that is, objects that contain pointers pointing to
>  other GTY-managed objects.  For other GC-allocated arrays and strings
> -you should use @code{atomic}.
> +you should use @code{atomic} or @code{string_length}.
> +
> +@findex string_length
> +@item string_length ("@var{expression}")
> +
> +In order to simplify production of PCH, a structure member that is a plain
> +array of bytes (an optionally @code{const} and/or @code{unsigned} @code{char
> +*}) is treated specially by the infrastructure. Even if such an array has not
> +been allocated in GC-controlled memory, it will still be written properly 
> into
> +a PCH.  The machinery responsible for this needs to know the length of the
> +data; by default, the length is determined by calling @code{strlen} on the
> +pointer.  The @code{string_length} option specifies an alternate way to
> +determine the length, such as by inspecting another struct member:
> +
> +@smallexample
> +struct GTY(()) non_terminated_string @{
> +  size_t sz;
> +  const char * GTY((string_length ("%h.sz"))) data;
> +@};
> +@end smallexample

In preparation for another thing I'm working on, OK to push the attached
"GTY: Explicitly reject 'string_length' option for (fields in) global variables"
(with  pointing to this message)?


Grüße
 Thomas


-
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 
München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas 
Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht 
München, HRB 106955
>From 9130fe7873c2e1b44ab2449bfe022837e26f710c Mon Sep 17 00:00:00 2001
From: Thomas Schwinge 
Date: Tue, 4 Jul 2023 11:46:50 +0200
Subject: [PATCH] GTY: Explicitly reject 'string_length' option for (fields in)
 global variables

This is preparational for another thing that I'm working on.  No change in
behavior -- other than a more explicit error message.

The 'string_length' option currently is not supported for (fields in) global
variables.  For example, if we apply the following (made-up) changes:

--- gcc/c-family/c-cppbuiltin.cc
+++ gcc/c-family/c-cppbuiltin.cc
@@ -1777 +1777 @@ struct GTY(()) lazy_hex_fp_value_struct
-  const char *hex_str;
+  const char * GTY((string_length("strlen(%h.hex_str) + 1"))) hex_str;

--- gcc/varasm.cc
+++ gcc/varasm.cc
@@ -66 +66 @@ along with GCC; see the file COPYING3.  If not see
-extern GTY(()) const char *first_global_object_name;
+extern GTY((string_length("strlen(%h.first_global_object_name) + 1"))) const char *first_global_object_name;

..., we get:

[...]
build/gengtype  \
-S [...]/source-gcc/gcc -I gtyp-input.list -w tmp-gtype.state
/bin/sh [...]/source-gcc/gcc/../move-if-change tmp-gtype.state gtype.state
build/gengtype  \
-r gtype.state
[...]/source-gcc/gcc/varasm.cc:66: global `first_global_object_name' has unknown option `string_length'
[...]/source-gcc/gcc/c-family/c-cppbuiltin.cc:1789: field `hex_str' of global `lazy_hex_fp_values[0]' has unknown option `string_length'
make[2]: *** [Makefile:2890: s-gtype] Error 1
[...]

These errors occur when writing "GC roots", where -- per my understanding --
'string_length' isn't relevant for actual GC purposes.  However, like
elsewhere, it is for PCH purposes, and simply accepting 'string_length' here
isn't sufficient: we'll still get '(gt_pointer_walker) >_pch_n_S' used in the
'struct ggc_root_tab' instances, and there's no easy way to change that to
instead use 'gt_pch_n_S2' with explicit 'size_t string_len' argument.  (At
least not sufficiently easy to justify spending any further time on, given that
I don't have an actual use for that feature.)

So, until an actual need arises, and/or to avoid the next person looking into
this having to figure out the same thing again, let's just document this
limitation:

[...]/source-gcc/gcc/varasm.cc:66: option `string_length' not supported for global `first_global_object_name'
[...]/source-gcc/gcc/c-family/c-cppbuiltin.cc:1789: option `string_length' not supported for field `hex_str' of global `lazy_hex_fp_values[0]'

This amends commit f3b957ea8b9dadfb1ed30f24f463529684b7a36a
"pch: Fix streaming of strings with embedded null bytes".

	gcc/
	* gengtype.cc (write_root, write_roots): Explicitly reject
	'string_length' option.
	* doc/gty.texi (GTY Options) : Document.
---
 gcc/doc/gty.texi |  4 
 gcc/gengtype.cc  | 10 ++
 2 files changed, 14 insert

[PATCH] x86: suppress avx512f-copysign.c testcase for 32-bit

2023-07-05 Thread Jan Beulich via Gcc-patches
The test installed by "x86: make VPTERNLOG* usable on less than 512-bit
operands with just AVX512F" won't succeed on 32-bit, for floating point
operations being done there (by default) without using SIMD insns.

gcc/testsuite/
* gcc.target/i386/avx512f-copysign.c: Suppress for 32-bit.
---
Committing right away based on previous communication with maintainer.

--- a/gcc/testsuite/gcc.target/i386/avx512f-copysign.c
+++ b/gcc/testsuite/gcc.target/i386/avx512f-copysign.c
@@ -1,4 +1,4 @@
-/* { dg-do compile } */
+/* { dg-do compile { target { ! ia32 } } } */
 /* { dg-options "-mavx512f -mno-avx512vl -mprefer-vector-width=512 -O2" } */
 /* { dg-final { scan-assembler-times "vpternlog\[dq\]\[ 
\\t\]+\\\$(?:216|228|0xd8|0xe4)," 5 } } */
 


GTY: Enhance 'string_length' option documentation (was: 'unsigned int len' field in 'libcpp/include/symtab.h:struct ht_identifier' (was: [PATCH] pch: Fix streaming of strings with embedded null bytes)

2023-07-05 Thread Thomas Schwinge
Hi!

On 2023-07-04T15:56:23-0400, Lewis Hyatt via Gcc-patches 
 wrote:
> On Tue, Jul 4, 2023 at 11:50 AM Thomas Schwinge  
> wrote:
>> I came across this one here on my way working through another (somewhat
>> related) GTY issue.  I generally do understand the issue here, but do
>> have a question about 'unsigned int len' field in
>> 'libcpp/include/symtab.h:struct ht_identifier': [...]

> I don't think there is currently any possibility for a null byte to
> end up in an ht_identifier's string. I assumed that ht_identifier
> stores the length as an optimization (especially since it doesn't take
> up any extra space on 64-bit platforms, given the 32-bit hash code is
> stored as well there.) I created the string_length GTY markup mainly
> to support another patch that I have still pending review, which I
> thought would increase the likelihood of PCH needing to handle null
> bytes in general. When I did that, I added the markup to ht_identifier
> simply because the length was already there, so there was no reason
> not to add it. It does save a few cycles when streaming out the PCH,
> but I doubt it is meaningful.

Thanks for confirming.  OK thus to push the attached
"GTY: Enhance 'string_length' option documentation"?


Grüße
 Thomas


-
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 
München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas 
Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht 
München, HRB 106955
>From a31b6657c26ac70c6e03b8ad81cdcb873f905716 Mon Sep 17 00:00:00 2001
From: Thomas Schwinge 
Date: Wed, 5 Jul 2023 08:38:49 +0200
Subject: [PATCH] GTY: Enhance 'string_length' option documentation

We're (currently) not aware of any actual use of 'ht_identifier's with NUL
characters embedded; its 'len' field appears to exist for optimization
purposes, since "forever".  Before 'struct ht_identifier' was added in
commit 2a967f3d3a45294640e155381ef549e0b8090ad4 (Subversion r42334), we had in
'gcc/cpplib.h:struct cpp_hashnode': 'unsigned short len', or earlier 'length',
earlier in 'gcc/cpphash.h:struct hashnode': 'unsigned short length', earlier
'size_t length' with comment: "length of token, for quick comparison", earlier
'int length', ever since the 'gcc/cpp*' files were added in
commit 7f2935c734c36f84ab62b20a04de465e19061333 (Subversion r9191).

This amends commit f3b957ea8b9dadfb1ed30f24f463529684b7a36a
"pch: Fix streaming of strings with embedded null bytes".

	gcc/
	* doc/gty.texi (GTY Options) : Enhance.
	libcpp/
	* include/symtab.h (struct ht_identifier): Document different
	rationale.
---
 gcc/doc/gty.texi| 11 +++
 libcpp/include/symtab.h |  4 +---
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/gcc/doc/gty.texi b/gcc/doc/gty.texi
index 7bd064b5781..15f9fa07405 100644
--- a/gcc/doc/gty.texi
+++ b/gcc/doc/gty.texi
@@ -217,6 +217,17 @@ struct GTY(()) non_terminated_string @{
 @};
 @end smallexample
 
+Similarly, this is useful for (regular NUL-terminated) strings with
+NUL characters embedded (that the default @code{strlen} use would run
+afoul of):
+
+@smallexample
+struct GTY(()) multi_string @{
+  const char * GTY((string_length ("%h.len + 1"))) str;
+  size_t len;
+@};
+@end smallexample
+
 The @code{string_length} option currently is not supported for (fields
 in) global variables.
 @c 
diff --git a/libcpp/include/symtab.h b/libcpp/include/symtab.h
index c7ccc6db9f0..0c713f2ad30 100644
--- a/libcpp/include/symtab.h
+++ b/libcpp/include/symtab.h
@@ -29,9 +29,7 @@ along with this program; see the file COPYING3.  If not see
 typedef struct ht_identifier ht_identifier;
 typedef struct ht_identifier *ht_identifier_ptr;
 struct GTY(()) ht_identifier {
-  /* This GTY markup arranges that the null-terminated identifier would still
- stream to PCH correctly, if a null byte were to make its way into an
- identifier somehow.  */
+  /* We know the 'len'gth of the 'str'ing; use it in the GTY markup.  */
   const unsigned char * GTY((string_length ("1 + %h.len"))) str;
   unsigned int len;
   unsigned int hash_value;
-- 
2.34.1



[PATCH 0/2] x86: vec_extract_* adjustments

2023-07-05 Thread Jan Beulich via Gcc-patches
1: correct / simplify @vec_extract_hi_ and vec_extract_hi_v32qi
2: slightly correct / simplify *vec_extractv2ti

Jan


[PATCH 1/2] x86: correct / simplify @vec_extract_hi_ and vec_extract_hi_v32qi

2023-07-05 Thread Jan Beulich via Gcc-patches
The middle alternative each was unusable without enabling AVX512DQ (in
addition to AVX512VL), which is entirely unrelated here. The last
alternative is usable with AVX512VL only (due to type restrictions on
what may be put in the upper 16 YMM registers), and hence is pointlessly
forcing 512-bit mode (without actually reflecting that in the "mode"
attribute).

gcc/

* config/i386/sse.md (@vec_extract_hi_): Drop last
alternative. Switch new last alternative's "isa" attribute to
"avx512vl".
(vec_extract_hi_v32qi): Likewise.
---
Like elsewhere I suspect "prefix_extra" is bogus here and should be
dropped.

Is "sselog1" actually appropriate here? Extracts are special forms of
moves after all, not logical operations. Even "sseshuf1" would seem to
come closer.

--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -12029,9 +12029,9 @@
   "operands[1] = gen_lowpart (mode, operands[1]);")
 
 (define_insn "@vec_extract_hi_"
-  [(set (match_operand: 0 "nonimmediate_operand" "=xm,vm,vm")
+  [(set (match_operand: 0 "nonimmediate_operand" "=xm,vm")
(vec_select:
- (match_operand:V16_256 1 "register_operand" "x,v,v")
+ (match_operand:V16_256 1 "register_operand" "x,v")
  (parallel [(const_int 8) (const_int 9)
 (const_int 10) (const_int 11)
 (const_int 12) (const_int 13)
@@ -12039,13 +12039,12 @@
   "TARGET_AVX"
   "@
vextract%~128\t{$0x1, %1, %0|%0, %1, 0x1}
-   vextracti32x4\t{$0x1, %1, %0|%0, %1, 0x1}
-   vextracti32x4\t{$0x1, %g1, %0|%0, %g1, 0x1}"
+   vextracti32x4\t{$0x1, %1, %0|%0, %1, 0x1}"
   [(set_attr "type" "sselog1")
(set_attr "prefix_extra" "1")
(set_attr "length_immediate" "1")
-   (set_attr "isa" "*,avx512dq,avx512f")
-   (set_attr "prefix" "vex,evex,evex")
+   (set_attr "isa" "*,avx512vl")
+   (set_attr "prefix" "vex,evex")
(set_attr "mode" "OI")])
 
 (define_insn_and_split "vec_extract_lo_v64qi"
@@ -12144,9 +12143,9 @@
   "operands[1] = gen_lowpart (V16QImode, operands[1]);")
 
 (define_insn "vec_extract_hi_v32qi"
-  [(set (match_operand:V16QI 0 "nonimmediate_operand" "=xm,vm,vm")
+  [(set (match_operand:V16QI 0 "nonimmediate_operand" "=xm,vm")
(vec_select:V16QI
- (match_operand:V32QI 1 "register_operand" "x,v,v")
+ (match_operand:V32QI 1 "register_operand" "x,v")
  (parallel [(const_int 16) (const_int 17)
 (const_int 18) (const_int 19)
 (const_int 20) (const_int 21)
@@ -12158,13 +12157,12 @@
   "TARGET_AVX"
   "@
vextract%~128\t{$0x1, %1, %0|%0, %1, 0x1}
-   vextracti32x4\t{$0x1, %1, %0|%0, %1, 0x1}
-   vextracti32x4\t{$0x1, %g1, %0|%0, %g1, 0x1}"
+   vextracti32x4\t{$0x1, %1, %0|%0, %1, 0x1}"
   [(set_attr "type" "sselog1")
(set_attr "prefix_extra" "1")
(set_attr "length_immediate" "1")
-   (set_attr "isa" "*,avx512dq,avx512f")
-   (set_attr "prefix" "vex,evex,evex")
+   (set_attr "isa" "*,avx512vl")
+   (set_attr "prefix" "vex,evex")
(set_attr "mode" "OI")])
 
 ;; NB: *vec_extract_0 must be placed before *vec_extracthf.



[PATCH 2/2] x86: slightly correct / simplify *vec_extractv2ti

2023-07-05 Thread Jan Beulich via Gcc-patches
V2TImode values cannot appear in the upper 16 YMM registers without
AVX512VL being enabled. Therefore forcing 512-bit mode (also not
reflected in the "mode" attribute) is pointless.

gcc/

* config/i386/sse.md (*vec_extractv2ti): Drop g modifiers.

--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -20115,7 +20115,7 @@
   "TARGET_AVX"
   "@
vextract%~128\t{%2, %1, %0|%0, %1, %2}
-   vextracti32x4\t{%2, %g1, %0|%0, %g1, %2}"
+   vextracti32x4\t{%2, %1, %0|%0, %1, %2}"
   [(set_attr "type" "sselog")
(set_attr "prefix_extra" "1")
(set_attr "length_immediate" "1")



Re: [PATCH v4] RISC-V: Fix one bug for floating-point static frm

2023-07-05 Thread Robin Dapp via Gcc-patches
Hi Pan,

yes, the problem is fixed for me.  Still some comments ;)  Sorry
it took a while.

> 1. By default, the RVV floating-point will take dyn mode.
> 2. DYN is invalid in FRM register for RVV floating-point.
> 
> When mode switching the function entry and exit, it will take DYN as
> the frm mode.

We need to clarify this as it is misleading (even if it's just
a patch description, at least I was confused):

RVV floating-point instructions always (implicitly) use the dynamic
rounding mode.  That's IMHO not a default but rather an unchangeable
fact.  This implies that rounding is performed according to the
rounding mode set in the FRM register.  The FRM register itself
only holds proper rounding modes and never the dynamic rounding mode. 

> -  if (mode != FRM_MODE_NONE && mode != prev_mode)
> +  if (mode != FRM_MODE_DYN && mode != prev_mode)
>   {

Adding a comment like "Switching to the dynamic rounding mode is not
necessary.  When an instruction requests it, it effectively uses
the rounding mode already set in the FRM register.  All other rounding
modes require us to switch the rounding mode via the FRM register."

> -  return code >= 0 ? get_attr_frm_mode (insn) : FRM_MODE_NONE;
> +  /* According to RVV 1.0 spec, all vector floating-point operations use
> +  the dynamic rounding mode in the frm register.  */
> +  return code >= 0 ? get_attr_frm_mode (insn) : FRM_MODE_DYN;

As you reverted the previous patch get_attr_frm_mode is no longer
problematic because it returns FRM_MODE_NONE for instructions with
a dynamic rounding mode (instead of FRM_MODE_DYN).  I still find
that a bit confusing or at least halfway inconsistent and somebody
reading it will suppose something is wrong.  Could you either fix
the enum or add a TODO here that explains the situation?

The normal flow is that mode switching asks us if we need a mode
switch for an instruction and returning "NO MODE" means no.  But
we return FRM_MODE_DYN by default and FRM_MODE_NONE for vector float
which appears odd.

In riscv_mode_after the default mode is again FRM_MODE_NONE.  Wouldn't
we also want FRM_MODE_DYN here?

> @@ -7791,7 +7795,9 @@ riscv_mode_exit (int entity)
>  case RISCV_VXRM:
>return VXRM_MODE_NONE;
>  case RISCV_FRM:
> -  return FRM_MODE_NONE;
> +  /* According to RVV 1.0 spec, all vector floating-point operations use
> +  the dynamic rounding mode in the frm register.  */
> +  return FRM_MODE_DYN;

I'd rather not have the comment duplicated all over the place.  I
know I asked for it but I'd rather have it at a single spot explaining
what we need to do.

Regards
 Robin



[PATCH] adjust testcase for now happening epilogue vectorization

2023-07-05 Thread Richard Biener via Gcc-patches
gcc.dg/vect/slp-perm-9.c is reported to FAIL with -march=cascadelake
now which is because we now vectorize the epilogue with V2HImode
vectors after the recent change to not scrap too large vector
epilogues during transform but during analysis time.

The following adjusts the testcase to always use the existing alternate
N which avoids epilogue vectorization.

Tested on x86_64-unknown-linux-gnu, pushed.

* gcc.dg/vect/slp-perm-9.c: Always use alternate N.
---
 gcc/testsuite/gcc.dg/vect/slp-perm-9.c | 4 
 1 file changed, 4 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/vect/slp-perm-9.c 
b/gcc/testsuite/gcc.dg/vect/slp-perm-9.c
index 154c00af598..f1f5d4f95a0 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-perm-9.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-perm-9.c
@@ -3,11 +3,7 @@
 #include 
 #include "tree-vect.h"
 
-#if VECTOR_BITS > 512
 #define N (VECTOR_BITS * 6 / 16)
-#else
-#define N 200
-#endif
 
 void __attribute__((noinline))
 foo (unsigned short *__restrict__ pInput, unsigned short *__restrict__ pOutput)
-- 
2.35.3


Re: [PATCH] gimple-isel: Recognize vec_extract pattern.

2023-07-05 Thread Richard Biener via Gcc-patches
On Tue, 4 Jul 2023, Robin Dapp wrote:

> Hi Richard,
> 
> changed the patch according to your comments and I agree that
> it is more readable that way.  I hope using lhs as target for
> the extract directly is possible the way I did it.  Richard's
> patch for aarch64 is already, therefore testsuites on aarch64 and
> i386 are unchanged.
> 
> Regards
>  Robin
> 
> Subject: [PATCH v2] gimple-isel: Recognize vec_extract pattern.
> 
> In gimple-isel we already deduce a vec_set pattern from an
> ARRAY_REF(VIEW_CONVERT_EXPR).  This patch does the same for a
> vec_extract.
> 
> The code is largely similar to the vec_set one
> including the addition of a can_vec_extract_var_idx_p function
> in optabs.cc to check if the backend can handle a register
> operand as index.  We already have can_vec_extract in
> optabs-query but that one checks whether we can extract
> specific modes.
> 
> With the introduction of an internal function for vec_extract
> the expander must not FAIL.  For vec_set this has already been
> the case so adjust the documentation accordingly.
> 
> Additionally, clarify the wording of the vector-vector case for
> vec_extract.
> 
> gcc/ChangeLog:
> 
>   * doc/md.texi: Document that vec_set and vec_extract must not
>   fail.
>   * gimple-isel.cc (gimple_expand_vec_set_expr): Rename this...
>   (gimple_expand_vec_set_extract_expr): ...to this.
>   (gimple_expand_vec_exprs): Call renamed function.
>   * internal-fn.cc (vec_extract_direct): Add.
>   (expand_vec_extract_optab_fn): New function to expand
>   vec_extract optab.
>   (direct_vec_extract_optab_supported_p): Add.
>   * internal-fn.def (VEC_EXTRACT): Add.
>   * optabs.cc (can_vec_extract_var_idx_p): New function.
>   * optabs.h (can_vec_extract_var_idx_p): Declare.
> ---
>  gcc/doc/md.texi |   7 +++-
>  gcc/gimple-isel.cc  | 100 
>  gcc/internal-fn.cc  |  39 +
>  gcc/internal-fn.def |   1 +
>  gcc/optabs.cc   |  24 +++
>  gcc/optabs.h|   1 +
>  6 files changed, 144 insertions(+), 28 deletions(-)
> 
> diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
> index f14dd32b2dc..b30a824488b 100644
> --- a/gcc/doc/md.texi
> +++ b/gcc/doc/md.texi
> @@ -5091,6 +5091,8 @@ Mask elements @var{i} with @var{i} > (operand 5 + 
> operand 6) are ignored.
>  Set given field in the vector value.  Operand 0 is the vector to modify,
>  operand 1 is new value of field and operand 2 specify the field index.
>  
> +This pattern is not allowed to @code{FAIL}.
> +
>  @cindex @code{vec_extract@var{m}@var{n}} instruction pattern
>  @item @samp{vec_extract@var{m}@var{n}}
>  Extract given field from the vector value.  Operand 1 is the vector, operand 
> 2
> @@ -5098,7 +5100,10 @@ specify field index and operand 0 place to store value 
> into.  The
>  @var{n} mode is the mode of the field or vector of fields that should be
>  extracted, should be either element mode of the vector mode @var{m}, or
>  a vector mode with the same element mode and smaller number of elements.
> -If @var{n} is a vector mode, the index is counted in units of that mode.
> +If @var{n} is a vector mode the index is counted in multiples of
> +mode @var{n}.
> +
> +This pattern is not allowed to @code{FAIL}.
>  
>  @cindex @code{vec_init@var{m}@var{n}} instruction pattern
>  @item @samp{vec_init@var{m}@var{n}}
> diff --git a/gcc/gimple-isel.cc b/gcc/gimple-isel.cc
> index ef688ddb57f..a18b26dec7b 100644
> --- a/gcc/gimple-isel.cc
> +++ b/gcc/gimple-isel.cc
> @@ -42,17 +42,27 @@ along with GCC; see the file COPYING3.  If not see
>  
>  /* Expand all ARRAY_REF(VIEW_CONVERT_EXPR) gimple assignments into calls to
> internal function based on vector type of selected expansion.
> -   i.e.:
> +
> +   For vec_set:
> +
>   VIEW_CONVERT_EXPR(u)[_1] = i_4(D);
> =>
>   _7 = u;
>   _8 = .VEC_SET (_7, i_4(D), _1);
> - u = _8;  */
> + u = _8;
> +
> +   For vec_extract:
> +
> +  _3 = VIEW_CONVERT_EXPR(vD.2208)[idx_2(D)];
> +   =>
> +  _4 = vD.2208;
> +  _5 = .VEC_EXTRACT (_4, idx_2(D));
> +  _3 = _5;  */

I think you are doing

 _3 = .VEC_EXTRACT (_4, idx_2(D));

and avoiding the SSA name copy correctly.  Can you double-check?

OK with the comment adjusted.

Thanks,
Richard.

>  
>  static bool
> -gimple_expand_vec_set_expr (struct function *fun, gimple_stmt_iterator *gsi)
> +gimple_expand_vec_set_extract_expr (struct function *fun,
> + gimple_stmt_iterator *gsi)
>  {
> -  enum tree_code code;
>gcall *new_stmt = NULL;
>gassign *ass_stmt = NULL;
>bool cfg_changed = false;
> @@ -62,49 +72,84 @@ gimple_expand_vec_set_expr (struct function *fun, 
> gimple_stmt_iterator *gsi)
>if (!stmt)
>  return false;
>  
> +  bool is_extract = false;
> +
>tree lhs = gimple_assign_lhs (stmt);
> -  code = TREE_CODE (lhs);
> -  if (code != ARRAY_REF)
> +  tree rhs = gimple_assign_rhs1 (stmt);
> +  tree val, ref;

GTY: Clean up obsolete 'bool needs_cast_p' field of 'gcc/gengtype.cc:struct walk_type_data' (was: [PATCH 3/3] remove gengtype support for param_is use_param, if_marked and splay tree allocators)

2023-07-05 Thread Thomas Schwinge
Hi!

On 2014-11-23T23:11:36-0500, tsaund...@mozilla.com wrote:
> gcc/
>
>   * plugin.c, plugin.def, ggc.h, ggc-common.c, gengtype.h, gengtype.c,
>   gengtype-state.c, gengtype-parse.c, gentype-lex.l, gcc-plugin.h,
>   doc/plugins.texi, doc/gty.texi: Remove support for if_marked and
>   param_is.

> --- a/gcc/gengtype.c
> +++ b/gcc/gengtype.c

> @@ -2861,39 +2733,6 @@ walk_type (type_p t, struct walk_type_data *d)
>if (d->used_length)
>  length = NULL;
>
> -[...]
> -   d->needs_cast_p = (t->kind != TYPE_POINTER
> -  && (nt->kind == TYPE_POINTER
> -  || nt->kind == TYPE_STRING));
> -[...]

OK to push the attached
"GTY: Clean up obsolete 'bool needs_cast_p' field of 'gcc/gengtype.cc:struct 
walk_type_data'"?
No change in the 'gengtype'-generated files in my test build.


Grüße
 Thomas


-
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 
München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas 
Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht 
München, HRB 106955
>From 12cad7f5b3bfd8b01c90f9e7817fd2d837f2802b Mon Sep 17 00:00:00 2001
From: Thomas Schwinge 
Date: Tue, 4 Jul 2023 22:47:48 +0200
Subject: [PATCH] GTY: Clean up obsolete 'bool needs_cast_p' field of
 'gcc/gengtype.cc:struct walk_type_data'

Last use disappeared in 2014 with
commit 63f5d5b818319129217e41bcb23db53f99ff11b0 (Subversion r218558)
"remove gengtype support for param_is use_param, if_marked and splay tree allocators".

	gcc/
	* gengtype.cc (struct walk_type_data): Remove 'needs_cast_p'.
	Adjust all users.
---
 gcc/gengtype.cc | 19 ---
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/gcc/gengtype.cc b/gcc/gengtype.cc
index 04dbb0de8bd..54d3c8aaec3 100644
--- a/gcc/gengtype.cc
+++ b/gcc/gengtype.cc
@@ -2447,7 +2447,6 @@ struct walk_type_data
   int used_length;
   type_p orig_s;
   const char *reorder_fn;
-  bool needs_cast_p;
   bool fn_wants_lvalue;
   bool in_record_p;
   int loopcounter;
@@ -2663,7 +2662,6 @@ walk_type (type_p t, struct walk_type_data *d)
   options_p oo;
   const struct nested_ptr_data *nested_ptr_d = NULL;
 
-  d->needs_cast_p = false;
   for (oo = d->opt; oo; oo = oo->next)
 if (strcmp (oo->name, "length") == 0 && oo->kind == OPTION_STRING)
   length = oo->info.string;
@@ -3186,7 +3184,6 @@ static void
 write_types_process_field (type_p f, const struct walk_type_data *d)
 {
   const struct write_types_data *wtd;
-  const char *cast = d->needs_cast_p ? "(void *)" : "";
   wtd = (const struct write_types_data *) d->cookie;
 
   switch (f->kind)
@@ -3195,8 +3192,8 @@ write_types_process_field (type_p f, const struct walk_type_data *d)
 case TYPE_UNDEFINED:
   gcc_unreachable ();
 case TYPE_POINTER:
-  oprintf (d->of, "%*s%s (%s%s", d->indent, "",
-	   wtd->subfield_marker_routine, cast, d->val);
+  oprintf (d->of, "%*s%s (%s", d->indent, "",
+	   wtd->subfield_marker_routine, d->val);
   if (wtd->param_prefix)
 	{
 	  if (f->u.p->kind == TYPE_SCALAR)
@@ -3229,8 +3226,8 @@ write_types_process_field (type_p f, const struct walk_type_data *d)
 	}
   oprintf (d->of, ");\n");
   if (d->reorder_fn && wtd->reorder_note_routine)
-	oprintf (d->of, "%*s%s (%s%s, %s, %s);\n", d->indent, "",
-		 wtd->reorder_note_routine, cast, d->val,
+	oprintf (d->of, "%*s%s (%s, %s, %s);\n", d->indent, "",
+		 wtd->reorder_note_routine, d->val,
 		 d->prev_val[3], d->reorder_fn);
   break;
 
@@ -3262,16 +3259,16 @@ write_types_process_field (type_p f, const struct walk_type_data *d)
 	   : nullptr);
 	  if (length_override)
 	{
-	  oprintf (d->of, "2 (%s%s, ", cast, d->val);
+	  oprintf (d->of, "2 (%s, ", d->val);
 	  output_escaped_param (d, length_override, "string_length");
 	}
 	  else
-	oprintf (d->of, " (%s%s", cast, d->val);
+	oprintf (d->of, " (%s", d->val);
 
 	  oprintf (d->of, ");\n");
 	  if (d->reorder_fn && wtd->reorder_note_routine)
-	oprintf (d->of, "%*s%s (%s%s, %s%s, %s);\n", d->indent, "",
-		 wtd->reorder_note_routine, cast, d->val, cast, d->val,
+	oprintf (d->of, "%*s%s (%s, %s, %s);\n", d->indent, "",
+		 wtd->reorder_note_routine, d->val, d->val,
 		 d->reorder_fn);
 	}
   break;
-- 
2.34.1



[PATCH] RISC-V: Handle rouding mode correctly on zfinx

2023-07-05 Thread Kito Cheng via Gcc-patches
Zfinx has provide fcsr like F, so rouding mode should use fcsr instead
of `soft` fenv.

libgcc/ChangeLog:

* config/riscv/sfp-machine.h (FP_INIT_ROUNDMODE): Check zfinx.
(FP_HANDLE_EXCEPTIONS): Ditto.
---
 libgcc/config/riscv/sfp-machine.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libgcc/config/riscv/sfp-machine.h 
b/libgcc/config/riscv/sfp-machine.h
index ded594d75d87..38e2817bffa2 100644
--- a/libgcc/config/riscv/sfp-machine.h
+++ b/libgcc/config/riscv/sfp-machine.h
@@ -113,7 +113,7 @@ typedef int __gcc_CMPtype __attribute__ ((mode 
(__libgcc_cmp_return__)));
 
 #define _FP_TININESS_AFTER_ROUNDING 1
 
-#ifdef __riscv_flen
+#if defined(__riscv_flen) || defined(__riscv_zfinx)
 #define FP_INIT_ROUNDMODE  \
 do {   \
   __asm__ volatile ("frrm %0" : "=r" (_frm));  \
-- 
2.40.1



Re: [PATCH] gimple-isel: Recognize vec_extract pattern.

2023-07-05 Thread Robin Dapp via Gcc-patches
>> +  _4 = vD.2208;
>> +  _5 = .VEC_EXTRACT (_4, idx_2(D));
>> +  _3 = _5;  */
> 
> I think you are doing
> 
>  _3 = .VEC_EXTRACT (_4, idx_2(D));
> 
> and avoiding the SSA name copy correctly.  Can you double-check?
> 
> OK with the comment adjusted.

Argh, yes, thanks.

Regards
 Robin


GTY: Clean up obsolete parametrized structs remnants (was: [PATCH 3/3] remove gengtype support for param_is use_param, if_marked and splay tree allocators)

2023-07-05 Thread Thomas Schwinge
Hi!

On 2014-11-23T23:11:36-0500, tsaund...@mozilla.com wrote:
> gcc/
>
>   * plugin.c, plugin.def, ggc.h, ggc-common.c, gengtype.h, gengtype.c,
>   gengtype-state.c, gengtype-parse.c, gentype-lex.l, gcc-plugin.h,
>   doc/plugins.texi, doc/gty.texi: Remove support for if_marked and
>   param_is.

> --- a/gcc/gengtype.h
> +++ b/gcc/gengtype.h

> @@ -153,11 +152,6 @@ enum typekind {
>TYPE_LANG_STRUCT, /* GCC front-end language specific structs.
> Various languages may have homonymous but
> different structs.  */
> -  TYPE_PARAM_STRUCT,/* Type for parametrized structs, e.g. hash_t
> -   hash-tables, ...  See (param_is, use_param,
> -   param1_is, param2_is,... use_param1,
> -   use_param_2,... use_params) GTY
> -   options.  */
>TYPE_USER_STRUCT   /* User defined type.  Walkers and markers for
>  this type are assumed to be provided by the
>  user.  */

OK to push the attached
"GTY: Clean up obsolete parametrized structs remnants"?


Grüße
 Thomas


-
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 
München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas 
Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht 
München, HRB 106955
>From 100039989c2bca5d98cb76a1bc3ef5b40bfc27ce Mon Sep 17 00:00:00 2001
From: Thomas Schwinge 
Date: Tue, 4 Jul 2023 22:47:48 +0200
Subject: [PATCH] GTY: Clean up obsolete parametrized structs remnants

Support removed in 2014 with
commit 63f5d5b818319129217e41bcb23db53f99ff11b0 (Subversion r218558)
"remove gengtype support for param_is use_param, if_marked and splay tree allocators".

	gcc/
	* gengtype-parse.cc: Clean up obsolete parametrized structs
	remnants.
	* gengtype.cc: Likewise.
---
 gcc/gengtype-parse.cc | 1 -
 gcc/gengtype.cc   | 6 ++
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/gcc/gengtype-parse.cc b/gcc/gengtype-parse.cc
index 2b2156c5f45..efee4171674 100644
--- a/gcc/gengtype-parse.cc
+++ b/gcc/gengtype-parse.cc
@@ -80,7 +80,6 @@ static const char *const token_names[] = {
   "...",
   "ptr_alias",
   "nested_ptr",
-  "a param_is option",
   "a number",
   "a scalar type",
   "an identifier",
diff --git a/gcc/gengtype.cc b/gcc/gengtype.cc
index 54d3c8aaec3..49ddba684af 100644
--- a/gcc/gengtype.cc
+++ b/gcc/gengtype.cc
@@ -1388,8 +1388,6 @@ adjust_field_rtx_def (type_p t, options_p ARG_UNUSED (opt))
At present:
- Converts pointer-to-char, with no length parameter, to TYPE_STRING;
- Similarly for arrays of pointer-to-char;
-   - Converts structures for which a parameter is provided to
- TYPE_PARAM_STRUCT;
- Handles "special" options.
 */
 
@@ -3654,7 +3652,7 @@ write_func_for_structure (type_p orig_s, type_p s,
 }
 
 
-/* Write out marker routines for STRUCTURES and PARAM_STRUCTS.  */
+/* Write out marker routines for STRUCTURES.  */
 
 static void
 write_types (outf_p output_header, type_p structures,
@@ -4002,7 +4000,7 @@ write_local_func_for_structure (const_type_p orig_s, type_p s)
   }
 }
 
-/* Write out local marker routines for STRUCTURES and PARAM_STRUCTS.  */
+/* Write out local marker routines for STRUCTURES.  */
 
 static void
 write_local (outf_p output_header, type_p structures)
-- 
2.34.1



Re: GTY: Explicitly reject 'string_length' option for (fields in) global variables (was: [PATCH] pch: Fix streaming of strings with embedded null bytes)

2023-07-05 Thread Richard Biener via Gcc-patches
On Wed, Jul 5, 2023 at 9:51 AM Thomas Schwinge  wrote:
>
> Hi!
>
> On 2022-10-18T18:14:54-0400, Lewis Hyatt via Gcc-patches 
>  wrote:
> > [...] add a new
> > GTY option "string_length" so that gt_pch_note_object can be informed the
> > actual length it ought to use, [...]
>
> > --- a/gcc/doc/gty.texi
> > +++ b/gcc/doc/gty.texi
> > @@ -196,7 +196,26 @@ static GTY((length("reg_known_value_size"))) rtx 
> > *reg_known_value;
> >  Note that the @code{length} option is only meant for use with arrays of
> >  non-atomic objects, that is, objects that contain pointers pointing to
> >  other GTY-managed objects.  For other GC-allocated arrays and strings
> > -you should use @code{atomic}.
> > +you should use @code{atomic} or @code{string_length}.
> > +
> > +@findex string_length
> > +@item string_length ("@var{expression}")
> > +
> > +In order to simplify production of PCH, a structure member that is a plain
> > +array of bytes (an optionally @code{const} and/or @code{unsigned} 
> > @code{char
> > +*}) is treated specially by the infrastructure. Even if such an array has 
> > not
> > +been allocated in GC-controlled memory, it will still be written properly 
> > into
> > +a PCH.  The machinery responsible for this needs to know the length of the
> > +data; by default, the length is determined by calling @code{strlen} on the
> > +pointer.  The @code{string_length} option specifies an alternate way to
> > +determine the length, such as by inspecting another struct member:
> > +
> > +@smallexample
> > +struct GTY(()) non_terminated_string @{
> > +  size_t sz;
> > +  const char * GTY((string_length ("%h.sz"))) data;
> > +@};
> > +@end smallexample
>
> In preparation for another thing I'm working on, OK to push the attached
> "GTY: Explicitly reject 'string_length' option for (fields in) global 
> variables"
> (with  pointing to this message)?

OK

>
> Grüße
>  Thomas
>
>
> -
> Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 
> München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas 
> Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht 
> München, HRB 106955


Re: GTY: Enhance 'string_length' option documentation (was: 'unsigned int len' field in 'libcpp/include/symtab.h:struct ht_identifier' (was: [PATCH] pch: Fix streaming of strings with embedded null by

2023-07-05 Thread Richard Biener via Gcc-patches
On Wed, Jul 5, 2023 at 9:57 AM Thomas Schwinge  wrote:
>
> Hi!
>
> On 2023-07-04T15:56:23-0400, Lewis Hyatt via Gcc-patches 
>  wrote:
> > On Tue, Jul 4, 2023 at 11:50 AM Thomas Schwinge  
> > wrote:
> >> I came across this one here on my way working through another (somewhat
> >> related) GTY issue.  I generally do understand the issue here, but do
> >> have a question about 'unsigned int len' field in
> >> 'libcpp/include/symtab.h:struct ht_identifier': [...]
>
> > I don't think there is currently any possibility for a null byte to
> > end up in an ht_identifier's string. I assumed that ht_identifier
> > stores the length as an optimization (especially since it doesn't take
> > up any extra space on 64-bit platforms, given the 32-bit hash code is
> > stored as well there.) I created the string_length GTY markup mainly
> > to support another patch that I have still pending review, which I
> > thought would increase the likelihood of PCH needing to handle null
> > bytes in general. When I did that, I added the markup to ht_identifier
> > simply because the length was already there, so there was no reason
> > not to add it. It does save a few cycles when streaming out the PCH,
> > but I doubt it is meaningful.
>
> Thanks for confirming.  OK thus to push the attached
> "GTY: Enhance 'string_length' option documentation"?

OK.

> Grüße
>  Thomas
>
>
> -
> Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 
> München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas 
> Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht 
> München, HRB 106955


Re: [PATCH 1/2] x86: correct / simplify @vec_extract_hi_ and vec_extract_hi_v32qi

2023-07-05 Thread Hongtao Liu via Gcc-patches
On Wed, Jul 5, 2023 at 4:00 PM Jan Beulich via Gcc-patches
 wrote:
>
> The middle alternative each was unusable without enabling AVX512DQ (in
> addition to AVX512VL), which is entirely unrelated here. The last
> alternative is usable with AVX512VL only (due to type restrictions on
> what may be put in the upper 16 YMM registers), and hence is pointlessly
> forcing 512-bit mode (without actually reflecting that in the "mode"
> attribute).
Ok.
>
> gcc/
>
> * config/i386/sse.md (@vec_extract_hi_): Drop last
> alternative. Switch new last alternative's "isa" attribute to
> "avx512vl".
> (vec_extract_hi_v32qi): Likewise.
> ---
> Like elsewhere I suspect "prefix_extra" is bogus here and should be
> dropped.
>
> Is "sselog1" actually appropriate here? Extracts are special forms of
> moves after all, not logical operations. Even "sseshuf1" would seem to
> come closer.
Honestly, I don't know why it's marked as sselog1, but looking at the
code,  almost all vec_extract patterns are marked as sselog1, guess
it's originally from pextr.
Agree that it's should be more close to shuffle instructions.
>
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -12029,9 +12029,9 @@
>"operands[1] = gen_lowpart (mode, operands[1]);")
>
>  (define_insn "@vec_extract_hi_"
> -  [(set (match_operand: 0 "nonimmediate_operand" "=xm,vm,vm")
> +  [(set (match_operand: 0 "nonimmediate_operand" "=xm,vm")
> (vec_select:
> - (match_operand:V16_256 1 "register_operand" "x,v,v")
> + (match_operand:V16_256 1 "register_operand" "x,v")
>   (parallel [(const_int 8) (const_int 9)
>  (const_int 10) (const_int 11)
>  (const_int 12) (const_int 13)
> @@ -12039,13 +12039,12 @@
>"TARGET_AVX"
>"@
> vextract%~128\t{$0x1, %1, %0|%0, %1, 0x1}
> -   vextracti32x4\t{$0x1, %1, %0|%0, %1, 0x1}
> -   vextracti32x4\t{$0x1, %g1, %0|%0, %g1, 0x1}"
> +   vextracti32x4\t{$0x1, %1, %0|%0, %1, 0x1}"
>[(set_attr "type" "sselog1")
> (set_attr "prefix_extra" "1")
> (set_attr "length_immediate" "1")
> -   (set_attr "isa" "*,avx512dq,avx512f")
> -   (set_attr "prefix" "vex,evex,evex")
> +   (set_attr "isa" "*,avx512vl")
> +   (set_attr "prefix" "vex,evex")
> (set_attr "mode" "OI")])
>
>  (define_insn_and_split "vec_extract_lo_v64qi"
> @@ -12144,9 +12143,9 @@
>"operands[1] = gen_lowpart (V16QImode, operands[1]);")
>
>  (define_insn "vec_extract_hi_v32qi"
> -  [(set (match_operand:V16QI 0 "nonimmediate_operand" "=xm,vm,vm")
> +  [(set (match_operand:V16QI 0 "nonimmediate_operand" "=xm,vm")
> (vec_select:V16QI
> - (match_operand:V32QI 1 "register_operand" "x,v,v")
> + (match_operand:V32QI 1 "register_operand" "x,v")
>   (parallel [(const_int 16) (const_int 17)
>  (const_int 18) (const_int 19)
>  (const_int 20) (const_int 21)
> @@ -12158,13 +12157,12 @@
>"TARGET_AVX"
>"@
> vextract%~128\t{$0x1, %1, %0|%0, %1, 0x1}
> -   vextracti32x4\t{$0x1, %1, %0|%0, %1, 0x1}
> -   vextracti32x4\t{$0x1, %g1, %0|%0, %g1, 0x1}"
> +   vextracti32x4\t{$0x1, %1, %0|%0, %1, 0x1}"
>[(set_attr "type" "sselog1")
> (set_attr "prefix_extra" "1")
> (set_attr "length_immediate" "1")
> -   (set_attr "isa" "*,avx512dq,avx512f")
> -   (set_attr "prefix" "vex,evex,evex")
> +   (set_attr "isa" "*,avx512vl")
> +   (set_attr "prefix" "vex,evex")
> (set_attr "mode" "OI")])
>
>  ;; NB: *vec_extract_0 must be placed before *vec_extracthf.
>


-- 
BR,
Hongtao


Re: [PATCH 2/2] x86: slightly correct / simplify *vec_extractv2ti

2023-07-05 Thread Hongtao Liu via Gcc-patches
On Wed, Jul 5, 2023 at 4:01 PM Jan Beulich via Gcc-patches
 wrote:
>
> V2TImode values cannot appear in the upper 16 YMM registers without
> AVX512VL being enabled. Therefore forcing 512-bit mode (also not
> reflected in the "mode" attribute) is pointless.
Please set isa attribute for alternative 1 to avx512vl.
>
> gcc/
>
> * config/i386/sse.md (*vec_extractv2ti): Drop g modifiers.
>
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -20115,7 +20115,7 @@
>"TARGET_AVX"
>"@
> vextract%~128\t{%2, %1, %0|%0, %1, %2}
> -   vextracti32x4\t{%2, %g1, %0|%0, %g1, %2}"
> +   vextracti32x4\t{%2, %1, %0|%0, %1, %2}"
>[(set_attr "type" "sselog")
> (set_attr "prefix_extra" "1")
> (set_attr "length_immediate" "1")
>


-- 
BR,
Hongtao


[PATCH] Vect: select small VF for epilog of unrolled loop (PR tree-optimization/110474)

2023-07-05 Thread Hao Liu OS via Gcc-patches
Hi,

If a loop is unrolled during vectorization (i.e. suggested_unroll_factor > 1),
the VFs of both main and epilog loop are enlarged.  The epilog vect loop is
specific for a loop with small iteration counts, so a large VF may hurt
performance.

This patch unscales the main loop VF by suggested_unroll_factor while selecting
the epilog loop VF, so that it will be the same as vectorized loop without
unrolling (i.e. suggested_unroll_factor = 1).

gcc/ChangeLog:

PR tree-optimization/110474
* tree-vect-loop.cc (vect_analyze_loop_2): unscale the VF by suggested
unroll factor while selecting the epilog vect loop VF.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/pr110474.c: New testcase.
---
 gcc/testsuite/gcc.target/aarch64/pr110474.c | 37 +
 gcc/tree-vect-loop.cc   | 16 +
 2 files changed, 47 insertions(+), 6 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/pr110474.c

diff --git a/gcc/testsuite/gcc.target/aarch64/pr110474.c 
b/gcc/testsuite/gcc.target/aarch64/pr110474.c
new file mode 100644
index 000..e548416162a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr110474.c
@@ -0,0 +1,37 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mtune=neoverse-n2 -mcpu=neoverse-n1 
-fdump-tree-vect-details --param aarch64-vect-unroll-limit=2" } */
+/* { dg-final { scan-tree-dump "Choosing vector mode V8HI"  "vect" } } */
+/* { dg-final { scan-tree-dump "Choosing epilogue vector mode V8QI"  "vect" } 
} */
+
+/* Do not increase the the vector factor of the epilog vectorized loop
+   for a loop with suggested_unroll_factor > 1.
+
+   before (suggested_unroll_factor=1):
+ if N >= 16:
+ main vect loop
+ if N >= 8:
+ epilog vect loop
+ scalar code
+
+   before (suggested_unroll_factor=2):
+ if N >= 32:
+ main vect loop
+ if N >= 16:  // May fail to execute vectorized code (e.g. N is 8)
+ epilog vect loop
+ scalar code
+
+   after  (suggested_unroll_factor=2):
+ if N >= 32:
+ main vect loop
+ if N >= 8:  // The same VF as suggested_unroll_factor=1
+ epilog vect loop
+ scalar code  */
+
+int
+foo (short *A, char *B, int N)
+{
+  int sum = 0;
+  for (int i = 0; i < N; ++i)
+sum += A[i] * B[i];
+  return sum;
+}
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 3b46c58a8d8..4d9abd035ea 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -3021,12 +3021,16 @@ start_over:
  to be able to handle fewer than VF scalars, or needs to have a lower VF
  than the main loop.  */
   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
-  && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
-  && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
-  LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
-return opt_result::failure_at (vect_location,
-  "Vectorization factor too high for"
-  " epilogue loop.\n");
+  && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
+{
+  poly_uint64 unscaled_vf
+   = exact_div (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
+orig_loop_vinfo->suggested_unroll_factor);
+  if (maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), unscaled_vf))
+   return opt_result::failure_at (vect_location,
+  "Vectorization factor too high for"
+  " epilogue loop.\n");
+}
 
   /* Decide whether this loop_vinfo should use partial vectors or peeling,
  assuming that the loop will be used as a main loop.  We will redo
-- 
2.34.1


Re: [PATCH 1/2] x86: correct / simplify @vec_extract_hi_ and vec_extract_hi_v32qi

2023-07-05 Thread Jan Beulich via Gcc-patches
On 05.07.2023 10:40, Hongtao Liu wrote:
> On Wed, Jul 5, 2023 at 4:00 PM Jan Beulich via Gcc-patches
>  wrote:
>>
>> The middle alternative each was unusable without enabling AVX512DQ (in
>> addition to AVX512VL), which is entirely unrelated here. The last
>> alternative is usable with AVX512VL only (due to type restrictions on
>> what may be put in the upper 16 YMM registers), and hence is pointlessly
>> forcing 512-bit mode (without actually reflecting that in the "mode"
>> attribute).
> Ok.

Thanks.

>> ---
>> Like elsewhere I suspect "prefix_extra" is bogus here and should be
>> dropped.
>>
>> Is "sselog1" actually appropriate here? Extracts are special forms of
>> moves after all, not logical operations. Even "sseshuf1" would seem to
>> come closer.
> Honestly, I don't know why it's marked as sselog1, but looking at the
> code,  almost all vec_extract patterns are marked as sselog1, guess
> it's originally from pextr.
> Agree that it's should be more close to shuffle instructions.

Yet as said I think these are special forms of moves. To me "shuffle"
involves more than one element. Yet then I don't really know what
the "type" attributes are used for (other than vaguely "for
scheduling"), and hence whether treating extracts as shuffles would
be more appropriate. (IOW I'd be happy to make a patch to convert all
extracts, but I'd need to know whether the conversion should be to
"sseshuf", "sseshuf1", or "ssemov". In the former two cases knowing
the "Why?" would also help, especially for writing a sensible
description. I also haven't found any explanation towards the
difference between sse and sse1: The "memory" attribute
evaluates to "both" for the 1 forms if operand 1 is in memory, yet
that doesn't seem to fit any of the uses here.)

Jan


Re: [PATCH 2/2] x86: slightly correct / simplify *vec_extractv2ti

2023-07-05 Thread Jan Beulich via Gcc-patches
On 05.07.2023 10:47, Hongtao Liu wrote:
> On Wed, Jul 5, 2023 at 4:01 PM Jan Beulich via Gcc-patches
>  wrote:
>>
>> V2TImode values cannot appear in the upper 16 YMM registers without
>> AVX512VL being enabled. Therefore forcing 512-bit mode (also not
>> reflected in the "mode" attribute) is pointless.
> Please set isa attribute for alternative 1 to avx512vl.

Since that looks redundant to me (as per the description), would you
mind explaining why that's necessary / wanted? It also feels orthogonal
to the change I'm making, as there was no "isa" attribute so far (which
would have wanted to be "avx512f" as per what you ask for, prior to the
change I'm making). Again me asking back is primarily to properly
describe the changes I'm making, of course along with me still needing
to properly understand when what attribute needs specifying explicitly.

Jan


[PATCH] RISC-V: Allow variable index for vec_set.

2023-07-05 Thread Robin Dapp via Gcc-patches
Hi,

this patch enables a variable index for vec_set and
adjusts/cleans up the tests.

Regards
 Robin

gcc/ChangeLog:

* config/riscv/autovec.md: Allow register index operand.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/vls-vlmax/vec_set-1.c: Adjust
test.
* gcc.target/riscv/rvv/autovec/vls-vlmax/vec_set-2.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls-vlmax/vec_set-3.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls-vlmax/vec_set-4.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls-vlmax/vec_set-run.c: Ditto.
---
 gcc/config/riscv/autovec.md   |  17 +-
 .../riscv/rvv/autovec/vls-vlmax/vec_set-1.c   |  32 +++-
 .../riscv/rvv/autovec/vls-vlmax/vec_set-2.c   |  32 +++-
 .../riscv/rvv/autovec/vls-vlmax/vec_set-3.c   |  33 +++-
 .../riscv/rvv/autovec/vls-vlmax/vec_set-4.c   |  33 +++-
 .../riscv/rvv/autovec/vls-vlmax/vec_set-run.c | 153 ++
 .../rvv/autovec/vls-vlmax/vec_set-zvfh-run.c  |  49 +++---
 7 files changed, 185 insertions(+), 164 deletions(-)

diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md
index aeeb8807928..6ee1af7990e 100644
--- a/gcc/config/riscv/autovec.md
+++ b/gcc/config/riscv/autovec.md
@@ -1022,7 +1022,7 @@ (define_expand "select_vl"
 (define_expand "vec_set"
   [(match_operand:V0 "register_operand")
(match_operand: 1 "register_operand")
-   (match_operand  2 "immediate_operand")]
+   (match_operand  2 "nonmemory_operand")]
   "TARGET_VECTOR"
 {
   /* If we set the first element, emit an v(f)mv.s.[xf].  */
@@ -1039,12 +1039,17 @@ (define_expand "vec_set"
 it at the proper position using vslideup with an
 "effective length" of 1 i.e. a VL 1 past the offset.  */
 
-  /* Slide offset = element index.  */
-  int offset = INTVAL (operands[2]);
-
-  /* Only insert one element, i.e. VL = offset + 1.  */
+  /* Here we set VL = offset + 1.  */
   rtx length = gen_reg_rtx (Pmode);
-  emit_move_insn (length, GEN_INT (offset + 1));
+  operands[2] = gen_lowpart (Pmode, operands[2]);
+  if (CONST_INT_P (operands[2]))
+ emit_move_insn (length, GEN_INT (INTVAL (operands[2]) + 1));
+  else
+   {
+ rtx add = gen_rtx_PLUS (GET_MODE (operands[2]),
+ operands[2], GEN_INT (1));
+ emit_move_insn (length, add);
+   }
 
   /* Move operands[1] into a vector register via vmv.v.x using the same
 VL we need for the slide.  */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/vec_set-1.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/vec_set-1.c
index 3d60e635869..e97f6f5f8ee 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/vec_set-1.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/vec_set-1.c
@@ -20,6 +20,15 @@ typedef double vnx2df __attribute__((vector_size (16)));
 return v;  \
   }
 
+#define VEC_SET_VAR1(S,V)  \
+  V\
+  __attribute__((noipa))   \
+  vec_set_var_##V (V v, int8_t idx, S s)   \
+  {\
+v[idx] = s;\
+return v;  \
+  }\
+
 #define TEST_ALL1(T)   \
   T (_Float16, vnx8hf, 0)  \
   T (_Float16, vnx8hf, 3)  \
@@ -43,20 +52,31 @@ typedef double vnx2df __attribute__((vector_size (16)));
   T (int8_t, vnx16qi, 11)  \
   T (int8_t, vnx16qi, 15)  \
 
+#define TEST_ALL_VAR1(T)   \
+  T (_Float16, vnx8hf) \
+  T (float, vnx4sf)\
+  T (double, vnx2df)   \
+  T (int64_t, vnx2di)  \
+  T (int32_t, vnx4si)  \
+  T (int16_t, vnx8hi)  \
+  T (int8_t, vnx16qi)  \
+
 TEST_ALL1 (VEC_SET)
+TEST_ALL_VAR1 (VEC_SET_VAR1)
 
 /* { dg-final { scan-assembler-times 
{vset[i]*vli\s+[a-z0-9,]+,\s*e8,\s*m1,\s*ta,\s*ma} 1 } } */
-/* { dg-final { scan-assembler-times 
{vset[i]*vli\s+[a-z0-9,]+,\s*e8,\s*m1,\s*tu,\s*ma} 4 } } */
+/* { dg-final { scan-assembler-times 
{vset[i]*vli\s+[a-z0-9,]+,\s*e8,\s*m1,\s*tu,\s*ma} 5 } } */
 /* { dg-final { scan-assembler-times 
{vset[i]*vli\s+[a-z0-9,]+,\s*e16,\s*m1,\s*ta,\s*ma} 2 } } */
-/* { dg-final { scan-assembler-times 
{vset[i]*vli\s+[a-z0-9,]+,\s*e16,\s*m1,\s*tu,\s*ma} 4 } } */
+/* { dg-final { scan-assembler-times 
{vset[i]*vli\s+[a-z0-9,]+,\s*e16,\s*m1,\s*tu,\s*ma} 6 } } */
 /* { dg-final { scan-assembler-times 
{vset[i]*vli\s+[a-z0-9,]+,\s*e32,\s*m1,\s*ta,\s*ma} 2 } } */
-/* { dg-final { scan-assembler-times 
{vset[i]*vli\s+[a-z0-9,]+,\s*e32,\s*m1,\s*tu,\s*ma} 4 } } */
+/* { dg-final { scan-assembler-times 
{vse

[PATCH] RISC-V: Support variable index in vec_extract.

2023-07-05 Thread Robin Dapp via Gcc-patches
Hi,

this patch adds a gen_lowpart in the vec_extract expander so it properly
works with a variable index and adds tests.

Regards
 Robin

gcc/ChangeLog:

* config/riscv/autovec.md: Add gen_lowpart.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/vls-vlmax/vec_extract-1.c: Add
tests for variable index.
* gcc.target/riscv/rvv/autovec/vls-vlmax/vec_extract-2.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls-vlmax/vec_extract-3.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls-vlmax/vec_extract-4.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls-vlmax/vec_extract-run.c:
Ditto.
* gcc.target/riscv/rvv/autovec/vls-vlmax/vec_extract-zvfh-run.c:
Ditto.
---
 gcc/config/riscv/autovec.md   |   1 +
 .../rvv/autovec/vls-vlmax/vec_extract-1.c |  33 +++-
 .../rvv/autovec/vls-vlmax/vec_extract-2.c |  32 +++-
 .../rvv/autovec/vls-vlmax/vec_extract-3.c |  32 +++-
 .../rvv/autovec/vls-vlmax/vec_extract-4.c |  32 +++-
 .../rvv/autovec/vls-vlmax/vec_extract-run.c   | 154 ++
 .../autovec/vls-vlmax/vec_extract-zvfh-run.c  |  49 +++---
 7 files changed, 171 insertions(+), 162 deletions(-)

diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md
index 6ee1af7990e..466b27d5c49 100644
--- a/gcc/config/riscv/autovec.md
+++ b/gcc/config/riscv/autovec.md
@@ -1088,6 +1088,7 @@ (define_expand "vec_extract"
 {
   /* Emit the slide down to index 0 in a new vector.  */
   tmp = gen_reg_rtx (mode);
+  operands[2] = gen_lowpart (Pmode, operands[2]);
   rtx ops[] = {tmp, RVV_VUNDEF (mode), operands[1], operands[2]};
   riscv_vector::emit_vlmax_slide_insn
(code_for_pred_slide (UNSPEC_VSLIDEDOWN, mode), ops);
diff --git 
a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/vec_extract-1.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/vec_extract-1.c
index 9cb167a8cdc..34a82128042 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/vec_extract-1.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/vec_extract-1.c
@@ -11,7 +11,6 @@ typedef _Float16 vnx8hf __attribute__((vector_size (16)));
 typedef float vnx4sf __attribute__((vector_size (16)));
 typedef double vnx2df __attribute__((vector_size (16)));
 
-
 #define VEC_EXTRACT(S,V,IDX)   \
   S\
   __attribute__((noipa))   \
@@ -20,6 +19,14 @@ typedef double vnx2df __attribute__((vector_size (16)));
 return v[IDX]; \
   }
 
+#define VEC_EXTRACT_VAR1(S,V)  \
+  S\
+  __attribute__((noipa))   \
+  vec_extract_var_##V (V v, int8_t idx)\
+  {\
+return v[idx]; \
+  }
+
 #define TEST_ALL1(T)   \
   T (_Float16, vnx8hf, 0)  \
   T (_Float16, vnx8hf, 3)  \
@@ -43,17 +50,27 @@ typedef double vnx2df __attribute__((vector_size (16)));
   T (int8_t, vnx16qi, 11)  \
   T (int8_t, vnx16qi, 15)  \
 
+#define TEST_ALL_VAR1(T)   \
+  T (_Float16, vnx8hf) \
+  T (float, vnx4sf)\
+  T (double, vnx2df)   \
+  T (int64_t, vnx2di)  \
+  T (int32_t, vnx4si)  \
+  T (int16_t, vnx8hi)  \
+  T (int8_t, vnx16qi)  \
+
 TEST_ALL1 (VEC_EXTRACT)
+TEST_ALL_VAR1 (VEC_EXTRACT_VAR1)
 
-/* { dg-final { scan-assembler-times 
{vset[i]*vli\s+[a-z0-9,]+,\s*e8,\s*m1,\s*ta,\s*ma} 5 } } */
-/* { dg-final { scan-assembler-times 
{vset[i]*vli\s+[a-z0-9,]+,\s*e16,\s*m1,\s*ta,\s*ma} 6 } } */
-/* { dg-final { scan-assembler-times 
{vset[i]*vli\s+[a-z0-9,]+,\s*e32,\s*m1,\s*ta,\s*ma} 6 } } */
-/* { dg-final { scan-assembler-times 
{vset[i]*vli\s+[a-z0-9,]+,\s*e64,\s*m1,\s*ta,\s*ma} 4 } } */
+/* { dg-final { scan-assembler-times 
{vset[i]*vli\s+[a-z0-9,]+,\s*e8,\s*m1,\s*ta,\s*ma} 6 } } */
+/* { dg-final { scan-assembler-times 
{vset[i]*vli\s+[a-z0-9,]+,\s*e16,\s*m1,\s*ta,\s*ma} 8 } } */
+/* { dg-final { scan-assembler-times 
{vset[i]*vli\s+[a-z0-9,]+,\s*e32,\s*m1,\s*ta,\s*ma} 8 } } */
+/* { dg-final { scan-assembler-times 
{vset[i]*vli\s+[a-z0-9,]+,\s*e64,\s*m1,\s*ta,\s*ma} 6 } } */
 
 /* { dg-final { scan-assembler-times {\tvslidedown.vi} 14 } } */
-/* { dg-final { scan-assembler-times {\tvslidedown.vx} 0 } } */
+/* { dg-final { scan-assembler-times {\tvslidedown.vx} 7 } } */
 
-/* { dg-final { scan-assembler-times {\tvfmv.f.s} 8 } } */
-/* { dg-final { scan-assembler-times {\tvmv.x.s} 13 } } */
+/* { dg-final { scan-assembler-times {\tvfmv.f.s} 11 } } */
+/* { dg-final { scan-assembler-times {\tvmv.x.s} 17 } } */
 
 /* { dg-final { scan-assembler-not {\tsext} } } */
diff --git

RE: [PATCH v4] RISC-V: Fix one bug for floating-point static frm

2023-07-05 Thread Li, Pan2 via Gcc-patches
Thanks Robin for reviewing, will address the comments with PATCH v5 later as I 
am in the middle of sth.

> In riscv_mode_after the default mode is again FRM_MODE_NONE.  Wouldn't
> we also want FRM_MODE_DYN here?
All of FRM should be aligned to DYN in PATCH v4, will double check about it 
when prepare the v5.

Pan

-Original Message-
From: Robin Dapp  
Sent: Wednesday, July 5, 2023 4:03 PM
To: Li, Pan2 ; gcc-patches@gcc.gnu.org
Cc: rdapp@gmail.com; juzhe.zh...@rivai.ai; jeffreya...@gmail.com; Wang, 
Yanzhang ; kito.ch...@gmail.com
Subject: Re: [PATCH v4] RISC-V: Fix one bug for floating-point static frm

Hi Pan,

yes, the problem is fixed for me.  Still some comments ;)  Sorry
it took a while.

> 1. By default, the RVV floating-point will take dyn mode.
> 2. DYN is invalid in FRM register for RVV floating-point.
> 
> When mode switching the function entry and exit, it will take DYN as
> the frm mode.

We need to clarify this as it is misleading (even if it's just
a patch description, at least I was confused):

RVV floating-point instructions always (implicitly) use the dynamic
rounding mode.  That's IMHO not a default but rather an unchangeable
fact.  This implies that rounding is performed according to the
rounding mode set in the FRM register.  The FRM register itself
only holds proper rounding modes and never the dynamic rounding mode. 

> -  if (mode != FRM_MODE_NONE && mode != prev_mode)
> +  if (mode != FRM_MODE_DYN && mode != prev_mode)
>   {

Adding a comment like "Switching to the dynamic rounding mode is not
necessary.  When an instruction requests it, it effectively uses
the rounding mode already set in the FRM register.  All other rounding
modes require us to switch the rounding mode via the FRM register."

> -  return code >= 0 ? get_attr_frm_mode (insn) : FRM_MODE_NONE;
> +  /* According to RVV 1.0 spec, all vector floating-point operations use
> +  the dynamic rounding mode in the frm register.  */
> +  return code >= 0 ? get_attr_frm_mode (insn) : FRM_MODE_DYN;

As you reverted the previous patch get_attr_frm_mode is no longer
problematic because it returns FRM_MODE_NONE for instructions with
a dynamic rounding mode (instead of FRM_MODE_DYN).  I still find
that a bit confusing or at least halfway inconsistent and somebody
reading it will suppose something is wrong.  Could you either fix
the enum or add a TODO here that explains the situation?

The normal flow is that mode switching asks us if we need a mode
switch for an instruction and returning "NO MODE" means no.  But
we return FRM_MODE_DYN by default and FRM_MODE_NONE for vector float
which appears odd.

In riscv_mode_after the default mode is again FRM_MODE_NONE.  Wouldn't
we also want FRM_MODE_DYN here?

> @@ -7791,7 +7795,9 @@ riscv_mode_exit (int entity)
>  case RISCV_VXRM:
>return VXRM_MODE_NONE;
>  case RISCV_FRM:
> -  return FRM_MODE_NONE;
> +  /* According to RVV 1.0 spec, all vector floating-point operations use
> +  the dynamic rounding mode in the frm register.  */
> +  return FRM_MODE_DYN;

I'd rather not have the comment duplicated all over the place.  I
know I asked for it but I'd rather have it at a single spot explaining
what we need to do.

Regards
 Robin



Re: [PATCH 1/2] x86: correct / simplify @vec_extract_hi_ and vec_extract_hi_v32qi

2023-07-05 Thread Hongtao Liu via Gcc-patches
On Wed, Jul 5, 2023 at 4:55 PM Jan Beulich  wrote:
>
> On 05.07.2023 10:40, Hongtao Liu wrote:
> > On Wed, Jul 5, 2023 at 4:00 PM Jan Beulich via Gcc-patches
> >  wrote:
> >>
> >> The middle alternative each was unusable without enabling AVX512DQ (in
> >> addition to AVX512VL), which is entirely unrelated here. The last
> >> alternative is usable with AVX512VL only (due to type restrictions on
> >> what may be put in the upper 16 YMM registers), and hence is pointlessly
> >> forcing 512-bit mode (without actually reflecting that in the "mode"
> >> attribute).
> > Ok.
>
> Thanks.
>
> >> ---
> >> Like elsewhere I suspect "prefix_extra" is bogus here and should be
> >> dropped.
> >>
> >> Is "sselog1" actually appropriate here? Extracts are special forms of
> >> moves after all, not logical operations. Even "sseshuf1" would seem to
> >> come closer.
> > Honestly, I don't know why it's marked as sselog1, but looking at the
> > code,  almost all vec_extract patterns are marked as sselog1, guess
> > it's originally from pextr.
> > Agree that it's should be more close to shuffle instructions.
>
> Yet as said I think these are special forms of moves. To me "shuffle"
> involves more than one element. Yet then I don't really know what
I think if it only extracts from the low part, it's close to a move,
otherwise it's more like shuffle(shuffle the specific elements to the
low part).
I guess one possible reason it's marked as sselog1 is from port usage
perspective, it's more close to vector logic instructions?
> the "type" attributes are used for (other than vaguely "for
> scheduling"), and hence whether treating extracts as shuffles would
AFAI, it's only used by scheduling, I don't know if there're tools
based on GCC schedule model.
> be more appropriate. (IOW I'd be happy to make a patch to convert all
> extracts, but I'd need to know whether the conversion should be to
> "sseshuf", "sseshuf1", or "ssemov". In the former two cases knowing
> the "Why?" would also help, especially for writing a sensible
> description. I also haven't found any explanation towards the
> difference between sse and sse1: The "memory" attribute
> evaluates to "both" for the 1 forms if operand 1 is in memory, yet
> that doesn't seem to fit any of the uses here.)
I think sse1 only has one input operand, but sse may have
two or more.
For instruction perspective,  they're the same type, sse1 is
introduced to avoid Segment Fault in define_memory_attr which will
check operands[2] or operands[3].
(Similar for other attribute default setting)
>
> Jan




--
BR,
Hongtao


Re: [PATCH 2/2] x86: slightly correct / simplify *vec_extractv2ti

2023-07-05 Thread Hongtao Liu via Gcc-patches
On Wed, Jul 5, 2023 at 5:03 PM Jan Beulich  wrote:
>
> On 05.07.2023 10:47, Hongtao Liu wrote:
> > On Wed, Jul 5, 2023 at 4:01 PM Jan Beulich via Gcc-patches
> >  wrote:
> >>
> >> V2TImode values cannot appear in the upper 16 YMM registers without
> >> AVX512VL being enabled. Therefore forcing 512-bit mode (also not
> >> reflected in the "mode" attribute) is pointless.
> > Please set isa attribute for alternative 1 to avx512vl.
>
> Since that looks redundant to me (as per the description), would you
> mind explaining why that's necessary / wanted? It also feels orthogonal
> to the change I'm making, as there was no "isa" attribute so far (which
> would have wanted to be "avx512f" as per what you ask for, prior to the
> change I'm making). Again me asking back is primarily to properly
> describe the changes I'm making, of course along with me still needing
> to properly understand when what attribute needs specifying explicitly.
I checked ix86_hard_regno_ok, TImode/V2TImode will be allocated
with evex sse register only under TARGET_AVX512VL. otherwise
alternative 0 is matched.
So yes, no need to set isa attribute here, patch LGTM.
>
> Jan




--
BR,
Hongtao


GTY: Repair 'enum gty_token', 'token_names' desynchronization (was: [cxx-conversion] Support garbage-collected C++ templates)

2023-07-05 Thread Thomas Schwinge
Hi!

On 2012-08-10T11:06:46-0400, Diego Novillo  wrote:
>  * gengtype-lex.l (USER_GTY): Add pattern for "user".
>  * gengtype-parse.c (option): Handle USER_GTY.
>  (opts_have): New.
>  (type): Call it.
>  If the keyword 'user' is used, do not walk the fields
>  of the structure.
>  * gengtype.h (USER_GTY): Add.

These changes got incorporated in
commit 0823efedd0fb8669b7e840954bc54c3b2cf08d67 (Subversion r190402).

> --- a/gcc/gengtype-lex.l
> +++ b/gcc/gengtype-lex.l
> @@ -108,6 +108,7 @@ EOID  [^[:alnum:]_]
>   "enum"/{EOID}   { return ENUM; }
>   "ptr_alias"/{EOID}  { return PTR_ALIAS; }
>   "nested_ptr"/{EOID} { return NESTED_PTR; }
> +"user"/{EOID}{ return USER_GTY; }
>   [0-9]+  { return NUM; }
>   "param"[0-9]*"_is"/{EOID}   {
> *yylval = XDUPVAR (const char, yytext, yyleng, yyleng+1);

> --- a/gcc/gengtype-parse.c
> +++ b/gcc/gengtype-parse.c
> @@ -499,6 +499,10 @@ option (options_p prev)
> [...]

> --- a/gcc/gengtype.h
> +++ b/gcc/gengtype.h
> @@ -463,6 +463,7 @@ enum
>   ELLIPSIS,
>   PTR_ALIAS,
>   NESTED_PTR,
> +USER_GTY,
>   PARAM_IS,
>   NUM,
>   SCALAR,

This did add 'USER_GTY' to what nowadays is known as 'enum gty_token',
but didn't accordingly update 'gcc/gengtype-parse.c:token_names', leaving
those out of sync.  Updating 'gcc/gengtype-parse.c:token_value_format'
wasn't necessary, as:

/* print_token assumes that any token >= FIRST_TOKEN_WITH_VALUE may have
   a meaningful value to be printed.  */
FIRST_TOKEN_WITH_VALUE = PARAM_IS

This, in turn, got further confused -- or "fixed" -- by later changes:
2014 commit 63f5d5b818319129217e41bcb23db53f99ff11b0 (Subversion r218558)
"remove gengtype support for param_is use_param, if_marked and splay tree 
allocators",
which reciprocally missed corresponding clean-up.

OK to push the attached
"GTY: Repair 'enum gty_token', 'token_names' desynchronization"?


On top of that, I'll then re-submit an adjusted

"GTY: Clean up obsolete parametrized structs remnants".


Grüße
 Thomas


-
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 
München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas 
Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht 
München, HRB 106955
>From 8d2b040e825acdcddb7e1ff991fd538db13392f2 Mon Sep 17 00:00:00 2001
From: Thomas Schwinge 
Date: Wed, 5 Jul 2023 11:10:55 +0200
Subject: [PATCH] GTY: Repair 'enum gty_token', 'token_names' desynchronization

For example, for the following (made-up) changes:

--- gcc/ggc-tests.cc
+++ gcc/ggc-tests.cc
@@ -258 +258 @@ class GTY((tag("1"))) some_subclass : public example_base
-class GTY((tag("2"))) some_other_subclass : public example_base
+class GTY((tag(user))) some_other_subclass : public example_base
@@ -384 +384 @@ test_chain_next ()
-struct GTY((user)) user_struct
+struct GTY((user user)) user_struct

..., we get unexpected "have a param_is option" diagnostics:

[...]
build/gengtype  \
-S [...]/source-gcc/gcc -I gtyp-input.list -w tmp-gtype.state
[...]/source-gcc/gcc/ggc-tests.cc:258: parse error: expected a string constant, have a param_is option
[...]/source-gcc/gcc/ggc-tests.cc:384: parse error: expected ')', have a param_is option
make[2]: *** [Makefile:2888: s-gtype] Error 1
[...]

This traces back to 2012 "Support garbage-collected C++ templates", which got
incorporated in commit 0823efedd0fb8669b7e840954bc54c3b2cf08d67
(Subversion r190402), which did add 'USER_GTY' to what nowadays is known as
'enum gty_token', but didn't accordingly update
'gcc/gengtype-parse.c:token_names', leaving those out of sync.  Updating
'gcc/gengtype-parse.c:token_value_format' wasn't necessary, as:

/* print_token assumes that any token >= FIRST_TOKEN_WITH_VALUE may have
   a meaningful value to be printed.  */
FIRST_TOKEN_WITH_VALUE = PARAM_IS

This, in turn, got further confused -- or "fixed" -- by later changes:
2014 commit 63f5d5b818319129217e41bcb23db53f99ff11b0 (Subversion r218558)
"remove gengtype support for param_is use_param, if_marked and splay tree allocators",
which reciprocally missed corresponding clean-up.

With that addressed via adding the missing '"user"' to 'token_names', and,
until that is properly fixed, a temporary 'UNUSED_PARAM_IS' (re-)added for use
with 'FIRST_TOKEN_WITH_VALUE', we then get the expected:

[...]/source-gcc/gcc/ggc-tests.cc:258: parse error: expected a string constant, have 'user'
[...]/source-gcc/gcc/ggc-tests.cc:384: parse error: expected ')', have 'user'

	gcc/
	* gengtype-parse.cc (token_names): Add '"user"'.
	* gengtype.h (gty_token): Add 'UNUSED_PARAM_IS' for use with
	'FIRST_TOKEN_WITH_VALUE'.
---
 gcc/gengtype-par

Re: [PATCH 2/2] x86: slightly correct / simplify *vec_extractv2ti

2023-07-05 Thread Hongtao Liu via Gcc-patches
On Wed, Jul 5, 2023 at 6:22 PM Hongtao Liu  wrote:
>
> On Wed, Jul 5, 2023 at 5:03 PM Jan Beulich  wrote:
> >
> > On 05.07.2023 10:47, Hongtao Liu wrote:
> > > On Wed, Jul 5, 2023 at 4:01 PM Jan Beulich via Gcc-patches
> > >  wrote:
> > >>
> > >> V2TImode values cannot appear in the upper 16 YMM registers without
> > >> AVX512VL being enabled. Therefore forcing 512-bit mode (also not
> > >> reflected in the "mode" attribute) is pointless.
> > > Please set isa attribute for alternative 1 to avx512vl.
> >
> > Since that looks redundant to me (as per the description), would you
> > mind explaining why that's necessary / wanted? It also feels orthogonal
> > to the change I'm making, as there was no "isa" attribute so far (which
> > would have wanted to be "avx512f" as per what you ask for, prior to the
> > change I'm making). Again me asking back is primarily to properly
> > describe the changes I'm making, of course along with me still needing
> > to properly understand when what attribute needs specifying explicitly.
It's decided by many factors: instruction isa requirement, possible
register allocation for the alternative, also how
recog_memoized(constrain_operands) decide which_alternative.
For *vec_extractv2ti the alternative is implicitly guarded by
ix86_hard_regno_ok and no need for explicit isa attribute.
> I checked ix86_hard_regno_ok, TImode/V2TImode will be allocated
> with evex sse register only under TARGET_AVX512VL. otherwise
> alternative 0 is matched.
> So yes, no need to set isa attribute here, patch LGTM.
> >
> > Jan
>
>
>
>
> --
> BR,
> Hongtao



-- 
BR,
Hongtao


RE: [PATCH V5] VECT: Apply LEN_MASK_GATHER_LOAD/SCATTER_STORE into vectorizer

2023-07-05 Thread Li, Pan2 via Gcc-patches
Passed both the bootstrap and regression tests in X86.

Pan

-Original Message-
From: Gcc-patches  On Behalf 
Of juzhe.zh...@rivai.ai
Sent: Tuesday, July 4, 2023 9:10 PM
To: gcc-patches@gcc.gnu.org
Cc: richard.sandif...@arm.com; rguent...@suse.de; Ju-Zhe Zhong 

Subject: [PATCH V5] VECT: Apply LEN_MASK_GATHER_LOAD/SCATTER_STORE into 
vectorizer

From: Ju-Zhe Zhong 

Hi, Richard and Richi.

Address comments from Richi.

Make gs_info.ifn = LEN_MASK_GATHER_LOAD/LEN_MASK_SCATTER_STORE.

I have fully tested these 4 format:

length = vf is a dummpy length,
mask = {-1,-1, ... } is a dummy mask.

1. no length, no mask
   LEN_MASK_GATHER_LOAD (..., length = vf, mask = {-1,-1,...})
2. exist length, no mask
   LEN_MASK_GATHER_LOAD (..., len, mask = {-1,-1,...})
3. exist mask, no length
   LEN_MASK_GATHER_LOAD (..., length = vf, mask)
4. both mask and length exist
   LEN_MASK_GATHER_LOAD (..., length, mask)

All of these work fine in this patch.

Here is the example:

void
f (int *restrict a,
   int *restrict b, int n,
   int base, int step,
   int *restrict cond)
{
  for (int i = 0; i < n; ++i)
{
  if (cond[i])
a[i * 4] = b[i];
}
}

Gimple IR:

   [local count: 105119324]:
  _58 = (unsigned long) n_13(D);

   [local count: 630715945]:
  # vectp_cond.7_45 = PHI 
  # vectp_b.11_51 = PHI 
  # vectp_a.14_55 = PHI 
  # ivtmp_59 = PHI 
  _61 = .SELECT_VL (ivtmp_59, POLY_INT_CST [2, 2]);
  ivtmp_44 = _61 * 4;
  vect__4.9_47 = .LEN_MASK_LOAD (vectp_cond.7_45, 32B, _61, 0, { -1, ... });
  mask__24.10_49 = vect__4.9_47 != { 0, ... };
  vect__8.13_53 = .LEN_MASK_LOAD (vectp_b.11_51, 32B, _61, 0, mask__24.10_49);
  ivtmp_54 = _61 * 16;
  .LEN_MASK_SCATTER_STORE (vectp_a.14_55, { 0, 16, 32, ... }, 1, vect__8.13_53, 
_61, 0, mask__24.10_49);
  vectp_cond.7_46 = vectp_cond.7_45 + ivtmp_44;
  vectp_b.11_52 = vectp_b.11_51 + ivtmp_44;
  vectp_a.14_56 = vectp_a.14_55 + ivtmp_54;
  ivtmp_60 = ivtmp_59 - _61;
  if (ivtmp_60 != 0)
goto ; [83.33%]
  else
goto ; [16.67%]

Ok for trunk ?

gcc/ChangeLog:

* internal-fn.cc (internal_fn_len_index): Apply 
LEN_MASK_GATHER_LOAD/SCATTER_STORE into vectorizer.
(internal_fn_mask_index): Ditto.
* optabs-query.cc (supports_vec_gather_load_p): Ditto.
(supports_vec_scatter_store_p): Ditto.
* tree-vect-data-refs.cc (vect_gather_scatter_fn_p): Ditto.
* tree-vect-patterns.cc (vect_recog_gather_scatter_pattern): Ditto.
* tree-vect-stmts.cc (check_load_store_for_partial_vectors): Ditto.
(vect_get_strided_load_store_ops): Ditto.
(vectorizable_store): Ditto.
(vectorizable_load): Ditto.

---
 gcc/internal-fn.cc |   6 +-
 gcc/optabs-query.cc|   2 +
 gcc/tree-vect-data-refs.cc |  18 +-
 gcc/tree-vect-patterns.cc  |   4 +-
 gcc/tree-vect-stmts.cc | 122 +++--
 5 files changed, 129 insertions(+), 23 deletions(-)

diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
index 303df102d81..bec60cdf4d0 100644
--- a/gcc/internal-fn.cc
+++ b/gcc/internal-fn.cc
@@ -4472,7 +4472,7 @@ internal_fn_len_index (internal_fn fn)
 
 case IFN_LEN_MASK_GATHER_LOAD:
 case IFN_LEN_MASK_SCATTER_STORE:
-  return 4;
+  return 5;
 
 default:
   return -1;
@@ -4497,11 +4497,9 @@ internal_fn_mask_index (internal_fn fn)
 case IFN_MASK_SCATTER_STORE:
 case IFN_LEN_MASK_LOAD:
 case IFN_LEN_MASK_STORE:
-  return 4;
-
 case IFN_LEN_MASK_GATHER_LOAD:
 case IFN_LEN_MASK_SCATTER_STORE:
-  return 6;
+  return 4;
 
 default:
   return (conditional_internal_fn_code (fn) != ERROR_MARK
diff --git a/gcc/optabs-query.cc b/gcc/optabs-query.cc
index 2fdd0d34354..bf1f484e874 100644
--- a/gcc/optabs-query.cc
+++ b/gcc/optabs-query.cc
@@ -676,6 +676,7 @@ supports_vec_gather_load_p (machine_mode mode)
 this_fn_optabs->supports_vec_gather_load[mode]
   = (supports_vec_convert_optab_p (gather_load_optab, mode)
 || supports_vec_convert_optab_p (mask_gather_load_optab, mode)
+|| supports_vec_convert_optab_p (len_mask_gather_load_optab, mode)
 ? 1 : -1);
 
   return this_fn_optabs->supports_vec_gather_load[mode] > 0;
@@ -692,6 +693,7 @@ supports_vec_scatter_store_p (machine_mode mode)
 this_fn_optabs->supports_vec_scatter_store[mode]
   = (supports_vec_convert_optab_p (scatter_store_optab, mode)
 || supports_vec_convert_optab_p (mask_scatter_store_optab, mode)
+|| supports_vec_convert_optab_p (len_mask_scatter_store_optab, mode)
 ? 1 : -1);
 
   return this_fn_optabs->supports_vec_scatter_store[mode] > 0;
diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
index ebe93832b1e..ab2af103cb4 100644
--- a/gcc/tree-vect-data-refs.cc
+++ b/gcc/tree-vect-data-refs.cc
@@ -3873,16 +3873,24 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, 
bool masked_p,
 return false;
 
   /* Work out which function we need.  */
-  internal_fn ifn, alt_ifn;
+  internal_fn

Re: RE: [PATCH V5] VECT: Apply LEN_MASK_GATHER_LOAD/SCATTER_STORE into vectorizer

2023-07-05 Thread juzhe.zh...@rivai.ai
Thank you for using intel's machines test it for me.



juzhe.zh...@rivai.ai
 
From: Li, Pan2
Date: 2023-07-05 19:15
To: juzhe.zh...@rivai.ai; gcc-patches@gcc.gnu.org
CC: richard.sandif...@arm.com; rguent...@suse.de
Subject: RE: [PATCH V5] VECT: Apply LEN_MASK_GATHER_LOAD/SCATTER_STORE into 
vectorizer
Passed both the bootstrap and regression tests in X86.
 
Pan
 
-Original Message-
From: Gcc-patches  On Behalf 
Of juzhe.zh...@rivai.ai
Sent: Tuesday, July 4, 2023 9:10 PM
To: gcc-patches@gcc.gnu.org
Cc: richard.sandif...@arm.com; rguent...@suse.de; Ju-Zhe Zhong 

Subject: [PATCH V5] VECT: Apply LEN_MASK_GATHER_LOAD/SCATTER_STORE into 
vectorizer
 
From: Ju-Zhe Zhong 
 
Hi, Richard and Richi.
 
Address comments from Richi.
 
Make gs_info.ifn = LEN_MASK_GATHER_LOAD/LEN_MASK_SCATTER_STORE.
 
I have fully tested these 4 format:
 
length = vf is a dummpy length,
mask = {-1,-1, ... } is a dummy mask.
 
1. no length, no mask
   LEN_MASK_GATHER_LOAD (..., length = vf, mask = {-1,-1,...})
2. exist length, no mask
   LEN_MASK_GATHER_LOAD (..., len, mask = {-1,-1,...})
3. exist mask, no length
   LEN_MASK_GATHER_LOAD (..., length = vf, mask)
4. both mask and length exist
   LEN_MASK_GATHER_LOAD (..., length, mask)
 
All of these work fine in this patch.
 
Here is the example:
 
void
f (int *restrict a,
   int *restrict b, int n,
   int base, int step,
   int *restrict cond)
{
  for (int i = 0; i < n; ++i)
{
  if (cond[i])
a[i * 4] = b[i];
}
}
 
Gimple IR:
 
   [local count: 105119324]:
  _58 = (unsigned long) n_13(D);
 
   [local count: 630715945]:
  # vectp_cond.7_45 = PHI 
  # vectp_b.11_51 = PHI 
  # vectp_a.14_55 = PHI 
  # ivtmp_59 = PHI 
  _61 = .SELECT_VL (ivtmp_59, POLY_INT_CST [2, 2]);
  ivtmp_44 = _61 * 4;
  vect__4.9_47 = .LEN_MASK_LOAD (vectp_cond.7_45, 32B, _61, 0, { -1, ... });
  mask__24.10_49 = vect__4.9_47 != { 0, ... };
  vect__8.13_53 = .LEN_MASK_LOAD (vectp_b.11_51, 32B, _61, 0, mask__24.10_49);
  ivtmp_54 = _61 * 16;
  .LEN_MASK_SCATTER_STORE (vectp_a.14_55, { 0, 16, 32, ... }, 1, vect__8.13_53, 
_61, 0, mask__24.10_49);
  vectp_cond.7_46 = vectp_cond.7_45 + ivtmp_44;
  vectp_b.11_52 = vectp_b.11_51 + ivtmp_44;
  vectp_a.14_56 = vectp_a.14_55 + ivtmp_54;
  ivtmp_60 = ivtmp_59 - _61;
  if (ivtmp_60 != 0)
goto ; [83.33%]
  else
goto ; [16.67%]
 
Ok for trunk ?
 
gcc/ChangeLog:
 
* internal-fn.cc (internal_fn_len_index): Apply 
LEN_MASK_GATHER_LOAD/SCATTER_STORE into vectorizer.
(internal_fn_mask_index): Ditto.
* optabs-query.cc (supports_vec_gather_load_p): Ditto.
(supports_vec_scatter_store_p): Ditto.
* tree-vect-data-refs.cc (vect_gather_scatter_fn_p): Ditto.
* tree-vect-patterns.cc (vect_recog_gather_scatter_pattern): Ditto.
* tree-vect-stmts.cc (check_load_store_for_partial_vectors): Ditto.
(vect_get_strided_load_store_ops): Ditto.
(vectorizable_store): Ditto.
(vectorizable_load): Ditto.
 
---
gcc/internal-fn.cc |   6 +-
gcc/optabs-query.cc|   2 +
gcc/tree-vect-data-refs.cc |  18 +-
gcc/tree-vect-patterns.cc  |   4 +-
gcc/tree-vect-stmts.cc | 122 +++--
5 files changed, 129 insertions(+), 23 deletions(-)
 
diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
index 303df102d81..bec60cdf4d0 100644
--- a/gcc/internal-fn.cc
+++ b/gcc/internal-fn.cc
@@ -4472,7 +4472,7 @@ internal_fn_len_index (internal_fn fn)
 case IFN_LEN_MASK_GATHER_LOAD:
 case IFN_LEN_MASK_SCATTER_STORE:
-  return 4;
+  return 5;
 default:
   return -1;
@@ -4497,11 +4497,9 @@ internal_fn_mask_index (internal_fn fn)
 case IFN_MASK_SCATTER_STORE:
 case IFN_LEN_MASK_LOAD:
 case IFN_LEN_MASK_STORE:
-  return 4;
-
 case IFN_LEN_MASK_GATHER_LOAD:
 case IFN_LEN_MASK_SCATTER_STORE:
-  return 6;
+  return 4;
 default:
   return (conditional_internal_fn_code (fn) != ERROR_MARK
diff --git a/gcc/optabs-query.cc b/gcc/optabs-query.cc
index 2fdd0d34354..bf1f484e874 100644
--- a/gcc/optabs-query.cc
+++ b/gcc/optabs-query.cc
@@ -676,6 +676,7 @@ supports_vec_gather_load_p (machine_mode mode)
 this_fn_optabs->supports_vec_gather_load[mode]
   = (supports_vec_convert_optab_p (gather_load_optab, mode)
|| supports_vec_convert_optab_p (mask_gather_load_optab, mode)
+ || supports_vec_convert_optab_p (len_mask_gather_load_optab, mode)
? 1 : -1);
   return this_fn_optabs->supports_vec_gather_load[mode] > 0;
@@ -692,6 +693,7 @@ supports_vec_scatter_store_p (machine_mode mode)
 this_fn_optabs->supports_vec_scatter_store[mode]
   = (supports_vec_convert_optab_p (scatter_store_optab, mode)
|| supports_vec_convert_optab_p (mask_scatter_store_optab, mode)
+ || supports_vec_convert_optab_p (len_mask_scatter_store_optab, mode)
? 1 : -1);
   return this_fn_optabs->supports_vec_scatter_store[mode] > 0;
diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
index ebe93832b1e..ab2af103cb4 100644
--- a/gcc/t

[PATCH] rs6000: Don't ICE when generating vector pair load/store insns [PR110411]

2023-07-05 Thread P Jeevitha via Gcc-patches
Hi All,

The following patch has been bootstrapped and regtested on powerpc64le-linux.

while generating vector pairs of load & store instruction, the src address
was treated as an altivec type and that type of address is invalid for 
lxvp and stxvp insns. The solution for this is to avoid altivec type address
for OOmode and XOmode.

2023-07-05  Jeevitha Palanisamy  

gcc/
PR target/110411
* config/rs6000/rs6000.cc (rs6000_legitimate_address_p): Avoid altivec
address for OOmode and XOmde.

gcc/testsuite/
PR target/110411
* gcc.target/powerpc/pr110411.c: New testcase.

diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc
index 07c3a3d15ac..b914c65e5c9 100644
--- a/gcc/config/rs6000/rs6000.cc
+++ b/gcc/config/rs6000/rs6000.cc
@@ -9894,6 +9894,8 @@ rs6000_legitimate_address_p (machine_mode mode, rtx x, 
bool reg_ok_strict)
 
   /* Handle unaligned altivec lvx/stvx type addresses.  */
   if (VECTOR_MEM_ALTIVEC_OR_VSX_P (mode)
+  && mode !=  OOmode
+  && mode !=  XOmode
   && GET_CODE (x) == AND
   && CONST_INT_P (XEXP (x, 1))
   && INTVAL (XEXP (x, 1)) == -16)
diff --git a/gcc/testsuite/gcc.target/powerpc/pr110411.c 
b/gcc/testsuite/gcc.target/powerpc/pr110411.c
new file mode 100644
index 000..83ef0638fb2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr110411.c
@@ -0,0 +1,21 @@
+/* PR target/110411 */
+/* { dg-options "-O2 -mdejagnu-cpu=power10 -S -mblock-ops-vector-pair" } */
+
+/* Verify we do not ICE on the following.  */
+
+#include 
+
+struct s {
+  long a;
+  long b;
+  long c;
+  long d: 1;
+};
+unsigned long ptr;
+
+void
+foo (struct s *dst)
+{
+  struct s *src = (struct s *)(ptr & ~0xFUL);
+  memcpy (dst, src, sizeof(struct s));
+}



Re: [PATCH v1 0/6] Add Loongson SX/ASX instruction support to LoongArch target.

2023-07-05 Thread Xi Ruoyao via Gcc-patches
A question: is vld/vst guaranteed to be atomic if the accessed address
is aligned?  If true we can use them to implement lock-free 128-bit
atomic load and store.  See https://gcc.gnu.org/bugzilla/PR104688 for
the background, and some people really hate using a lock for atomics.

On Fri, 2023-06-30 at 10:16 +0800, Chenghui Pan wrote:
> These patches add the Loongson SX/ASX instruction support to the
> LoongArch
> target, and can be utilized by using the new "-mlsx" and
> "-mlasx" option.
> 
> Patches are bootstrapped and tested on loongarch64-linux-gnu target.
> 
> Lulu Cheng (6):
>   LoongArch: Added Loongson SX vector directive compilation framework.
>   LoongArch: Added Loongson SX base instruction support.
>   LoongArch: Added Loongson SX directive builtin function support.
>   LoongArch: Added Loongson ASX vector directive compilation
> framework.
>   LoongArch: Added Loongson ASX base instruction support.
>   LoongArch: Added Loongson ASX directive builtin function support.
> 
>  gcc/config.gcc    |    2 +-
>  gcc/config/loongarch/constraints.md   |  128 +-
>  .../loongarch/genopts/loongarch-strings   |    4 +
>  gcc/config/loongarch/genopts/loongarch.opt.in |   16 +-
>  gcc/config/loongarch/lasx.md  | 5147 
>  gcc/config/loongarch/lasxintrin.h | 5342
> +
>  gcc/config/loongarch/loongarch-builtins.cc    | 2686 -
>  gcc/config/loongarch/loongarch-c.cc   |   18 +
>  gcc/config/loongarch/loongarch-def.c  |    6 +
>  gcc/config/loongarch/loongarch-def.h  |    9 +-
>  gcc/config/loongarch/loongarch-driver.cc  |   10 +
>  gcc/config/loongarch/loongarch-driver.h   |    2 +
>  gcc/config/loongarch/loongarch-ftypes.def |  666 +-
>  gcc/config/loongarch/loongarch-modes.def  |   39 +
>  gcc/config/loongarch/loongarch-opts.cc    |   89 +-
>  gcc/config/loongarch/loongarch-opts.h |    3 +
>  gcc/config/loongarch/loongarch-protos.h   |   35 +
>  gcc/config/loongarch/loongarch-str.h  |    3 +
>  gcc/config/loongarch/loongarch.cc | 4615 +-
>  gcc/config/loongarch/loongarch.h  |  117 +-
>  gcc/config/loongarch/loongarch.md |   56 +-
>  gcc/config/loongarch/loongarch.opt    |   16 +-
>  gcc/config/loongarch/lsx.md   | 4490 ++
>  gcc/config/loongarch/lsxintrin.h  | 5181 
>  gcc/config/loongarch/predicates.md    |  333 +-
>  25 files changed, 28723 insertions(+), 290 deletions(-)
>  create mode 100644 gcc/config/loongarch/lasx.md
>  create mode 100644 gcc/config/loongarch/lasxintrin.h
>  create mode 100644 gcc/config/loongarch/lsx.md
>  create mode 100644 gcc/config/loongarch/lsxintrin.h
> 

-- 
Xi Ruoyao 
School of Aerospace Science and Technology, Xidian University


[PATCH] RISC-V: Change truncate to float_truncate in narrowing

2023-07-05 Thread Robin Dapp via Gcc-patches
Hi,

Juzhe noticed that several floating-point conversion tests
FAIL on 32 bit.  This is due to the autovect FP narrowing patterns
using a truncate instead of a float_truncate which results in
a combine ICE.  It would try to e.g. simplify a unary operation by
simplify_const_unary_operation which obviously expects a float_truncate
and not a truncate for a floating-point mode.

Regards
 Robin

gcc/ChangeLog:

* config/riscv/autovec.md: Use float_truncate.
---
 gcc/config/riscv/autovec.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md
index 466b27d5c49..3884dfc363c 100644
--- a/gcc/config/riscv/autovec.md
+++ b/gcc/config/riscv/autovec.md
@@ -473,7 +473,7 @@ (define_expand "extend2"
 ;; -
 (define_insn_and_split "trunc2"
   [(set (match_operand: 0 "register_operand" "=vr")
-(truncate:
+(float_truncate:
  (match_operand:VWEXTF_ZVFHMIN 1 "register_operand"  " vr")))]
   "TARGET_VECTOR && can_create_pseudo_p ()"
   "#"
@@ -493,7 +493,7 @@ (define_insn_and_split "trunc2"
 ;; -
 (define_expand "trunc2"
   [(set (match_operand: 0 "register_operand")
-(truncate:
+(float_truncate:
  (match_operand:VQEXTF 1 "register_operand")))]
   "TARGET_VECTOR && (TARGET_ZVFHMIN || TARGET_ZVFH)"
 {
-- 
2.41.0



Re: [PATCH] RISC-V: Change truncate to float_truncate in narrowing

2023-07-05 Thread juzhe.zh...@rivai.ai
LGTM. Thanks for fixing this.



juzhe.zh...@rivai.ai
 
From: Robin Dapp
Date: 2023-07-05 21:00
To: gcc-patches; palmer; Kito Cheng; juzhe.zh...@rivai.ai; jeffreyalaw
CC: rdapp.gcc
Subject: [PATCH] RISC-V: Change truncate to float_truncate in narrowing
Hi,
 
Juzhe noticed that several floating-point conversion tests
FAIL on 32 bit.  This is due to the autovect FP narrowing patterns
using a truncate instead of a float_truncate which results in
a combine ICE.  It would try to e.g. simplify a unary operation by
simplify_const_unary_operation which obviously expects a float_truncate
and not a truncate for a floating-point mode.
 
Regards
Robin
 
gcc/ChangeLog:
 
* config/riscv/autovec.md: Use float_truncate.
---
gcc/config/riscv/autovec.md | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
 
diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md
index 466b27d5c49..3884dfc363c 100644
--- a/gcc/config/riscv/autovec.md
+++ b/gcc/config/riscv/autovec.md
@@ -473,7 +473,7 @@ (define_expand "extend2"
;; -
(define_insn_and_split "trunc2"
   [(set (match_operand: 0 "register_operand" "=vr")
-(truncate:
+(float_truncate:
  (match_operand:VWEXTF_ZVFHMIN 1 "register_operand"  " vr")))]
   "TARGET_VECTOR && can_create_pseudo_p ()"
   "#"
@@ -493,7 +493,7 @@ (define_insn_and_split "trunc2"
;; -
(define_expand "trunc2"
   [(set (match_operand: 0 "register_operand")
-(truncate:
+(float_truncate:
  (match_operand:VQEXTF 1 "register_operand")))]
   "TARGET_VECTOR && (TARGET_ZVFHMIN || TARGET_ZVFH)"
{
-- 
2.41.0
 
 


Re: [PATCH V5] VECT: Apply LEN_MASK_GATHER_LOAD/SCATTER_STORE into vectorizer

2023-07-05 Thread Richard Biener via Gcc-patches
On Tue, 4 Jul 2023, juzhe.zh...@rivai.ai wrote:

> From: Ju-Zhe Zhong 
> 
> Hi, Richard and Richi.
> 
> Address comments from Richi.
> 
> Make gs_info.ifn = LEN_MASK_GATHER_LOAD/LEN_MASK_SCATTER_STORE.
> 
> I have fully tested these 4 format:
> 
> length = vf is a dummpy length,
> mask = {-1,-1, ... } is a dummy mask.
> 
> 1. no length, no mask
>LEN_MASK_GATHER_LOAD (..., length = vf, mask = {-1,-1,...})
> 2. exist length, no mask
>LEN_MASK_GATHER_LOAD (..., len, mask = {-1,-1,...})
> 3. exist mask, no length
>LEN_MASK_GATHER_LOAD (..., length = vf, mask)
> 4. both mask and length exist
>LEN_MASK_GATHER_LOAD (..., length, mask)
> 
> All of these work fine in this patch.
> 
> Here is the example:
> 
> void
> f (int *restrict a,
>int *restrict b, int n,
>int base, int step,
>int *restrict cond)
> {
>   for (int i = 0; i < n; ++i)
> {
>   if (cond[i])
> a[i * 4] = b[i];
> }
> }
> 
> Gimple IR:
> 
>[local count: 105119324]:
>   _58 = (unsigned long) n_13(D);
> 
>[local count: 630715945]:
>   # vectp_cond.7_45 = PHI 
>   # vectp_b.11_51 = PHI 
>   # vectp_a.14_55 = PHI 
>   # ivtmp_59 = PHI 
>   _61 = .SELECT_VL (ivtmp_59, POLY_INT_CST [2, 2]);
>   ivtmp_44 = _61 * 4;
>   vect__4.9_47 = .LEN_MASK_LOAD (vectp_cond.7_45, 32B, _61, 0, { -1, ... });
>   mask__24.10_49 = vect__4.9_47 != { 0, ... };
>   vect__8.13_53 = .LEN_MASK_LOAD (vectp_b.11_51, 32B, _61, 0, mask__24.10_49);
>   ivtmp_54 = _61 * 16;
>   .LEN_MASK_SCATTER_STORE (vectp_a.14_55, { 0, 16, 32, ... }, 1, 
> vect__8.13_53, _61, 0, mask__24.10_49);
>   vectp_cond.7_46 = vectp_cond.7_45 + ivtmp_44;
>   vectp_b.11_52 = vectp_b.11_51 + ivtmp_44;
>   vectp_a.14_56 = vectp_a.14_55 + ivtmp_54;
>   ivtmp_60 = ivtmp_59 - _61;
>   if (ivtmp_60 != 0)
> goto ; [83.33%]
>   else
> goto ; [16.67%]
> 
> Ok for trunk ?

I think it matches the spirit of the existing code, thus OK.

Thanks,
Richard.

> gcc/ChangeLog:
> 
> * internal-fn.cc (internal_fn_len_index): Apply 
> LEN_MASK_GATHER_LOAD/SCATTER_STORE into vectorizer.
> (internal_fn_mask_index): Ditto.
> * optabs-query.cc (supports_vec_gather_load_p): Ditto.
> (supports_vec_scatter_store_p): Ditto.
> * tree-vect-data-refs.cc (vect_gather_scatter_fn_p): Ditto.
> * tree-vect-patterns.cc (vect_recog_gather_scatter_pattern): Ditto.
> * tree-vect-stmts.cc (check_load_store_for_partial_vectors): Ditto.
> (vect_get_strided_load_store_ops): Ditto.
> (vectorizable_store): Ditto.
> (vectorizable_load): Ditto.
> 
> ---
>  gcc/internal-fn.cc |   6 +-
>  gcc/optabs-query.cc|   2 +
>  gcc/tree-vect-data-refs.cc |  18 +-
>  gcc/tree-vect-patterns.cc  |   4 +-
>  gcc/tree-vect-stmts.cc | 122 +++--
>  5 files changed, 129 insertions(+), 23 deletions(-)
> 
> diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> index 303df102d81..bec60cdf4d0 100644
> --- a/gcc/internal-fn.cc
> +++ b/gcc/internal-fn.cc
> @@ -4472,7 +4472,7 @@ internal_fn_len_index (internal_fn fn)
>  
>  case IFN_LEN_MASK_GATHER_LOAD:
>  case IFN_LEN_MASK_SCATTER_STORE:
> -  return 4;
> +  return 5;
>  
>  default:
>return -1;
> @@ -4497,11 +4497,9 @@ internal_fn_mask_index (internal_fn fn)
>  case IFN_MASK_SCATTER_STORE:
>  case IFN_LEN_MASK_LOAD:
>  case IFN_LEN_MASK_STORE:
> -  return 4;
> -
>  case IFN_LEN_MASK_GATHER_LOAD:
>  case IFN_LEN_MASK_SCATTER_STORE:
> -  return 6;
> +  return 4;
>  
>  default:
>return (conditional_internal_fn_code (fn) != ERROR_MARK
> diff --git a/gcc/optabs-query.cc b/gcc/optabs-query.cc
> index 2fdd0d34354..bf1f484e874 100644
> --- a/gcc/optabs-query.cc
> +++ b/gcc/optabs-query.cc
> @@ -676,6 +676,7 @@ supports_vec_gather_load_p (machine_mode mode)
>  this_fn_optabs->supports_vec_gather_load[mode]
>= (supports_vec_convert_optab_p (gather_load_optab, mode)
>|| supports_vec_convert_optab_p (mask_gather_load_optab, mode)
> +  || supports_vec_convert_optab_p (len_mask_gather_load_optab, mode)
>? 1 : -1);
>  
>return this_fn_optabs->supports_vec_gather_load[mode] > 0;
> @@ -692,6 +693,7 @@ supports_vec_scatter_store_p (machine_mode mode)
>  this_fn_optabs->supports_vec_scatter_store[mode]
>= (supports_vec_convert_optab_p (scatter_store_optab, mode)
>|| supports_vec_convert_optab_p (mask_scatter_store_optab, mode)
> +  || supports_vec_convert_optab_p (len_mask_scatter_store_optab, mode)
>? 1 : -1);
>  
>return this_fn_optabs->supports_vec_scatter_store[mode] > 0;
> diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
> index ebe93832b1e..ab2af103cb4 100644
> --- a/gcc/tree-vect-data-refs.cc
> +++ b/gcc/tree-vect-data-refs.cc
> @@ -3873,16 +3873,24 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool 
> read_p, bool masked_p,
>  return false;
>  
>/* Work out which function we need

Re: [PATCH][RFC] target/110456 - avoid loop masking with zero distance dependences

2023-07-05 Thread Richard Biener via Gcc-patches
On Tue, 4 Jul 2023, Richard Sandiford wrote:

> Richard Biener  writes:
> > On Thu, 29 Jun 2023, Richard Biener wrote:
> >
> >> On Thu, 29 Jun 2023, Richard Sandiford wrote:
> >> 
> >> > Richard Biener  writes:
> >> > > With applying loop masking to epilogues on x86_64 AVX512 we see
> >> > > some significant performance regressions when evaluating SPEC CPU 2017
> >> > > that are caused by store-to-load forwarding fails across outer
> >> > > loop iterations when the inner loop does not iterate.  Consider
> >> > >
> >> > >   for (j = 0; j < m; ++j)
> >> > > for (i = 0; i < n; ++i)
> >> > >   a[j*n + i] += b[j*n + i];
> >> > >
> >> > > with 'n' chosen so that the inner loop vectorized code is fully
> >> > > executed by the masked epilogue and that masked epilogue
> >> > > storing O > n elements (with elements >= n masked of course).
> >> > > Then the masked load performed for the next outer loop iteration
> >> > > will get a hit in the store queue but it obviously cannot forward
> >> > > so we have to wait for the store to retire.
> >> > >
> >> > > That causes a significant hit to performance especially if 'n'
> >> > > would have made a non-masked epilogue to fully cover 'n' as well
> >> > > (say n == 4 for a V4DImode epilogue), avoiding the need for
> >> > > store-forwarding and waiting for the retiring of the store.
> >> > >
> >> > > The following applies a very simple heuristic, disabling
> >> > > the use of loop masking when there's a memory reference pair
> >> > > with dependence distance zero.  That resolves the issue
> >> > > (other problematic dependence distances seem to be less common
> >> > > at least).
> >> > >
> >> > > I have applied this heuristic in generic vectorizer code but
> >> > > restricted it to non-VL vector sizes.  There currently isn't
> >> > > a way for the target to request disabling of masking only,
> >> > > while we can reject the vectoriztion at costing time that will
> >> > > not re-consider the same vector mode but without masking.
> >> > > It seems simply re-costing with masking disabled should be
> >> > > possible through, we'd just need an indication whether that
> >> > > should be done?  Maybe always when the current vector mode is
> >> > > of fixed size?
> >> > >
> >> > > I wonder how SVE vectorized code behaves in these situations?
> >> > > The affected SPEC CPU 2017 benchmarks were 527.cam4_r and
> >> > > 503.bwaves_r though I think both will need a hardware vector
> >> > > size covering at least 8 doubles to show the issue.  527.cam4_r
> >> > > has 4 elements in the inner loop, 503.bwaves_r 5 IIRC.
> >> > >
> >> > > Bootstrap / regtest running on x86_64-unknown-linux-gnu.
> >> > >
> >> > > Any comments?
> >> > >
> >> > > Thanks,
> >> > > Richard.
> >> > >
> >> > >PR target/110456
> >> > >* tree-vectorizer.h (vec_info_shared::has_zero_dep_dist): New.
> >> > >* tree-vectorizer.cc (vec_info_shared::vec_info_shared):
> >> > >Initialize has_zero_dep_dist.
> >> > >* tree-vect-data-refs.cc (vect_analyze_data_ref_dependence):
> >> > >Remember if we've seen a dependence distance of zero.
> >> > >* tree-vect-stmts.cc (check_load_store_for_partial_vectors):
> >> > >When we've seen a dependence distance of zero and the vector
> >> > >type has constant size disable the use of partial vectors.
> >> > > ---
> >> > >  gcc/tree-vect-data-refs.cc |  2 ++
> >> > >  gcc/tree-vect-stmts.cc | 10 ++
> >> > >  gcc/tree-vectorizer.cc |  1 +
> >> > >  gcc/tree-vectorizer.h  |  3 +++
> >> > >  4 files changed, 16 insertions(+)
> >> > >
> >> > > diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
> >> > > index ebe93832b1e..40cde95c16a 100644
> >> > > --- a/gcc/tree-vect-data-refs.cc
> >> > > +++ b/gcc/tree-vect-data-refs.cc
> >> > > @@ -470,6 +470,8 @@ vect_analyze_data_ref_dependence (struct 
> >> > > data_dependence_relation *ddr,
> >> > > "dependence distance == 0 between %T and 
> >> > > %T\n",
> >> > > DR_REF (dra), DR_REF (drb));
> >> > >  
> >> > > +loop_vinfo->shared->has_zero_dep_dist = true;
> >> > > +
> >> > >  /* When we perform grouped accesses and perform implicit CSE
> >> > > by detecting equal accesses and doing disambiguation with
> >> > > runtime alias tests like for
> >> > > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> >> > > index d642d3c257f..3bcbc000323 100644
> >> > > --- a/gcc/tree-vect-stmts.cc
> >> > > +++ b/gcc/tree-vect-stmts.cc
> >> > > @@ -1839,6 +1839,16 @@ check_load_store_for_partial_vectors 
> >> > > (loop_vec_info loop_vinfo, tree vectype,
> >> > >using_partial_vectors_p = true;
> >> > >  }
> >> > >  
> >> > > +  if (loop_vinfo->shared->has_zero_dep_dist
> >> > > +  && TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
> >> > 
> >> > I don't think it makes sense to treat VLA and VLS differently here.
> >> > 
> >> > But RMW 

RE: [PATCH V5] VECT: Apply LEN_MASK_GATHER_LOAD/SCATTER_STORE into vectorizer

2023-07-05 Thread Li, Pan2 via Gcc-patches
Committed, thanks Richard.

Pan

-Original Message-
From: Gcc-patches  On Behalf 
Of Richard Biener via Gcc-patches
Sent: Wednesday, July 5, 2023 9:21 PM
To: Ju-Zhe Zhong 
Cc: gcc-patches@gcc.gnu.org; richard.sandif...@arm.com
Subject: Re: [PATCH V5] VECT: Apply LEN_MASK_GATHER_LOAD/SCATTER_STORE into 
vectorizer

On Tue, 4 Jul 2023, juzhe.zh...@rivai.ai wrote:

> From: Ju-Zhe Zhong 
> 
> Hi, Richard and Richi.
> 
> Address comments from Richi.
> 
> Make gs_info.ifn = LEN_MASK_GATHER_LOAD/LEN_MASK_SCATTER_STORE.
> 
> I have fully tested these 4 format:
> 
> length = vf is a dummpy length,
> mask = {-1,-1, ... } is a dummy mask.
> 
> 1. no length, no mask
>LEN_MASK_GATHER_LOAD (..., length = vf, mask = {-1,-1,...})
> 2. exist length, no mask
>LEN_MASK_GATHER_LOAD (..., len, mask = {-1,-1,...})
> 3. exist mask, no length
>LEN_MASK_GATHER_LOAD (..., length = vf, mask)
> 4. both mask and length exist
>LEN_MASK_GATHER_LOAD (..., length, mask)
> 
> All of these work fine in this patch.
> 
> Here is the example:
> 
> void
> f (int *restrict a,
>int *restrict b, int n,
>int base, int step,
>int *restrict cond)
> {
>   for (int i = 0; i < n; ++i)
> {
>   if (cond[i])
> a[i * 4] = b[i];
> }
> }
> 
> Gimple IR:
> 
>[local count: 105119324]:
>   _58 = (unsigned long) n_13(D);
> 
>[local count: 630715945]:
>   # vectp_cond.7_45 = PHI 
>   # vectp_b.11_51 = PHI 
>   # vectp_a.14_55 = PHI 
>   # ivtmp_59 = PHI 
>   _61 = .SELECT_VL (ivtmp_59, POLY_INT_CST [2, 2]);
>   ivtmp_44 = _61 * 4;
>   vect__4.9_47 = .LEN_MASK_LOAD (vectp_cond.7_45, 32B, _61, 0, { -1, ... });
>   mask__24.10_49 = vect__4.9_47 != { 0, ... };
>   vect__8.13_53 = .LEN_MASK_LOAD (vectp_b.11_51, 32B, _61, 0, mask__24.10_49);
>   ivtmp_54 = _61 * 16;
>   .LEN_MASK_SCATTER_STORE (vectp_a.14_55, { 0, 16, 32, ... }, 1, 
> vect__8.13_53, _61, 0, mask__24.10_49);
>   vectp_cond.7_46 = vectp_cond.7_45 + ivtmp_44;
>   vectp_b.11_52 = vectp_b.11_51 + ivtmp_44;
>   vectp_a.14_56 = vectp_a.14_55 + ivtmp_54;
>   ivtmp_60 = ivtmp_59 - _61;
>   if (ivtmp_60 != 0)
> goto ; [83.33%]
>   else
> goto ; [16.67%]
> 
> Ok for trunk ?

I think it matches the spirit of the existing code, thus OK.

Thanks,
Richard.

> gcc/ChangeLog:
> 
> * internal-fn.cc (internal_fn_len_index): Apply 
> LEN_MASK_GATHER_LOAD/SCATTER_STORE into vectorizer.
> (internal_fn_mask_index): Ditto.
> * optabs-query.cc (supports_vec_gather_load_p): Ditto.
> (supports_vec_scatter_store_p): Ditto.
> * tree-vect-data-refs.cc (vect_gather_scatter_fn_p): Ditto.
> * tree-vect-patterns.cc (vect_recog_gather_scatter_pattern): Ditto.
> * tree-vect-stmts.cc (check_load_store_for_partial_vectors): Ditto.
> (vect_get_strided_load_store_ops): Ditto.
> (vectorizable_store): Ditto.
> (vectorizable_load): Ditto.
> 
> ---
>  gcc/internal-fn.cc |   6 +-
>  gcc/optabs-query.cc|   2 +
>  gcc/tree-vect-data-refs.cc |  18 +-
>  gcc/tree-vect-patterns.cc  |   4 +-
>  gcc/tree-vect-stmts.cc | 122 +++--
>  5 files changed, 129 insertions(+), 23 deletions(-)
> 
> diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> index 303df102d81..bec60cdf4d0 100644
> --- a/gcc/internal-fn.cc
> +++ b/gcc/internal-fn.cc
> @@ -4472,7 +4472,7 @@ internal_fn_len_index (internal_fn fn)
>  
>  case IFN_LEN_MASK_GATHER_LOAD:
>  case IFN_LEN_MASK_SCATTER_STORE:
> -  return 4;
> +  return 5;
>  
>  default:
>return -1;
> @@ -4497,11 +4497,9 @@ internal_fn_mask_index (internal_fn fn)
>  case IFN_MASK_SCATTER_STORE:
>  case IFN_LEN_MASK_LOAD:
>  case IFN_LEN_MASK_STORE:
> -  return 4;
> -
>  case IFN_LEN_MASK_GATHER_LOAD:
>  case IFN_LEN_MASK_SCATTER_STORE:
> -  return 6;
> +  return 4;
>  
>  default:
>return (conditional_internal_fn_code (fn) != ERROR_MARK
> diff --git a/gcc/optabs-query.cc b/gcc/optabs-query.cc
> index 2fdd0d34354..bf1f484e874 100644
> --- a/gcc/optabs-query.cc
> +++ b/gcc/optabs-query.cc
> @@ -676,6 +676,7 @@ supports_vec_gather_load_p (machine_mode mode)
>  this_fn_optabs->supports_vec_gather_load[mode]
>= (supports_vec_convert_optab_p (gather_load_optab, mode)
>|| supports_vec_convert_optab_p (mask_gather_load_optab, mode)
> +  || supports_vec_convert_optab_p (len_mask_gather_load_optab, mode)
>? 1 : -1);
>  
>return this_fn_optabs->supports_vec_gather_load[mode] > 0;
> @@ -692,6 +693,7 @@ supports_vec_scatter_store_p (machine_mode mode)
>  this_fn_optabs->supports_vec_scatter_store[mode]
>= (supports_vec_convert_optab_p (scatter_store_optab, mode)
>|| supports_vec_convert_optab_p (mask_scatter_store_optab, mode)
> +  || supports_vec_convert_optab_p (len_mask_scatter_store_optab, mode)
>? 1 : -1);
>  
>return this_fn_optabs->supports_vec_scatter_store[mode] > 0;
> diff --git a/gcc/tr

[PATCH] match.pd: Implement missed optimization (~X | Y) ^ X -> ~(X & Y) [PR109986]

2023-07-05 Thread Drew Ross via Gcc-patches
Adds a simplification for (~X | Y) ^ X to be folded into ~(X & Y).
Tested successfully on x86_64 and x86 targets.

PR middle-end/109986

gcc/ChangeLog:

* match.pd ((~X | Y) ^ X -> ~(X & Y)): New simplification.

gcc/testsuite/ChangeLog:

* gcc.c-torture/execute/pr109986.c: New test.
* gcc.dg/tree-ssa/pr109986.c: New test.
---
 gcc/match.pd  |  11 ++
 .../gcc.c-torture/execute/pr109986.c  |  41 
 gcc/testsuite/gcc.dg/tree-ssa/pr109986.c  | 177 ++
 3 files changed, 229 insertions(+)
 create mode 100644 gcc/testsuite/gcc.c-torture/execute/pr109986.c
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr109986.c

diff --git a/gcc/match.pd b/gcc/match.pd
index a17d6838c14..d9d7d932881 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -1627,6 +1627,17 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
  (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
   (convert (bit_and @1 (bit_not @0)
 
+/* (~X | Y) ^ X -> ~(X & Y).  */
+(simplify
+ (bit_xor:c (nop_convert1?
+ (bit_ior:c (nop_convert2? (bit_not (nop_convert3? @0)))
+@1)) (nop_convert4? @0))
+  (if (types_match (type, @1))
+   (bit_not (bit_and @1 (convert @0)))
+   (if (types_match (type, @0))
+(bit_not (bit_and (convert @1) @0))
+(convert (bit_not (bit_and @0 (convert @1)))
+
 /* Convert ~X ^ ~Y to X ^ Y.  */
 (simplify
  (bit_xor (convert1? (bit_not @0)) (convert2? (bit_not @1)))
diff --git a/gcc/testsuite/gcc.c-torture/execute/pr109986.c 
b/gcc/testsuite/gcc.c-torture/execute/pr109986.c
new file mode 100644
index 000..00ee9888539
--- /dev/null
+++ b/gcc/testsuite/gcc.c-torture/execute/pr109986.c
@@ -0,0 +1,41 @@
+/* PR middle-end/109986 */
+
+#include "../../gcc.dg/tree-ssa/pr109986.c"
+
+int 
+main ()
+{
+  if (t1 (29789, 29477) != -28678) __builtin_abort ();
+  if (t2 (20196, -18743) != 4294965567) __builtin_abort ();
+  if (t3 (127, 99) != -100) __builtin_abort ();
+  if (t4 (100, 53) != 219) __builtin_abort ();
+  if (t5 (20100, 1283) != -1025) __builtin_abort ();
+  if (t6 (20100, 10283) != 63487) __builtin_abort ();
+  if (t7 (2136614690L, 1136698390L) != -1128276995L) __builtin_abort ();
+  if (t8 (1136698390L, 2136614690L) != -1128276995UL) __builtin_abort ();
+  if (t9 (9176690219839792930LL, 3176690219839721234LL) != 
-3175044472123688707LL)
+__builtin_abort ();
+  if (t10 (9176690219839792930LL, 3176690219839721234LL) != 
15271699601585862909ULL)
+__builtin_abort ();
+  if (t11 (29789, 29477) != -28678) __builtin_abort ();
+  if (t12 (20196, -18743) != 4294965567) __builtin_abort ();
+  if (t13 (127, 99) != -100) __builtin_abort ();
+  if (t14 (100, 53) != 219) __builtin_abort ();
+  if (t15 (20100, 1283) != -1025) __builtin_abort ();
+  if (t16 (20100, 10283) != 63487) __builtin_abort ();
+  if (t17 (2136614690, 1136698390) != -1128276995) __builtin_abort ();
+  if (t18 (1136698390L, 2136614690L) != -1128276995UL) __builtin_abort ();
+  if (t19 (9176690219839792930LL, 3176690219839721234LL) != 
-3175044472123688707LL)
+__builtin_abort ();
+  if (t20 (9176690219839792930LL, 3176690219839721234LL) != 
15271699601585862909ULL)
+__builtin_abort ();
+  v4si a1 = {1, 2, 3, 4};
+  v4si a2 = {6, 7, 8, 9}; 
+  v4si r1 = {-1, -3, -1, -1}; 
+  v4si b1 = t21 (a1, a2);
+  v4si b2 = t22 (a1, a2);
+  if (__builtin_memcmp (&b1,  &r1,  sizeof (b1) != 0)) __builtin_abort();  
+  if (__builtin_memcmp (&b2,  &r1,  sizeof (b2) != 0)) __builtin_abort();
+  return 0;
+}
+
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr109986.c 
b/gcc/testsuite/gcc.dg/tree-ssa/pr109986.c
new file mode 100644
index 000..45f099b5656
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr109986.c
@@ -0,0 +1,177 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-dse1 -Wno-psabi" } */
+
+typedef int v4si __attribute__((vector_size(16)));
+
+/* Generic */
+__attribute__((noipa)) int 
+t1 (int a, int b)
+{
+  return (~a | b) ^ a;
+}
+
+__attribute__((noipa)) unsigned int 
+t2 (int a, int b)
+{
+  return a ^ (~a | (unsigned int) b);
+}
+
+__attribute__((noipa)) char
+t3 (char a, char b)
+{
+  return (b | ~a) ^ a;
+}
+
+__attribute__((noipa)) unsigned char
+t4 (char a, char b)
+{
+  return ((unsigned char) a) ^ (b | ~a);
+}
+
+__attribute__((noipa)) short
+t5 (short a, short b)
+{
+  return a ^ (b | ~a);
+}
+
+__attribute__((noipa)) unsigned short
+t6 (short a, short b)
+{
+  return ((unsigned short) a) ^ (b | ~a);
+}
+
+__attribute__((noipa)) long
+t7 (long a, long b)
+{
+  return a ^ (b | ~a);
+}
+
+__attribute__((noipa)) unsigned long
+t8 (long a, long b)
+{
+  return ((unsigned long) a) ^ (b | ~a);
+}
+
+__attribute__((noipa)) long long
+t9 (long long a, long long b)
+{
+  return a ^ (b | ~a);
+}
+
+__attribute__((noipa)) unsigned long long
+t10 (long long a, long long b)
+{
+  return ((unsigned long long) a) ^ (b | ~a);
+}
+
+__attribute__((noipa)) v4si
+t21 (v4si a, v4si b)
+{
+  r

Re: [PATCH] RISC-V: Change truncate to float_truncate in narrowing

2023-07-05 Thread Kito Cheng via Gcc-patches
Lgtm

juzhe.zh...@rivai.ai 於 2023年7月5日 週三,21:04寫道:

> LGTM. Thanks for fixing this.
>
>
>
> juzhe.zh...@rivai.ai
>
> From: Robin Dapp
> Date: 2023-07-05 21:00
> To: gcc-patches; palmer; Kito Cheng; juzhe.zh...@rivai.ai; jeffreyalaw
> CC: rdapp.gcc
> Subject: [PATCH] RISC-V: Change truncate to float_truncate in narrowing
> Hi,
>
> Juzhe noticed that several floating-point conversion tests
> FAIL on 32 bit.  This is due to the autovect FP narrowing patterns
> using a truncate instead of a float_truncate which results in
> a combine ICE.  It would try to e.g. simplify a unary operation by
> simplify_const_unary_operation which obviously expects a float_truncate
> and not a truncate for a floating-point mode.
>
> Regards
> Robin
>
> gcc/ChangeLog:
>
> * config/riscv/autovec.md: Use float_truncate.
> ---
> gcc/config/riscv/autovec.md | 4 ++--
> 1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md
> index 466b27d5c49..3884dfc363c 100644
> --- a/gcc/config/riscv/autovec.md
> +++ b/gcc/config/riscv/autovec.md
> @@ -473,7 +473,7 @@ (define_expand "extend2"
> ;;
> -
> (define_insn_and_split "trunc2"
>[(set (match_operand: 0 "register_operand" "=vr")
> -(truncate:
> +(float_truncate:
>   (match_operand:VWEXTF_ZVFHMIN 1 "register_operand"  " vr")))]
>"TARGET_VECTOR && can_create_pseudo_p ()"
>"#"
> @@ -493,7 +493,7 @@ (define_insn_and_split "trunc2"
> ;;
> -
> (define_expand "trunc2"
>[(set (match_operand: 0 "register_operand")
> -(truncate:
> +(float_truncate:
>   (match_operand:VQEXTF 1 "register_operand")))]
>"TARGET_VECTOR && (TARGET_ZVFHMIN || TARGET_ZVFH)"
> {
> --
> 2.41.0
>
>
>


Re: [PATCH] Add -Wmissing-variable-declarations [PR65213].

2023-07-05 Thread Hamza Mahfooz

Ping?

On Tue, Jun 13 2023 at 09:05:29 AM -04:00:00, Hamza Mahfooz 
 wrote:

Resolves:
PR c/65213 - Extend -Wmissing-declarations to variables [i.e. add
-Wmissing-variable-declarations]

gcc/c-family/ChangeLog:

PR c/65213
* c.opt (-Wmissing-variable-declarations): New option.

gcc/c/ChangeLog:

PR c/65213
* c-decl.cc (start_decl): Handle -Wmissing-variable-declarations

gcc/ChangeLog:

PR c/65213
* doc/invoke.texi (-Wmissing-variable-declarations): Document
new option.

gcc/testsuite/ChangeLog:

PR c/65213
* gcc.dg/Wmissing-variable-declarations.c: New test.

Signed-off-by: Hamza Mahfooz 
---
 gcc/c-family/c.opt|  4 +++
 gcc/c/c-decl.cc   | 10 +-
 gcc/doc/invoke.texi   | 11 +--
 .../gcc.dg/Wmissing-variable-declarations.c   | 33 
+++

 4 files changed, 55 insertions(+), 3 deletions(-)
 create mode 100644 
gcc/testsuite/gcc.dg/Wmissing-variable-declarations.c


diff --git a/gcc/c-family/c.opt b/gcc/c-family/c.opt
index cead1995561..42ad447f39b 100644
--- a/gcc/c-family/c.opt
+++ b/gcc/c-family/c.opt
@@ -1043,6 +1043,10 @@ Wmissing-prototypes
 C ObjC Var(warn_missing_prototypes) Warning
 Warn about global functions without prototypes.

+Wmissing-variable-declarations
+C ObjC Var(warn_missing_variable_declarations) Warning
+Warn about global variables without previous declarations.
+
 Wmudflap
 C ObjC C++ ObjC++ WarnRemoved

diff --git a/gcc/c/c-decl.cc b/gcc/c/c-decl.cc
index 1af51c4acfc..8e276b2a846 100644
--- a/gcc/c/c-decl.cc
+++ b/gcc/c/c-decl.cc
@@ -5340,6 +5340,7 @@ start_decl (struct c_declarator *declarator, 
struct c_declspecs *declspecs,

location_t *lastloc /* = NULL */)
 {
   tree decl;
+  tree old_decl;
   tree tem;
   tree expr = NULL_TREE;
   enum deprecated_states deprecated_state = DEPRECATED_NORMAL;
@@ -5360,7 +5361,9 @@ start_decl (struct c_declarator *declarator, 
struct c_declspecs *declspecs,

   if (!decl || decl == error_mark_node)
 return NULL_TREE;

-  if (tree lastdecl = lastloc ? lookup_last_decl (decl) : NULL_TREE)
+  old_decl = lookup_last_decl (decl);
+
+  if (tree lastdecl = lastloc ? old_decl : NULL_TREE)
 if (lastdecl != error_mark_node)
   *lastloc = DECL_SOURCE_LOCATION (lastdecl);

@@ -5372,6 +5375,11 @@ start_decl (struct c_declarator *declarator, 
struct c_declspecs *declspecs,

   && TREE_PUBLIC (decl))
 warning (OPT_Wmain, "%q+D is usually a function", decl);

+  if (warn_missing_variable_declarations && VAR_P (decl)
+  && !DECL_EXTERNAL (decl) && TREE_PUBLIC (decl) && old_decl == 
NULL_TREE)
+warning_at (DECL_SOURCE_LOCATION (decl), 
OPT_Wmissing_variable_declarations,

+   "no previous declaration for %qD", decl);
+
   if (initialized)
 /* Is it valid for this decl to have an initializer at all?
If not, set INITIALIZED to zero, which will indirectly
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 8fa3f9fae01..e9b51842234 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -496,8 +496,8 @@ Objective-C and Objective-C++ Dialects}.

 @item C and Objective-C-only Warning Options
 @gccoptlist{-Wbad-function-cast  -Wmissing-declarations
--Wmissing-parameter-type  -Wmissing-prototypes  -Wnested-externs
--Wold-style-declaration  -Wold-style-definition
+-Wmissing-parameter-type -Wmissing-prototypes 
-Wmissing-variable-declarations

+-Wnested-externs -Wold-style-declaration  -Wold-style-definition
 -Wstrict-prototypes  -Wtraditional  -Wtraditional-conversion
 -Wdeclaration-after-statement  -Wpointer-sign}

@@ -9565,6 +9565,13 @@ provide prototypes and a non-matching 
declaration declares an

 overload rather than conflict with an earlier declaration.
 Use @option{-Wmissing-declarations} to detect missing declarations 
in C++.


+@opindex Wmissing-variable-declarations
+@opindex Wno-missing-variable-declarations
+@item -Wmissing-variable-declarations @r{(C and Objective-C only)}
+Warn if a global variable is defined without a previous declaration.
+Use this option to detect global variables that do not have a 
matching

+extern declaration in a header file.
+
 @opindex Wmissing-declarations
 @opindex Wno-missing-declarations
 @item -Wmissing-declarations
diff --git a/gcc/testsuite/gcc.dg/Wmissing-variable-declarations.c 
b/gcc/testsuite/gcc.dg/Wmissing-variable-declarations.c

new file mode 100644
index 000..b292dbe8c22
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/Wmissing-variable-declarations.c
@@ -0,0 +1,33 @@
+/* { dg-do compile } */
+/* { dg-options "-Wmissing-variable-declarations" } */
+
+int b0; /* { dg-warning "no previous declaration for 'b0'" } */
+
+int b1 = 1; /* { dg-warning "no previous declaration for 'b1'" } */
+
+int b2; /* { dg-warning "no previous declaration for 'b2'" } */
+int b2 = 2;
+
+struct {
+int g0;
+} b3; /* { dg-warning "no previous declaration for 'b3'" } */
+
+int b4; /* { dg-w

Re: [PATCH][RFC] target/110456 - avoid loop masking with zero distance dependences

2023-07-05 Thread Richard Sandiford via Gcc-patches
Richard Biener  writes:
> On Tue, 4 Jul 2023, Richard Sandiford wrote:
>
>> Richard Biener  writes:
>> > On Thu, 29 Jun 2023, Richard Biener wrote:
>> >
>> >> On Thu, 29 Jun 2023, Richard Sandiford wrote:
>> >> 
>> >> > Richard Biener  writes:
>> >> > > With applying loop masking to epilogues on x86_64 AVX512 we see
>> >> > > some significant performance regressions when evaluating SPEC CPU 2017
>> >> > > that are caused by store-to-load forwarding fails across outer
>> >> > > loop iterations when the inner loop does not iterate.  Consider
>> >> > >
>> >> > >   for (j = 0; j < m; ++j)
>> >> > > for (i = 0; i < n; ++i)
>> >> > >   a[j*n + i] += b[j*n + i];
>> >> > >
>> >> > > with 'n' chosen so that the inner loop vectorized code is fully
>> >> > > executed by the masked epilogue and that masked epilogue
>> >> > > storing O > n elements (with elements >= n masked of course).
>> >> > > Then the masked load performed for the next outer loop iteration
>> >> > > will get a hit in the store queue but it obviously cannot forward
>> >> > > so we have to wait for the store to retire.
>> >> > >
>> >> > > That causes a significant hit to performance especially if 'n'
>> >> > > would have made a non-masked epilogue to fully cover 'n' as well
>> >> > > (say n == 4 for a V4DImode epilogue), avoiding the need for
>> >> > > store-forwarding and waiting for the retiring of the store.
>> >> > >
>> >> > > The following applies a very simple heuristic, disabling
>> >> > > the use of loop masking when there's a memory reference pair
>> >> > > with dependence distance zero.  That resolves the issue
>> >> > > (other problematic dependence distances seem to be less common
>> >> > > at least).
>> >> > >
>> >> > > I have applied this heuristic in generic vectorizer code but
>> >> > > restricted it to non-VL vector sizes.  There currently isn't
>> >> > > a way for the target to request disabling of masking only,
>> >> > > while we can reject the vectoriztion at costing time that will
>> >> > > not re-consider the same vector mode but without masking.
>> >> > > It seems simply re-costing with masking disabled should be
>> >> > > possible through, we'd just need an indication whether that
>> >> > > should be done?  Maybe always when the current vector mode is
>> >> > > of fixed size?
>> >> > >
>> >> > > I wonder how SVE vectorized code behaves in these situations?
>> >> > > The affected SPEC CPU 2017 benchmarks were 527.cam4_r and
>> >> > > 503.bwaves_r though I think both will need a hardware vector
>> >> > > size covering at least 8 doubles to show the issue.  527.cam4_r
>> >> > > has 4 elements in the inner loop, 503.bwaves_r 5 IIRC.
>> >> > >
>> >> > > Bootstrap / regtest running on x86_64-unknown-linux-gnu.
>> >> > >
>> >> > > Any comments?
>> >> > >
>> >> > > Thanks,
>> >> > > Richard.
>> >> > >
>> >> > >   PR target/110456
>> >> > >   * tree-vectorizer.h (vec_info_shared::has_zero_dep_dist): New.
>> >> > >   * tree-vectorizer.cc (vec_info_shared::vec_info_shared):
>> >> > >   Initialize has_zero_dep_dist.
>> >> > >   * tree-vect-data-refs.cc (vect_analyze_data_ref_dependence):
>> >> > >   Remember if we've seen a dependence distance of zero.
>> >> > >   * tree-vect-stmts.cc (check_load_store_for_partial_vectors):
>> >> > >   When we've seen a dependence distance of zero and the vector
>> >> > >   type has constant size disable the use of partial vectors.
>> >> > > ---
>> >> > >  gcc/tree-vect-data-refs.cc |  2 ++
>> >> > >  gcc/tree-vect-stmts.cc | 10 ++
>> >> > >  gcc/tree-vectorizer.cc |  1 +
>> >> > >  gcc/tree-vectorizer.h  |  3 +++
>> >> > >  4 files changed, 16 insertions(+)
>> >> > >
>> >> > > diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
>> >> > > index ebe93832b1e..40cde95c16a 100644
>> >> > > --- a/gcc/tree-vect-data-refs.cc
>> >> > > +++ b/gcc/tree-vect-data-refs.cc
>> >> > > @@ -470,6 +470,8 @@ vect_analyze_data_ref_dependence (struct 
>> >> > > data_dependence_relation *ddr,
>> >> > >"dependence distance == 0 between %T and 
>> >> > > %T\n",
>> >> > >DR_REF (dra), DR_REF (drb));
>> >> > >  
>> >> > > +   loop_vinfo->shared->has_zero_dep_dist = true;
>> >> > > +
>> >> > > /* When we perform grouped accesses and perform implicit CSE
>> >> > >by detecting equal accesses and doing disambiguation with
>> >> > >runtime alias tests like for
>> >> > > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
>> >> > > index d642d3c257f..3bcbc000323 100644
>> >> > > --- a/gcc/tree-vect-stmts.cc
>> >> > > +++ b/gcc/tree-vect-stmts.cc
>> >> > > @@ -1839,6 +1839,16 @@ check_load_store_for_partial_vectors 
>> >> > > (loop_vec_info loop_vinfo, tree vectype,
>> >> > >using_partial_vectors_p = true;
>> >> > >  }
>> >> > >  
>> >> > > +  if (loop_vinfo->shared->has_zero_dep_dist
>> >> > > +  && TYPE_VECTOR_SUBPARTS (vectype).is_constant 

Re: [PATCH] RISC-V: Support variable index in vec_extract.

2023-07-05 Thread Jeff Law via Gcc-patches




On 7/5/23 03:13, Robin Dapp wrote:

Hi,

this patch adds a gen_lowpart in the vec_extract expander so it properly
works with a variable index and adds tests.

Regards
  Robin

gcc/ChangeLog:

* config/riscv/autovec.md: Add gen_lowpart.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/vls-vlmax/vec_extract-1.c: Add
tests for variable index.
* gcc.target/riscv/rvv/autovec/vls-vlmax/vec_extract-2.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls-vlmax/vec_extract-3.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls-vlmax/vec_extract-4.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls-vlmax/vec_extract-run.c:
Ditto.
* gcc.target/riscv/rvv/autovec/vls-vlmax/vec_extract-zvfh-run.c:
Ditto.

OK
jeff


Re: [PATCH] RISC-V: Handle rouding mode correctly on zfinx

2023-07-05 Thread Jeff Law via Gcc-patches




On 7/5/23 02:11, Kito Cheng wrote:

Zfinx has provide fcsr like F, so rouding mode should use fcsr instead
of `soft` fenv.

libgcc/ChangeLog:

* config/riscv/sfp-machine.h (FP_INIT_ROUNDMODE): Check zfinx.
(FP_HANDLE_EXCEPTIONS): Ditto.

OK
jeff


Re: [PATCH] RISC-V: Allow variable index for vec_set.

2023-07-05 Thread Jeff Law via Gcc-patches




On 7/5/23 03:12, Robin Dapp wrote:

Hi,

this patch enables a variable index for vec_set and
adjusts/cleans up the tests.

Regards
  Robin

gcc/ChangeLog:

* config/riscv/autovec.md: Allow register index operand.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/vls-vlmax/vec_set-1.c: Adjust
test.
* gcc.target/riscv/rvv/autovec/vls-vlmax/vec_set-2.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls-vlmax/vec_set-3.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls-vlmax/vec_set-4.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls-vlmax/vec_set-run.c: Ditto.

OK
jeff


RE: [PATCH v1] RISC-V: Use FRM_DYN when add the rounding mode operand

2023-07-05 Thread Li, Pan2 via Gcc-patches
Committed, thanks Juzhe and Kito.

Pan

-Original Message-
From: Kito Cheng  
Sent: Wednesday, July 5, 2023 3:16 PM
To: juzhe.zh...@rivai.ai
Cc: Li, Pan2 ; gcc-patches ; Robin 
Dapp ; jeffreyalaw ; Wang, Yanzhang 

Subject: Re: [PATCH v1] RISC-V: Use FRM_DYN when add the rounding mode operand

LGTM

On Wed, Jul 5, 2023 at 10:08 AM juzhe.zh...@rivai.ai
 wrote:
>
> LGTM.
>
>
>
> juzhe.zh...@rivai.ai
>
> From: pan2.li
> Date: 2023-07-04 20:26
> To: gcc-patches
> CC: juzhe.zhong; rdapp.gcc; jeffreyalaw; pan2.li; yanzhang.wang; kito.cheng
> Subject: [PATCH v1] RISC-V: Use FRM_DYN when add the rounding mode operand
> From: Pan Li 
>
> This patch would like to take FRM_DYN const rtx as the rounding mode
> operand according to the RVV spec, which takes the dyn as the only
> rounding mode for floating-point.
>
> Signed-off-by: Pan Li 
>
> gcc/ChangeLog:
>
> * config/riscv/riscv-vector-builtins.cc
> (function_expander::use_exact_insn): Use FRM_DYN instead of const0.
> ---
> gcc/config/riscv/riscv-vector-builtins.cc | 7 +++
> 1 file changed, 3 insertions(+), 4 deletions(-)
>
> diff --git a/gcc/config/riscv/riscv-vector-builtins.cc 
> b/gcc/config/riscv/riscv-vector-builtins.cc
> index 648c765a5d1..3a53b56effa 100644
> --- a/gcc/config/riscv/riscv-vector-builtins.cc
> +++ b/gcc/config/riscv/riscv-vector-builtins.cc
> @@ -3569,11 +3569,10 @@ function_expander::use_exact_insn (insn_code icode)
>if (base->has_rounding_mode_operand_p ())
>  add_input_operand (call_expr_nargs (exp) - 2);
> -  /* TODO: Currently, we don't support intrinsic that is modeling rounding 
> mode.
> - We add default rounding mode for the intrinsics that didn't model 
> rounding
> - mode yet.  */
> +  /* The RVV floating-point only support dynamic rounding mode in the
> + FRM register.  */
>if (opno != insn_data[icode].n_generator_args)
> -add_input_operand (Pmode, const0_rtx);
> +add_input_operand (Pmode, gen_int_mode (riscv_vector::FRM_DYN, Pmode));
>return generate_insn (icode);
> }
> --
> 2.34.1
>
>


Re: [PATCH] Fortran: fixes for procedures with ALLOCATABLE,INTENT(OUT) arguments [PR92178]

2023-07-05 Thread Mikael Morin

Le 04/07/2023 à 21:37, Mikael Morin a écrit :

Le 04/07/2023 à 21:00, Harald Anlauf a écrit :

Hi Mikael, all,

I think I've found it: there is a call to gfc_conv_class_to_class
that - according to a comment - does a repackaging to a class array.
Deferring that repackaging along with the deallocation not only fixes
the regression, but also the cases I tested.

Attached is a "sneak preview", hoping that the experts (Paul, Mikael,
...) can tell if I am going down the wrong road.


I think that's it mostly.  There is one last thing that I am not sure...


diff --git a/gcc/fortran/trans-expr.cc b/gcc/fortran/trans-expr.cc
index 16e8f037cfc..a68c8d33acc 100644
--- a/gcc/fortran/trans-expr.cc
+++ b/gcc/fortran/trans-expr.cc
@@ -6858,6 +6860,10 @@ gfc_conv_procedure_call (gfc_se * se, 
gfc_symbol * sym,

  && e->symtree->n.sym->attr.optional,
  CLASS_DATA (fsym)->attr.class_pointer
  || CLASS_DATA (fsym)->attr.allocatable);
+
+  /* Defer repackaging after deallocation.  */
+  if (defer_repackage)
+    gfc_add_block_to_block (&dealloc_blk, &parmse.pre);
 }
   else
 {


... whether you will not be deferring too much here.  That is parmse.pre 
contains both the argument evaluation and the class container setup from 
gfc_conv_class_to_class.  If it's safe to defer both, that's fine, 
otherwise a separate gfc_se struct should be passed to 
gfc_conv_class_to_class so that only the latter part can be deferred.

Need to think of an example...


Here is an example, admittedly artificial.  Fails with the above change, 
but fails with master as well.


program p
  implicit none
  type t
integer :: i
  end type t
  type u
class(t), allocatable :: ta(:)
  end type u
  type(u), allocatable, target :: c(:)
  c = [u([t(1), t(3)]), u([t(4), t(9)])]
  call bar (allocated (c(c(1)%ta(1)%i)%ta), c(c(1)%ta(1)%i)%ta, 
allocated (c(c(1)%ta(1)%i)%ta))

  if (allocated(c(1)%ta)) stop 11
  if (.not. allocated(c(2)%ta)) stop 12
contains
  subroutine bar (alloc, x, alloc2)
logical :: alloc, alloc2
class(t), allocatable, intent(out) :: x(:)
if (allocated (x)) stop 1
if (.not. alloc)   stop 2
if (.not. alloc2)  stop 3
  end subroutine bar
end



[committed] sched: Change return type of predicate functions from int to bool

2023-07-05 Thread Uros Bizjak via Gcc-patches
Also change some internal variables to bool.

gcc/ChangeLog:

* sched-int.h (struct haifa_sched_info): Change can_schedule_ready_p,
scehdule_more_p and contributes_to_priority indirect frunction
type from int to bool.
(no_real_insns_p): Change return type from int to bool.
(contributes_to_priority): Ditto.
* haifa-sched.cc (no_real_insns_p): Change return type from
int to bool and adjust function body accordingly.
* modulo-sched.cc (try_scheduling_node_in_cycle): Change "success"
variable type from int to bool.
(ps_insn_advance_column): Change return type from int to bool.
(ps_has_conflicts): Ditto. Change "has_conflicts"
variable type from int to bool.
* sched-deps.cc (deps_may_trap_p): Change return type from int to bool.
(conditions_mutex_p): Ditto.
* sched-ebb.cc (schedule_more_p): Ditto.
(ebb_contributes_to_priority): Change return type from
int to bool and adjust function body accordingly.
* sched-rgn.cc (is_cfg_nonregular): Ditto.
(check_live_1): Ditto.
(is_pfree): Ditto.
(find_conditional_protection): Ditto.
(is_conditionally_protected): Ditto.
(is_prisky): Ditto.
(is_exception_free): Ditto.
(haifa_find_rgns): Change "unreachable" and "too_large_failure"
variables from int to bool.
(extend_rgns): Change "rescan" variable from int to bool.
(check_live): Change return type from
int to bool and adjust function body accordingly.
(can_schedule_ready_p): Ditto.
(schedule_more_p): Ditto.
(contributes_to_priority): Ditto.

Bootstrapped and regression tested on x86_64-linux-gnu {,-m32}.

Uros.
diff --git a/gcc/haifa-sched.cc b/gcc/haifa-sched.cc
index 2c881ede0ec..01a2a80d982 100644
--- a/gcc/haifa-sched.cc
+++ b/gcc/haifa-sched.cc
@@ -5033,18 +5033,18 @@ get_ebb_head_tail (basic_block beg, basic_block end,
   *tailp = end_tail;
 }
 
-/* Return nonzero if there are no real insns in the range [ HEAD, TAIL ].  */
+/* Return true if there are no real insns in the range [ HEAD, TAIL ].  */
 
-int
+bool
 no_real_insns_p (const rtx_insn *head, const rtx_insn *tail)
 {
   while (head != NEXT_INSN (tail))
 {
   if (!NOTE_P (head) && !LABEL_P (head))
-   return 0;
+   return false;
   head = NEXT_INSN (head);
 }
-  return 1;
+  return true;
 }
 
 /* Restore-other-notes: NOTE_LIST is the end of a chain of notes
diff --git a/gcc/modulo-sched.cc b/gcc/modulo-sched.cc
index 26752213d19..c5a392dd511 100644
--- a/gcc/modulo-sched.cc
+++ b/gcc/modulo-sched.cc
@@ -2119,7 +2119,7 @@ try_scheduling_node_in_cycle (partial_schedule_ptr ps,
  sbitmap must_follow)
 {
   ps_insn_ptr psi;
-  bool success = 0;
+  bool success = false;
 
   verify_partial_schedule (ps, sched_nodes);
   psi = ps_add_node_check_conflicts (ps, u, cycle, must_precede, must_follow);
@@ -2127,7 +2127,7 @@ try_scheduling_node_in_cycle (partial_schedule_ptr ps,
 {
   SCHED_TIME (u) = cycle;
   bitmap_set_bit (sched_nodes, u);
-  success = 1;
+  success = true;
   *num_splits = 0;
   if (dump_file)
fprintf (dump_file, "Scheduled w/o split in %d\n", cycle);
@@ -3067,7 +3067,7 @@ ps_insn_find_column (partial_schedule_ptr ps, ps_insn_ptr 
ps_i,
in failure and true in success.  Bit N is set in MUST_FOLLOW if
the node with cuid N must be come after the node pointed to by
PS_I when scheduled in the same cycle.  */
-static int
+static bool
 ps_insn_advance_column (partial_schedule_ptr ps, ps_insn_ptr ps_i,
sbitmap must_follow)
 {
@@ -3158,7 +3158,7 @@ advance_one_cycle (void)
 /* Checks if PS has resource conflicts according to DFA, starting from
FROM cycle to TO cycle; returns true if there are conflicts and false
if there are no conflicts.  Assumes DFA is being used.  */
-static int
+static bool
 ps_has_conflicts (partial_schedule_ptr ps, int from, int to)
 {
   int cycle;
@@ -3214,7 +3214,8 @@ ps_add_node_check_conflicts (partial_schedule_ptr ps, int 
n,
 int c, sbitmap must_precede,
 sbitmap must_follow)
 {
-  int i, first, amount, has_conflicts = 0;
+  int i, first, amount;
+  bool has_conflicts = false;
   ps_insn_ptr ps_i;
 
   /* First add the node to the PS, if this succeeds check for
diff --git a/gcc/sched-deps.cc b/gcc/sched-deps.cc
index 998fe930804..c23218890f3 100644
--- a/gcc/sched-deps.cc
+++ b/gcc/sched-deps.cc
@@ -472,7 +472,7 @@ static int cache_size;
 /* True if we should mark added dependencies as a non-register deps.  */
 static bool mark_as_hard;
 
-static int deps_may_trap_p (const_rtx);
+static bool deps_may_trap_p (const_rtx);
 static void add_dependence_1 (rtx_insn *, rtx_insn *, enum reg_note);
 static void add_dependence_list (rtx_insn *, rtx_insn_list *, int,
 enum reg_note, bool);
@@ -488,7 +488,7 @@ static void sched_analyze_2 (class deps_desc *, rtx, 
rtx_insn *);
 static void sched_analyze_i

[PATCH] libstdc++: Fix fwrite error parameter

2023-07-05 Thread shuaitq via Gcc-patches
the first parameter of fwrite should be the const char* __s which want write to 
FILE *__file,
rather than the FILE *__file write to the FILE *__file.

0001-Fix-fwrite-error-parameter.txt
Description: Binary data


[PATCH] libstdc++: Fix fwrite error parameter

2023-07-05 Thread Tianqiang Shuai via Gcc-patches
the first parameter of fwrite should be the const char* __s which want write to 
FILE *__file,
rather than the FILE *__file write to the FILE *__file.

0001-Fix-fwrite-error-parameter.txt
Description: Binary data


[PATCH] libstdc++: Fix fwrite error parameter

2023-07-05 Thread Tianqiang Shuai via Gcc-patches
the first parameter of fwrite should be the const char* __s which want write to 
FILE *__file,
rather than the FILE *__file write to the FILE *__file.

0001-Fix-fwrite-error-parameter.txt
Description: Binary data


Re: [PATCH] libstdc++: Fix fwrite error parameter

2023-07-05 Thread Jonathan Wakely via Gcc-patches
On Wed, 5 Jul 2023 at 16:54, shuaitq via Libstdc++
 wrote:
>
> the first parameter of fwrite should be the const char* __s which want write 
> to FILE *__file,
> rather than the FILE *__file write to the FILE *__file.

Thank you. The patch is correct and small enough to not require a
copyright assignment or DCO sign-off, so I'll commit it for you.



Re: [PATCH 2/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops

2023-07-05 Thread Stamatis Markianos-Wright via Gcc-patches
Thank you Andre for reviewing! I'll attach the updated version of the 
patch to the third review email (your final one thus far ;)


On 22/06/2023 16:54, Andre Vieira (lists) wrote:
Some comments below, all quite minor. I'll continue to review 
tomorrow, I need a fresher brain for 
arm_mve_check_df_chain_back_for_implic_predic  ;)


+static int
+arm_mve_get_vctp_lanes (rtx x)
+{
+  if (GET_CODE (x) == SET && GET_CODE (XEXP (x, 1)) == UNSPEC
+  && (XINT (XEXP (x, 1), 1) == VCTP || XINT (XEXP (x, 1), 1) == 
VCTP_M))

+    {
+  switch (GET_MODE (XEXP (x, 1)))
+    {
+  case V16BImode:
+    return 16;
+  case V8BImode:
+    return 8;
+  case V4BImode:
+    return 4;
+  case V2QImode:
+    return 2;
+  default:
+    break;
+    }
+    }
+  return 0;
+}

I think you can replace the switch with something along the lines of:
machine_mode mode = GET_MODE (XEXP (x, 1));
return VECTOR_MODE_P (mode) ? GET_MODE_NUNITS (mode) : 0;


Ah true, especially now that there are no HImode predicates!

I added an additional check of `&& VALID_MVE_PRED_MODE (mode)` as well, 
just to make sure we could never pick up V4SImode, etc. (although I'd 
never expect that to happen if `rtx x` came from a valid instruction)





+/* Check if an insn requires the use of the VPR_REG, if it does, 
return the

+   sub-rtx of the VPR_REG.  The `type` argument controls whether
+   this function should:
+   * For type == 0, check all operands, including the OUT operands,
+ and return the first occurance of the VPR_REG.

s/occurance/occurrence/

Done


+  bool requires_vpr;
+  extract_constrain_insn (insn);

indent of requires_vpr is off.

Done


+  if (type == 1 && (recog_data.operand_type[op] == OP_OUT
+    || recog_data.operand_type[op] == OP_INOUT))
+    continue;
+  else if (type == 2 && (recog_data.operand_type[op] == OP_IN
+ || recog_data.operand_type[op] == OP_INOUT))
+    continue;

Why skip INOUT? I guess this will become clear when I see the uses, 
but I'm wondering whether 'only check the input operands.' is clear 
enough. Maybe 'check operands that are input only.' would be more 
accurate?
Oh! Thanks for spotting this. It also doesn't work with my comment at 
the top:

`(INOUT operands are considered both as input and output operands)`

It's been a long time since I wrote this piece, but it might be that I 
added this after realising that there are no insns with an OP_INOUT VPR 
reg. Since I don't think it's functional, I changed the code to align 
with the comment, instead.




+  /* Fetch the reg_class for each entry and check it against the
+   * VPR_REG reg_class.  */

Remove leading * on the second line.

Damn auto-formatters ;)
Done


+
+/* Wrapper function of arm_get_required_vpr_reg with type == 1, so 
return

+   something only if the VPR reg is an input operand to the insn.  */

When talking about a function parameter in comments capitalize (INSN) 
the name. Same for:

Done


+/* Wrapper function of arm_get_required_vpr_reg with type == 2, so 
return
+   something only if the VPR reg is the retrurn value, an output of, 
or is

+   clobbered by the insn.  */

+/* Return true if an insn is an MVE instruction that VPT-predicable, 
but in
+   its unpredicated form, or if it is predicated, but on a predicate 
other

+   than vpr_reg.  */

In this one also 'is a MVE instruction that is VPT-predicable' would 
be better I think.

Oops, thanks for spotting. Done.



On 15/06/2023 12:47, Stamatis Markianos-Wright via Gcc-patches wrote:
>  Hi all,
>
>  This is the 2/2 patch that contains the functional changes needed
>  for MVE Tail Predicated Low Overhead Loops.  See my previous email
>  for a general introduction of MVE LOLs.
>
>  This support is added through the already existing loop-doloop
>  mechanisms that are used for non-MVE dls/le looping.
>
>  Mid-end changes are:
>
>  1) Relax the loop-doloop mechanism in the mid-end to allow for
> decrement numbers other that -1 and for `count` to be an
> rtx containing a simple REG (which in this case will contain
> the number of elements to be processed), rather
> than an expression for calculating the number of iterations.
>  2) Added a new df utility function: `df_bb_regno_only_def_find` 
that
> will return the DEF of a REG only if it is DEF-ed once 
within the

> basic block.
>
>  And many things in the backend to implement the above 
optimisation:

>
>  3)  Implement the `arm_predict_doloop_p` target hook to 
instruct the

>  mid-end about Low Overhead Loops (MVE or not), as well as
>  `arm_loop_unroll_adjust` which will prevent unrolling of 
any loops
>  that are valid for becoming MVE Tail_Predicated Low 
Overhead Loops
>  (unrolling can transform a loop in ways that invalidate the 
dlstp/
>  letp tranformation logic and the benefit of the dlstp/letp 
loop

>

[v2] GTY: Clean up obsolete parametrized structs remnants (was: [PATCH 3/3] remove gengtype support for param_is use_param, if_marked and splay tree allocators)

2023-07-05 Thread Thomas Schwinge
Hi!

On 2023-07-05T10:16:09+0200, I wrote:
> On 2014-11-23T23:11:36-0500, tsaund...@mozilla.com wrote:
>> gcc/
>>
>>   * plugin.c, plugin.def, ggc.h, ggc-common.c, gengtype.h, gengtype.c,
>>   gengtype-state.c, gengtype-parse.c, gentype-lex.l, gcc-plugin.h,
>>   doc/plugins.texi, doc/gty.texi: Remove support for if_marked and
>>   param_is.
>
>> --- a/gcc/gengtype.h
>> +++ b/gcc/gengtype.h
>
>> @@ -153,11 +152,6 @@ enum typekind {
>>TYPE_LANG_STRUCT, /* GCC front-end language specific structs.
>> Various languages may have homonymous but
>> different structs.  */
>> -  TYPE_PARAM_STRUCT,/* Type for parametrized structs, e.g. hash_t
>> -   hash-tables, ...  See (param_is, use_param,
>> -   param1_is, param2_is,... use_param1,
>> -   use_param_2,... use_params) GTY
>> -   options.  */
>>TYPE_USER_STRUCT   /* User defined type.  Walkers and markers for
>>  this type are assumed to be provided by the
>>  user.  */
>
> OK to push the attached
> "GTY: Clean up obsolete parametrized structs remnants"?

Updated per

"GTY: Repair 'enum gty_token', 'token_names' desynchronization", OK to
push the attached
v2 "GTY: Clean up obsolete parametrized structs remnants"?


Grüße
 Thomas


-
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 
München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas 
Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht 
München, HRB 106955
>From fed7429b8c19c4bed570d343169ba6a35ed088b0 Mon Sep 17 00:00:00 2001
From: Thomas Schwinge 
Date: Tue, 4 Jul 2023 22:47:48 +0200
Subject: [PATCH] GTY: Clean up obsolete parametrized structs remnants

Support removed in 2014 with
commit 63f5d5b818319129217e41bcb23db53f99ff11b0 (Subversion r218558)
"remove gengtype support for param_is use_param, if_marked and splay tree allocators".

	gcc/
	* gengtype-parse.cc: Clean up obsolete parametrized structs
	remnants.
	* gengtype.cc: Likewise.
	* gengtype.h: Likewise.
---
 gcc/gengtype-parse.cc | 2 --
 gcc/gengtype.cc   | 6 ++
 gcc/gengtype.h| 3 +--
 3 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/gcc/gengtype-parse.cc b/gcc/gengtype-parse.cc
index 8791b35a3da..6b6331cc228 100644
--- a/gcc/gengtype-parse.cc
+++ b/gcc/gengtype-parse.cc
@@ -82,7 +82,6 @@ static const char *const token_names[] = {
   "ptr_alias",
   "nested_ptr",
   "user",
-  "a param_is option",
   "a number",
   "a scalar type",
   "an identifier",
@@ -95,7 +94,6 @@ static const char *const token_names[] = {
 /* This array is indexed by token code minus FIRST_TOKEN_WITH_VALUE.  */
 /* Keep in sync with 'gengtype.h:enum gty_token'.  */
 static const char *const token_value_format[] = {
-  "%s",
   "'%s'",
   "'%s'",
   "'%s'",
diff --git a/gcc/gengtype.cc b/gcc/gengtype.cc
index 54d3c8aaec3..49ddba684af 100644
--- a/gcc/gengtype.cc
+++ b/gcc/gengtype.cc
@@ -1388,8 +1388,6 @@ adjust_field_rtx_def (type_p t, options_p ARG_UNUSED (opt))
At present:
- Converts pointer-to-char, with no length parameter, to TYPE_STRING;
- Similarly for arrays of pointer-to-char;
-   - Converts structures for which a parameter is provided to
- TYPE_PARAM_STRUCT;
- Handles "special" options.
 */
 
@@ -3654,7 +3652,7 @@ write_func_for_structure (type_p orig_s, type_p s,
 }
 
 
-/* Write out marker routines for STRUCTURES and PARAM_STRUCTS.  */
+/* Write out marker routines for STRUCTURES.  */
 
 static void
 write_types (outf_p output_header, type_p structures,
@@ -4002,7 +4000,7 @@ write_local_func_for_structure (const_type_p orig_s, type_p s)
   }
 }
 
-/* Write out local marker routines for STRUCTURES and PARAM_STRUCTS.  */
+/* Write out local marker routines for STRUCTURES.  */
 
 static void
 write_local (outf_p output_header, type_p structures)
diff --git a/gcc/gengtype.h b/gcc/gengtype.h
index 2122373edf2..13942900910 100644
--- a/gcc/gengtype.h
+++ b/gcc/gengtype.h
@@ -478,7 +478,6 @@ enum gty_token
   PTR_ALIAS,
   NESTED_PTR,
   USER_GTY,
-  UNUSED_PARAM_IS,
   NUM,
   SCALAR,
   ID,
@@ -489,7 +488,7 @@ enum gty_token
 
   /* print_token assumes that any token >= FIRST_TOKEN_WITH_VALUE may have
  a meaningful value to be printed.  */
-  FIRST_TOKEN_WITH_VALUE = UNUSED_PARAM_IS
+  FIRST_TOKEN_WITH_VALUE = NUM
 };
 
 
-- 
2.34.1



Re: [PATCH 2/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops

2023-07-05 Thread Stamatis Markianos-Wright via Gcc-patches



On 23/06/2023 11:23, Andre Vieira (lists) wrote:

+  if (insn != arm_mve_get_loop_vctp (body))
+    {

probably a good idea to invert the condition here and return false, 
helps reducing the indenting in this function.


Done, thanks





+    /* Starting from the current insn, scan backwards through the insn
+   chain until BB_HEAD: "for each insn in the BB prior to the 
current".

+    */

There's a trailing whitespace after insn, but also I'd rewrite this 
bit. The "for each insn in the BB prior to the current" is superfluous 
and even confusing to me. How about:
"Scan backwards from the current INSN through the instruction chain 
until the start of the basic block.  "

Yes, agreed, it wasn't very clear. Done.



 I find 'that previous insn' to be confusing as you don't mention any 
previous insn before. So how about something along the lines of:
'If a previous insn defines a register that INSN uses then return true 
if...'

Done



Do we need to check: 'insn != prev_insn' ? Any reason why you can't 
start the loop with:

'for (rtx_insn *prev_insn = PREV_INSN (insn);'


True! Done.



Now I also found a case where things might go wrong in:
+    /* Look at all the DEFs of that previous insn: if one of them 
is on
+   the same REG as our current insn, then recurse in order to 
check

+   that insn's USEs.  If any of these insns return true as
+   MVE_VPT_UNPREDICATED_INSN_Ps, then the whole chain is 
affected
+   by the change in behaviour from being placed in dlstp/letp 
loop.

+    */
+    df_ref prev_insn_defs = NULL;
+    FOR_EACH_INSN_DEF (prev_insn_defs, prev_insn)
+  {
+    if (DF_REF_REGNO (insn_uses) == DF_REF_REGNO (prev_insn_defs)
+    && insn != prev_insn
+    && body == BLOCK_FOR_INSN (prev_insn)
+    && !arm_mve_vec_insn_is_predicated_with_this_predicate
+ (insn, vctp_vpr_generated)
+    && arm_mve_check_df_chain_back_for_implic_predic
+ (prev_insn, vctp_vpr_generated))
+  return true;
+  }

The body == BLOCK_FOR_INSN (prev_insn) hinted me at it, if a def comes 
from outside of the BB (so outside of the loop's body) then its by 
definition unpredicated by vctp.  I think you want to check that if 
prev_insn defines a register used by insn then return true if 
prev_insn isn't in the same BB or has a chain that is not predicated, 
i.e.: '!arm_mve_vec_insn_is_predicated_with_this_predicate (insn, 
vctp_vpr_generated) && arm_mve_check_df_chain_back_for_implic_predic 
prev_insn, vctp_vpr_generated))' you check body != BLOCK_FOR_INSN 
(prev_insn)'


Yes, you're right, this is vulnerable here. A neater fix to this (I 
think?) is to make the above REGNO_REG_SET_P more generic, so that it 
covers all scalar values and scalar ops, as well.
Then it's a "if this insn in the loop has any input that originates 
outside the bb, then it's unsafe" check and the recursive loop backwards 
is only for the recursive "are any previous insns unsafe"






I also found some other issues, this currently loloops:

uint16_t  test (uint16_t *a, int n)
{
  uint16_t res =0;
  while (n > 0)
    {
  mve_pred16_t p = vctp16q (n);
  uint16x8_t va = vldrhq_u16 (a);
  res = vaddvaq_u16 (res, va);
  res = vaddvaq_p_u16 (res, va, p);
  a += 8;
  n -= 8;
    }
  return res;
}

But it shouldn't, this is because there's a lack of handling of across 
vector instructions. Luckily in MVE all across vector instructions 
have the side-effect that they write to a scalar register, even the 
vshlcq instruction (it writes to a scalar carry output).


Added support for them (you were right, there was some special handling 
needed!)





Did this lead me to find an ICE with:

uint16x8_t  test (uint16_t *a, int n)
{
  uint16x8_t res = vdupq_n_u16 (0);
  while (n > 0)
    {
  uint16_t carry = 0;
  mve_pred16_t p = vctp16q (n);
  uint16x8_t va = vldrhq_u16 (a);
  res = vshlcq_u16 (va, &carry, 1);
  res = vshlcq_m_u16 (res, &carry, 1 , p);
  a += 8;
  n -= 8;
    }
  return res;
}

This is because:
+  /* If the USE is outside the loop body bb, or it is inside, 
but

+ is an unpredicated store to memory.  */
+  if (BLOCK_FOR_INSN (insn) != BLOCK_FOR_INSN (next_use_insn)
+ || (arm_mve_vec_insn_is_unpredicated_or_uses_other_predicate
+ (next_use_insn, vctp_vpr_generated)
+    && mve_memory_operand
+    (SET_DEST (single_set (next_use_insn)),
+ GET_MODE (SET_DEST (single_set (next_use_insn))
+    return true;

Assumes single_set doesn't return 0.


Thanks! That is indeed correct.

Corrected this by having a utility function to scan insn operands and 
check against mve_memory_operand that supports any number of 
operands/SETs in the insn




Let's deal with these issues and I'll continue to review.

On 15/06/2023 12:47, Stamatis Markianos-Wright via Gcc-patches wrote:

 Hi all,

   

GGC, GTY: Tighten up a few things re 'reorder' option and strings

2023-07-05 Thread Thomas Schwinge
Hi!

OK to push the attached
"GGC, GTY: Tighten up a few things re 'reorder' option and strings"?


Grüße
 Thomas


-
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 
München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas 
Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht 
München, HRB 106955
>From 8751deeb7afdc8ece6a0645c8404f615144b1bd4 Mon Sep 17 00:00:00 2001
From: Thomas Schwinge 
Date: Wed, 5 Jul 2023 15:34:56 +0200
Subject: [PATCH] GGC, GTY: Tighten up a few things re 'reorder' option and
 strings

..., which doesn't make sense in combination.

This, again, is primarily preparational for another change.

	gcc/
	* ggc-common.cc (gt_pch_note_reorder, gt_pch_save): Tighten up a
	few things re 'reorder' option and strings.
	* stringpool.cc (gt_pch_p_S): This is now 'gcc_unreachable'.
---
 gcc/ggc-common.cc | 18 ++
 gcc/stringpool.cc |  1 +
 2 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/gcc/ggc-common.cc b/gcc/ggc-common.cc
index 173ab64cb73..bed7a9d4d02 100644
--- a/gcc/ggc-common.cc
+++ b/gcc/ggc-common.cc
@@ -314,6 +314,9 @@ gt_pch_note_reorder (void *obj, void *note_ptr_cookie,
   data = (struct ptr_data *)
 saving_htab->find_with_hash (obj, POINTER_HASH (obj));
   gcc_assert (data && data->note_ptr_cookie == note_ptr_cookie);
+  /* The GTY 'reorder' option doesn't make sense if we don't walk pointers,
+ such as for strings.  */
+  gcc_checking_assert (data->note_ptr_fn != gt_pch_p_S);
 
   data->reorder_fn = reorder_fn;
 }
@@ -636,12 +639,19 @@ gt_pch_save (FILE *f)
 	state.ptrs[i]->reorder_fn (state.ptrs[i]->obj,
    state.ptrs[i]->note_ptr_cookie,
    relocate_ptrs, &state);
-  state.ptrs[i]->note_ptr_fn (state.ptrs[i]->obj,
-  state.ptrs[i]->note_ptr_cookie,
-  relocate_ptrs, &state);
+  gt_note_pointers note_ptr_fn = state.ptrs[i]->note_ptr_fn;
+  gcc_checking_assert (note_ptr_fn != NULL);
+  /* 'gt_pch_p_S' enables certain special handling, but otherwise
+ corresponds to no 'note_ptr_fn'.  */
+  if (note_ptr_fn == gt_pch_p_S)
+	note_ptr_fn = NULL;
+  if (note_ptr_fn != NULL)
+	note_ptr_fn (state.ptrs[i]->obj, state.ptrs[i]->note_ptr_cookie,
+		 relocate_ptrs, &state);
   ggc_pch_write_object (state.d, state.f, state.ptrs[i]->obj,
 			state.ptrs[i]->new_addr, state.ptrs[i]->size);
-  if (state.ptrs[i]->note_ptr_fn != gt_pch_p_S)
+  if (state.ptrs[i]->reorder_fn != NULL
+	  || note_ptr_fn != NULL)
 	memcpy (state.ptrs[i]->obj, this_object, state.ptrs[i]->size);
 #if defined ENABLE_VALGRIND_ANNOTATIONS && defined VALGRIND_GET_VBITS
   if (UNLIKELY (get_vbits == 1))
diff --git a/gcc/stringpool.cc b/gcc/stringpool.cc
index 46aff39d7d5..8658e6ab52a 100644
--- a/gcc/stringpool.cc
+++ b/gcc/stringpool.cc
@@ -185,6 +185,7 @@ gt_pch_p_S (void *obj ATTRIBUTE_UNUSED, void *x ATTRIBUTE_UNUSED,
 	gt_pointer_operator op ATTRIBUTE_UNUSED,
 	void *cookie ATTRIBUTE_UNUSED)
 {
+  gcc_unreachable ();
 }
 
 /* PCH pointer-walking routine for strings.  */
-- 
2.34.1



[PATCH] doc: Update my Contributors entry

2023-07-05 Thread Jonathan Wakely via Gcc-patches
Gerald suggested I update this. Pushed to trunk.

-- >8 --

gcc/ChangeLog:

* doc/contrib.texi (Contributors): Update my entry.
---
 gcc/doc/contrib.texi | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/gcc/doc/contrib.texi b/gcc/doc/contrib.texi
index 758805dc5db..fa551c5f900 100644
--- a/gcc/doc/contrib.texi
+++ b/gcc/doc/contrib.texi
@@ -1095,8 +1095,7 @@ Todd Vierling for contributions for NetBSD ports.
 Andrew Waterman for contributing the RISC-V port, as well as maintaining it.
 
 @item
-Jonathan Wakely for contributing libstdc++ Doxygen notes and XHTML
-guidance and maintaining libstdc++.
+Jonathan Wakely for contributing to and maintaining libstdc++.
 
 @item
 Dean Wakerley for converting the install documentation from HTML to texinfo
-- 
2.41.0



GGC, GTY: No pointer walking for 'atomic' in PCH 'gt_pch_note_object' (was: Patch: New GTY ((atomic)) option)

2023-07-05 Thread Thomas Schwinge
Hi!

My original motivation for the following exercise what that, for example,
for: 'const unsigned char * GTY((atomic)) mode_table', we currently run
into 'const' mismatches, 'error: invalid conversion':

[...]
gtype-desc.cc: In function 'void gt_pch_nx_lto_file_decl_data(void*)':
gtype-desc.cc:6531:34: error: invalid conversion from 'const void*' to 
'void*' [-fpermissive]
 gt_pch_note_object ((*x).mode_table, x, 
gt_pch_p_18lto_file_decl_data);
  ^
In file included from [...]/source-gcc/gcc/hash-table.h:247:0,
 from [...]/source-gcc/gcc/coretypes.h:486,
 from gtype-desc.cc:23:
[...]/source-gcc/gcc/ggc.h:47:12: note:   initializing argument 1 of 'int 
gt_pch_note_object(void*, void*, gt_note_pointers, size_t)'
 extern int gt_pch_note_object (void *, void *, gt_note_pointers,
^
make[2]: *** [Makefile:1180: gtype-desc.o] Error 1
[...]

..., as I had reported as "'GTY' issues: (1) 'const' build error" in

'Adjust LTO mode tables for "Machine_Mode: Extend machine_mode from 8 to 16 
bits"'.

That said:

On 2011-05-16T02:13:56+0200, "Nicola Pero"  
wrote:
> This patch adds a new GTY option, "atomic", which is similar to the identical 
> option you have with Boehm GC
> and which can be used with pointers to inform the GC/PCH machinery that they 
> point to an area of memory that
> contains no pointers (and hence needs no scanning).
>
> [...]

On top of that, OK to push the attached
"GGC, GTY: No pointer walking for 'atomic' in PCH 'gt_pch_note_object'"?
Appreciate review from a GGC, GTY-savvy person.

This depends on

"GGC, GTY: Tighten up a few things re 'reorder' option and strings".


Grüße
 Thomas


-
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 
München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas 
Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht 
München, HRB 106955
>From 2f12ce94166f411e4b9084b1c89738bb480343cc Mon Sep 17 00:00:00 2001
From: Thomas Schwinge 
Date: Tue, 4 Jul 2023 11:46:00 +0200
Subject: [PATCH] GGC, GTY: No pointer walking for 'atomic' in PCH
 'gt_pch_note_object'

Since it's inception in 2011 commit 555c3771903cc461949f06acf28f92fc067b6a1c
(Subversion r173996) "New GTY ((atomic)) option", the following rationale has
been given in (nowadays) 'gcc/gengtype.cc':

If a pointer type is marked as "atomic", we process the
field itself, but we don't walk the data that they point to.

There are two main cases where we walk types: to mark
pointers that are reachable, and to relocate pointers when
writing a PCH file.  In both cases, an atomic pointer is
itself marked or relocated, but the memory that it points
to is left untouched.  In the case of PCH, that memory will
be read/written unchanged to the PCH file.

Therefore, we may completely skip the boilerplate pointer walking, which we
didn't for PCH 'gt_pch_note_object'.

--- build-gcc/gcc/gt-c-c-decl.h	2023-06-26 08:59:55.120395571 +0200
+++ build-gcc/gcc/gt-c-c-decl.h	2023-07-05 15:58:36.286165439 +0200
@@ -1138,7 +1138,7 @@
 case TS_OPTIMIZATION:
   gt_pch_n_15cl_optimization ((*x).generic.optimization.opts);
   if ((*x).generic.optimization.optabs != NULL) {
-gt_pch_note_object ((*x).generic.optimization.optabs, x, gt_pch_p_14lang_tree_node);
+gt_pch_note_object ((*x).generic.optimization.optabs, x, NULL);
   }
   break;
 case TS_TARGET_OPTION:
--- build-gcc/gcc/gt-cp-tree.h	2023-06-26 08:59:55.120395571 +0200
+++ build-gcc/gcc/gt-cp-tree.h	2023-07-05 15:58:36.286165439 +0200
@@ -1452,7 +1452,7 @@
 case TS_OPTIMIZATION:
   gt_pch_n_15cl_optimization ((*x).generic.optimization.opts);
   if ((*x).generic.optimization.optabs != NULL) {
-gt_pch_note_object ((*x).generic.optimization.optabs, x, gt_pch_p_14lang_tree_node);
+gt_pch_note_object ((*x).generic.optimization.optabs, x, NULL);
   }
   break;
 case TS_TARGET_OPTION:
[...]

..., which is for 'gcc/tree-core.h':

struct GTY(()) tree_optimization_option {
  struct tree_base base;
  [...]
  struct cl_optimization *opts;
  [...]
  void *GTY ((atomic)) optabs;
  [...]
  struct target_optabs *GTY ((skip)) base_optabs;
};

..., which means we'll still 'gt_pch_note_object' 'optabs' itself.  (That's
important; if we skip those completely, we'll later fail the
'gcc_assert (result)' in 'gcc/ggc-common.cc:relocate_ptrs'.)  However, we no
longer attempt to walk 'optabs' via 'gt_pc

[PATCH] Fix PR 110554: vec lowering introduces scalar signed-boolean:32 comparisons

2023-07-05 Thread Andrew Pinski via Gcc-patches
So the problem is vector generic decided to do comparisons in signed-boolean:32
types but the rest of the middle-end was not ready for that. Since we are 
building
the comparison which will feed into a cond_expr here, using boolean_type_node is
better and also correct. The rest of the compiler thinks the ranges for
comparison is always [0,1] too.

Note this code does not currently lowers bigger vector sizes into smaller
vector sizes so using boolean_type_node here is better.

OK? bootstrapped and tested on x86_64-linux-gnu with no regressions.

gcc/ChangeLog:

PR middle-end/110554
* tree-vect-generic.cc (expand_vector_condition): For comparisons,
just build using boolean_type_node instead of the cond_type.
For non-comparisons/non-scalar-bitmask, build a ` != 0` gimple
that will feed into the COND_EXPR.
---
 gcc/tree-vect-generic.cc | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc
index df04a0db68d..a7e6cb87a5e 100644
--- a/gcc/tree-vect-generic.cc
+++ b/gcc/tree-vect-generic.cc
@@ -1121,7 +1121,7 @@ expand_vector_condition (gimple_stmt_iterator *gsi, 
bitmap dce_ssa_names)
   comp_width, comp_index);
  tree aa2 = tree_vec_extract (gsi, comp_inner_type, a2,
   comp_width, comp_index);
- aa = gimplify_build2 (gsi, code, cond_type, aa1, aa2);
+ aa = gimplify_build2 (gsi, code, boolean_type_node, aa1, aa2);
}
   else if (a_is_scalar_bitmask)
{
@@ -1132,7 +1132,11 @@ expand_vector_condition (gimple_stmt_iterator *gsi, 
bitmap dce_ssa_names)
build_zero_cst (TREE_TYPE (a)));
}
   else
-   aa = tree_vec_extract (gsi, cond_type, a, comp_width, comp_index);
+   {
+ result = tree_vec_extract (gsi, cond_type, a, comp_width, comp_index);
+ aa = gimplify_build2 (gsi, NE_EXPR, boolean_type_node, result,
+   build_zero_cst (cond_type));
+   }
   result = gimplify_build3 (gsi, COND_EXPR, inner_type, aa, bb, cc);
   if (!CONSTANT_CLASS_P (result))
constant_p = false;
-- 
2.31.1



RE: [PATCH] arm: Fix MVE intrinsics support with LTO (PR target/110268)

2023-07-05 Thread Kyrylo Tkachov via Gcc-patches
Hi Christophe,

> -Original Message-
> From: Christophe Lyon 
> Sent: Monday, June 26, 2023 4:03 PM
> To: gcc-patches@gcc.gnu.org; Kyrylo Tkachov ;
> Richard Sandiford 
> Cc: Christophe Lyon 
> Subject: [PATCH] arm: Fix MVE intrinsics support with LTO (PR target/110268)
> 
> After the recent MVE intrinsics re-implementation, LTO stopped working
> because the intrinsics would no longer be defined.
> 
> The main part of the patch is simple and similar to what we do for
> AArch64:
> - call handle_arm_mve_h() from arm_init_mve_builtins to declare the
>   intrinsics when the compiler is in LTO mode
> - actually implement arm_builtin_decl for MVE.
> 
> It was just a bit tricky to handle __ARM_MVE_PRESERVE_USER_NAMESPACE:
> its value in the user code cannot be guessed at LTO time, so we always
> have to assume that it was not defined.  The led to a few fixes in the
> way we register MVE builtins as placeholders or not.  Without this
> patch, we would just omit some versions of the inttrinsics when
> __ARM_MVE_PRESERVE_USER_NAMESPACE is true. In fact, like for the C/C++
> placeholders, we need to always keep entries for all of them to ensure
> that we have a consistent numbering scheme.
> 
>   2023-06-26  Christophe Lyon   
> 
>   PR target/110268
>   gcc/
>   * config/arm/arm-builtins.cc (arm_init_mve_builtins): Handle LTO.
>   (arm_builtin_decl): Hahndle MVE builtins.
>   * config/arm/arm-mve-builtins.cc (builtin_decl): New function.
>   (add_unique_function): Fix handling of
>   __ARM_MVE_PRESERVE_USER_NAMESPACE.
>   (add_overloaded_function): Likewise.
>   * config/arm/arm-protos.h (builtin_decl): New declaration.
> 
>   gcc/testsuite/
>   * gcc.target/arm/pr110268-1.c: New test.
>   * gcc.target/arm/pr110268-2.c: New test.
> ---
>  gcc/config/arm/arm-builtins.cc| 11 +++-
>  gcc/config/arm/arm-mve-builtins.cc| 61 ---
>  gcc/config/arm/arm-protos.h   |  1 +
>  gcc/testsuite/gcc.target/arm/pr110268-1.c | 11 
>  gcc/testsuite/gcc.target/arm/pr110268-2.c | 22 
>  5 files changed, 76 insertions(+), 30 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/arm/pr110268-1.c
>  create mode 100644 gcc/testsuite/gcc.target/arm/pr110268-2.c
> 
> diff --git a/gcc/config/arm/arm-builtins.cc b/gcc/config/arm/arm-builtins.cc
> index 36365e40a5b..fca7dcaf565 100644
> --- a/gcc/config/arm/arm-builtins.cc
> +++ b/gcc/config/arm/arm-builtins.cc
> @@ -1918,6 +1918,15 @@ arm_init_mve_builtins (void)
>arm_builtin_datum *d = &mve_builtin_data[i];
>arm_init_builtin (fcode, d, "__builtin_mve");
>  }
> +
> +  if (in_lto_p)
> +{
> +  arm_mve::handle_arm_mve_types_h ();
> +  /* Under LTO, we cannot know whether
> +  __ARM_MVE_PRESERVE_USER_NAMESPACE was defined, so assume
> it
> +  was not.  */
> +  arm_mve::handle_arm_mve_h (false);
> +}
>  }
> 
>  /* Set up all the NEON builtins, even builtins for instructions that are not
> @@ -2723,7 +2732,7 @@ arm_builtin_decl (unsigned code, bool initialize_p
> ATTRIBUTE_UNUSED)
>  case ARM_BUILTIN_GENERAL:
>return arm_general_builtin_decl (subcode);
>  case ARM_BUILTIN_MVE:
> -  return error_mark_node;
> +  return arm_mve::builtin_decl (subcode);
>  default:
>gcc_unreachable ();
>  }
> diff --git a/gcc/config/arm/arm-mve-builtins.cc b/gcc/config/arm/arm-mve-
> builtins.cc
> index 7033e41a571..e9a12f27411 100644
> --- a/gcc/config/arm/arm-mve-builtins.cc
> +++ b/gcc/config/arm/arm-mve-builtins.cc
> @@ -493,6 +493,16 @@ handle_arm_mve_h (bool
> preserve_user_namespace)
>preserve_user_namespace);
>  }
> 
> +/* Return the function decl with SVE function subcode CODE, or
> error_mark_node
> +   if no such function exists.  */
> +tree
> +builtin_decl (unsigned int code)
> +{
> +  if (code >= vec_safe_length (registered_functions))
> +return error_mark_node;
> +  return (*registered_functions)[code]->decl;
> +}
> +
>  /* Return true if CANDIDATE is equivalent to MODEL_TYPE for overloading
> purposes.  */
>  static bool
> @@ -849,7 +859,6 @@ function_builder::add_function (const
> function_instance &instance,
>  ? integer_zero_node
>  : simulate_builtin_function_decl (input_location, name, fntype,
> code, NULL, attrs);
> -
>registered_function &rfn = *ggc_alloc  ();
>rfn.instance = instance;
>rfn.decl = decl;
> @@ -889,15 +898,12 @@ function_builder::add_unique_function (const
> function_instance &instance,
>gcc_assert (!*rfn_slot);
>*rfn_slot = &rfn;
> 
> -  /* Also add the non-prefixed non-overloaded function, if the user
> namespace
> - does not need to be preserved.  */
> -  if (!preserve_user_namespace)
> -{
> -  char *noprefix_name = get_name (instance, false, false);
> -  tree attrs = get_attributes (instance);
> -  add_function (instance, noprefix_name, fntyp

Re: [PATCH] libatomic: Enable lock-free 128-bit atomics on AArch64 [PR110061]

2023-07-05 Thread Wilco Dijkstra via Gcc-patches

ping

From: Wilco Dijkstra
Sent: 02 June 2023 18:28
To: GCC Patches 
Cc: Richard Sandiford ; Kyrylo Tkachov 

Subject: [PATCH] libatomic: Enable lock-free 128-bit atomics on AArch64 
[PR110061] 
 

Enable lock-free 128-bit atomics on AArch64.  This is backwards compatible with
existing binaries, gives better performance than locking atomics and is what
most users expect.

Note 128-bit atomic loads use a load/store exclusive loop if LSE2 is not 
supported.
This results in an implicit store which is invisible to software as long as the 
given
address is writeable (which will be true when using atomics in actual code).

A simple test on an old Cortex-A72 showed 2.7x speedup of 128-bit atomics.

Passes regress, OK for commit?

libatomic/
    PR target/110061
    config/linux/aarch64/atomic_16.S: Implement lock-free ARMv8.0 atomics.
    config/linux/aarch64/host-config.h: Use atomic_16.S for baseline v8.0.
    State we have lock-free atomics.

---

diff --git a/libatomic/config/linux/aarch64/atomic_16.S 
b/libatomic/config/linux/aarch64/atomic_16.S
index 
05439ce394b9653c9bcb582761ff7aaa7c8f9643..0485c284117edf54f41959d2fab9341a9567b1cf
 100644
--- a/libatomic/config/linux/aarch64/atomic_16.S
+++ b/libatomic/config/linux/aarch64/atomic_16.S
@@ -22,6 +22,21 @@
    .  */
 
 
+/* AArch64 128-bit lock-free atomic implementation.
+
+   128-bit atomics are now lock-free for all AArch64 architecture versions.
+   This is backwards compatible with existing binaries and gives better
+   performance than locking atomics.
+
+   128-bit atomic loads use a exclusive loop if LSE2 is not supported.
+   This results in an implicit store which is invisible to software as long
+   as the given address is writeable.  Since all other atomics have explicit
+   writes, this will be true when using atomics in actual code.
+
+   The libat__16 entry points are ARMv8.0.
+   The libat__16_i1 entry points are used when LSE2 is available.  */
+
+
 .arch   armv8-a+lse
 
 #define ENTRY(name) \
@@ -37,6 +52,10 @@ name:    \
 .cfi_endproc;   \
 .size name, .-name;
 
+#define ALIAS(alias,name)  \
+   .global alias;  \
+   .set alias, name;
+
 #define res0 x0
 #define res1 x1
 #define in0  x2
@@ -70,6 +89,24 @@ name:    \
 #define SEQ_CST 5
 
 
+ENTRY (libat_load_16)
+   mov x5, x0
+   cbnz    w1, 2f
+
+   /* RELAXED.  */
+1: ldxp    res0, res1, [x5]
+   stxp    w4, res0, res1, [x5]
+   cbnz    w4, 1b
+   ret
+
+   /* ACQUIRE/CONSUME/SEQ_CST.  */
+2: ldaxp   res0, res1, [x5]
+   stxp    w4, res0, res1, [x5]
+   cbnz    w4, 2b
+   ret
+END (libat_load_16)
+
+
 ENTRY (libat_load_16_i1)
 cbnz    w1, 1f
 
@@ -93,6 +130,23 @@ ENTRY (libat_load_16_i1)
 END (libat_load_16_i1)
 
 
+ENTRY (libat_store_16)
+   cbnz    w4, 2f
+
+   /* RELAXED.  */
+1: ldxp    xzr, tmp0, [x0]
+   stxp    w4, in0, in1, [x0]
+   cbnz    w4, 1b
+   ret
+
+   /* RELEASE/SEQ_CST.  */
+2: ldxp    xzr, tmp0, [x0]
+   stlxp   w4, in0, in1, [x0]
+   cbnz    w4, 2b
+   ret
+END (libat_store_16)
+
+
 ENTRY (libat_store_16_i1)
 cbnz    w4, 1f
 
@@ -101,14 +155,14 @@ ENTRY (libat_store_16_i1)
 ret
 
 /* RELEASE/SEQ_CST.  */
-1: ldaxp   xzr, tmp0, [x0]
+1: ldxp    xzr, tmp0, [x0]
 stlxp   w4, in0, in1, [x0]
 cbnz    w4, 1b
 ret
 END (libat_store_16_i1)
 
 
-ENTRY (libat_exchange_16_i1)
+ENTRY (libat_exchange_16)
 mov x5, x0
 cbnz    w4, 2f
 
@@ -126,22 +180,55 @@ ENTRY (libat_exchange_16_i1)
 stxp    w4, in0, in1, [x5]
 cbnz    w4, 3b
 ret
-4:
-   cmp w4, RELEASE
-   b.ne    6f
 
-   /* RELEASE.  */
-5: ldxp    res0, res1, [x5]
+   /* RELEASE/ACQ_REL/SEQ_CST.  */
+4: ldaxp   res0, res1, [x5]
 stlxp   w4, in0, in1, [x5]
-   cbnz    w4, 5b
+   cbnz    w4, 4b
 ret
+END (libat_exchange_16)
 
-   /* ACQ_REL/SEQ_CST.  */
-6: ldaxp   res0, res1, [x5]
-   stlxp   w4, in0, in1, [x5]
-   cbnz    w4, 6b
+
+ENTRY (libat_compare_exchange_16)
+   ldp exp0, exp1, [x1]
+   cbz w4, 3f
+   cmp w4, RELEASE
+   b.hs    4f
+
+   /* ACQUIRE/CONSUME.  */
+1: ldaxp   tmp0, tmp1, [x0]
+   cmp tmp0, exp0
+   ccmp    tmp1, exp1, 0, eq
+   bne 2f
+   stxp    w4, in0, in1, [x0]
+   cbnz    w4, 1b
+   mov x0, 1
 ret
-END (libat_exchange_16_i1)
+
+2: stp tmp0, tmp1, [x1]
+   mov x0, 0
+   ret
+
+   /* RELAXED.  */
+3: ldxp    tmp0, tmp1, [x0]
+   cmp tmp0, exp0
+   ccmp    tmp1, exp1, 0, eq
+   bne 2b
+   stxp    w4, in0, in1, [x0]
+   cbnz    w4, 3b
+   mov x0, 1
+   ret
+
+   /* RELEASE/ACQ_REL/SEQ_CST.  */
+4: ldaxp   tmp0, tmp1, [

Re: Re: [PATCH] libstdc++: Fix fwrite error parameter

2023-07-05 Thread Tianqiang Shuai via Gcc-patches
Thanks a lot.

Re: [PATCH] Vect: select small VF for epilog of unrolled loop (PR tree-optimization/110474)

2023-07-05 Thread Richard Sandiford via Gcc-patches
Hao Liu OS via Gcc-patches  writes:
> Hi,
>
> If a loop is unrolled during vectorization (i.e. suggested_unroll_factor > 1),
> the VFs of both main and epilog loop are enlarged.  The epilog vect loop is
> specific for a loop with small iteration counts, so a large VF may hurt
> performance.
>
> This patch unscales the main loop VF by suggested_unroll_factor while 
> selecting
> the epilog loop VF, so that it will be the same as vectorized loop without
> unrolling (i.e. suggested_unroll_factor = 1).

I agree that unrolling the main loop shouldn't cause more iterations
to be handled by the scalar code.  It would be nice to support multiple
epilogues, but that's probably a lot of work.

> gcc/ChangeLog:
>
>   PR tree-optimization/110474
>   * tree-vect-loop.cc (vect_analyze_loop_2): unscale the VF by suggested
>   unroll factor while selecting the epilog vect loop VF.
>
> gcc/testsuite/ChangeLog:
>
>   * gcc.target/aarch64/pr110474.c: New testcase.

OK, thanks.

Richard

> ---
>  gcc/testsuite/gcc.target/aarch64/pr110474.c | 37 +
>  gcc/tree-vect-loop.cc   | 16 +
>  2 files changed, 47 insertions(+), 6 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/pr110474.c
>
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr110474.c 
> b/gcc/testsuite/gcc.target/aarch64/pr110474.c
> new file mode 100644
> index 000..e548416162a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/pr110474.c
> @@ -0,0 +1,37 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -mtune=neoverse-n2 -mcpu=neoverse-n1 
> -fdump-tree-vect-details --param aarch64-vect-unroll-limit=2" } */
> +/* { dg-final { scan-tree-dump "Choosing vector mode V8HI"  "vect" } } */
> +/* { dg-final { scan-tree-dump "Choosing epilogue vector mode V8QI"  "vect" 
> } } */
> +
> +/* Do not increase the the vector factor of the epilog vectorized loop
> +   for a loop with suggested_unroll_factor > 1.
> +
> +   before (suggested_unroll_factor=1):
> + if N >= 16:
> + main vect loop
> + if N >= 8:
> + epilog vect loop
> + scalar code
> +
> +   before (suggested_unroll_factor=2):
> + if N >= 32:
> + main vect loop
> + if N >= 16:  // May fail to execute vectorized code (e.g. N is 8)
> + epilog vect loop
> + scalar code
> +
> +   after  (suggested_unroll_factor=2):
> + if N >= 32:
> + main vect loop
> + if N >= 8:  // The same VF as suggested_unroll_factor=1
> + epilog vect loop
> + scalar code  */
> +
> +int
> +foo (short *A, char *B, int N)
> +{
> +  int sum = 0;
> +  for (int i = 0; i < N; ++i)
> +sum += A[i] * B[i];
> +  return sum;
> +}
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 3b46c58a8d8..4d9abd035ea 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -3021,12 +3021,16 @@ start_over:
>   to be able to handle fewer than VF scalars, or needs to have a lower VF
>   than the main loop.  */
>if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
> -  && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
> -  && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
> -LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
> -return opt_result::failure_at (vect_location,
> -"Vectorization factor too high for"
> -" epilogue loop.\n");
> +  && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
> +{
> +  poly_uint64 unscaled_vf
> + = exact_div (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
> +  orig_loop_vinfo->suggested_unroll_factor);
> +  if (maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), unscaled_vf))
> + return opt_result::failure_at (vect_location,
> +"Vectorization factor too high for"
> +" epilogue loop.\n");
> +}
>  
>/* Decide whether this loop_vinfo should use partial vectors or peeling,
>   assuming that the loop will be used as a main loop.  We will redo


Re: [PATCH] Fortran: fixes for procedures with ALLOCATABLE,INTENT(OUT) arguments [PR92178]

2023-07-05 Thread Harald Anlauf via Gcc-patches

Hi Mikael,

Am 05.07.23 um 16:54 schrieb Mikael Morin:

Here is an example, admittedly artificial.  Fails with the above change,
but fails with master as well.

program p
   implicit none
   type t
     integer :: i
   end type t
   type u
     class(t), allocatable :: ta(:)
   end type u
   type(u), allocatable, target :: c(:)
   c = [u([t(1), t(3)]), u([t(4), t(9)])]
   call bar (allocated (c(c(1)%ta(1)%i)%ta), c(c(1)%ta(1)%i)%ta,
allocated (c(c(1)%ta(1)%i)%ta))
   if (allocated(c(1)%ta)) stop 11
   if (.not. allocated(c(2)%ta)) stop 12
contains
   subroutine bar (alloc, x, alloc2)
     logical :: alloc, alloc2
     class(t), allocatable, intent(out) :: x(:)
     if (allocated (x)) stop 1
     if (.not. alloc)   stop 2
     if (.not. alloc2)  stop 3
   end subroutine bar
end


while it looks artificial, it is valid, and IMHO it is a beast...

I've played around and added another argument gfc_se *convse to
gfc_conv_class_to_class in an attempt to implement what I thought
you suggested (to get the .pre/.post separately), but in the end
this did not lead to working code.  And the tree-dump for your
example above is beyond what I can grasp.

I've noticed that my attempt does not properly handle the
parmse.post; at least this is what the above example shows:
there is a small part after the call to bar that should have
been executed before that call, which I attribute to .post.
But my attempts in moving that part regresses on a couple
of testcases with class and intent(out).  I am at a loss now.

I am attaching the latest version of my patch to give you or
Paul or others the opportunity to see what is wrong or add the
missing pieces.

Thanks for your help so far.

Harald

From 989030fc04eacf97a034ab1f7ed85b932669f82d Mon Sep 17 00:00:00 2001
From: Harald Anlauf 
Date: Wed, 5 Jul 2023 22:21:09 +0200
Subject: [PATCH] Fortran: fixes for procedures with ALLOCATABLE,INTENT(OUT)
 arguments [PR92178]

gcc/fortran/ChangeLog:

	PR fortran/92178
	* trans-expr.cc (gfc_conv_procedure_call): Check procedures for
	allocatable dummy arguments with INTENT(OUT) and move deallocation
	of actual arguments after evaluation of argument expressions before
	the procedure is executed.

gcc/testsuite/ChangeLog:

	PR fortran/92178
	* gfortran.dg/intent_out_16.f90: New test.
	* gfortran.dg/intent_out_17.f90: New test.
	* gfortran.dg/intent_out_18.f90: New test.

Co-authored-by: Steven G. Kargl 
---
 gcc/fortran/trans-expr.cc   | 54 +++--
 gcc/testsuite/gfortran.dg/intent_out_16.f90 | 89 +
 gcc/testsuite/gfortran.dg/intent_out_17.f90 | 46 +++
 gcc/testsuite/gfortran.dg/intent_out_18.f90 | 31 +++
 4 files changed, 215 insertions(+), 5 deletions(-)
 create mode 100644 gcc/testsuite/gfortran.dg/intent_out_16.f90
 create mode 100644 gcc/testsuite/gfortran.dg/intent_out_17.f90
 create mode 100644 gcc/testsuite/gfortran.dg/intent_out_18.f90

diff --git a/gcc/fortran/trans-expr.cc b/gcc/fortran/trans-expr.cc
index 30946ba3f63..7017b652d6e 100644
--- a/gcc/fortran/trans-expr.cc
+++ b/gcc/fortran/trans-expr.cc
@@ -6085,9 +6085,10 @@ gfc_conv_procedure_call (gfc_se * se, gfc_symbol * sym,
   else
 info = NULL;
 
-  stmtblock_t post, clobbers;
+  stmtblock_t post, clobbers, dealloc_blk;
   gfc_init_block (&post);
   gfc_init_block (&clobbers);
+  gfc_init_block (&dealloc_blk);
   gfc_init_interface_mapping (&mapping);
   if (!comp)
 {
@@ -6117,6 +6118,32 @@ gfc_conv_procedure_call (gfc_se * se, gfc_symbol * sym,
 	   && UNLIMITED_POLY (sym)
 	   && comp && (strcmp ("_copy", comp->name) == 0);
 
+  /* Scan for allocatable actual arguments passed to allocatable dummy
+ arguments with INTENT(OUT).  As the corresponding actual arguments are
+ deallocated before execution of the procedure, we evaluate actual
+ argument expressions to avoid problems with possible dependencies.  */
+  bool force_eval_args = false;
+  gfc_formal_arglist *tmp_formal;
+  for (arg = args, tmp_formal = formal; arg != NULL;
+   arg = arg->next, tmp_formal = tmp_formal ? tmp_formal->next : NULL)
+{
+  e = arg->expr;
+  fsym = tmp_formal ? tmp_formal->sym : NULL;
+  if (e && fsym
+	  && e->expr_type == EXPR_VARIABLE
+	  && fsym->attr.intent == INTENT_OUT
+	  && (fsym->ts.type == BT_CLASS && fsym->attr.class_ok
+	  ? CLASS_DATA (fsym)->attr.allocatable
+	  : fsym->attr.allocatable)
+	  && e->symtree
+	  && e->symtree->n.sym
+	  && gfc_variable_attr (e, NULL).allocatable)
+	{
+	  force_eval_args = true;
+	  break;
+	}
+}
+
   /* Evaluate the arguments.  */
   for (arg = args, argc = 0; arg != NULL;
arg = arg->next, formal = formal ? formal->next : NULL, ++argc)
@@ -6680,7 +6707,7 @@ gfc_conv_procedure_call (gfc_se * se, gfc_symbol * sym,
 		  else
 			tmp = gfc_finish_block (&block);
 
-		  gfc_add_expr_to_block (&se->pre, tmp);
+		  gfc_add_expr_to_block (&dealloc_blk, tmp);
 		}
 
 		  /* A class array element needs converting back to be a
@

Re: [PATCH] analyzer: Add support of placement new and improved operator new [PR105948]

2023-07-05 Thread David Malcolm via Gcc-patches
On Tue, 2023-07-04 at 18:25 +0200, priour...@gmail.com wrote:
> From: benjamin priour 
> 
> Script contrib/check_GNU_style.sh complains about there being a space
> before a left square bracket ("operator new []").
> Though, it is actually within a literal string, and the space  
> is required to correctly detect the function.
> 
> Succesfully regstrapped on x86_64-linux-gnu against trunk
> 3c776fdf1a8.
> Is it OK for trunk ?

Thanks for the patch.

Overall, looks almost ready, but some nitpicks below,,,

[..snip...]


> diff --git a/gcc/analyzer/kf-lang-cp.cc b/gcc/analyzer/kf-lang-cp.cc
> index 393b4f25e79..258d92919d7 100644
> --- a/gcc/analyzer/kf-lang-cp.cc
> +++ b/gcc/analyzer/kf-lang-cp.cc
> @@ -35,6 +35,34 @@ along with GCC; see the file COPYING3.  If not see
>  
>  #if ENABLE_ANALYZER
>  
> +/* Return TRUE if CALL is non-allocating operator new or operator
> new[]*/
> +
> +bool is_placement_new_p (const gcall *call)

Please can you extend the leading comment, giving the expected
signatures of the functions, and a link to cppreference.org.

In particular, there's some special-casing here of "nothrow_t" which
would make more sense with a comment up here.

> +{
> +  gcc_assert (call);
> +
> +  tree fndecl = gimple_call_fndecl (call);
> +  if (!fndecl)
> +return false;
> +
> +  if (!is_named_call_p (fndecl, "operator new", call, 2)
> +&& !is_named_call_p (fndecl, "operator new []", call, 2))
> +return false;
> +  tree arg1 = gimple_call_arg (call, 1);
> +
> +  if (!POINTER_TYPE_P (TREE_TYPE (arg1)))
> +return false;
> +
> +  /* Sadly, for non-throwing new, the second argument type
> +is not REFERENCE_TYPE but also POINTER_TYPE
> +so a simple check is out of the way.  */
> +  tree identifier = TYPE_IDENTIFIER (TREE_TYPE (TREE_TYPE (arg1)));
> +  if (!identifier)
> +return true;
> +  const char *name = IDENTIFIER_POINTER (identifier);
> +  return 0 != strcmp (name, "nothrow_t");
> +}
> +
>  namespace ana {
>  
>  /* Implementations of specific functions.  */
> @@ -46,7 +74,7 @@ class kf_operator_new : public known_function
>  public:
>bool matches_call_types_p (const call_details &cd) const final
> override
>{
> -return cd.num_args () == 1;
> +return cd.num_args () == 1 || cd.num_args () == 2;

Looks like we should also check that arg 0 is of integral type, and
that arg 1 is of pointer type.


>}
>  
>void impl_call_pre (const call_details &cd) const final override
> @@ -54,13 +82,60 @@ public:
>  region_model *model = cd.get_model ();
>  region_model_manager *mgr = cd.get_manager ();
>  const svalue *size_sval = cd.get_arg_svalue (0);
> -const region *new_reg
> -  = model->get_or_create_region_for_heap_alloc (size_sval,
> cd.get_ctxt ());
> -if (cd.get_lhs_type ())
> +region_model_context *ctxt = cd.get_ctxt ();
> +const gcall *call = cd.get_call_stmt ();
> +
> +/* If the call is an allocating new, then create a heap
> allocated
> +region.  */
> +if (!is_placement_new_p (call))
> +  {

You have:
   if (!condition)
 suite_a;
   else
 suite_b; // this is implicitly a double negative
 

Please change it to:

  if (condition)
suite_b;
  else
suite_a;

to avoid the implicit double negative.


> +   const region *new_reg
> + = model->get_or_create_region_for_heap_alloc (size_sval, ctxt);
> +   if (cd.get_lhs_type ())
> + {
> +   const svalue *ptr_sval
> + = mgr->get_ptr_svalue (cd.get_lhs_type (), new_reg);
> +   cd.maybe_set_lhs (ptr_sval);
> + }
> +  }
> +/* If the call was actually a placement new, check that
> accessing
> +the buffer lhs is placed into does not result in out-of-bounds. 
> */
> +else
>{
> +   const region *ptr_reg = cd.maybe_get_arg_region (1);
> +   if (ptr_reg && cd.get_lhs_type ())
> + {
> +   const region *base_reg = ptr_reg->get_base_region ();
> +   const svalue *num_bytes_sval = cd.get_arg_svalue (0);
> +   const region *sized_new_reg = mgr->get_sized_region (base_reg,
> + cd.get_lhs_type (),
> + num_bytes_sval);
> +   model->check_region_for_write (sized_new_reg,
> + nullptr,
> + ctxt);
> const svalue *ptr_sval
> - = mgr->get_ptr_svalue (cd.get_lhs_type (), new_reg);
> + = mgr->get_ptr_svalue (cd.get_lhs_type (), sized_new_reg);
> cd.maybe_set_lhs (ptr_sval);
> + }
> +  }
> +  }

[...snip...]

> diff --git a/gcc/testsuite/g++.dg/analyzer/new-2.C
> b/gcc/testsuite/g++.dg/analyzer/new-2.C
> new file mode 100644
> index 000..4e696040a54
> --- /dev/null
> +++ b/gcc/testsuite/g++.dg/analyzer/new-2.C
> @@ -0,0 +1,50 @@
> +// { dg-additional-options "-O0" }
> +
> +struct A
> +{
> +  int x;
> +  int y;
> +};

We've run into issues with bounds-checking testcases when using types
like "int" that have target-specific sizes.

Please use  in these t

RE: [EXTERNAL] Re: [PATCH] Collect both user and kernel events for autofdo tests and autoprofiledbootstrap

2023-07-05 Thread Eugene Rozenfeld via Gcc-patches
There is no warning and perf /uk succeeds when kptr_restrict is set to 1 and 
perf_event_paranoid set to 2. However, create_gcov may fail since it won't be 
able to understand kernel addresses and it requires at least 95% of events to 
be successfully mapped.

If I set both kptr_restrict and perf_event_paranoid to 1, then I do get 
warnings from perf (but it still succeeds and exits with a 0 code). And, of 
course create_gcov will also fail to map some events since it won't understand 
kernel addresses.

WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,
check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.

Samples in kernel functions may not be resolved if a suitable vmlinux
file is not found in the buildid cache or in the vmlinux path.

Samples in kernel modules won't be resolved at all.

If some relocation was applied (e.g. kexec) symbols may be misresolved
even with a suitable vmlinux or kallsyms file.

Couldn't record kernel reference relocation symbol
Symbol resolution may be skewed if relocation was used (e.g. kexec).
Check /proc/kallsyms permission or run as root.
[ perf record: Woken up 2 times to write data ]
[ perf record: Captured and wrote 0.037 MB 
/home/erozen/gcc1_objdir/gcc/testsuite/gcc/indir-call-prof.perf.data (86 
samples) ]

Eugene

-Original Message-
From: Richard Biener  
Sent: Monday, July 3, 2023 12:47 AM
To: Eugene Rozenfeld 
Cc: Sam James ; gcc-patches@gcc.gnu.org
Subject: Re: [EXTERNAL] Re: [PATCH] Collect both user and kernel events for 
autofdo tests and autoprofiledbootstrap

On Sat, Jul 1, 2023 at 12:05 AM Eugene Rozenfeld 
 wrote:
>
> I also set /proc/sys/kernel/perf_event_paranoid to 1 instead of the default 2.

Does the perf attempt fail when the privileges are not adjusted and you specify 
--all?  I see it adds /uk as flags, when I do

> perf record -e instructions//uk ./a.out

it doesn't complain in any way with

> cat /proc/sys/kernel/kptr_restrict
1
> cat /proc/sys/kernel/perf_event_paranoid
2

so in case the 'kernel' side is simply ignored when profiling there isn't 
permitted/possible then I guess the patch is OK?

Can you confirm?

Thanks,
Richard.

> -Original Message-
> From: Gcc-patches 
>  On Behalf Of 
> Eugene Rozenfeld via Gcc-patches
> Sent: Friday, June 30, 2023 2:44 PM
> To: Sam James ; Richard Biener 
> 
> Cc: gcc-patches@gcc.gnu.org
> Subject: RE: [EXTERNAL] Re: [PATCH] Collect both user and kernel 
> events for autofdo tests and autoprofiledbootstrap
>
> I don't run this with elevated privileges but I set 
> /proc/sys/kernel/kptr_restrict to 0. Setting that does require elevated 
> privileges.
>
> If that's not acceptable, the only fix I can think of is to make that event 
> mapping threshold percentage a parameter to create_gcov and pass something 
> low enough. 80% instead of the current threshold of 95% should work, although 
> it's a bit fragile.
>
> Eugene
>
> -Original Message-
> From: Sam James 
> Sent: Friday, June 30, 2023 1:59 AM
> To: Richard Biener 
> Cc: Eugene Rozenfeld ; 
> gcc-patches@gcc.gnu.org
> Subject: [EXTERNAL] Re: [PATCH] Collect both user and kernel events 
> for autofdo tests and autoprofiledbootstrap
>
> [You don't often get email from s...@gentoo.org. Learn why this is 
> important at https://aka.ms/LearnAboutSenderIdentification ]
>
> Richard Biener via Gcc-patches  writes:
>
> > On Fri, Jun 30, 2023 at 7:28 AM Eugene Rozenfeld via Gcc-patches 
> >  wrote:
> >>
> >> When we collect just user events for autofdo with lbr we get some 
> >> events where branch sources are kernel addresses and branch targets 
> >> are user addresses. Without kernel MMAP events create_gcov can't 
> >> make sense of kernel addresses. Currently create_gcov fails if it 
> >> can't map at least 95% of events. We sometimes get below this threshold 
> >> with just user events. The change is to collect both user events and 
> >> kernel events.
> >
> > Does this require elevated privileges?  Can we instead "fix" create_gcov 
> > here?
>
> Right, requiring privileges for this is going to be a no-go for a lot of 
> builders. In a distro context, for example, it means we can't consider 
> autofdo at all.


[PATCH] testsuite: fix dwarf2/utf-1.C with DWARF4

2023-07-05 Thread Marek Polacek via Gcc-patches
Running
$ make check-c++ RUNTESTFLAGS='--target_board=unix\{-gdwarf-5,-gdwarf-4\} 
dwarf2.exp=utf-1.C'
shows
FAIL: g++.dg/debug/dwarf2/utf-1.C  -std=gnu++20  scan-assembler-times 
DW_AT_encoding \\(0x10\\) 3
because with -gdwarf-4 the output is:

  .byte   0x10# DW_AT_encoding

but with -gdwarf-5 the output is the expected:

# DW_AT_encoding (0x10)

The difference is caused by the DWARF5 optimize_implicit_const
optimization:


I suppose we could do what testsuite/rust/debug/chartype.rs does
and just run the test with -gdwarf-4.

Tested on x86_64-pc-linux-gnu, ok for trunk?

gcc/testsuite/ChangeLog:

* g++.dg/debug/dwarf2/utf-1.C: Use -gdwarf-4.  Adjust expected
output.
---
 gcc/testsuite/g++.dg/debug/dwarf2/utf-1.C | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/gcc/testsuite/g++.dg/debug/dwarf2/utf-1.C 
b/gcc/testsuite/g++.dg/debug/dwarf2/utf-1.C
index 43b354f1bb5..0ce4d8727d6 100644
--- a/gcc/testsuite/g++.dg/debug/dwarf2/utf-1.C
+++ b/gcc/testsuite/g++.dg/debug/dwarf2/utf-1.C
@@ -1,8 +1,13 @@
 // { dg-do compile { target c++20 } }
-// { dg-options { -gdwarf -dA } }
+// { dg-options { -gdwarf-4 -dA } }
 
 // Test that all three use DW_ATE_UTF.
-// { dg-final { scan-assembler-times {DW_AT_encoding \(0x10\)} 3 } }
+// This test uses -gdwarf-4 since in DWARF5 optimize_implicit_const
+// would optimize the output from:
+//   .byte   0x10# DW_AT_encoding
+// into:
+//   # DW_AT_encoding (0x10)
+// { dg-final { scan-assembler-times "0x10\[ \t]\[^\n\r]* DW_AT_encoding" 3 } }
 
 char8_t c8;
 char16_t c16;

base-commit: be240fc6acc9714e66afbfbe6dc193844bfcba05
-- 
2.41.0



Re: [PATCH] analyzer: Add support of placement new and improved operator new [PR105948]

2023-07-05 Thread Benjamin Priour via Gcc-patches

Hi David,

On 05/07/2023 22:59, David Malcolm wrote:

diff --git a/gcc/analyzer/kf-lang-cp.cc b/gcc/analyzer/kf-lang-cp.cc
index 393b4f25e79..258d92919d7 100644
--- a/gcc/analyzer/kf-lang-cp.cc
+++ b/gcc/analyzer/kf-lang-cp.cc
@@ -35,6 +35,34 @@ along with GCC; see the file COPYING3.  If not see
  
  #if ENABLE_ANALYZER
  
+/* Return TRUE if CALL is non-allocating operator new or operator

new[]*/
+
+bool is_placement_new_p (const gcall *call)

Please can you extend the leading comment, giving the expected
signatures of the functions, and a link to cppreference.org.

In particular, there's some special-casing here of "nothrow_t" which
would make more sense with a comment up here.


I've now extended the leading comment of is_placement_new_p so that the 
special cases appears clearer.


Leading comment is now:

   /* Return true if CALL is a non-allocating operator new or operator
   new []
  that contains no user-defined args, i.e. having any signature of:

    - void* operator new  ( std::size_t count, void* ptr );
    - void* operator new[]( std::size_t count, void* ptr );

  See https://en.cppreference.com/w/cpp/memory/new/operator_new . */

Whereas above the "nothrow_t" special case now reads

    /* We must distinguish between an allocating non-throwing new
    and a non-allocating new.

    The former might have one of the following signatures :
    void* operator new  ( std::size_t count, const std::nothrow_t&
   tag );
    void* operator new[]( std::size_t count, const std::nothrow_t&
   tag );

    However, debugging has shown that TAG is actually a POINTER_TYPE,
    not a REFERENCE_TYPE.

    Thus, we cannot easily differentiate the types, but we instead
   have to
    check if the second argument's type identifies as nothrow_t.  */



+{
+  gcc_assert (call);
+
+  tree fndecl = gimple_call_fndecl (call);
+  if (!fndecl)
+return false;
+
+  if (!is_named_call_p (fndecl, "operator new", call, 2)
+&& !is_named_call_p (fndecl, "operator new []", call, 2))
+return false;
+  tree arg1 = gimple_call_arg (call, 1);
+
+  if (!POINTER_TYPE_P (TREE_TYPE (arg1)))
+return false;
+
+  /* Sadly, for non-throwing new, the second argument type
+is not REFERENCE_TYPE but also POINTER_TYPE
+so a simple check is out of the way.  */
+  tree identifier = TYPE_IDENTIFIER (TREE_TYPE (TREE_TYPE (arg1)));
+  if (!identifier)
+return true;
+  const char *name = IDENTIFIER_POINTER (identifier);
+  return 0 != strcmp (name, "nothrow_t");
+}
+
  namespace ana {
  
  /* Implementations of specific functions.  */

@@ -46,7 +74,7 @@ class kf_operator_new : public known_function
  public:
bool matches_call_types_p (const call_details &cd) const final
override
{
-return cd.num_args () == 1;
+return cd.num_args () == 1 || cd.num_args () == 2;

Looks like we should also check that arg 0 is of integral type, and
that arg 1 is of pointer type.


Well technically some standard signatures use an align_val_t as a second 
argument,


which is a size_t value. But since we don't handle such signatures 
properly yet, I'm going


with your suggestion.


}
  
void impl_call_pre (const call_details &cd) const final override

@@ -54,13 +82,60 @@ public:
  region_model *model = cd.get_model ();
  region_model_manager *mgr = cd.get_manager ();
  const svalue *size_sval = cd.get_arg_svalue (0);
-const region *new_reg
-  = model->get_or_create_region_for_heap_alloc (size_sval,
cd.get_ctxt ());
-if (cd.get_lhs_type ())
+region_model_context *ctxt = cd.get_ctxt ();
+const gcall *call = cd.get_call_stmt ();
+
+/* If the call is an allocating new, then create a heap
allocated
+region.  */
+if (!is_placement_new_p (call))
+  {

You have:
if (!condition)
  suite_a;
else
  suite_b; // this is implicitly a double negative
  


Please change it to:

   if (condition)
 suite_b;
   else
 suite_a;

to avoid the implicit double negative.



(nods)

diff --git a/gcc/testsuite/g++.dg/analyzer/new-2.C
b/gcc/testsuite/g++.dg/analyzer/new-2.C
new file mode 100644
index 000..4e696040a54
--- /dev/null
+++ b/gcc/testsuite/g++.dg/analyzer/new-2.C
@@ -0,0 +1,50 @@
+// { dg-additional-options "-O0" }
+
+struct A
+{
+  int x;
+  int y;
+};

We've run into issues with bounds-checking testcases when using types
like "int" that have target-specific sizes.

Please use  in these test cases, and types with explicit
sizes, such as int32_t, to avoid the behavior of the test cases being
affected by sizeof the various types.

Thanks, I've now changed it in placement-new-size.C


[..snip...]

Other than those issues, looks good

Thanks again
Dave


Thanks for the review !

I'll submit the updated patch tomorrow on the mail list.

Benjamin



Re: [PATCH] libstdc++: Split up pstl/set.cc testcase

2023-07-05 Thread Thomas Rodgers via Gcc-patches
Comment added to each file.

Tested x86_64-linux. Pushed to trunk.

On Mon, Jul 3, 2023 at 4:16 PM Jonathan Wakely  wrote:

> On Mon, 3 Jul 2023 at 23:14, Thomas Rodgers via Libstdc++
>  wrote:
> >
> > This testcase is causing some timeout issues. This patch splits the
> > testcase up by individual set algorithm.
>
> I think the Apache license requires a notice saying the original file
> was modified. A comment in each new file noting it was derived from
> pstl/alg_sorting/set.cc (or whatever the file is called upstream)
> should be sufficient.
>
> OK with that change, thanks.
>
>


[COMMITTED 1/5] Move relation discovery into compute_operand_range

2023-07-05 Thread Andrew MacLeod via Gcc-patches

This is a set of 5 patches which cleans up GORIs compute_operand routines.

This is the mechanism GORI uses to calculate ranges from the bottom of 
the routine back thru definitions in the block to the name that is 
requested.


Currently, compute_operand_range() is called on a stmt, and it divides 
the work based on which operands are used to get back to the requested 
name.  It calls compute_operand1_range or compute_operand2_range or 
compute_operand1_and_operand2_range. If the specified name is not on 
this statement, then a call back to compute_operand_range on the 
definition statement is made.


this means the call chain is recursive, but involves alternating 
functions.  This patch sets changes the compute_operand1_range and 
compute_operand2_range to be leaf functions, and then 
compute_operand_range is still recursive, but has a much smaller stack 
footprint, and is also becomes a tailcall.


I tried removing the recursion, but at this point, removing the 
recursion is a performance hit :-P   stay tuned on that one.


This patch moves some common code for relation discovery from 
compute_operand[12]range into compute_operand_range.


Bootstraps on  x86_64-pc-linux-gnu  with no regressions.  Pushed.

Andrew
From 290798faef706c335bd346b13771f977ddedb415 Mon Sep 17 00:00:00 2001
From: Andrew MacLeod 
Date: Tue, 4 Jul 2023 11:28:52 -0400
Subject: [PATCH 1/6] Move relation discovery into compute_operand_range

compute_operand1_range and compute_operand2_range were both doing
relation discovery between the 2 operands... move it into a common area.

	* gimple-range-gori.cc (compute_operand_range): Check for
	a relation between op1 and op2 and use that instead.
	(compute_operand1_range): Don't look for a relation override.
	(compute_operand2_range): Ditto.
---
 gcc/gimple-range-gori.cc | 42 +---
 1 file changed, 13 insertions(+), 29 deletions(-)

diff --git a/gcc/gimple-range-gori.cc b/gcc/gimple-range-gori.cc
index 4ee0ae36014..b0d13a8ac53 100644
--- a/gcc/gimple-range-gori.cc
+++ b/gcc/gimple-range-gori.cc
@@ -623,6 +623,18 @@ gori_compute::compute_operand_range (vrange &r, gimple *stmt,
   tree op1 = gimple_range_ssa_p (handler.operand1 ());
   tree op2 = gimple_range_ssa_p (handler.operand2 ());
 
+  // If there is a relation betwen op1 and op2, use it instead as it is
+  // likely to be more applicable.
+  if (op1 && op2)
+{
+  relation_kind k = handler.op1_op2_relation (lhs);
+  if (k != VREL_VARYING)
+	{
+	  vrel.set_relation (k, op1, op2);
+	  vrel_ptr = &vrel;
+	}
+}
+
   // Handle end of lookup first.
   if (op1 == name)
 return compute_operand1_range (r, handler, lhs, name, src, vrel_ptr);
@@ -1079,7 +1091,6 @@ gori_compute::compute_operand1_range (vrange &r,
   const vrange &lhs, tree name,
   fur_source &src, value_relation *rel)
 {
-  value_relation local_rel;
   gimple *stmt = handler.stmt ();
   tree op1 = handler.operand1 ();
   tree op2 = handler.operand2 ();
@@ -1088,7 +1099,6 @@ gori_compute::compute_operand1_range (vrange &r,
   relation_trio trio;
   if (rel)
 trio = rel->create_trio (lhs_name, op1, op2);
-  relation_kind op_op = trio.op1_op2 ();
 
   Value_Range op1_range (TREE_TYPE (op1));
   Value_Range tmp (TREE_TYPE (op1));
@@ -1102,19 +1112,7 @@ gori_compute::compute_operand1_range (vrange &r,
 {
   src.get_operand (op2_range, op2);
 
-  // If there is a relation betwen op1 and op2, use it instead.
-  // This allows multiple relations to be processed in compound logicals.
-  if (gimple_range_ssa_p (op1) && gimple_range_ssa_p (op2))
-	{
-	  relation_kind k = handler.op1_op2_relation (lhs);
-	  if (k != VREL_VARYING)
-	{
-	  op_op = k;
-	  local_rel.set_relation (op_op, op1, op2);
-	  rel = &local_rel;
-	}
-	}
-
+  relation_kind op_op = trio.op1_op2 ();
   if (op_op != VREL_VARYING)
 	refine_using_relation (op1, op1_range, op2, op2_range, src, op_op);
 
@@ -1189,7 +1187,6 @@ gori_compute::compute_operand2_range (vrange &r,
   const vrange &lhs, tree name,
   fur_source &src, value_relation *rel)
 {
-  value_relation local_rel;
   gimple *stmt = handler.stmt ();
   tree op1 = handler.operand1 ();
   tree op2 = handler.operand2 ();
@@ -1207,19 +1204,6 @@ gori_compute::compute_operand2_range (vrange &r,
 trio = rel->create_trio (lhs_name, op1, op2);
   relation_kind op_op = trio.op1_op2 ();
 
-  // If there is a relation betwen op1 and op2, use it instead.
-  // This allows multiple relations to be processed in compound logicals.
-  if (gimple_range_ssa_p (op1) && gimple_range_ssa_p (op2))
-{
-  relation_kind k = handler.op1_op2_relation (lhs);
-  if (k != VREL_VARYING)
-	{
-	  op_op = k;
-	  local_rel.set_relation (op_op, op1, op2);
-	  rel = &local_rel;
-	}
-}
-
   if (op_op != VREL_VARYING)
 refine_using_relation (op1, op1_range, op2, op2_range, src, op_op);
 
-- 
2.40.1



[COMMITTED 2/5] Simplify compute_operand_range for op1 and op2 case.

2023-07-05 Thread Andrew MacLeod via Gcc-patches
This patch simplifies compute_operand1_and_operand2() such that it only 
calls each routine one. This will simplify the next couple of patches.


It also allows moves the determination that op1 and op2 have an 
interdependence to  compute_operand_range().


Bootstraps on  x86_64-pc-linux-gnu  with no regressions.  Pushed.

Andrew
From 7276248946d3eae83e5e08fc023163614c9ea9ab Mon Sep 17 00:00:00 2001
From: Andrew MacLeod 
Date: Wed, 5 Jul 2023 13:36:27 -0400
Subject: [PATCH 2/6] Simplify compute_operand_range for op1 and op2 case.

Move the check for co-dependency between 2 operands into
compute_operand_range, resulting in a much cleaner
compute_operand1_and_operand2_range routine.

	* gimple-range-gori.cc (compute_operand_range): Check for
	operand interdependence when both op1 and op2 are computed.
	(compute_operand1_and_operand2_range): No checks required now.
---
 gcc/gimple-range-gori.cc | 25 +++--
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/gcc/gimple-range-gori.cc b/gcc/gimple-range-gori.cc
index b0d13a8ac53..5429c6e3c1a 100644
--- a/gcc/gimple-range-gori.cc
+++ b/gcc/gimple-range-gori.cc
@@ -650,6 +650,17 @@ gori_compute::compute_operand_range (vrange &r, gimple *stmt,
   if (!op1_in_chain && !op2_in_chain)
 return false;
 
+  // If either operand is in the def chain of the other (or they are equal), it
+  // will be evaluated twice and can result in an exponential time calculation.
+  // Instead just evaluate the one operand.
+  if (op1_in_chain && op2_in_chain)
+{
+  if (in_chain_p (op1, op2) || op1 == op2)
+	op1_in_chain = false;
+  else if (in_chain_p (op2, op1))
+	op2_in_chain = false;
+}
+
   bool res = false;
   // If the lhs doesn't tell us anything only a relation can possibly enhance
   // the result.
@@ -1275,24 +1286,10 @@ gori_compute::compute_operand1_and_operand2_range (vrange &r,
 {
   Value_Range op_range (TREE_TYPE (name));
 
-  // If op1 is in the def chain of op2, we'll do the work twice to evalaute
-  // op1.  This can result in an exponential time calculation.
-  // Instead just evaluate op2, which will eventualy get to op1.
-  if (in_chain_p (handler.operand1 (), handler.operand2 ()))
-return compute_operand2_range (r, handler, lhs, name, src, rel);
-
-  // Likewise if op2 is in the def chain of op1.
-  if (in_chain_p (handler.operand2 (), handler.operand1 ()))
-return compute_operand1_range (r, handler, lhs, name, src, rel);
-
   // Calculate a good a range through op2.
   if (!compute_operand2_range (r, handler, lhs, name, src, rel))
 return false;
 
-  // If op1 == op2 there is again no need to go further.
-  if (handler.operand1 () == handler.operand2 ())
-return true;
-
   // Now get the range thru op1.
   if (!compute_operand1_range (op_range, handler, lhs, name, src, rel))
 return false;
-- 
2.40.1



[COMMITTED 3/5] Make compute_operand1_range a leaf call.

2023-07-05 Thread Andrew MacLeod via Gcc-patches
now operand1 alone is resolved, and returned as the result.  much 
cleaner, and removes it from the recursion stack.


compute_operand_range() will decide if further evaluation is required.

Bootstraps on  x86_64-pc-linux-gnu  with no regressions.  Pushed.

Andrew

From 912b5ac49677160aada7a2d862273251406dfca5 Mon Sep 17 00:00:00 2001
From: Andrew MacLeod 
Date: Wed, 5 Jul 2023 13:41:50 -0400
Subject: [PATCH 3/6] Make compute_operand1_range a leaf call.

Rather than creating long call chains, put the onus for finishing
the evlaution on the caller.

	* gimple-range-gori.cc (compute_operand_range): After calling
	compute_operand1_range, recursively call self if needed.
	(compute_operand1_range): Turn into a leaf function.
	(gori_compute::compute_operand1_and_operand2_range): Finish
	operand1 calculation.
	* gimple-range-gori.h (compute_operand1_range): Remove name param.
---
 gcc/gimple-range-gori.cc | 49 
 gcc/gimple-range-gori.h  |  2 +-
 2 files changed, 25 insertions(+), 26 deletions(-)

diff --git a/gcc/gimple-range-gori.cc b/gcc/gimple-range-gori.cc
index 5429c6e3c1a..b66b9b0398c 100644
--- a/gcc/gimple-range-gori.cc
+++ b/gcc/gimple-range-gori.cc
@@ -637,7 +637,7 @@ gori_compute::compute_operand_range (vrange &r, gimple *stmt,
 
   // Handle end of lookup first.
   if (op1 == name)
-return compute_operand1_range (r, handler, lhs, name, src, vrel_ptr);
+return compute_operand1_range (r, handler, lhs, src, vrel_ptr);
   if (op2 == name)
 return compute_operand2_range (r, handler, lhs, name, src, vrel_ptr);
 
@@ -731,7 +731,15 @@ gori_compute::compute_operand_range (vrange &r, gimple *stmt,
 res = compute_operand1_and_operand2_range (r, handler, lhs, name, src,
 	   vrel_ptr);
   else if (op1_in_chain)
-res = compute_operand1_range (r, handler, lhs, name, src, vrel_ptr);
+{
+  Value_Range vr (TREE_TYPE (op1));
+  if (!compute_operand1_range (vr, handler, lhs, src, vrel_ptr))
+	return false;
+  gimple *src_stmt = SSA_NAME_DEF_STMT (op1);
+  gcc_checking_assert (src_stmt);
+  // Then feed this range back as the LHS of the defining statement.
+  return compute_operand_range (r, src_stmt, vr, name, src, vrel_ptr);
+}
   else if (op2_in_chain)
 res = compute_operand2_range (r, handler, lhs, name, src, vrel_ptr);
   else
@@ -1099,7 +1107,7 @@ gori_compute::refine_using_relation (tree op1, vrange &op1_range,
 bool
 gori_compute::compute_operand1_range (vrange &r,
   gimple_range_op_handler &handler,
-  const vrange &lhs, tree name,
+  const vrange &lhs,
   fur_source &src, value_relation *rel)
 {
   gimple *stmt = handler.stmt ();
@@ -1112,7 +1120,6 @@ gori_compute::compute_operand1_range (vrange &r,
 trio = rel->create_trio (lhs_name, op1, op2);
 
   Value_Range op1_range (TREE_TYPE (op1));
-  Value_Range tmp (TREE_TYPE (op1));
   Value_Range op2_range (op2 ? TREE_TYPE (op2) : TREE_TYPE (op1));
 
   // Fetch the known range for op1 in this block.
@@ -1130,7 +1137,7 @@ gori_compute::compute_operand1_range (vrange &r,
   // If op1 == op2, create a new trio for just this call.
   if (op1 == op2 && gimple_range_ssa_p (op1))
 	trio = relation_trio (trio.lhs_op1 (), trio.lhs_op2 (), VREL_EQ);
-  if (!handler.calc_op1 (tmp, lhs, op2_range, trio))
+  if (!handler.calc_op1 (r, lhs, op2_range, trio))
 	return false;
 }
   else
@@ -1138,7 +1145,7 @@ gori_compute::compute_operand1_range (vrange &r,
   // We pass op1_range to the unary operation.  Normally it's a
   // hidden range_for_type parameter, but sometimes having the
   // actual range can result in better information.
-  if (!handler.calc_op1 (tmp, lhs, op1_range, trio))
+  if (!handler.calc_op1 (r, lhs, op1_range, trio))
 	return false;
 }
 
@@ -1161,30 +1168,16 @@ gori_compute::compute_operand1_range (vrange &r,
   tracer.print (idx, "Computes ");
   print_generic_expr (dump_file, op1, TDF_SLIM);
   fprintf (dump_file, " = ");
-  tmp.dump (dump_file);
+  r.dump (dump_file);
   fprintf (dump_file, " intersect Known range : ");
   op1_range.dump (dump_file);
   fputc ('\n', dump_file);
 }
-  // Intersect the calculated result with the known result and return if done.
-  if (op1 == name)
-{
-  tmp.intersect (op1_range);
-  r = tmp;
-  if (idx)
-	tracer.trailer (idx, "produces ", true, name, r);
-  return true;
-}
-  // If the calculation continues, we're using op1_range as the new LHS.
-  op1_range.intersect (tmp);
 
+  r.intersect (op1_range);
   if (idx)
-tracer.trailer (idx, "produces ", true, op1, op1_range);
-  gimple *src_stmt = SSA_NAME_DEF_STMT (op1);
-  gcc_checking_assert (src_stmt);
-
-  // Then feed this range back as the LHS of the defining statement.
-  return compute_operand_range (r, src_stmt, op1_range, name, src, rel);
+tracer.trailer (idx, "produces ", true, op1, r);
+  return true;
 }
 
 
@@ -1291

[COMMITTED 4/5] Make compute_operand2_range a leaf call.

2023-07-05 Thread Andrew MacLeod via Gcc-patches
now operand2 alone is resolved, and returned as the result.  much 
cleaner, and removes it from the recursion stack.


compute_operand_range() will decide if further evaluation is required.

Bootstraps on  x86_64-pc-linux-gnu  with no regressions.  Pushed.

Andrew
From 298952bcf05d298892e99adba1f4a75af17bc65a Mon Sep 17 00:00:00 2001
From: Andrew MacLeod 
Date: Wed, 5 Jul 2023 13:52:21 -0400
Subject: [PATCH 4/6] Make compute_operand2_range a leaf call.

Rather than creating long call chains, put the onus for finishing
the evlaution on the caller.

	* gimple-range-gori.cc (compute_operand_range): After calling
	compute_operand2_range, recursively call self if needed.
	(compute_operand2_range): Turn into a leaf function.
	(gori_compute::compute_operand1_and_operand2_range): Finish
	operand2 calculation.
	* gimple-range-gori.h (compute_operand2_range): Remove name param.
---
 gcc/gimple-range-gori.cc | 52 +++-
 gcc/gimple-range-gori.h  |  2 +-
 2 files changed, 26 insertions(+), 28 deletions(-)

diff --git a/gcc/gimple-range-gori.cc b/gcc/gimple-range-gori.cc
index b66b9b0398c..b036ed56f02 100644
--- a/gcc/gimple-range-gori.cc
+++ b/gcc/gimple-range-gori.cc
@@ -639,7 +639,7 @@ gori_compute::compute_operand_range (vrange &r, gimple *stmt,
   if (op1 == name)
 return compute_operand1_range (r, handler, lhs, src, vrel_ptr);
   if (op2 == name)
-return compute_operand2_range (r, handler, lhs, name, src, vrel_ptr);
+return compute_operand2_range (r, handler, lhs, src, vrel_ptr);
 
   // NAME is not in this stmt, but one of the names in it ought to be
   // derived from it.
@@ -741,7 +741,15 @@ gori_compute::compute_operand_range (vrange &r, gimple *stmt,
   return compute_operand_range (r, src_stmt, vr, name, src, vrel_ptr);
 }
   else if (op2_in_chain)
-res = compute_operand2_range (r, handler, lhs, name, src, vrel_ptr);
+{
+  Value_Range vr (TREE_TYPE (op2));
+  if (!compute_operand2_range (vr, handler, lhs, src, vrel_ptr))
+	return false;
+  gimple *src_stmt = SSA_NAME_DEF_STMT (op2);
+  gcc_checking_assert (src_stmt);
+  // Then feed this range back as the LHS of the defining statement.
+  return compute_operand_range (r, src_stmt, vr, name, src, vrel_ptr);
+}
   else
 gcc_unreachable ();
 
@@ -1188,7 +1196,7 @@ gori_compute::compute_operand1_range (vrange &r,
 bool
 gori_compute::compute_operand2_range (vrange &r,
   gimple_range_op_handler &handler,
-  const vrange &lhs, tree name,
+  const vrange &lhs,
   fur_source &src, value_relation *rel)
 {
   gimple *stmt = handler.stmt ();
@@ -1198,7 +1206,6 @@ gori_compute::compute_operand2_range (vrange &r,
 
   Value_Range op1_range (TREE_TYPE (op1));
   Value_Range op2_range (TREE_TYPE (op2));
-  Value_Range tmp (TREE_TYPE (op2));
 
   src.get_operand (op1_range, op1);
   src.get_operand (op2_range, op2);
@@ -1215,7 +1222,7 @@ gori_compute::compute_operand2_range (vrange &r,
   if (op1 == op2 && gimple_range_ssa_p (op1))
 trio = relation_trio (trio.lhs_op1 (), trio.lhs_op2 (), VREL_EQ);
   // Intersect with range for op2 based on lhs and op1.
-  if (!handler.calc_op2 (tmp, lhs, op1_range, trio))
+  if (!handler.calc_op2 (r, lhs, op1_range, trio))
 return false;
 
   unsigned idx;
@@ -1237,31 +1244,16 @@ gori_compute::compute_operand2_range (vrange &r,
   tracer.print (idx, "Computes ");
   print_generic_expr (dump_file, op2, TDF_SLIM);
   fprintf (dump_file, " = ");
-  tmp.dump (dump_file);
+  r.dump (dump_file);
   fprintf (dump_file, " intersect Known range : ");
   op2_range.dump (dump_file);
   fputc ('\n', dump_file);
 }
   // Intersect the calculated result with the known result and return if done.
-  if (op2 == name)
-{
-  tmp.intersect (op2_range);
-  r = tmp;
-  if (idx)
-	tracer.trailer (idx, " produces ", true, NULL_TREE, r);
-  return true;
-}
-  // If the calculation continues, we're using op2_range as the new LHS.
-  op2_range.intersect (tmp);
-
+  r.intersect (op2_range);
   if (idx)
-tracer.trailer (idx, " produces ", true, op2, op2_range);
-  gimple *src_stmt = SSA_NAME_DEF_STMT (op2);
-  gcc_checking_assert (src_stmt);
-//  gcc_checking_assert (!is_import_p (op2, find.bb));
-
-  // Then feed this range back as the LHS of the defining statement.
-  return compute_operand_range (r, src_stmt, op2_range, name, src, rel);
+tracer.trailer (idx, " produces ", true, op2, r);
+  return true;
 }
 
 // Calculate a range for NAME from both operand positions of S
@@ -1279,15 +1271,21 @@ gori_compute::compute_operand1_and_operand2_range (vrange &r,
 {
   Value_Range op_range (TREE_TYPE (name));
 
+  Value_Range vr (TREE_TYPE (handler.operand2 ()));
   // Calculate a good a range through op2.
-  if (!compute_operand2_range (r, handler, lhs, name, src, rel))
+  if (!compute_operand2_range (vr, handler, lhs, src, rel))
+return false;
+  gimple *src_stmt = SS

[COMMITTED 5/5] Make compute_operand_range a tail call.

2023-07-05 Thread Andrew MacLeod via Gcc-patches
This simply tweaks cmpute_operand_range a little so the recursion is a 
tail call.


With this, the patchset produces a modest speedup of 0.2% in VRP and 
0.4% in threading.  It will also have a much smaller stack profile.


Bootstraps on  x86_64-pc-linux-gnu  with no regressions.  Pushed.

Andrew

From 51ed3a6ce432e7e6226bb62125ef8a09b2ebf60c Mon Sep 17 00:00:00 2001
From: Andrew MacLeod 
Date: Wed, 5 Jul 2023 14:26:00 -0400
Subject: [PATCH 5/6] Make compute_operand_range a tail call.

Tweak the routine so it is making a tail call.

	* gimple-range-gori.cc (compute_operand_range): Convert to a tail
	call.
---
 gcc/gimple-range-gori.cc | 34 --
 1 file changed, 16 insertions(+), 18 deletions(-)

diff --git a/gcc/gimple-range-gori.cc b/gcc/gimple-range-gori.cc
index b036ed56f02..6dc15a0ce3f 100644
--- a/gcc/gimple-range-gori.cc
+++ b/gcc/gimple-range-gori.cc
@@ -725,36 +725,34 @@ gori_compute::compute_operand_range (vrange &r, gimple *stmt,
 			 op1_trange, op1_frange, op2_trange, op2_frange);
   if (idx)
 	tracer.trailer (idx, "compute_operand", res, name, r);
+  return res;
 }
   // Follow the appropriate operands now.
-  else if (op1_in_chain && op2_in_chain)
-res = compute_operand1_and_operand2_range (r, handler, lhs, name, src,
-	   vrel_ptr);
-  else if (op1_in_chain)
+  if (op1_in_chain && op2_in_chain)
+return compute_operand1_and_operand2_range (r, handler, lhs, name, src,
+		vrel_ptr);
+  Value_Range vr;
+  gimple *src_stmt;
+  if (op1_in_chain)
 {
-  Value_Range vr (TREE_TYPE (op1));
+  vr.set_type (TREE_TYPE (op1));
   if (!compute_operand1_range (vr, handler, lhs, src, vrel_ptr))
 	return false;
-  gimple *src_stmt = SSA_NAME_DEF_STMT (op1);
-  gcc_checking_assert (src_stmt);
-  // Then feed this range back as the LHS of the defining statement.
-  return compute_operand_range (r, src_stmt, vr, name, src, vrel_ptr);
+  src_stmt = SSA_NAME_DEF_STMT (op1);
 }
-  else if (op2_in_chain)
+  else
 {
-  Value_Range vr (TREE_TYPE (op2));
+  gcc_checking_assert (op2_in_chain);
+  vr.set_type (TREE_TYPE (op2));
   if (!compute_operand2_range (vr, handler, lhs, src, vrel_ptr))
 	return false;
-  gimple *src_stmt = SSA_NAME_DEF_STMT (op2);
-  gcc_checking_assert (src_stmt);
-  // Then feed this range back as the LHS of the defining statement.
-  return compute_operand_range (r, src_stmt, vr, name, src, vrel_ptr);
+  src_stmt = SSA_NAME_DEF_STMT (op2);
 }
-  else
-gcc_unreachable ();
 
+  gcc_checking_assert (src_stmt);
+  // Then feed this range back as the LHS of the defining statement.
+  return compute_operand_range (r, src_stmt, vr, name, src, vrel_ptr);
   // If neither operand is derived, this statement tells us nothing.
-  return res;
 }
 
 
-- 
2.40.1



[PATCH] x86: Properly find the maximum stack slot alignment

2023-07-05 Thread H.J. Lu via Gcc-patches
Don't assume that stack slots can only be accessed by stack or frame
registers.  Also check memory accesses from registers defined by
stack or frame registers.

gcc/

PR target/109780
* config/i386/i386.cc (ix86_set_with_register_source): New.
(ix86_find_all_stack_access): Likewise.
(ix86_find_max_used_stack_alignment): Also check memory accesses
from registers defined by stack or frame registers.

gcc/testsuite/

PR target/109780
* g++.target/i386/pr109780-1.C: New test.
* gcc.target/i386/pr109780-1.c: Likewise.
* gcc.target/i386/pr109780-2.c: Likewise.
---
 gcc/config/i386/i386.cc| 145 ++---
 gcc/testsuite/g++.target/i386/pr109780-1.C |  72 ++
 gcc/testsuite/gcc.target/i386/pr109780-1.c |  14 ++
 gcc/testsuite/gcc.target/i386/pr109780-2.c |  21 +++
 4 files changed, 233 insertions(+), 19 deletions(-)
 create mode 100644 gcc/testsuite/g++.target/i386/pr109780-1.C
 create mode 100644 gcc/testsuite/gcc.target/i386/pr109780-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr109780-2.c

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index caca74d6dec..85dd8cb0581 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -8084,6 +8084,72 @@ output_probe_stack_range (rtx reg, rtx end)
   return "";
 }
 
+/* Check if PAT is a SET with register source.  */
+
+static void
+ix86_set_with_register_source (rtx, const_rtx pat, void *data)
+{
+  if (GET_CODE (pat) != SET)
+return;
+
+  rtx src = SET_SRC (pat);
+  if (MEM_P (src) || CONST_INT_P (src))
+return;
+
+  bool *may_use_register = (bool *) data;
+  *may_use_register = true;
+}
+
+/* Find all register access registers.  */
+
+static bool
+ix86_find_all_stack_access (HARD_REG_SET &stack_slot_access)
+{
+  bool repeat = false;
+
+  for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
+if (GENERAL_REGNO_P (i)
+   && !TEST_HARD_REG_BIT (stack_slot_access, i))
+  for (df_ref def = DF_REG_DEF_CHAIN (i);
+  def != NULL;
+  def = DF_REF_NEXT_REG (def))
+   {
+ if (DF_REF_IS_ARTIFICIAL (def))
+   continue;
+
+ rtx_insn *insn = DF_REF_INSN (def);
+
+ bool may_use_register = false;
+ note_stores (insn, ix86_set_with_register_source,
+  &may_use_register);
+
+ if (!may_use_register)
+   continue;
+
+ df_ref use;
+ FOR_EACH_INSN_USE (use, insn)
+   {
+ rtx reg = DF_REF_REG (use);
+
+ if (!REG_P (reg))
+   continue;
+
+ /* Skip if stack slot access register isn't used.  */
+ if (!TEST_HARD_REG_BIT (stack_slot_access,
+ REGNO (reg)))
+   continue;
+
+ /* Add this register to stack_slot_access.  */
+ add_to_hard_reg_set (&stack_slot_access, Pmode, i);
+
+ /* Repeat if a register is added to stack_slot_access.  */
+ repeat = true;
+   }
+   }
+
+  return repeat;
+}
+
 /* Set stack_frame_required to false if stack frame isn't required.
Update STACK_ALIGNMENT to the largest alignment, in bits, of stack
slot used if stack frame is required and CHECK_STACK_SLOT is true.  */
@@ -8092,15 +8158,23 @@ static void
 ix86_find_max_used_stack_alignment (unsigned int &stack_alignment,
bool check_stack_slot)
 {
-  HARD_REG_SET set_up_by_prologue, prologue_used;
+  HARD_REG_SET set_up_by_prologue, prologue_used, stack_slot_access;
   basic_block bb;
 
   CLEAR_HARD_REG_SET (prologue_used);
   CLEAR_HARD_REG_SET (set_up_by_prologue);
+  CLEAR_HARD_REG_SET (stack_slot_access);
   add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
   add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
   add_to_hard_reg_set (&set_up_by_prologue, Pmode,
   HARD_FRAME_POINTER_REGNUM);
+  /* Stack slot can be accessed by stack pointer, frame pointer or
+ registers defined by stack pointer or frame pointer.  */
+  add_to_hard_reg_set (&stack_slot_access, Pmode,
+  STACK_POINTER_REGNUM);
+  if (frame_pointer_needed)
+add_to_hard_reg_set (&stack_slot_access, Pmode,
+HARD_FRAME_POINTER_REGNUM);
 
   /* The preferred stack alignment is the minimum stack alignment.  */
   if (stack_alignment > crtl->preferred_stack_boundary)
@@ -8108,32 +8182,65 @@ ix86_find_max_used_stack_alignment (unsigned int 
&stack_alignment,
 
   bool require_stack_frame = false;
 
+  /* Find all register access registers.  */
+  while (ix86_find_all_stack_access (stack_slot_access))
+;
+
   FOR_EACH_BB_FN (bb, cfun)
 {
   rtx_insn *insn;
   FOR_BB_INSNS (bb, insn)
-   if (NONDEBUG_INSN_P (insn)
-   && requires_stack_frame_p (insn, prologue_used,
-  set_up_by_prologue))
+   if (NONDEBUG

Re: [PATCH V2] i386: Inline function with default arch/tune to caller

2023-07-05 Thread Hongyu Wang via Gcc-patches
Thanks, this is the updated patch I'm going to check in.

Uros Bizjak  于2023年7月4日周二 16:57写道:
>
> On Tue, Jul 4, 2023 at 10:32 AM Hongyu Wang  wrote:
> >
> > > In a follow-up patch, can you please document inlining rules involving
> > > -march and -mtune to "x86 Function Attributes" section? Currently, the
> > > inlining rules at the end of "target function attribute" section does
> > > not even mention -march and -mtune. Maybe a subsubsection "Inlining
> > > rules" should be added (like AArch64 has) to mention that only default
> > > arch and tune are inlined by default (but inline can be forced with
> > > always_inline for different mtune flags).
> >
> > The document has below at the end of 'target (OPTIONS)' section
> >
> > On the x86, the inliner does not inline a function that has
> > different target options than the caller, unless the callee
> > has a subset of the target options of the caller.  For example
> > a function declared with 'target("sse3")' can inline a
> > function with 'target("sse2")', since '-msse3' implies
> > '-msse2'.
> >
> > Do we need to move this part to a new section and combine with -march and
> > -mtune rule description to the new subsubsection?
> >
> > > Looking at the above, perhaps inlining of different arches can also be
> > > forced with always_inline? This would allow developers some control of
> > > inlining, and would not be surprising.
> >
> > If so, I'd like to add the always_inline change on arch to current
> > patch and leave the
> > document change alone in the next patch.
>
> Yes, this is OK.
>
> Thanks,
> Uros.
> >
> > Uros Bizjak via Gcc-patches  于2023年7月4日周二 14:19写道:
> > >
> > > On Tue, Jul 4, 2023 at 5:12 AM Hongyu Wang  wrote:
> > > >
> > > > Hi,
> > > >
> > > > For function with different target attributes, current logic rejects to
> > > > inline the callee when any arch or tune is mismatched. Relax the
> > > > condition to allow callee with default arch/tune to be inlined.
> > > >
> > > > Boostrapped/regtested on x86-64-linux-gnu{-m32,}.
> > > >
> > > > Ok for trunk?
> > > >
> > > > gcc/ChangeLog:
> > > >
> > > > * config/i386/i386.cc (ix86_can_inline_p): If callee has
> > > > default arch=x86-64 and tune=generic, do not block the
> > > > inlining to its caller.
> > > >
> > > > gcc/testsuite/ChangeLog:
> > > >
> > > > * gcc.target/i386/inline_target_clones.c: New test.
> > >
> > > OK.
> > >
> > > In a follow-up patch, can you please document inlining rules involving
> > > -march and -mtune to "x86 Function Attributes" section? Currently, the
> > > inlining rules at the end of "target function attribute" section does
> > > not even mention -march and -mtune. Maybe a subsubsection "Inlining
> > > rules" should be added (like AArch64 has) to mention that only default
> > > arch and tune are inlined by default (but inline can be forced with
> > > always_inline for different mtune flags).
> > >
> > > Looking at the above, perhaps inlining of different arches can also be
> > > forced with always_inline? This would allow developers some control of
> > > inlining, and would not be surprising.
> > >
> > > Thanks,
> > > Uros.
> > >
> > > > ---
> > > >  gcc/config/i386/i386.cc   | 22 +++--
> > > >  .../gcc.target/i386/inline_target_clones.c| 24 +++
> > > >  2 files changed, 39 insertions(+), 7 deletions(-)
> > > >  create mode 100644 gcc/testsuite/gcc.target/i386/inline_target_clones.c
> > > >
> > > > diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> > > > index 8989985700a..4741c9b5364 100644
> > > > --- a/gcc/config/i386/i386.cc
> > > > +++ b/gcc/config/i386/i386.cc
> > > > @@ -605,13 +605,6 @@ ix86_can_inline_p (tree caller, tree callee)
> > > >!= (callee_opts->x_target_flags & 
> > > > ~always_inline_safe_mask))
> > > >  ret = false;
> > > >
> > > > -  /* See if arch, tune, etc. are the same.  */
> > > > -  else if (caller_opts->arch != callee_opts->arch)
> > > > -ret = false;
> > > > -
> > > > -  else if (!always_inline && caller_opts->tune != callee_opts->tune)
> > > > -ret = false;
> > > > -
> > > >else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath
> > > >/* If the calle doesn't use FP expressions differences in
> > > >   ix86_fpmath can be ignored.  We are called from FEs
> > > > @@ -622,6 +615,21 @@ ix86_can_inline_p (tree caller, tree callee)
> > > >|| ipa_fn_summaries->get (callee_node)->fp_expressions))
> > > >  ret = false;
> > > >
> > > > +  /* At this point we cannot identify whether arch or tune setting
> > > > + comes from target attribute or not. So the most conservative way
> > > > + is to allow the callee that uses default arch and tune string to
> > > > + be inlined.  */
> > > > +  else if (!strcmp (callee_opts->x_ix86_arch_string, "x86-64")
> > > > +  && !strcmp (callee_opts->x_ix86_tune_string, "generic"))
> > > > +ret = true;
> > > 

[PATCH] Disparage slightly for the alternative which move DFmode between SSE_REGS and GENERAL_REGS.

2023-07-05 Thread liuhongt via Gcc-patches
For testcase

void __cond_swap(double* __x, double* __y) {
  bool __r = (*__x < *__y);
  auto __tmp = __r ? *__x : *__y;
  *__y = __r ? *__y : *__x;
  *__x = __tmp;
}

GCC-14 with -O2 and -march=x86-64 options generates the following code:

__cond_swap(double*, double*):
movsd   xmm1, QWORD PTR [rdi]
movsd   xmm0, QWORD PTR [rsi]
comisd  xmm0, xmm1
jbe .L2
movqrax, xmm1
movapd  xmm1, xmm0
movqxmm0, rax
.L2:
movsd   QWORD PTR [rsi], xmm1
movsd   QWORD PTR [rdi], xmm0
ret

rax is used to save and restore DFmode value. In RA both GENERAL_REGS
and SSE_REGS cost zero since we didn't disparage the
alternative in movdf_internal pattern, according to register
allocation order, GENERAL_REGS is allocated. The patch add ? for
alternative (r,v) and (v,r) just like we did for movsf/hf/bf_internal
pattern, after that we get optimal RA.

__cond_swap:
.LFB0:
.cfi_startproc
movsd   (%rdi), %xmm1
movsd   (%rsi), %xmm0
comisd  %xmm1, %xmm0
jbe .L2
movapd  %xmm1, %xmm2
movapd  %xmm0, %xmm1
movapd  %xmm2, %xmm0
.L2:
movsd   %xmm1, (%rsi)
movsd   %xmm0, (%rdi)
ret

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}
Ok for trunk?


gcc/ChangeLog:

PR target/110170
* config/i386/i386.md (movdf_internal): Disparage slightly for
2 alternatives (r,v) and (v,r) by adding constraint modifier
'?'.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr110170-3.c: New test.
---
 gcc/config/i386/i386.md|  4 ++--
 gcc/testsuite/gcc.target/i386/pr110170-3.c | 11 +++
 2 files changed, 13 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr110170-3.c

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index a82cc353cfd..e47ced1bb70 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -3915,9 +3915,9 @@ (define_split
 ;; Possible store forwarding (partial memory) stall in alternatives 4, 6 and 7.
 (define_insn "*movdf_internal"
   [(set (match_operand:DF 0 "nonimmediate_operand"
-"=Yf*f,m   ,Yf*f,?r ,!o,?*r ,!o,!o,?r,?m,?r,?r,v,v,v,m,*x,*x,*x,m ,r ,v,r  
,o ,r  ,m")
+"=Yf*f,m   ,Yf*f,?r ,!o,?*r ,!o,!o,?r,?m,?r,?r,v,v,v,m,*x,*x,*x,m ,?r,?v,r 
 ,o ,r  ,m")
(match_operand:DF 1 "general_operand"
-"Yf*fm,Yf*f,G   ,roF,r ,*roF,*r,F ,rm,rC,C ,F ,C,v,m,v,C ,*x,m ,*x,v,r 
,roF,rF,rmF,rC"))]
+"Yf*fm,Yf*f,G   ,roF,r ,*roF,*r,F ,rm,rC,C ,F ,C,v,m,v,C ,*x,m ,*x, v, 
r,roF,rF,rmF,rC"))]
   "!(MEM_P (operands[0]) && MEM_P (operands[1]))
&& (lra_in_progress || reload_completed
|| !CONST_DOUBLE_P (operands[1])
diff --git a/gcc/testsuite/gcc.target/i386/pr110170-3.c 
b/gcc/testsuite/gcc.target/i386/pr110170-3.c
new file mode 100644
index 000..70daa89e9aa
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr110170-3.c
@@ -0,0 +1,11 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -fno-if-conversion -fno-if-conversion2" } */
+/* { dg-final { scan-assembler-not {(?n)movq.*r} } } */
+
+void __cond_swap(double* __x, double* __y) {
+  _Bool __r = (*__x < *__y);
+  double __tmp = __r ? *__x : *__y;
+  *__y = __r ? *__y : *__x;
+  *__x = __tmp;
+}
+
-- 
2.39.1.388.g2fc9e9ca3c



[PATCH 2/2] Adjust rtx_cost for DF/SFmode AND/IOR/XOR/ANDN operations.

2023-07-05 Thread liuhongt via Gcc-patches
They should have same cost as vector mode since both generate
pand/pandn/pxor/por instruction.

Bootstrapped and regtested on x86_64-pc-linu-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

* config/i386/i386.cc (ix86_rtx_costs): Adjust rtx_cost for
DF/SFmode AND/IOR/XOR/ANDN operations.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr110170-2.c: New test.
---
 gcc/config/i386/i386.cc|  6 --
 gcc/testsuite/gcc.target/i386/pr110170-2.c | 16 
 2 files changed, 20 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr110170-2.c

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index d4ff56ee8dd..fe31acd7646 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -21153,7 +21153,8 @@ ix86_rtx_costs (rtx x, machine_mode mode, int 
outer_code_i, int opno,
 
 case IOR:
 case XOR:
-  if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+  if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
+ || SSE_FLOAT_MODE_P (mode))
*total = ix86_vec_cost (mode, cost->sse_op);
   else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
*total = cost->add * 2;
@@ -21167,7 +21168,8 @@ ix86_rtx_costs (rtx x, machine_mode mode, int 
outer_code_i, int opno,
  *total = cost->lea;
  return true;
}
-  else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+  else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
+  || SSE_FLOAT_MODE_P (mode))
{
  /* pandn is a single instruction.  */
  if (GET_CODE (XEXP (x, 0)) == NOT)
diff --git a/gcc/testsuite/gcc.target/i386/pr110170-2.c 
b/gcc/testsuite/gcc.target/i386/pr110170-2.c
new file mode 100644
index 000..d43e322fc49
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr110170-2.c
@@ -0,0 +1,16 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-msse2 -O2 -mfpmath=sse" } */
+/* { dg-final { scan-assembler-not "comi" } }  */
+
+double
+foo (double* a, double* b, double c, double d)
+{
+  return *a < *b ? c : d;
+}
+
+float
+foo1 (float* a, float* b, float c, float d)
+{
+  return *a < *b ? c : d;
+}
+
-- 
2.39.1.388.g2fc9e9ca3c



[PATCH 1/2] [x86] Add pre_reload splitter to detect fp min/max pattern.

2023-07-05 Thread liuhongt via Gcc-patches
We have ix86_expand_sse_fp_minmax to detect min/max sematics, but
it requires rtx_equal_p for cmp_op0/cmp_op1 and if_true/if_false, for
the testcase in the PR, there's an extra move from cmp_op0 to if_true,
and it failed ix86_expand_sse_fp_minmax.

This patch adds pre_reload splitter to detect the min/max pattern.

Operands order in MINSS matters for signed zero and NANs, since the
instruction always returns second operand when any operand is NAN or
both operands are zero.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

PR target/110170
* config/i386/i386.md (*ieee_minmax3_1): New pre_reload
splitter to detect fp min/max pattern.

gcc/testsuite/ChangeLog:

* g++.target/i386/pr110170.C: New test.
* gcc.target/i386/pr110170.c: New test.
---
 gcc/config/i386/i386.md  | 30 +
 gcc/testsuite/g++.target/i386/pr110170.C | 78 
 gcc/testsuite/gcc.target/i386/pr110170.c | 18 ++
 3 files changed, 126 insertions(+)
 create mode 100644 gcc/testsuite/g++.target/i386/pr110170.C
 create mode 100644 gcc/testsuite/gcc.target/i386/pr110170.c

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index e6ebc461e52..353bb21993d 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -22483,6 +22483,36 @@ (define_insn "*ieee_s3"
(set_attr "type" "sseadd")
(set_attr "mode" "")])
 
+;; Operands order in min/max instruction matters for signed zero and NANs.
+(define_insn_and_split "*ieee_minmax3_1"
+  [(set (match_operand:MODEF 0 "register_operand")
+   (unspec:MODEF
+ [(match_operand:MODEF 1 "register_operand")
+  (match_operand:MODEF 2 "register_operand")
+  (lt:MODEF
+(match_operand:MODEF 3 "register_operand")
+(match_operand:MODEF 4 "register_operand"))]
+ UNSPEC_BLENDV))]
+  "SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH
+  && ((rtx_equal_p (operands[1], operands[3])
+   && rtx_equal_p (operands[2], operands[4]))
+  || (rtx_equal_p (operands[1], operands[4])
+ && rtx_equal_p (operands[2], operands[3])))
+  && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+{
+  int u = (rtx_equal_p (operands[1], operands[3])
+  && rtx_equal_p (operands[2], operands[4]))
+  ? UNSPEC_IEEE_MAX : UNSPEC_IEEE_MIN;
+  emit_move_insn (operands[0],
+ gen_rtx_UNSPEC (mode,
+ gen_rtvec (2, operands[2], operands[1]),
+ u));
+  DONE;
+})
+
 ;; Make two stack loads independent:
 ;;   fld aa  fld aa
 ;;   fld %st(0) ->   fld bb
diff --git a/gcc/testsuite/g++.target/i386/pr110170.C 
b/gcc/testsuite/g++.target/i386/pr110170.C
new file mode 100644
index 000..1e9a781ca74
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/pr110170.C
@@ -0,0 +1,78 @@
+/* { dg-do run } */
+/* { dg-options " -O2 -march=x86-64 -mfpmath=sse -std=gnu++20" } */
+#include 
+
+void
+__attribute__((noinline))
+__cond_swap(double* __x, double* __y) {
+  bool __r = (*__x < *__y);
+  auto __tmp = __r ? *__x : *__y;
+  *__y = __r ? *__y : *__x;
+  *__x = __tmp;
+}
+
+auto test1() {
+double nan = -0.0;
+double x = 0.0;
+__cond_swap(&nan, &x);
+return x == -0.0 && nan == 0.0;
+}
+
+auto test1r() {
+double nan = NAN;
+double x = 1.0;
+__cond_swap(&x, &nan);
+return isnan(x) && signbit(x) == 0 && nan == 1.0;
+}
+
+auto test2() {
+double nan = NAN;
+double x = -1.0;
+__cond_swap(&nan, &x);
+return isnan(x) && signbit(x) == 0 && nan == -1.0;
+}
+
+auto test2r() {
+double nan = NAN;
+double x = -1.0;
+__cond_swap(&x, &nan);
+return isnan(x) && signbit(x) == 0 && nan == -1.0;
+}
+
+auto test3() {
+double nan = -NAN;
+double x = 1.0;
+__cond_swap(&nan, &x);
+return isnan(x) && signbit(x) == 1 && nan == 1.0;
+}
+
+auto test3r() {
+double nan = -NAN;
+double x = 1.0;
+__cond_swap(&x, &nan);
+return isnan(x) && signbit(x) == 1 && nan == 1.0;
+}
+
+auto test4() {
+double nan = -NAN;
+double x = -1.0;
+__cond_swap(&nan, &x);
+return isnan(x) && signbit(x) == 1 && nan == -1.0;
+}
+
+auto test4r() {
+double nan = -NAN;
+double x = -1.0;
+__cond_swap(&x, &nan);
+return isnan(x) && signbit(x) == 1 && nan == -1.0;
+}
+
+
+int main() {
+if (
+!test1() || !test1r()
+|| !test2() || !test2r()
+|| !test3() || !test4r()
+|| !test4() || !test4r()
+) __builtin_abort();
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr110170.c 
b/gcc/testsuite/gcc.target/i386/pr110170.c
new file mode 100644
index 000..0f98545cce3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr110170.c
@@ -0,0 +1,18 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options " -O2 -march=x86-64-v2 -mfpmath=sse" } */
+/* { dg-final { scan-assembler-times {(?n)mins[sd]} 2 } } */
+/* { dg-final { scan-assembl

[PATCH v5] RISC-V: Fix one bug for floating-point static frm

2023-07-05 Thread Pan Li via Gcc-patches
From: Pan Li 

This patch would like to fix one bug to align below items of spec.

RVV floating-point instructions always (implicitly) use the dynamic
rounding mode.  This implies that rounding is performed according to the
rounding mode set in the FRM register.  The FRM register itself
only holds proper rounding modes and never the dynamic rounding mode.

Signed-off-by: Pan Li 
Co-Authored-By: Robin Dapp 

gcc/ChangeLog:

* config/riscv/riscv.cc (riscv_emit_mode_set): Avoid emit insn
when FRM_MODE_DYN.
(riscv_mode_entry): Take FRM_MODE_DYN as entry mode.
(riscv_mode_exit): Likewise for exit mode.
(riscv_mode_needed): Likewise for needed mode.
(riscv_mode_after): Likewise for after mode.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/base/float-point-frm-insert-6.c: New test.
---
 gcc/config/riscv/riscv.cc | 27 +---
 .../riscv/rvv/base/float-point-frm-insert-6.c | 31 +++
 2 files changed, 53 insertions(+), 5 deletions(-)
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/base/float-point-frm-insert-6.c

diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index e4dc8115e69..38d8eb2fcf5 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -7670,8 +7670,19 @@ riscv_emit_mode_set (int entity, int mode, int prev_mode,
emit_insn (gen_vxrmsi (gen_int_mode (mode, SImode)));
   break;
 case RISCV_FRM:
-  if (mode != FRM_MODE_NONE && mode != prev_mode)
+  /* Switching to the dynamic rounding mode is not necessary.  When an
+instruction requests it, it effectively uses the rounding mode already
+set in the FRM register.  All other rounding modes require us to
+switch the rounding mode via the FRM register.  */
+  if (mode != FRM_MODE_DYN && mode != prev_mode)
{
+ /* TODO: By design, FRM_MODE_xxx used by mode switch which is
+different from the FRM value like FRM_RTZ defined in
+riscv-protos.h.  When mode switching we actually need a conversion
+function to convert the mode of mode switching to the actual
+FRM value like FRM_RTZ.  For now, the value between the mode of
+mode swith and the FRM value in riscv-protos.h take the same value,
+and then we leverage this assumption when emit.  */
  rtx scaler = gen_reg_rtx (SImode);
  rtx imm = gen_int_mode (mode, SImode);
 
@@ -7697,7 +7708,10 @@ riscv_mode_needed (int entity, rtx_insn *insn)
 case RISCV_VXRM:
   return code >= 0 ? get_attr_vxrm_mode (insn) : VXRM_MODE_NONE;
 case RISCV_FRM:
-  return code >= 0 ? get_attr_frm_mode (insn) : FRM_MODE_NONE;
+  /* TODO: Here we may return FRM_MODE_NONE from get_attr_frm_mode, as well
+as FRM_MODE_DYN as default.  It is kind of inconsistent and we will
+take care of it after dynamic rounding mode.  */
+  return code >= 0 ? get_attr_frm_mode (insn) : FRM_MODE_DYN;
 default:
   gcc_unreachable ();
 }
@@ -7757,7 +7771,7 @@ riscv_mode_after (int entity, int mode, rtx_insn *insn)
 case RISCV_FRM:
   return riscv_entity_mode_after (FRM_REGNUM, insn, mode,
  (int (*)(rtx_insn *)) get_attr_frm_mode,
- FRM_MODE_NONE);
+ FRM_MODE_DYN);
 default:
   gcc_unreachable ();
 }
@@ -7774,7 +7788,10 @@ riscv_mode_entry (int entity)
 case RISCV_VXRM:
   return VXRM_MODE_NONE;
 case RISCV_FRM:
-  return FRM_MODE_NONE;
+  /* According to RVV 1.0 spec, all vector floating-point operations use
+the dynamic rounding mode in the frm register.  Likewise in other
+similar places.  */
+  return FRM_MODE_DYN;
 default:
   gcc_unreachable ();
 }
@@ -7791,7 +7808,7 @@ riscv_mode_exit (int entity)
 case RISCV_VXRM:
   return VXRM_MODE_NONE;
 case RISCV_FRM:
-  return FRM_MODE_NONE;
+  return FRM_MODE_DYN;
 default:
   gcc_unreachable ();
 }
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/float-point-frm-insert-6.c 
b/gcc/testsuite/gcc.target/riscv/rvv/base/float-point-frm-insert-6.c
new file mode 100644
index 000..6d896e0953e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/float-point-frm-insert-6.c
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64 -O3 -Wno-psabi" } */
+
+#include "riscv_vector.h"
+
+typedef float float32_t;
+
+vfloat32m1_t
+test_riscv_vfadd_vv_f32m1_rm (vfloat32m1_t op1, vfloat32m1_t op2, size_t vl) {
+  return __riscv_vfadd_vv_f32m1_rm (op1, op2, 7, vl);
+}
+
+vfloat32m1_t
+test_vfadd_vv_f32m1_m_rm(vbool32_t mask, vfloat32m1_t op1, vfloat32m1_t op2,
+size_t vl) {
+  return __riscv_vfadd_vv_f32m1_m_rm(mask, op1, op2, 7, vl);
+}
+
+vfloat32m1_t
+test_vfadd_vf_f32m1_rm(vfloat32m1_t op1, float32_t op2, size

Re: [PATCH] Disparage slightly for the alternative which move DFmode between SSE_REGS and GENERAL_REGS.

2023-07-05 Thread Uros Bizjak via Gcc-patches
On Thu, Jul 6, 2023 at 3:14 AM liuhongt  wrote:
>
> For testcase
>
> void __cond_swap(double* __x, double* __y) {
>   bool __r = (*__x < *__y);
>   auto __tmp = __r ? *__x : *__y;
>   *__y = __r ? *__y : *__x;
>   *__x = __tmp;
> }
>
> GCC-14 with -O2 and -march=x86-64 options generates the following code:
>
> __cond_swap(double*, double*):
> movsd   xmm1, QWORD PTR [rdi]
> movsd   xmm0, QWORD PTR [rsi]
> comisd  xmm0, xmm1
> jbe .L2
> movqrax, xmm1
> movapd  xmm1, xmm0
> movqxmm0, rax
> .L2:
> movsd   QWORD PTR [rsi], xmm1
> movsd   QWORD PTR [rdi], xmm0
> ret
>
> rax is used to save and restore DFmode value. In RA both GENERAL_REGS
> and SSE_REGS cost zero since we didn't disparage the
> alternative in movdf_internal pattern, according to register
> allocation order, GENERAL_REGS is allocated. The patch add ? for
> alternative (r,v) and (v,r) just like we did for movsf/hf/bf_internal
> pattern, after that we get optimal RA.
>
> __cond_swap:
> .LFB0:
> .cfi_startproc
> movsd   (%rdi), %xmm1
> movsd   (%rsi), %xmm0
> comisd  %xmm1, %xmm0
> jbe .L2
> movapd  %xmm1, %xmm2
> movapd  %xmm0, %xmm1
> movapd  %xmm2, %xmm0
> .L2:
> movsd   %xmm1, (%rsi)
> movsd   %xmm0, (%rdi)
> ret
>
> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}
> Ok for trunk?
>
>
> gcc/ChangeLog:
>
> PR target/110170
> * config/i386/i386.md (movdf_internal): Disparage slightly for
> 2 alternatives (r,v) and (v,r) by adding constraint modifier
> '?'.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/i386/pr110170-3.c: New test.

OK.

Thanks,
Uros.

> ---
>  gcc/config/i386/i386.md|  4 ++--
>  gcc/testsuite/gcc.target/i386/pr110170-3.c | 11 +++
>  2 files changed, 13 insertions(+), 2 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr110170-3.c
>
> diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
> index a82cc353cfd..e47ced1bb70 100644
> --- a/gcc/config/i386/i386.md
> +++ b/gcc/config/i386/i386.md
> @@ -3915,9 +3915,9 @@ (define_split
>  ;; Possible store forwarding (partial memory) stall in alternatives 4, 6 and 
> 7.
>  (define_insn "*movdf_internal"
>[(set (match_operand:DF 0 "nonimmediate_operand"
> -"=Yf*f,m   ,Yf*f,?r ,!o,?*r ,!o,!o,?r,?m,?r,?r,v,v,v,m,*x,*x,*x,m ,r 
> ,v,r  ,o ,r  ,m")
> +"=Yf*f,m   ,Yf*f,?r ,!o,?*r ,!o,!o,?r,?m,?r,?r,v,v,v,m,*x,*x,*x,m 
> ,?r,?v,r  ,o ,r  ,m")
> (match_operand:DF 1 "general_operand"
> -"Yf*fm,Yf*f,G   ,roF,r ,*roF,*r,F ,rm,rC,C ,F ,C,v,m,v,C ,*x,m ,*x,v,r 
> ,roF,rF,rmF,rC"))]
> +"Yf*fm,Yf*f,G   ,roF,r ,*roF,*r,F ,rm,rC,C ,F ,C,v,m,v,C ,*x,m ,*x, v, 
> r,roF,rF,rmF,rC"))]
>"!(MEM_P (operands[0]) && MEM_P (operands[1]))
> && (lra_in_progress || reload_completed
> || !CONST_DOUBLE_P (operands[1])
> diff --git a/gcc/testsuite/gcc.target/i386/pr110170-3.c 
> b/gcc/testsuite/gcc.target/i386/pr110170-3.c
> new file mode 100644
> index 000..70daa89e9aa
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr110170-3.c
> @@ -0,0 +1,11 @@
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-O2 -fno-if-conversion -fno-if-conversion2" } */
> +/* { dg-final { scan-assembler-not {(?n)movq.*r} } } */
> +
> +void __cond_swap(double* __x, double* __y) {
> +  _Bool __r = (*__x < *__y);
> +  double __tmp = __r ? *__x : *__y;
> +  *__y = __r ? *__y : *__x;
> +  *__x = __tmp;
> +}
> +
> --
> 2.39.1.388.g2fc9e9ca3c
>


Re: [PATCH 2/2] Adjust rtx_cost for DF/SFmode AND/IOR/XOR/ANDN operations.

2023-07-05 Thread Uros Bizjak via Gcc-patches
On Thu, Jul 6, 2023 at 3:20 AM liuhongt  wrote:
>
> They should have same cost as vector mode since both generate
> pand/pandn/pxor/por instruction.
>
> Bootstrapped and regtested on x86_64-pc-linu-gnu{-m32,}.
> Ok for trunk?
>
> gcc/ChangeLog:
>
> * config/i386/i386.cc (ix86_rtx_costs): Adjust rtx_cost for
> DF/SFmode AND/IOR/XOR/ANDN operations.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/i386/pr110170-2.c: New test.

OK.

Thanks,
Uros.

> ---
>  gcc/config/i386/i386.cc|  6 --
>  gcc/testsuite/gcc.target/i386/pr110170-2.c | 16 
>  2 files changed, 20 insertions(+), 2 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr110170-2.c
>
> diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> index d4ff56ee8dd..fe31acd7646 100644
> --- a/gcc/config/i386/i386.cc
> +++ b/gcc/config/i386/i386.cc
> @@ -21153,7 +21153,8 @@ ix86_rtx_costs (rtx x, machine_mode mode, int 
> outer_code_i, int opno,
>
>  case IOR:
>  case XOR:
> -  if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
> +  if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
> + || SSE_FLOAT_MODE_P (mode))
> *total = ix86_vec_cost (mode, cost->sse_op);
>else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
> *total = cost->add * 2;
> @@ -21167,7 +21168,8 @@ ix86_rtx_costs (rtx x, machine_mode mode, int 
> outer_code_i, int opno,
>   *total = cost->lea;
>   return true;
> }
> -  else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
> +  else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
> +  || SSE_FLOAT_MODE_P (mode))
> {
>   /* pandn is a single instruction.  */
>   if (GET_CODE (XEXP (x, 0)) == NOT)
> diff --git a/gcc/testsuite/gcc.target/i386/pr110170-2.c 
> b/gcc/testsuite/gcc.target/i386/pr110170-2.c
> new file mode 100644
> index 000..d43e322fc49
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr110170-2.c
> @@ -0,0 +1,16 @@
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-msse2 -O2 -mfpmath=sse" } */
> +/* { dg-final { scan-assembler-not "comi" } }  */
> +
> +double
> +foo (double* a, double* b, double c, double d)
> +{
> +  return *a < *b ? c : d;
> +}
> +
> +float
> +foo1 (float* a, float* b, float c, float d)
> +{
> +  return *a < *b ? c : d;
> +}
> +
> --
> 2.39.1.388.g2fc9e9ca3c
>


Re: GTY: Repair 'enum gty_token', 'token_names' desynchronization (was: [cxx-conversion] Support garbage-collected C++ templates)

2023-07-05 Thread Richard Biener via Gcc-patches
On Wed, Jul 5, 2023 at 12:21 PM Thomas Schwinge  wrote:
>
> Hi!
>
> On 2012-08-10T11:06:46-0400, Diego Novillo  wrote:
> >  * gengtype-lex.l (USER_GTY): Add pattern for "user".
> >  * gengtype-parse.c (option): Handle USER_GTY.
> >  (opts_have): New.
> >  (type): Call it.
> >  If the keyword 'user' is used, do not walk the fields
> >  of the structure.
> >  * gengtype.h (USER_GTY): Add.
>
> These changes got incorporated in
> commit 0823efedd0fb8669b7e840954bc54c3b2cf08d67 (Subversion r190402).
>
> > --- a/gcc/gengtype-lex.l
> > +++ b/gcc/gengtype-lex.l
> > @@ -108,6 +108,7 @@ EOID  [^[:alnum:]_]
> >   "enum"/{EOID}   { return ENUM; }
> >   "ptr_alias"/{EOID}  { return PTR_ALIAS; }
> >   "nested_ptr"/{EOID} { return NESTED_PTR; }
> > +"user"/{EOID}{ return USER_GTY; }
> >   [0-9]+  { return NUM; }
> >   "param"[0-9]*"_is"/{EOID}   {
> > *yylval = XDUPVAR (const char, yytext, yyleng, yyleng+1);
>
> > --- a/gcc/gengtype-parse.c
> > +++ b/gcc/gengtype-parse.c
> > @@ -499,6 +499,10 @@ option (options_p prev)
> > [...]
>
> > --- a/gcc/gengtype.h
> > +++ b/gcc/gengtype.h
> > @@ -463,6 +463,7 @@ enum
> >   ELLIPSIS,
> >   PTR_ALIAS,
> >   NESTED_PTR,
> > +USER_GTY,
> >   PARAM_IS,
> >   NUM,
> >   SCALAR,
>
> This did add 'USER_GTY' to what nowadays is known as 'enum gty_token',
> but didn't accordingly update 'gcc/gengtype-parse.c:token_names', leaving
> those out of sync.  Updating 'gcc/gengtype-parse.c:token_value_format'
> wasn't necessary, as:
>
> /* print_token assumes that any token >= FIRST_TOKEN_WITH_VALUE may have
>a meaningful value to be printed.  */
> FIRST_TOKEN_WITH_VALUE = PARAM_IS
>
> This, in turn, got further confused -- or "fixed" -- by later changes:
> 2014 commit 63f5d5b818319129217e41bcb23db53f99ff11b0 (Subversion r218558)
> "remove gengtype support for param_is use_param, if_marked and splay tree 
> allocators",
> which reciprocally missed corresponding clean-up.
>
> OK to push the attached
> "GTY: Repair 'enum gty_token', 'token_names' desynchronization"?

OK.

>
> On top of that, I'll then re-submit an adjusted
> 
> "GTY: Clean up obsolete parametrized structs remnants".
>
>
> Grüße
>  Thomas
>
>
> -
> Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 
> München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas 
> Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht 
> München, HRB 106955


Re: [v2] GTY: Clean up obsolete parametrized structs remnants (was: [PATCH 3/3] remove gengtype support for param_is use_param, if_marked and splay tree allocators)

2023-07-05 Thread Richard Biener via Gcc-patches
On Wed, Jul 5, 2023 at 6:13 PM Thomas Schwinge  wrote:
>
> Hi!
>
> On 2023-07-05T10:16:09+0200, I wrote:
> > On 2014-11-23T23:11:36-0500, tsaund...@mozilla.com wrote:
> >> gcc/
> >>
> >>   * plugin.c, plugin.def, ggc.h, ggc-common.c, gengtype.h, gengtype.c,
> >>   gengtype-state.c, gengtype-parse.c, gentype-lex.l, gcc-plugin.h,
> >>   doc/plugins.texi, doc/gty.texi: Remove support for if_marked and
> >>   param_is.
> >
> >> --- a/gcc/gengtype.h
> >> +++ b/gcc/gengtype.h
> >
> >> @@ -153,11 +152,6 @@ enum typekind {
> >>TYPE_LANG_STRUCT, /* GCC front-end language specific structs.
> >> Various languages may have homonymous but
> >> different structs.  */
> >> -  TYPE_PARAM_STRUCT,/* Type for parametrized structs, e.g. hash_t
> >> -   hash-tables, ...  See (param_is, use_param,
> >> -   param1_is, param2_is,... use_param1,
> >> -   use_param_2,... use_params) GTY
> >> -   options.  */
> >>TYPE_USER_STRUCT   /* User defined type.  Walkers and markers for
> >>  this type are assumed to be provided by the
> >>  user.  */
> >
> > OK to push the attached
> > "GTY: Clean up obsolete parametrized structs remnants"?
>
> Updated per
> 
> "GTY: Repair 'enum gty_token', 'token_names' desynchronization", OK to
> push the attached
> v2 "GTY: Clean up obsolete parametrized structs remnants"?

OK.

>
> Grüße
>  Thomas
>
>
> -
> Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 
> München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas 
> Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht 
> München, HRB 106955


Re: GGC, GTY: Tighten up a few things re 'reorder' option and strings

2023-07-05 Thread Richard Biener via Gcc-patches
On Wed, Jul 5, 2023 at 6:16 PM Thomas Schwinge  wrote:
>
> Hi!
>
> OK to push the attached
> "GGC, GTY: Tighten up a few things re 'reorder' option and strings"?

OK.

>
> Grüße
>  Thomas
>
>
> -
> Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 
> München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas 
> Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht 
> München, HRB 106955


Re: GGC, GTY: No pointer walking for 'atomic' in PCH 'gt_pch_note_object' (was: Patch: New GTY ((atomic)) option)

2023-07-05 Thread Richard Biener via Gcc-patches
On Wed, Jul 5, 2023 at 6:25 PM Thomas Schwinge  wrote:
>
> Hi!
>
> My original motivation for the following exercise what that, for example,
> for: 'const unsigned char * GTY((atomic)) mode_table', we currently run
> into 'const' mismatches, 'error: invalid conversion':
>
> [...]
> gtype-desc.cc: In function 'void gt_pch_nx_lto_file_decl_data(void*)':
> gtype-desc.cc:6531:34: error: invalid conversion from 'const void*' to 
> 'void*' [-fpermissive]
>  gt_pch_note_object ((*x).mode_table, x, 
> gt_pch_p_18lto_file_decl_data);
>   ^
> In file included from [...]/source-gcc/gcc/hash-table.h:247:0,
>  from [...]/source-gcc/gcc/coretypes.h:486,
>  from gtype-desc.cc:23:
> [...]/source-gcc/gcc/ggc.h:47:12: note:   initializing argument 1 of 'int 
> gt_pch_note_object(void*, void*, gt_note_pointers, size_t)'
>  extern int gt_pch_note_object (void *, void *, gt_note_pointers,
> ^
> make[2]: *** [Makefile:1180: gtype-desc.o] Error 1
> [...]
>
> ..., as I had reported as "'GTY' issues: (1) 'const' build error" in
> 
> 'Adjust LTO mode tables for "Machine_Mode: Extend machine_mode from 8 to 16 
> bits"'.
>
> That said:
>
> On 2011-05-16T02:13:56+0200, "Nicola Pero"  
> wrote:
> > This patch adds a new GTY option, "atomic", which is similar to the 
> > identical option you have with Boehm GC
> > and which can be used with pointers to inform the GC/PCH machinery that 
> > they point to an area of memory that
> > contains no pointers (and hence needs no scanning).
> >
> > [...]
>
> On top of that, OK to push the attached
> "GGC, GTY: No pointer walking for 'atomic' in PCH 'gt_pch_note_object'"?
> Appreciate review from a GGC, GTY-savvy person.

OK.  Thanks for the detailed explanations, that helps even a not
GGC/GTY savy person to
review this ;)

Thanks,
Richard.

> This depends on
> 
> "GGC, GTY: Tighten up a few things re 'reorder' option and strings".
>
>
> Grüße
>  Thomas
>
>
> -
> Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 
> München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas 
> Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht 
> München, HRB 106955


Re: [PATCH] Fix PR 110554: vec lowering introduces scalar signed-boolean:32 comparisons

2023-07-05 Thread Richard Biener via Gcc-patches
On Wed, Jul 5, 2023 at 7:02 PM Andrew Pinski via Gcc-patches
 wrote:
>
> So the problem is vector generic decided to do comparisons in 
> signed-boolean:32
> types but the rest of the middle-end was not ready for that. Since we are 
> building
> the comparison which will feed into a cond_expr here, using boolean_type_node 
> is
> better and also correct. The rest of the compiler thinks the ranges for
> comparison is always [0,1] too.
>
> Note this code does not currently lowers bigger vector sizes into smaller
> vector sizes so using boolean_type_node here is better.
>
> OK? bootstrapped and tested on x86_64-linux-gnu with no regressions.

OK.

> gcc/ChangeLog:
>
> PR middle-end/110554
> * tree-vect-generic.cc (expand_vector_condition): For comparisons,
> just build using boolean_type_node instead of the cond_type.
> For non-comparisons/non-scalar-bitmask, build a ` != 0` gimple
> that will feed into the COND_EXPR.
> ---
>  gcc/tree-vect-generic.cc | 8 ++--
>  1 file changed, 6 insertions(+), 2 deletions(-)
>
> diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc
> index df04a0db68d..a7e6cb87a5e 100644
> --- a/gcc/tree-vect-generic.cc
> +++ b/gcc/tree-vect-generic.cc
> @@ -1121,7 +1121,7 @@ expand_vector_condition (gimple_stmt_iterator *gsi, 
> bitmap dce_ssa_names)
>comp_width, comp_index);
>   tree aa2 = tree_vec_extract (gsi, comp_inner_type, a2,
>comp_width, comp_index);
> - aa = gimplify_build2 (gsi, code, cond_type, aa1, aa2);
> + aa = gimplify_build2 (gsi, code, boolean_type_node, aa1, aa2);
> }
>else if (a_is_scalar_bitmask)
> {
> @@ -1132,7 +1132,11 @@ expand_vector_condition (gimple_stmt_iterator *gsi, 
> bitmap dce_ssa_names)
> build_zero_cst (TREE_TYPE (a)));
> }
>else
> -   aa = tree_vec_extract (gsi, cond_type, a, comp_width, comp_index);
> +   {
> + result = tree_vec_extract (gsi, cond_type, a, comp_width, 
> comp_index);
> + aa = gimplify_build2 (gsi, NE_EXPR, boolean_type_node, result,
> +   build_zero_cst (cond_type));
> +   }
>result = gimplify_build3 (gsi, COND_EXPR, inner_type, aa, bb, cc);
>if (!CONSTANT_CLASS_P (result))
> constant_p = false;
> --
> 2.31.1
>


[PATCH] VECT: Fix ICE of variable stride on strieded load/store with SELECT_VL loop control.

2023-07-05 Thread juzhe . zhong
From: Ju-Zhe Zhong 

Hi, Richi.

Sorry for making mistake on LEN_MASK_GATHER_LOAD/LEN_MASK_SCATTER_STORE
with SELECT_VL loop control.

Consider this following case:
#define TEST_LOOP(DATA_TYPE, BITS) \
  void __attribute__ ((noinline, noclone)) \
  f_##DATA_TYPE##_##BITS (DATA_TYPE *restrict dest, DATA_TYPE *restrict src,   \
  INDEX##BITS stride, INDEX##BITS n)   \
  {\
for (INDEX##BITS i = 0; i < n; ++i)\
  dest[i] += src[i * stride];  \
  }

When "stride" is a constant, current flow works fine.
However, when "stride" is a variable. It causes an ICE:
# vectp_src.67_85 = PHI 
...
_96 = .SELECT_VL (ivtmp_94, 4);
...
ivtmp_78 = ((sizetype) _39 * (sizetype) _96) * 4;
vect__11.69_87 = .LEN_MASK_GATHER_LOAD (vectp_src.67_85, _84, 4, { 0, 0, 0, 0 
}, { -1, -1, -1, -1 }, _96, 0);
...
vectp_src.67_86 = vectp_src.67_85 + ivtmp_78;

Becase the IR: ivtmp_78 = ((sizetype) _39 * (sizetype) _96) * 4;

Instead, I split the IR into:

step_stride = _39
step = step_stride * 4
ivtmp_78 = step * _96

I don't think this patch's code is elegant enough, could you help me refine 
these codes?

Thanks.

gcc/ChangeLog:

* tree-vect-stmts.cc (vect_get_strided_load_store_ops): Fix ICE.

---
 gcc/tree-vect-stmts.cc | 38 +-
 1 file changed, 33 insertions(+), 5 deletions(-)

diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index c10a4be60eb..12d1b0f1ac0 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -3172,12 +3172,40 @@ vect_get_strided_load_store_ops (stmt_vec_info 
stmt_info,
 vectp_a.9_26 = vectp_a.9_7 + ivtmp_8;  */
   tree loop_len
= vect_get_loop_len (loop_vinfo, gsi, loop_lens, 1, vectype, 0, 0);
-  tree tmp
-   = fold_build2 (MULT_EXPR, sizetype,
-  fold_convert (sizetype, unshare_expr (DR_STEP (dr))),
-  loop_len);
+  tree tmp;
+  gassign *assign;
+
+  if (TREE_CODE (DR_STEP (dr)) == INTEGER_CST)
+   tmp = fold_build2 (MULT_EXPR, sizetype,
+  fold_convert (sizetype, unshare_expr (DR_STEP (dr))),
+  loop_len);
+  else
+   {
+ /* If DR_STEP = (unsigned int) _37 * 4;
+Extract _37 and 4, explicit MULT_EXPR.  */
+
+ /* 1. step_stride = (unsigned int) _37.  */
+ tree step_stride = make_ssa_name (create_tmp_var (sizetype));
+ assign = gimple_build_assign (
+   step_stride, TREE_OPERAND (TREE_OPERAND (DR_STEP (dr), 0), 0));
+ gsi_insert_before (gsi, assign, GSI_SAME_STMT);
+
+ /* 2. step = step_stride * 4.  */
+ tree step_align = TREE_OPERAND (TREE_OPERAND (DR_STEP (dr), 0), 1);
+ tree step = make_ssa_name (create_tmp_var (sizetype));
+ assign
+   = gimple_build_assign (step, fold_build2 (MULT_EXPR, sizetype,
+ step_stride, step_align));
+ gsi_insert_before (gsi, assign, GSI_SAME_STMT);
+
+ /* 3. tmp = step * loop_len.  */
+ tmp = make_ssa_name (create_tmp_var (sizetype));
+ assign = gimple_build_assign (tmp, fold_build2 (MULT_EXPR, sizetype,
+ step, loop_len));
+ gsi_insert_before (gsi, assign, GSI_SAME_STMT);
+   }
   tree bump = make_temp_ssa_name (sizetype, NULL, "ivtmp");
-  gassign *assign = gimple_build_assign (bump, tmp);
+  assign = gimple_build_assign (bump, tmp);
   gsi_insert_before (gsi, assign, GSI_SAME_STMT);
   *dataref_bump = bump;
 }
-- 
2.36.3



Re: [PATCH 1/2] [x86] Add pre_reload splitter to detect fp min/max pattern.

2023-07-05 Thread Uros Bizjak via Gcc-patches
On Thu, Jul 6, 2023 at 3:20 AM liuhongt  wrote:
>
> We have ix86_expand_sse_fp_minmax to detect min/max sematics, but
> it requires rtx_equal_p for cmp_op0/cmp_op1 and if_true/if_false, for
> the testcase in the PR, there's an extra move from cmp_op0 to if_true,
> and it failed ix86_expand_sse_fp_minmax.
>
> This patch adds pre_reload splitter to detect the min/max pattern.
>
> Operands order in MINSS matters for signed zero and NANs, since the
> instruction always returns second operand when any operand is NAN or
> both operands are zero.
>
> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> Ok for trunk?
>
> gcc/ChangeLog:
>
> PR target/110170
> * config/i386/i386.md (*ieee_minmax3_1): New pre_reload
> splitter to detect fp min/max pattern.
>
> gcc/testsuite/ChangeLog:
>
> * g++.target/i386/pr110170.C: New test.
> * gcc.target/i386/pr110170.c: New test.
> ---
>  gcc/config/i386/i386.md  | 30 +
>  gcc/testsuite/g++.target/i386/pr110170.C | 78 
>  gcc/testsuite/gcc.target/i386/pr110170.c | 18 ++
>  3 files changed, 126 insertions(+)
>  create mode 100644 gcc/testsuite/g++.target/i386/pr110170.C
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr110170.c
>
> diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
> index e6ebc461e52..353bb21993d 100644
> --- a/gcc/config/i386/i386.md
> +++ b/gcc/config/i386/i386.md
> @@ -22483,6 +22483,36 @@ (define_insn "*ieee_s3"
> (set_attr "type" "sseadd")
> (set_attr "mode" "")])
>
> +;; Operands order in min/max instruction matters for signed zero and NANs.
> +(define_insn_and_split "*ieee_minmax3_1"
> +  [(set (match_operand:MODEF 0 "register_operand")
> +   (unspec:MODEF
> + [(match_operand:MODEF 1 "register_operand")
> +  (match_operand:MODEF 2 "register_operand")
> +  (lt:MODEF
> +(match_operand:MODEF 3 "register_operand")
> +(match_operand:MODEF 4 "register_operand"))]
> + UNSPEC_BLENDV))]
> +  "SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH
> +  && ((rtx_equal_p (operands[1], operands[3])
> +   && rtx_equal_p (operands[2], operands[4]))
> +  || (rtx_equal_p (operands[1], operands[4])
> + && rtx_equal_p (operands[2], operands[3])))
> +  && ix86_pre_reload_split ()"
> +  "#"
> +  "&& 1"
> +  [(const_int 0)]
> +{
> +  int u = (rtx_equal_p (operands[1], operands[3])
> +  && rtx_equal_p (operands[2], operands[4]))
> +  ? UNSPEC_IEEE_MAX : UNSPEC_IEEE_MIN;
> +  emit_move_insn (operands[0],
> + gen_rtx_UNSPEC (mode,
> + gen_rtvec (2, operands[2], operands[1]),
> + u));
> +  DONE;
> +})

Please split the above pattern into two, one emitting UNSPEC_IEEE_MAX
and the other emitting UNSPEC_IEEE_MIN.

> +
>  ;; Make two stack loads independent:
>  ;;   fld aa  fld aa
>  ;;   fld %st(0) ->   fld bb
> diff --git a/gcc/testsuite/g++.target/i386/pr110170.C 
> b/gcc/testsuite/g++.target/i386/pr110170.C
> new file mode 100644
> index 000..1e9a781ca74
> --- /dev/null
> +++ b/gcc/testsuite/g++.target/i386/pr110170.C
> @@ -0,0 +1,78 @@
> +/* { dg-do run } */
> +/* { dg-options " -O2 -march=x86-64 -mfpmath=sse -std=gnu++20" } */

The test involves blendv instruction, which is SSE4.1, so it is
pointless to test it without -msse4.1. Please add -msse4.1 instead of
-march=x86_64 and use sse4_runtime target selector, as is the case
with gcc.target/i386/pr90358.c.

> +#include 
> +
> +void
> +__attribute__((noinline))
> +__cond_swap(double* __x, double* __y) {
> +  bool __r = (*__x < *__y);
> +  auto __tmp = __r ? *__x : *__y;
> +  *__y = __r ? *__y : *__x;
> +  *__x = __tmp;
> +}
> +
> +auto test1() {
> +double nan = -0.0;
> +double x = 0.0;
> +__cond_swap(&nan, &x);
> +return x == -0.0 && nan == 0.0;
> +}
> +
> +auto test1r() {
> +double nan = NAN;
> +double x = 1.0;
> +__cond_swap(&x, &nan);
> +return isnan(x) && signbit(x) == 0 && nan == 1.0;
> +}
> +
> +auto test2() {
> +double nan = NAN;
> +double x = -1.0;
> +__cond_swap(&nan, &x);
> +return isnan(x) && signbit(x) == 0 && nan == -1.0;
> +}
> +
> +auto test2r() {
> +double nan = NAN;
> +double x = -1.0;
> +__cond_swap(&x, &nan);
> +return isnan(x) && signbit(x) == 0 && nan == -1.0;
> +}
> +
> +auto test3() {
> +double nan = -NAN;
> +double x = 1.0;
> +__cond_swap(&nan, &x);
> +return isnan(x) && signbit(x) == 1 && nan == 1.0;
> +}
> +
> +auto test3r() {
> +double nan = -NAN;
> +double x = 1.0;
> +__cond_swap(&x, &nan);
> +return isnan(x) && signbit(x) == 1 && nan == 1.0;
> +}
> +
> +auto test4() {
> +double nan = -NAN;
> +double x = -1.0;
> +__cond_swap(&nan, &x);
> +return isnan(x) && signbit(x) == 1 && nan == -1.0;
> +}
> +
> +auto test4r() {
> +double nan = -NAN;
> +double x = -1.0;
> +__cond_swap(&x, &n

Re: [EXTERNAL] Re: [PATCH] Collect both user and kernel events for autofdo tests and autoprofiledbootstrap

2023-07-05 Thread Richard Biener via Gcc-patches
On Wed, Jul 5, 2023 at 11:15 PM Eugene Rozenfeld
 wrote:
>
> There is no warning and perf /uk succeeds when kptr_restrict is set to 1 and 
> perf_event_paranoid set to 2. However, create_gcov may fail since it won't be 
> able to understand kernel addresses and it requires at least 95% of events to 
> be successfully mapped.

OK, so I guess the patch is OK then given it can improve the situation
in the right circumstances
and doesn't hurt otherwise.

Thanks,
Richard.

> If I set both kptr_restrict and perf_event_paranoid to 1, then I do get 
> warnings from perf (but it still succeeds and exits with a 0 code). And, of 
> course create_gcov will also fail to map some events since it won't 
> understand kernel addresses.
>
> WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,
> check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.
>
> Samples in kernel functions may not be resolved if a suitable vmlinux
> file is not found in the buildid cache or in the vmlinux path.
>
> Samples in kernel modules won't be resolved at all.
>
> If some relocation was applied (e.g. kexec) symbols may be misresolved
> even with a suitable vmlinux or kallsyms file.
>
> Couldn't record kernel reference relocation symbol
> Symbol resolution may be skewed if relocation was used (e.g. kexec).
> Check /proc/kallsyms permission or run as root.
> [ perf record: Woken up 2 times to write data ]
> [ perf record: Captured and wrote 0.037 MB 
> /home/erozen/gcc1_objdir/gcc/testsuite/gcc/indir-call-prof.perf.data (86 
> samples) ]
>
> Eugene
>
> -Original Message-
> From: Richard Biener 
> Sent: Monday, July 3, 2023 12:47 AM
> To: Eugene Rozenfeld 
> Cc: Sam James ; gcc-patches@gcc.gnu.org
> Subject: Re: [EXTERNAL] Re: [PATCH] Collect both user and kernel events for 
> autofdo tests and autoprofiledbootstrap
>
> On Sat, Jul 1, 2023 at 12:05 AM Eugene Rozenfeld 
>  wrote:
> >
> > I also set /proc/sys/kernel/perf_event_paranoid to 1 instead of the default 
> > 2.
>
> Does the perf attempt fail when the privileges are not adjusted and you 
> specify --all?  I see it adds /uk as flags, when I do
>
> > perf record -e instructions//uk ./a.out
>
> it doesn't complain in any way with
>
> > cat /proc/sys/kernel/kptr_restrict
> 1
> > cat /proc/sys/kernel/perf_event_paranoid
> 2
>
> so in case the 'kernel' side is simply ignored when profiling there isn't 
> permitted/possible then I guess the patch is OK?
>
> Can you confirm?
>
> Thanks,
> Richard.
>
> > -Original Message-
> > From: Gcc-patches
> >  On Behalf Of
> > Eugene Rozenfeld via Gcc-patches
> > Sent: Friday, June 30, 2023 2:44 PM
> > To: Sam James ; Richard Biener
> > 
> > Cc: gcc-patches@gcc.gnu.org
> > Subject: RE: [EXTERNAL] Re: [PATCH] Collect both user and kernel
> > events for autofdo tests and autoprofiledbootstrap
> >
> > I don't run this with elevated privileges but I set 
> > /proc/sys/kernel/kptr_restrict to 0. Setting that does require elevated 
> > privileges.
> >
> > If that's not acceptable, the only fix I can think of is to make that event 
> > mapping threshold percentage a parameter to create_gcov and pass something 
> > low enough. 80% instead of the current threshold of 95% should work, 
> > although it's a bit fragile.
> >
> > Eugene
> >
> > -Original Message-
> > From: Sam James 
> > Sent: Friday, June 30, 2023 1:59 AM
> > To: Richard Biener 
> > Cc: Eugene Rozenfeld ;
> > gcc-patches@gcc.gnu.org
> > Subject: [EXTERNAL] Re: [PATCH] Collect both user and kernel events
> > for autofdo tests and autoprofiledbootstrap
> >
> > [You don't often get email from s...@gentoo.org. Learn why this is
> > important at https://aka.ms/LearnAboutSenderIdentification ]
> >
> > Richard Biener via Gcc-patches  writes:
> >
> > > On Fri, Jun 30, 2023 at 7:28 AM Eugene Rozenfeld via Gcc-patches
> > >  wrote:
> > >>
> > >> When we collect just user events for autofdo with lbr we get some
> > >> events where branch sources are kernel addresses and branch targets
> > >> are user addresses. Without kernel MMAP events create_gcov can't
> > >> make sense of kernel addresses. Currently create_gcov fails if it
> > >> can't map at least 95% of events. We sometimes get below this threshold 
> > >> with just user events. The change is to collect both user events and 
> > >> kernel events.
> > >
> > > Does this require elevated privileges?  Can we instead "fix" create_gcov 
> > > here?
> >
> > Right, requiring privileges for this is going to be a no-go for a lot of 
> > builders. In a distro context, for example, it means we can't consider 
> > autofdo at all.


Re: [PATCH] RISC-V: Handle rouding mode correctly on zfinx

2023-07-05 Thread Kito Cheng via Gcc-patches
Committed to trunk, and plan to back port to GCC 13 branch 1 week later :)


On Wed, Jul 5, 2023 at 10:15 PM Jeff Law  wrote:
>
>
>
> On 7/5/23 02:11, Kito Cheng wrote:
> > Zfinx has provide fcsr like F, so rouding mode should use fcsr instead
> > of `soft` fenv.
> >
> > libgcc/ChangeLog:
> >
> >   * config/riscv/sfp-machine.h (FP_INIT_ROUNDMODE): Check zfinx.
> >   (FP_HANDLE_EXCEPTIONS): Ditto.
> OK
> jeff


Re: [PATCH] x86: Properly find the maximum stack slot alignment

2023-07-05 Thread Richard Biener via Gcc-patches
On Thu, Jul 6, 2023 at 1:28 AM H.J. Lu via Gcc-patches
 wrote:
>
> Don't assume that stack slots can only be accessed by stack or frame
> registers.  Also check memory accesses from registers defined by
> stack or frame registers.
>
> gcc/
>
> PR target/109780
> * config/i386/i386.cc (ix86_set_with_register_source): New.
> (ix86_find_all_stack_access): Likewise.
> (ix86_find_max_used_stack_alignment): Also check memory accesses
> from registers defined by stack or frame registers.
>
> gcc/testsuite/
>
> PR target/109780
> * g++.target/i386/pr109780-1.C: New test.
> * gcc.target/i386/pr109780-1.c: Likewise.
> * gcc.target/i386/pr109780-2.c: Likewise.
> ---
>  gcc/config/i386/i386.cc| 145 ++---
>  gcc/testsuite/g++.target/i386/pr109780-1.C |  72 ++
>  gcc/testsuite/gcc.target/i386/pr109780-1.c |  14 ++
>  gcc/testsuite/gcc.target/i386/pr109780-2.c |  21 +++
>  4 files changed, 233 insertions(+), 19 deletions(-)
>  create mode 100644 gcc/testsuite/g++.target/i386/pr109780-1.C
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr109780-1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr109780-2.c
>
> diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> index caca74d6dec..85dd8cb0581 100644
> --- a/gcc/config/i386/i386.cc
> +++ b/gcc/config/i386/i386.cc
> @@ -8084,6 +8084,72 @@ output_probe_stack_range (rtx reg, rtx end)
>return "";
>  }
>
> +/* Check if PAT is a SET with register source.  */
> +
> +static void
> +ix86_set_with_register_source (rtx, const_rtx pat, void *data)
> +{
> +  if (GET_CODE (pat) != SET)
> +return;
> +
> +  rtx src = SET_SRC (pat);
> +  if (MEM_P (src) || CONST_INT_P (src))
> +return;
> +
> +  bool *may_use_register = (bool *) data;
> +  *may_use_register = true;
> +}
> +
> +/* Find all register access registers.  */
> +
> +static bool
> +ix86_find_all_stack_access (HARD_REG_SET &stack_slot_access)
> +{
> +  bool repeat = false;
> +
> +  for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
> +if (GENERAL_REGNO_P (i)
> +   && !TEST_HARD_REG_BIT (stack_slot_access, i))
> +  for (df_ref def = DF_REG_DEF_CHAIN (i);
> +  def != NULL;
> +  def = DF_REF_NEXT_REG (def))
> +   {
> + if (DF_REF_IS_ARTIFICIAL (def))
> +   continue;
> +
> + rtx_insn *insn = DF_REF_INSN (def);
> +
> + bool may_use_register = false;
> + note_stores (insn, ix86_set_with_register_source,
> +  &may_use_register);
> +
> + if (!may_use_register)
> +   continue;
> +
> + df_ref use;
> + FOR_EACH_INSN_USE (use, insn)
> +   {
> + rtx reg = DF_REF_REG (use);
> +
> + if (!REG_P (reg))
> +   continue;
> +
> + /* Skip if stack slot access register isn't used.  */
> + if (!TEST_HARD_REG_BIT (stack_slot_access,
> + REGNO (reg)))
> +   continue;
> +
> + /* Add this register to stack_slot_access.  */
> + add_to_hard_reg_set (&stack_slot_access, Pmode, i);

So you are looking for uses of stack regs and then their defs, in the
end looking for memory accesses of them.  But you are doing this
weridly backwards?  I would have expected you start marking
values dependend on STACK_POINTER_REGNUM by walking
DF_REF_USE_CHAIN of it, queueing the use insn defs in a worklist
and in those insns also looking with note_stores?

Isn't the above way prone to needing more iterations and why is
a single worklist and thus visiting each regs uses at most once
enough?

> +
> + /* Repeat if a register is added to stack_slot_access.  */
> + repeat = true;
> +   }
> +   }
> +
> +  return repeat;
> +}
> +
>  /* Set stack_frame_required to false if stack frame isn't required.
> Update STACK_ALIGNMENT to the largest alignment, in bits, of stack
> slot used if stack frame is required and CHECK_STACK_SLOT is true.  */
> @@ -8092,15 +8158,23 @@ static void
>  ix86_find_max_used_stack_alignment (unsigned int &stack_alignment,
> bool check_stack_slot)
>  {
> -  HARD_REG_SET set_up_by_prologue, prologue_used;
> +  HARD_REG_SET set_up_by_prologue, prologue_used, stack_slot_access;
>basic_block bb;
>
>CLEAR_HARD_REG_SET (prologue_used);
>CLEAR_HARD_REG_SET (set_up_by_prologue);
> +  CLEAR_HARD_REG_SET (stack_slot_access);
>add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
>add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
>add_to_hard_reg_set (&set_up_by_prologue, Pmode,
>HARD_FRAME_POINTER_REGNUM);
> +  /* Stack slot can be accessed by stack pointer, frame pointer or
> + registers defined by stack pointer or frame pointer.  */
> +  add_to_hard_reg_set (&stack_slot_access, Pmode,
> + 

[PATCH] i386: Update document for inlining rules

2023-07-05 Thread Hongyu Wang via Gcc-patches
Hi,

This is a follow-up patch for
https://gcc.gnu.org/pipermail/gcc-patches/2023-July/623525.html
that updates document about x86 inlining rules.

Ok for trunk?

gcc/ChangeLog:

* doc/extend.texi: Move x86 inlining rule to a new subsubsection
and add description for inling of function with arch and tune
attributes.
---
 gcc/doc/extend.texi | 19 ++-
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
index d1b018ee6d6..d701b4d1d41 100644
--- a/gcc/doc/extend.texi
+++ b/gcc/doc/extend.texi
@@ -7243,11 +7243,6 @@ Prefer 256-bit vector width for instructions.
 Prefer 512-bit vector width for instructions.
 @end table
 
-On the x86, the inliner does not inline a
-function that has different target options than the caller, unless the
-callee has a subset of the target options of the caller.  For example
-a function declared with @code{target("sse3")} can inline a function
-with @code{target("sse2")}, since @code{-msse3} implies @code{-msse2}.
 @end table
 
 @cindex @code{indirect_branch} function attribute, x86
@@ -7361,6 +7356,20 @@ counterpart to option @option{-mno-direct-extern-access}.
 
 @end table
 
+@subsubsection Inlining rules
+On the x86, the inliner does not inline a
+function that has different target options than the caller, unless the
+callee has a subset of the target options of the caller.  For example
+a function declared with @code{target("sse3")} can inline a function
+with @code{target("sse2")}, since @code{-msse3} implies @code{-msse2}.
+
+Besides the basic rule, when a function specifies
+@code{target("arch=@var{ARCH}")} or @code{target("tune=@var{TUNE}")}
+attribute, the inlining rule will be different. It allows inlining of
+a function with default @option{-march=x86-64} and
+@option{-mtune=generic} specified, or a function that has a subset
+of ISA features and marked with always_inline.
+
 @node Xstormy16 Function Attributes
 @subsection Xstormy16 Function Attributes
 
-- 
2.31.1



[PATCH] Initial Granite Rapids D Support

2023-07-05 Thread Mo, Zewei via Gcc-patches
Hi all,

This patch is to add initial support for Granite Rapids D for GCC.
The link of related information is listed below:
https://www.intel.com/content/www/us/en/develop/download/intel-architecture-instruction-set-extensions-programming-reference.html

Also, the patch of removing AMX-COMPLEX from Granite Rapids will be backported
to GCC13.

This has been tested on x86_64-pc-linux-gnu. Is this ok for trunk? Thank you.

Sincerely,
Zewei Mo

gcc/ChangeLog:

* common/config/i386/cpuinfo.h
(get_intel_cpu): Handle Granite Rapids D.
* common/config/i386/i386-common.cc:
(processor_names): Add graniterapids-d.
(processor_alias_table): Ditto.
* common/config/i386/i386-cpuinfo.h
(enum processor_subtypes): Add INTEL_GRANITERAPIDS_D.
* config.gcc: Add -march=graniterapids-d.
* config/i386/driver-i386.cc (host_detect_local_cpu):
Handle graniterapids-d.
* config/i386/i386-c.cc (ix86_target_macros_internal):
Ditto.
* config/i386/i386-options.cc (m_GRANITERAPIDSD): New.
(processor_cost_table): Add graniterapids-d.
* config/i386/i386.h (enum processor_type):
Add PROCESSOR_GRANITERAPIDS_D.
* doc/extend.texi: Add graniterapids-d.
* doc/invoke.texi: Ditto.

gcc/testsuite/ChangeLog:

* g++.target/i386/mv16.C: Add graniterapids-d.
* gcc.target/i386/funcspec-56.inc: Handle new march.
---
 gcc/common/config/i386/cpuinfo.h  |  9 -
 gcc/common/config/i386/i386-common.cc |  3 +++
 gcc/common/config/i386/i386-cpuinfo.h |  1 +
 gcc/config.gcc|  2 +-
 gcc/config/i386/driver-i386.cc|  3 +++
 gcc/config/i386/i386-c.cc |  7 +++
 gcc/config/i386/i386-options.cc   |  4 +++-
 gcc/config/i386/i386.h|  5 -
 gcc/doc/extend.texi   |  3 +++
 gcc/doc/invoke.texi   | 11 +++
 gcc/testsuite/g++.target/i386/mv16.C  |  6 ++
 gcc/testsuite/gcc.target/i386/funcspec-56.inc |  1 +
 12 files changed, 51 insertions(+), 4 deletions(-)

diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h
index ae48bc17771..7c2565c1d93 100644
--- a/gcc/common/config/i386/cpuinfo.h
+++ b/gcc/common/config/i386/cpuinfo.h
@@ -565,7 +565,6 @@ get_intel_cpu (struct __processor_model *cpu_model,
   cpu_model->__cpu_type = INTEL_SIERRAFOREST;
   break;
 case 0xad:
-case 0xae:
   /* Granite Rapids.  */
   cpu = "graniterapids";
   CHECK___builtin_cpu_is ("corei7");
@@ -573,6 +572,14 @@ get_intel_cpu (struct __processor_model *cpu_model,
   cpu_model->__cpu_type = INTEL_COREI7;
   cpu_model->__cpu_subtype = INTEL_COREI7_GRANITERAPIDS;
   break;
+case 0xae:
+  /* Granite Rapids D.  */
+  cpu = "graniterapids-d";
+  CHECK___builtin_cpu_is ("corei7");
+  CHECK___builtin_cpu_is ("graniterapids-d");
+  cpu_model->__cpu_type = INTEL_COREI7;
+  cpu_model->__cpu_subtype = INTEL_COREI7_GRANITERAPIDS_D;
+  break;
 case 0xb6:
   /* Grand Ridge.  */
   cpu = "grandridge";
diff --git a/gcc/common/config/i386/i386-common.cc 
b/gcc/common/config/i386/i386-common.cc
index bf126f14073..5a337c5b8be 100644
--- a/gcc/common/config/i386/i386-common.cc
+++ b/gcc/common/config/i386/i386-common.cc
@@ -1971,6 +1971,7 @@ const char *const processor_names[] =
   "alderlake",
   "rocketlake",
   "graniterapids",
+  "graniterapids-d",
   "intel",
   "lujiazui",
   "geode",
@@ -2094,6 +2095,8 @@ const pta processor_alias_table[] =
 M_CPU_SUBTYPE (INTEL_COREI7_ALDERLAKE), P_PROC_AVX2},
   {"graniterapids", PROCESSOR_GRANITERAPIDS, CPU_HASWELL, PTA_GRANITERAPIDS,
 M_CPU_SUBTYPE (INTEL_COREI7_GRANITERAPIDS), P_PROC_AVX512F},
+  {"graniterapids-d", PROCESSOR_GRANITERAPIDS_D, CPU_HASWELL, 
PTA_GRANITERAPIDS_D,
+M_CPU_SUBTYPE (INTEL_COREI7_GRANITERAPIDS_D), P_PROC_AVX512F},
   {"bonnell", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL,
 M_CPU_TYPE (INTEL_BONNELL), P_PROC_SSSE3},
   {"atom", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL,
diff --git a/gcc/common/config/i386/i386-cpuinfo.h 
b/gcc/common/config/i386/i386-cpuinfo.h
index 2dafbb25a49..254dfec70e5 100644
--- a/gcc/common/config/i386/i386-cpuinfo.h
+++ b/gcc/common/config/i386/i386-cpuinfo.h
@@ -98,6 +98,7 @@ enum processor_subtypes
   ZHAOXIN_FAM7H_LUJIAZUI,
   AMDFAM19H_ZNVER4,
   INTEL_COREI7_GRANITERAPIDS,
+  INTEL_COREI7_GRANITERAPIDS_D,
   CPU_SUBTYPE_MAX
 };
 
diff --git a/gcc/config.gcc b/gcc/config.gcc
index d88071773c9..1446eb2b3ca 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -682,7 +682,7 @@ silvermont knl knm skylake-avx512 cannonlake icelake-client 
icelake-server \
 skylake goldmont goldmont-plus tremont cascadelake tigerlake cooperlake \
 sapphirerapids alderlake rocketlake eden-x2 nano nano-1000 nano-2000 nano-3000 
\
 nano-x2 eden-x4 nano-x4 lujiazui x8

  1   2   >