Re: [PATCH v4, rs6000] Implemented f[min/max]_optab by xs[min/max]dp [PR103605]

2022-06-08 Thread Kewen.Lin via Gcc-patches
on 2022/6/8 11:28, HAO CHEN GUI wrote:
> Hi,
>   This patch implements optab f[min/max]_optab by xs[min/max]dp on rs6000.
> Tests show that outputs of xs[min/max]dp are consistent with the standard
> of C99 fmin/max.
> 
>   This patch also binds __builtin_vsx_xs[min/max]dp to fmin/max instead
> of smin/max. So the builtins always generate xs[min/max]dp on all
> platforms.
> 
>   Compared with previous version, the main change is to fix indent problem.
> 
>   Bootstrapped and tested on ppc64 Linux BE and LE with no regressions.
> Is this okay for trunk? Any recommendations? Thanks a lot.

OK, thanks!

BR,
Kewen

> 
> ChangeLog
> 2022-05-31 Haochen Gui 
> 
> gcc/
>   PR target/103605
>   * config/rs6000/rs6000.md (FMINMAX): New.
>   (minmax_op): New.
>   (f3): New pattern by UNSPEC_FMAX and UNSPEC_FMIN.
>   * config/rs6000/rs6000-builtins.def (__builtin_vsx_xsmaxdp): Set
>   pattern to fmaxdf3.
>   (__builtin_vsx_xsmindp): Set pattern to fmindf3.
> 
> gcc/testsuite/
>   PR target/103605
>   * gcc.dg/powerpc/pr103605.c: New.
> 
> patch.diff
> diff --git a/gcc/config/rs6000/rs6000-builtins.def 
> b/gcc/config/rs6000/rs6000-builtins.def
> index f4a9f24bcc5..8b735493b40 100644
> --- a/gcc/config/rs6000/rs6000-builtins.def
> +++ b/gcc/config/rs6000/rs6000-builtins.def
> @@ -1613,10 +1613,10 @@
>  XSCVSPDP vsx_xscvspdp {}
> 
>const double __builtin_vsx_xsmaxdp (double, double);
> -XSMAXDP smaxdf3 {}
> +XSMAXDP fmaxdf3 {}
> 
>const double __builtin_vsx_xsmindp (double, double);
> -XSMINDP smindf3 {}
> +XSMINDP fmindf3 {}
> 
>const double __builtin_vsx_xsrdpi (double);
>  XSRDPI vsx_xsrdpi {}
> diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
> index bf85baa5370..42d3edf2eca 100644
> --- a/gcc/config/rs6000/rs6000.md
> +++ b/gcc/config/rs6000/rs6000.md
> @@ -158,6 +158,8 @@ (define_c_enum "unspec"
> UNSPEC_HASHCHK
> UNSPEC_XXSPLTIDP_CONST
> UNSPEC_XXSPLTIW_CONST
> +   UNSPEC_FMAX
> +   UNSPEC_FMIN
>])
> 
>  ;;
> @@ -5341,6 +5343,22 @@ (define_insn_and_split "*s3_fpr"
>DONE;
>  })
> 
> +
> +(define_int_iterator FMINMAX [UNSPEC_FMAX UNSPEC_FMIN])
> +
> +(define_int_attr  minmax_op [(UNSPEC_FMAX "max")
> +  (UNSPEC_FMIN "min")])
> +
> +(define_insn "f3"
> +  [(set (match_operand:SFDF 0 "vsx_register_operand" "=wa")
> + (unspec:SFDF [(match_operand:SFDF 1 "vsx_register_operand" "wa")
> +   (match_operand:SFDF 2 "vsx_register_operand" "wa")]
> +  FMINMAX))]
> +  "TARGET_VSX"
> +  "xsdp %x0,%x1,%x2"
> +  [(set_attr "type" "fp")]
> +)
> +
>  (define_expand "movcc"
> [(set (match_operand:GPR 0 "gpc_reg_operand")
>(if_then_else:GPR (match_operand 1 "comparison_operator")
> diff --git a/gcc/testsuite/gcc.target/powerpc/pr103605.c 
> b/gcc/testsuite/gcc.target/powerpc/pr103605.c
> new file mode 100644
> index 000..e43ac40c2d1
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/pr103605.c
> @@ -0,0 +1,37 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target powerpc_vsx_ok } */
> +/* { dg-options "-O1 -mvsx" } */
> +/* { dg-final { scan-assembler-times {\mxsmaxdp\M} 3 } } */
> +/* { dg-final { scan-assembler-times {\mxsmindp\M} 3 } } */
> +
> +#include 
> +
> +double test1 (double d0, double d1)
> +{
> +  return fmin (d0, d1);
> +}
> +
> +float test2 (float d0, float d1)
> +{
> +  return fmin (d0, d1);
> +}
> +
> +double test3 (double d0, double d1)
> +{
> +  return fmax (d0, d1);
> +}
> +
> +float test4 (float d0, float d1)
> +{
> +  return fmax (d0, d1);
> +}
> +
> +double test5 (double d0, double d1)
> +{
> +  return __builtin_vsx_xsmindp (d0, d1);
> +}
> +
> +double test6 (double d0, double d1)
> +{
> +  return __builtin_vsx_xsmaxdp (d0, d1);
> +}



[Patch] OpenMP: Fortran - fix ancestor's requires reverse_offload check

2022-06-08 Thread Tobias Burnus

The OpenMP requires directive may only be placed in the specification part of
a program unit (except it happens via the USE of a module).

But the target directive ancestor-requires-'reverse_offload' only checked
the current namespace.

OK for mainline?

Tobias
-
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 
München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas 
Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht 
München, HRB 106955
OpenMP: Fortran - fix ancestor's requires reverse_offload check

gcc/fortran/

	* openmp.cc (gfc_match_omp_clauses): Check also parent namespace
	for 'requires reverse_offload'.

gcc/testsuite/

	* gfortran.dg/gomp/target-device-ancestor-5.f90: New test.

 gcc/fortran/openmp.cc  |  9 ++-
 .../gfortran.dg/gomp/target-device-ancestor-5.f90  | 69 ++
 2 files changed, 77 insertions(+), 1 deletion(-)

diff --git a/gcc/fortran/openmp.cc b/gcc/fortran/openmp.cc
index d12cec43d64..aeb8a43e12e 100644
--- a/gcc/fortran/openmp.cc
+++ b/gcc/fortran/openmp.cc
@@ -2014,8 +2014,15 @@ gfc_match_omp_clauses (gfc_omp_clauses **cp, const omp_mask mask,
 		}
 	  else if (gfc_match ("ancestor : ") == MATCH_YES)
 		{
+		  bool has_requires = false;
 		  c->ancestor = true;
-		  if (!(gfc_current_ns->omp_requires & OMP_REQ_REVERSE_OFFLOAD))
+		  for (gfc_namespace *ns = gfc_current_ns; ns; ns = ns->parent)
+		if (ns->omp_requires & OMP_REQ_REVERSE_OFFLOAD)
+		  {
+			has_requires = true;
+			break;
+		  }
+		  if (!has_requires)
 		{
 		  gfc_error ("% device modifier not "
  "preceded by % directive "
diff --git a/gcc/testsuite/gfortran.dg/gomp/target-device-ancestor-5.f90 b/gcc/testsuite/gfortran.dg/gomp/target-device-ancestor-5.f90
new file mode 100644
index 000..06a11eb5092
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/gomp/target-device-ancestor-5.f90
@@ -0,0 +1,69 @@
+! { dg-do compile }
+!
+! Check that a requires directive is still recognized
+! if it is in the associated parent namespace of the
+! target directive.
+!
+
+module m
+  !$omp requires reverse_offload  ! { dg-error "REQUIRES directive is not yet supported" }
+contains
+  subroutine foo()
+!$omp target device(ancestor:1)
+!$omp end target
+  end subroutine foo
+
+  subroutine bar()
+block
+  block
+block
+  !$omp target device(ancestor:1)
+  !$omp end target
+end block
+  end block
+end block
+  end subroutine bar
+end module m
+
+subroutine foo()
+  !$omp requires reverse_offload  ! { dg-error "REQUIRES directive is not yet supported" }
+  block
+block
+  block
+!$omp target device(ancestor:1)
+!$omp end target
+  end block
+end block
+  end block
+contains
+  subroutine bar()
+block
+  block
+block
+  !$omp target device(ancestor:1)
+  !$omp end target
+end block
+  end block
+end block
+  end subroutine bar
+end subroutine foo
+
+program main
+  !$omp requires reverse_offload  ! { dg-error "REQUIRES directive is not yet supported" }
+contains
+  subroutine foo()
+!$omp target device(ancestor:1)
+!$omp end target
+  end subroutine foo
+
+  subroutine bar()
+block
+  block
+block
+  !$omp target device(ancestor:1)
+  !$omp end target
+end block
+  end block
+end block
+  end subroutine bar
+end


Re: [PATCH-1 v2, rs6000] Replace shift and ior insns with one rotate and mask insn for bswap pattern [PR93453]

2022-06-08 Thread HAO CHEN GUI via Gcc-patches
Hi,

On 7/6/2022 下午 11:59, Segher Boessenkool wrote:
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/powerpc/pr93453-1.c
>> @@ -0,0 +1,14 @@
>> +/* { dg-do compile { target lp64 } } */
>> +/* { dg-options "-mdejagnu-cpu=power6 -O2" } */
> It doesn't require -m64, only -mpowerpc64.  You can use has_arch_ppc64
> to test for the latter.

Tested it with 'target has_arch_ppc64', it works on both -m32 and -m64.

Thanks.
Gui Haochen


Re: [Patch] OpenMP: Fortran - fix ancestor's requires reverse_offload check

2022-06-08 Thread Jakub Jelinek via Gcc-patches
On Wed, Jun 08, 2022 at 09:54:07AM +0200, Tobias Burnus wrote:
> The OpenMP requires directive may only be placed in the specification part of
> a program unit (except it happens via the USE of a module).
> 
> But the target directive ancestor-requires-'reverse_offload' only checked
> the current namespace.
> 
> OK for mainline?
> 
> Tobias
> -
> Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 
> München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas 
> Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht 
> München, HRB 106955

> OpenMP: Fortran - fix ancestor's requires reverse_offload check
> 
> gcc/fortran/
> 
>   * openmp.cc (gfc_match_omp_clauses): Check also parent namespace
>   for 'requires reverse_offload'.
> 
> gcc/testsuite/
> 
>   * gfortran.dg/gomp/target-device-ancestor-5.f90: New test.

LGTM, thanks.

Jakub



Re: [PATCH 2/3][ARM] STAR-MC1 CPU Support - arm: Add individual star-mc1 cost tables and cost functions

2022-06-08 Thread Chung-Ju Wu via Gcc-patches

Hi Kyrylo,

On 2022/06/06 22:18 UTC+8, Kyrylo Tkachov wrote:

I'd rather not duplicate those structures and functions in the master branch, 
as they provide a maintenance burden to the community.
If some tuning parameters need to be modified in the future for better 
performance we can create star-mc1-specific structures on demand then.
Thus, I think we don't want this patch.
Thanks,
Kyrill


Thanks for the comment.

Indeed, considering the maintenance burden to community, having those duplicate
structures in the master branch is not a good idea.

I am planning to contribute the star-mc1 pipeline machine description in the 
future.
Maybe it would be better to propose new star-mc1 specific structure along with 
pipeline
implementation then.

Thanks for the review.  I won't apply this 2/3 patch.

Regards,
jasonwucj


Re: [PATCH 1/3][ARM] STAR-MC1 CPU Support - arm: Add star-mc1 core

2022-06-08 Thread Chung-Ju Wu via Gcc-patches

Hi Kyrylo,

On 2022/06/06 22:10 UTC+8, Kyrylo Tkachov wrote:


Successfully bootstrapped and tested on arm-none-eabi.

Is it OK for trunk?


This is okay (together with the documentation additions in 3/3)
Thanks for the patch,


Thanks for the approval.

The patches 1/3 and 3/3 have been merged into one single patch
and committed as: https://gcc.gnu.org/g:ef5cc6bbb60b0ccbc10fb76b697ae02f28af18c0

Regards,
jasonwucj


Re: [PATCH] RISC-V: Compute default ABI from -mcpu or -march

2022-06-08 Thread pc.wang via Gcc-patches
Thanks for your opinion! I did these just because LLVM has already done the 
same thing and I wanted to make GCC with the same behavior of LLVM. The only 
difference is that LLVM has no handling for ilp32f and lp64f and I have sent a 
patch to do it (sees https://reviews.llvm.org/D125947).
As for RISC-V specs, there are some descriptions in 
https://github.com/riscv-non-isa/riscv-toolchain-conventions#specifying-the-target-isa-with--march:
 
> A target -march which includes floating point instructions implies a 
> hardfloat calling convention, but can be overridden using the -mabi flag (see 
> the next section).
But I think we can make it clearer.
--
Sender:Palmer Dabbelt 
Sent At:2022 Jun. 8 (Wed.) 02:45
Recipient:gcc-patches 
Cc:pc.wang 
Subject:Re: [PATCH] RISC-V: Compute default ABI from -mcpu or -march

On Mon, 06 Jun 2022 19:51:20 PDT (-0700), gcc-patches@gcc.gnu.org wrote:
> If -mcpu or -march is specified and there is no -mabi, we will calculate
> default ABI from arch string provided by -march or defined in CPU info.

IMO this is generally a good idea and we've talked about it before, but 
just setting the ABI from the ISA isn't quite the right way to go.  IIRC 
we came up with something slightly more complicated, like picking the 
closest supported multilib.  That's probably more in line with what 
users are asking for, which IIUC is sort of just "I don't care that much 
about ABI, just make my stuff build".

Whatever we do here, we should document in the RISC-V specs as we'll 
want to make sure LLVM does the same thing.  We probably also want some 
sort of "-mabi=auto" argument, as it's always best to have an argument 
that changes back to the no-argument behavior.


>
> gcc/ChangeLog:
>
> * common/config/riscv/riscv-common.cc (compute_default_abi): 
> Implementation
> to calculate -mabi from arch string.
> (riscv_expand_abi_from_arch): New spec function to calcalute -mabi 
> from arch
> string provided by -march option.
> (riscv_expand_abi_from_cpu): New spec function to find CPU info and 
> calculate
> -mabi from arch string defined in CPU info.
> * config/riscv/riscv.h (EXTRA_SPEC_FUNCTIONS): Add above spec 
> functions.
> (OPTION_DEFAULT_SPECS): Use new spec functions to calculate -mabi and 
> -march
> has higher priority than -mcpu.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/riscv/mabi-1.c: ilp32e test.
> * gcc.target/riscv/mabi-2.c: ilp32 test.
> * gcc.target/riscv/mabi-3.c: ilp32f test.
> * gcc.target/riscv/mabi-4.c: ilp32d test.
> * gcc.target/riscv/mabi-5.c: lp64 test.
> * gcc.target/riscv/mabi-6.c: lp64f test.
> * gcc.target/riscv/mabi-7.c: lp64d test.
> * gcc.target/riscv/mabi-8.c: -march override -mcpu.
> ---
>  gcc/common/config/riscv/riscv-common.cc | 66 +
>  gcc/config/riscv/riscv.h| 15 --
>  gcc/testsuite/gcc.target/riscv/mabi-1.c |  7 +++
>  gcc/testsuite/gcc.target/riscv/mabi-2.c |  7 +++
>  gcc/testsuite/gcc.target/riscv/mabi-3.c |  7 +++
>  gcc/testsuite/gcc.target/riscv/mabi-4.c |  7 +++
>  gcc/testsuite/gcc.target/riscv/mabi-5.c |  7 +++
>  gcc/testsuite/gcc.target/riscv/mabi-6.c |  7 +++
>  gcc/testsuite/gcc.target/riscv/mabi-7.c |  7 +++
>  gcc/testsuite/gcc.target/riscv/mabi-8.c |  7 +++
>  10 files changed, 134 insertions(+), 3 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/riscv/mabi-1.c
>  create mode 100644 gcc/testsuite/gcc.target/riscv/mabi-2.c
>  create mode 100644 gcc/testsuite/gcc.target/riscv/mabi-3.c
>  create mode 100644 gcc/testsuite/gcc.target/riscv/mabi-4.c
>  create mode 100644 gcc/testsuite/gcc.target/riscv/mabi-5.c
>  create mode 100644 gcc/testsuite/gcc.target/riscv/mabi-6.c
>  create mode 100644 gcc/testsuite/gcc.target/riscv/mabi-7.c
>  create mode 100644 gcc/testsuite/gcc.target/riscv/mabi-8.c
>
> diff --git a/gcc/common/config/riscv/riscv-common.cc 
> b/gcc/common/config/riscv/riscv-common.cc
> index 0e5be2ce105..f8e40549d18 100644
> --- a/gcc/common/config/riscv/riscv-common.cc
> +++ b/gcc/common/config/riscv/riscv-common.cc
> @@ -1266,6 +1266,72 @@ riscv_default_mtune (int argc, const char **argv)
>  return default_mtune;
>  }
>
> +/* Compute default -mabi option from arch string.  */
> +
> +static const char *
> +compute_default_abi (const char* arch_str)
> +{
> +  location_t loc = UNKNOWN_LOCATION;
> +
> +  riscv_parse_arch_string (arch_str, NULL, loc);
> +
> +  if (current_subset_list->xlen () == 64)
> +{
> +  if (current_subset_list->lookup ("d", RISCV_DONT_CARE_VERSION,
> +  RISCV_DONT_CARE_VERSION))
> + return "lp64d";
> +  if (current_subset_list->lookup ("f", RISCV_DONT_CARE_VERSION,
> +  RISCV_DONT_CARE_VERSION))
> + return "lp64f";
> +  return "lp64";
> +}
> +  else
> +{
> +  if (current_subset_list->lookup ("e", RISCV_DONT_CARE_VERSION,
> + 

[Committed] Add -mno-avx2 to recent gcc.target/i386/xop-vpcmov3.c

2022-06-08 Thread Roger Sayle

Adding -march=cascadelake to the command line options of the recently
added xop-vpcmov3.c test case causes problems as GCC then prefers to
use AVX512's vpternlogd instruction, instead of the XOP vpcmov that
the test is checking for.  This is easily solved by adding an explicit
-mno-avx512vl to the command line options.

Committed to mainline as obvious (in hindsight).


2022-06-08  Roger Sayle  

gcc/testsuite/ChangeLog
* gcc.target/i386/xop-pcmov3.c: Add -mno-avx512vl to dg-options.


Roger
--
> -Original Message-
> From: skpan...@sc.intel.com 
> Sent: 07 June 2022 20:32
> To: gcc-patches@gcc.gnu.org; gcc-regress...@gcc.gnu.org;
> ro...@nextmovesoftware.com
> Subject: [r13-998 Regression] FAIL: gcc.target/i386/xop-pcmov3.c scan-
> assembler vpcmov on Linux/x86_64
> 
> On Linux/x86_64,
> 
> c4320bde42c6497b701e2e6b8f1c5069bed19818 is the first bad commit commit
> c4320bde42c6497b701e2e6b8f1c5069bed19818
> Author: Roger Sayle 
> Date:   Tue Jun 7 07:49:40 2022 +0100
> 
> Recognize vpcmov in combine with -mxop on x86.
> 
> caused
> 
> FAIL: gcc.target/i386/xop-pcmov3.c scan-assembler vpcmov
> 
> with GCC configured with
> 
> ../../gcc/configure --prefix=/local/skpandey/gccwork/toolwork/gcc-bisect-
> master/master/r13-998/usr --enable-clocale=gnu --with-system-zlib --with-
> demangler-in-ld --with-fpmath=sse --enable-languages=c,c++,fortran
--enable-
> cet --without-isl --enable-libmpx x86_64-linux --disable-bootstrap
> 
> To reproduce:
> 
> $ cd {build_dir}/gcc && make check
> RUNTESTFLAGS="i386.exp=gcc.target/i386/xop-pcmov3.c --
> target_board='unix{-m32\ -march=cascadelake}'"
> $ cd {build_dir}/gcc && make check
> RUNTESTFLAGS="i386.exp=gcc.target/i386/xop-pcmov3.c --
> target_board='unix{-m64\ -march=cascadelake}'"
> 
> (Please do not reply to this email, for question about this report,
contact me at
> skpgkp2 at gmail dot com)
diff --git a/gcc/testsuite/gcc.target/i386/xop-pcmov3.c 
b/gcc/testsuite/gcc.target/i386/xop-pcmov3.c
index 6c40f33a541..ea03d55a2e6 100644
--- a/gcc/testsuite/gcc.target/i386/xop-pcmov3.c
+++ b/gcc/testsuite/gcc.target/i386/xop-pcmov3.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -mxop" } */
+/* { dg-options "-O2 -mxop -mno-avx512vl" } */
 typedef int v4si __attribute__ ((vector_size (16)));
 
 v4si foo(v4si c, v4si t, v4si f)


[PATCH] RISC-V/testsuite: Fix pr105666.c under rv32

2022-06-08 Thread jiawei
From: Jia-wei Chen 

In rv32 regression test, this cases will report an error:

"cc1: error: ABI requires '-march=rv32'"

Add '-mabi' option will fix this.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/pr105666.c: New options.

---
 gcc/testsuite/gcc.target/riscv/pr105666.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.target/riscv/pr105666.c 
b/gcc/testsuite/gcc.target/riscv/pr105666.c
index dd996eec8ef..752bbf1c017 100644
--- a/gcc/testsuite/gcc.target/riscv/pr105666.c
+++ b/gcc/testsuite/gcc.target/riscv/pr105666.c
@@ -7,7 +7,7 @@
 
 /* { dg-do compile } */
 /* { dg-require-effective-target hard_float } */
-/* { dg-options "-march=rv64g -ffast-math" } */
+/* { dg-options "-march=rv64g -mabi=lp64d -ffast-math" } */
 
 #define NITER 4
 #define NVARS 20
-- 
2.25.1



[PATCH]AArch64 relax predicate on load structure load instructions

2022-06-08 Thread Tamar Christina via Gcc-patches
Hi All,

At some point in time we started lowering the ld1r instructions in gimple.

That is:

uint8x8_t f1(const uint8_t *in) {
return vld1_dup_u8(&in[1]);
}

generates at gimple:

  _3 = MEM[(const uint8_t *)in_1(D) + 1B];
  _4 = {_3, _3, _3, _3, _3, _3, _3, _3};

Which is good, but we then generate:

f1:
ldr b0, [x0, 1]
dup v0.8b, v0.b[0]
ret

instead of ld1r.

The reason for this is because the load instructions have a too restrictive
predicate on them which causes combine not to be able to combine the
instructions due to the predicate only accepting simple addressing modes.

This patch relaxes the predicate to accept any memory operand and relies on
LRA to legitimize the address when it needs to as the constraint still only
allows the simple addressing mode.  Reload is always able to legitimize to
these.

Secondly since we are now actually generating more ld1r it became clear that the
lane instructions suffer from a similar issue.

i.e.

float32x4_t f2(const float32_t *in, float32x4_t a) {
float32x4_t dup = vld1q_dup_f32(&in[1]);
return vfmaq_laneq_f32 (a, a, dup, 1);
}

would generate ld1r + vector fmla instead of ldr + lane fmla.

The reason for this is similar to the ld1r issue.  The predicate is too
restrictive in only acception register operands but not memory.

This relaxes it to accept register and/or memory while leaving the constraint
to only accept registers.  This will have LRA generate a reload if needed
forcing the memory to registers using the standard patterns.

These two changes allow combine and reload to generate the right sequences.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

* config/aarch64/aarch64-simd.md (mul_lane3, mul_laneq3,
mul_n3, *aarch64_mul3_elt_to_64v2df, *aarch64_mla_elt,
*aarch64_mla_elt_, aarch64_mla_n,
*aarch64_mls_elt, *aarch64_mls_elt_,
aarch64_mls_n, *aarch64_fma4_elt,
*aarch64_fma4_elt_,
*aarch64_fma4_elt_from_dup, *aarch64_fma4_elt_to_64v2df,
*aarch64_fnma4_elt, *aarch64_fnma4_elt_,
*aarch64_fnma4_elt_from_dup, *aarch64_fnma4_elt_to_64v2df,
*aarch64_mulx_elt_,
*aarch64_mulx_elt, *aarch64_mulx_elt_from_dup,
*aarch64_vgetfmulx): Relax register_operand to
nonimmediate_operand.
(aarch64_simd_ld2, aarch64_simd_ld2r,
aarch64_vec_load_lanes_lane,
vec_load_lanes, aarch64_simd_st2,
aarch64_vec_store_lanes_lane,
vec_store_lanes, aarch64_simd_ld3,
aarch64_simd_ld3r,
aarch64_vec_load_lanes_lane,
vec_load_lanes, aarch64_simd_st3,
aarch64_vec_store_lanes_lane,
vec_store_lanes, aarch64_simd_ld4,
aarch64_simd_ld4r,
aarch64_vec_load_lanes_lane,
vec_load_lanes, aarch64_simd_st4,
aarch64_vec_store_lanes_lane,
vec_store_lanes, aarch64_ld1_x3_,
aarch64_ld1_x4_, aarch64_st1_x2_,
aarch64_st1_x3_, aarch64_st1_x4_,
aarch64_be_ld1, aarch64_be_st1,
aarch64_ld2_dreg, aarch64_ld2_dreg,
aarch64_ld3_dreg, aarch64_ld3_dreg,
aarch64_ld4_dreg, aarch64_ld4_dreg,
aarch64_st2_dreg, aarch64_st2_dreg,
aarch64_st3_dreg, aarch64_st3_dreg,
aarch64_st4_dreg, aarch64_st4_dreg,
*aarch64_simd_ld1r, aarch64_simd_ld1_x2): Relax
aarch64_simd_struct_operand to memory_operand.
* config/aarch64/predicates.md (aarch64_simd_struct_operand): Remove.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/vld1r.c: New test.

--- inline copy of patch -- 
diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 
be5c70bbb7520ae93d19c4a432ce34863e5b9a64..24e3274ddda2ea76c83571fada8ff4c953b752a1
 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -712,7 +712,7 @@ (define_insn "mul_lane3"
(mult:VMULD
 (vec_duplicate:VMULD
   (vec_select:
-(match_operand: 2 "register_operand" "")
+(match_operand: 2 "nonimmediate_operand" "")
 (parallel [(match_operand:SI 3 "immediate_operand" "i")])))
 (match_operand:VMULD 1 "register_operand" "w")))]
   "TARGET_SIMD"
@@ -728,7 +728,7 @@ (define_insn "mul_laneq3"
  (mult:VMUL
(vec_duplicate:VMUL
  (vec_select:
-   (match_operand: 2 "register_operand" "")
+   (match_operand: 2 "nonimmediate_operand" "")
(parallel [(match_operand:SI 3 "immediate_operand")])))
   (match_operand:VMUL 1 "register_operand" "w")))]
   "TARGET_SIMD"
@@ -743,7 +743,7 @@ (define_insn "mul_n3"
  [(set (match_operand:VMUL 0 "register_operand" "=w")
(mult:VMUL
 (vec_duplicate:VMUL
-  (match_operand: 2 "register_operand" ""))
+  (match_operand: 2 "nonimmediate_operand" ""))
 (match_operand:VMUL 1 "register_operand" "w")))]
   "TARGET_SIMD"
   "mul\t%0., %1., %2.[

Re: [PATCH] RISC-V: Compute default ABI from -mcpu or -march

2022-06-08 Thread Kito Cheng via Gcc-patches
I also prefer adding a -mabi=auto option rather than change existing behavior.

On Wed, Jun 8, 2022 at 5:06 PM pc.wang via Gcc-patches
 wrote:
>
> Thanks for your opinion! I did these just because LLVM has already done the 
> same thing and I wanted to make GCC with the same behavior of LLVM. The only 
> difference is that LLVM has no handling for ilp32f and lp64f and I have sent 
> a patch to do it (sees https://reviews.llvm.org/D125947).
> As for RISC-V specs, there are some descriptions in 
> https://github.com/riscv-non-isa/riscv-toolchain-conventions#specifying-the-target-isa-with--march:
> > A target -march which includes floating point instructions implies a 
> > hardfloat calling convention, but can be overridden using the -mabi flag 
> > (see the next section).
> But I think we can make it clearer.
> --
> Sender:Palmer Dabbelt 
> Sent At:2022 Jun. 8 (Wed.) 02:45
> Recipient:gcc-patches 
> Cc:pc.wang 
> Subject:Re: [PATCH] RISC-V: Compute default ABI from -mcpu or -march
>
> On Mon, 06 Jun 2022 19:51:20 PDT (-0700), gcc-patches@gcc.gnu.org wrote:
> > If -mcpu or -march is specified and there is no -mabi, we will calculate
> > default ABI from arch string provided by -march or defined in CPU info.
>
> IMO this is generally a good idea and we've talked about it before, but
> just setting the ABI from the ISA isn't quite the right way to go.  IIRC
> we came up with something slightly more complicated, like picking the
> closest supported multilib.  That's probably more in line with what
> users are asking for, which IIUC is sort of just "I don't care that much
> about ABI, just make my stuff build".
>
> Whatever we do here, we should document in the RISC-V specs as we'll
> want to make sure LLVM does the same thing.  We probably also want some
> sort of "-mabi=auto" argument, as it's always best to have an argument
> that changes back to the no-argument behavior.
>
>
> >
> > gcc/ChangeLog:
> >
> > * common/config/riscv/riscv-common.cc (compute_default_abi): 
> > Implementation
> > to calculate -mabi from arch string.
> > (riscv_expand_abi_from_arch): New spec function to calcalute -mabi 
> > from arch
> > string provided by -march option.
> > (riscv_expand_abi_from_cpu): New spec function to find CPU info and 
> > calculate
> > -mabi from arch string defined in CPU info.
> > * config/riscv/riscv.h (EXTRA_SPEC_FUNCTIONS): Add above spec 
> > functions.
> > (OPTION_DEFAULT_SPECS): Use new spec functions to calculate -mabi 
> > and -march
> > has higher priority than -mcpu.
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.target/riscv/mabi-1.c: ilp32e test.
> > * gcc.target/riscv/mabi-2.c: ilp32 test.
> > * gcc.target/riscv/mabi-3.c: ilp32f test.
> > * gcc.target/riscv/mabi-4.c: ilp32d test.
> > * gcc.target/riscv/mabi-5.c: lp64 test.
> > * gcc.target/riscv/mabi-6.c: lp64f test.
> > * gcc.target/riscv/mabi-7.c: lp64d test.
> > * gcc.target/riscv/mabi-8.c: -march override -mcpu.
> > ---
> >  gcc/common/config/riscv/riscv-common.cc | 66 +
> >  gcc/config/riscv/riscv.h| 15 --
> >  gcc/testsuite/gcc.target/riscv/mabi-1.c |  7 +++
> >  gcc/testsuite/gcc.target/riscv/mabi-2.c |  7 +++
> >  gcc/testsuite/gcc.target/riscv/mabi-3.c |  7 +++
> >  gcc/testsuite/gcc.target/riscv/mabi-4.c |  7 +++
> >  gcc/testsuite/gcc.target/riscv/mabi-5.c |  7 +++
> >  gcc/testsuite/gcc.target/riscv/mabi-6.c |  7 +++
> >  gcc/testsuite/gcc.target/riscv/mabi-7.c |  7 +++
> >  gcc/testsuite/gcc.target/riscv/mabi-8.c |  7 +++
> >  10 files changed, 134 insertions(+), 3 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.target/riscv/mabi-1.c
> >  create mode 100644 gcc/testsuite/gcc.target/riscv/mabi-2.c
> >  create mode 100644 gcc/testsuite/gcc.target/riscv/mabi-3.c
> >  create mode 100644 gcc/testsuite/gcc.target/riscv/mabi-4.c
> >  create mode 100644 gcc/testsuite/gcc.target/riscv/mabi-5.c
> >  create mode 100644 gcc/testsuite/gcc.target/riscv/mabi-6.c
> >  create mode 100644 gcc/testsuite/gcc.target/riscv/mabi-7.c
> >  create mode 100644 gcc/testsuite/gcc.target/riscv/mabi-8.c
> >
> > diff --git a/gcc/common/config/riscv/riscv-common.cc 
> > b/gcc/common/config/riscv/riscv-common.cc
> > index 0e5be2ce105..f8e40549d18 100644
> > --- a/gcc/common/config/riscv/riscv-common.cc
> > +++ b/gcc/common/config/riscv/riscv-common.cc
> > @@ -1266,6 +1266,72 @@ riscv_default_mtune (int argc, const char **argv)
> >  return default_mtune;
> >  }
> >
> > +/* Compute default -mabi option from arch string.  */
> > +
> > +static const char *
> > +compute_default_abi (const char* arch_str)
> > +{
> > +  location_t loc = UNKNOWN_LOCATION;
> > +
> > +  riscv_parse_arch_string (arch_str, NULL, loc);
> > +
> > +  if (current_subset_list->xlen () == 64)
> > +{
> > +  if (current_subset_list->lookup ("d

Re: [PATCH]AArch64 relax predicate on load structure load instructions

2022-06-08 Thread Richard Sandiford via Gcc-patches
Tamar Christina  writes:
> Hi All,
>
> At some point in time we started lowering the ld1r instructions in gimple.
>
> That is:
>
> uint8x8_t f1(const uint8_t *in) {
> return vld1_dup_u8(&in[1]);
> }
>
> generates at gimple:
>
>   _3 = MEM[(const uint8_t *)in_1(D) + 1B];
>   _4 = {_3, _3, _3, _3, _3, _3, _3, _3};
>
> Which is good, but we then generate:
>
> f1:
>   ldr b0, [x0, 1]
>   dup v0.8b, v0.b[0]
>   ret
>
> instead of ld1r.
>
> The reason for this is because the load instructions have a too restrictive
> predicate on them which causes combine not to be able to combine the
> instructions due to the predicate only accepting simple addressing modes.
>
> This patch relaxes the predicate to accept any memory operand and relies on
> LRA to legitimize the address when it needs to as the constraint still only
> allows the simple addressing mode.  Reload is always able to legitimize to
> these.
>
> Secondly since we are now actually generating more ld1r it became clear that 
> the
> lane instructions suffer from a similar issue.
>
> i.e.
>
> float32x4_t f2(const float32_t *in, float32x4_t a) {
> float32x4_t dup = vld1q_dup_f32(&in[1]);
> return vfmaq_laneq_f32 (a, a, dup, 1);
> }
>
> would generate ld1r + vector fmla instead of ldr + lane fmla.
>
> The reason for this is similar to the ld1r issue.  The predicate is too
> restrictive in only acception register operands but not memory.
>
> This relaxes it to accept register and/or memory while leaving the constraint
> to only accept registers.  This will have LRA generate a reload if needed
> forcing the memory to registers using the standard patterns.
>
> These two changes allow combine and reload to generate the right sequences.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

This is going against the general direction of travel, which is to make
the instruction's predicates and conditions enforce the constraints as
much as possible (making optimistic assumptions about pseudo registers).

The RA *can* deal with things like:

  (match_operand:M N "general_operand" "r")

but it's best avoided, for a few reasons:

(1) The fix-up will be done in LRA, so IRA will not see the temporary
registers.  This can make the allocation of those temporaries
suboptimal but (more importantly) it might require other
previously-allocated registers to be spilled late due to the
unexpected increase in register pressure.

(2) It ends up hiding instructions from the pre-RA optimisers.

(3) It can also prevent combine opportunities (as well as create them),
unless the loose predicates in an insn I are propagated to all
patterns that might result from combining I with something else.

It sounds like the first problem (not generating ld1r) could be fixed
by (a) combining aarch64_simd_dup and *aarch64_simd_ld1r,
so that the register and memory alternatives are in the same pattern
and (b) using the merged instruction(s) to implement the vec_duplicate
optab.  Target-independent code should then make the address satisfy
the predicate, simplifying the address where necessary.

I'm not sure whether fixing the ld1r problem that way will avoid the
vfmaq_laneq_f32 problem; let me know if not.

Thanks,
Richard

> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
>   * config/aarch64/aarch64-simd.md (mul_lane3, mul_laneq3,
>   mul_n3, *aarch64_mul3_elt_to_64v2df, *aarch64_mla_elt,
>   *aarch64_mla_elt_, aarch64_mla_n,
>   *aarch64_mls_elt, *aarch64_mls_elt_,
>   aarch64_mls_n, *aarch64_fma4_elt,
>   *aarch64_fma4_elt_,
>   *aarch64_fma4_elt_from_dup, *aarch64_fma4_elt_to_64v2df,
>   *aarch64_fnma4_elt, *aarch64_fnma4_elt_,
>   *aarch64_fnma4_elt_from_dup, *aarch64_fnma4_elt_to_64v2df,
>   *aarch64_mulx_elt_,
>   *aarch64_mulx_elt, *aarch64_mulx_elt_from_dup,
>   *aarch64_vgetfmulx): Relax register_operand to
>   nonimmediate_operand.
>   (aarch64_simd_ld2, aarch64_simd_ld2r,
>   aarch64_vec_load_lanes_lane,
>   vec_load_lanes, aarch64_simd_st2,
>   aarch64_vec_store_lanes_lane,
>   vec_store_lanes, aarch64_simd_ld3,
>   aarch64_simd_ld3r,
>   aarch64_vec_load_lanes_lane,
>   vec_load_lanes, aarch64_simd_st3,
>   aarch64_vec_store_lanes_lane,
>   vec_store_lanes, aarch64_simd_ld4,
>   aarch64_simd_ld4r,
>   aarch64_vec_load_lanes_lane,
>   vec_load_lanes, aarch64_simd_st4,
>   aarch64_vec_store_lanes_lane,
>   vec_store_lanes, aarch64_ld1_x3_,
>   aarch64_ld1_x4_, aarch64_st1_x2_,
>   aarch64_st1_x3_, aarch64_st1_x4_,
>   aarch64_be_ld1, aarch64_be_st1,
>   aarch64_ld2_dreg, aarch64_ld2_dreg,
>   aarch64_ld3_dreg, aarch64_ld3_dreg,
>   aarch64_ld4_dreg, aarch64_ld4_dreg,
>   aarch64_st2_dreg, aarch64_st2_dreg,
>   aarch64_st3_dreg, aarch64_st3_dreg,
>   aarch64_st4_dreg, aarch64_st4_dreg,
>   *aarch64_simd_ld1r, aarch64_simd_ld1_x2): Relax
>   aarch64_simd_struct

Document mailing list (was: GCC Rust git branch)

2022-06-08 Thread Thomas Schwinge
Hi!

On 2021-05-28T11:19:16+0100, Philip Herron  wrote:
> On 28/05/2021 04:22, Jason Merrill wrote:
>> On Mon, May 24, 2021 at 9:25 AM Philip Herron > > wrote:
>>> As some of you might know, I have been working on GCC Rust over on
>>> GitHub https://github.com/Rust-GCC/gccrs
>>> . As the project is moving
>>> forward and enforcing GCC copyright assignments for contributors, I
>>> would like to create a branch on the GCC git repo to show the
>>> intention
>>> to be upstream with GCC someday.
>>>
>>>  [snip]
>>>
>>> Separately, some contributors have expressed interest in
>>> maintaining the
>>> GCC style communications of using a mailing list and irc. Is it
>>> reasonable for this project to get a r...@gcc.gnu.org
>>> ?
>>
>> That makes sense to me; I think overseers@ can help set up a new
>> mailing list.
>
> Thanks For the info everyone i will reach out to overseers about the
> Mailing List idea.

A  mailing list has thus been set up a year ago;
now documented on  per gcc-wwwdocs
commit 1c89cdccbebda5d4c2eeeb627b1461b8877bb27e
"Document  mailing list", see attached.


Grüße
 Thomas


-
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 
München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas 
Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht 
München, HRB 106955
>From 1c89cdccbebda5d4c2eeeb627b1461b8877bb27e Mon Sep 17 00:00:00 2001
From: Thomas Schwinge 
Date: Wed, 8 Jun 2022 12:55:55 +0200
Subject: [PATCH] Document  mailing list

---
 htdocs/lists.html | 5 +
 1 file changed, 5 insertions(+)

diff --git a/htdocs/lists.html b/htdocs/lists.html
index ad36a7e2..c32c505d 100644
--- a/htdocs/lists.html
+++ b/htdocs/lists.html
@@ -70,6 +70,11 @@ before subscribing and posting to these lists.
   is a moderate volume list where regression results for the GCC compilers
   are posted.
 
+  https://gcc.gnu.org/ml/gcc-rust/";>gcc-rust
+  is for discussing https://gcc.gnu.org/wiki/RustFrontEnd";>Rust
+  support in GCC.
+  Patches should be sent to both this list and gcc-patches.
+
   https://gcc.gnu.org/ml/libstdc++/";>libstdc++
   is the main discussion and development list for the standard C++
   library (libstdc++-v3).  Patches to libstdc++-v3 should be sent to
-- 
2.35.1



[PATCH] Fix PR target/104871 (macosx-version-min wrong for macOS >= Big Sur (darwin20))

2022-06-08 Thread Simon Wright
(resent with commit message format update)

This is the same sort of problem as in PR80204: at present, GCC 11 & 12 assume 
that if the 
OS version is >= 20, the compiler should see --mmacosx-version-min={major - 
9}.{minor -1}.0, 
e.g. for OS version 21.3.0 that would be 12.2.0 (the linker sees 
-macosx-version-min, same 
arguments).

However, the native compiler clang treats 21.3.0 as 12.0.0: the compiler sees
  -triple x86_64-apple-macosx12.0.0
and the linker sees
  -platform_version macos 12.0.0 
the result of which is that linking an object file built with clang and one 
built with gcc gives e.g.

  ld: warning: object file (null.o) was built for newer macOS version (12.2) 
than being linked (12.0)

I propose the following patch, which works fine for me (darwin 21.3.0).

gcc/ChangeLog:

2022-06-02  Simon Wright  

PR target/104871
* config/darwin-driver.cc (darwin_find_version_from_kernel): If the OS 
version is
   20 (macOS 11) or greater, report the minor version and the patch 
level as 0
   to match Apple clang’s behaviour.



pr104871.diff
Description: Binary data


Re: aarch64: Fix bitfield alignment in param passing [PR105549]

2022-06-08 Thread Christophe Lyon via Gcc-patches




On 6/7/22 19:44, Richard Sandiford wrote:

Christophe Lyon via Gcc-patches  writes:

While working on enabling DFP for AArch64, I noticed new failures in
gcc.dg/compat/struct-layout-1.exp (t028) which were not actually
caused by DFP types handling. These tests are generated during 'make
check' and enabling DFP made generation different (not sure if new
non-DFP tests are generated, or if existing ones are generated
differently, the tests in question are huge and difficult to compare).

Anyway, I reduced the problem to what I attach at the end of the new
gcc.target/aarch64/aapcs64/va_arg-17.c test and rewrote it in the same
scheme as other va_arg* AArch64 tests.  Richard Sandiford further
reduced this to a non-vararg function, added as a second testcase.

This is a tough case mixing bitfields and alignment, where
aarch64_function_arg_alignment did not follow what its descriptive
comment says: we want to use the natural alignment of the bitfield
type only if the user didn't override the alignment for the bitfield
itself.

The fix is thus very small, and this patch adds two new tests
(va_arg-17.c and pr105549.c). va_arg-17.c contains the reduced
offending testcase from struct-layout-1.exp for reference.

We also take the opportunity to fix the comment above
aarch64_function_arg_alignment since the value of the abi_break
parameter was changed in a previous commit, no longer match the
description.

2022-06-02  Christophe Lyon  

gcc/
PR target/105549
* config/aarch64/aarch64.cc (aarch64_function_arg_alignment):
Check DECL_USER_ALIGN for bitfield.

gcc/testsuite/
PR target/105549
* gcc.target/aarch64/aapcs64/va_arg-17.c: New.
* gcc.target/aarch64/pr105549.c: New.


### Attachment also inlined for ease of reply###


diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
40fc5e633992036a2c06867857a681792178ef00..2c6ccce7cb5dc32097d24514ee525729efb6b7ff
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -7262,9 +7262,9 @@ aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, 
machine_mode mode,
  /* Given MODE and TYPE of a function argument, return the alignment in
 bits.  The idea is to suppress any stronger alignment requested by
 the user and opt for the natural alignment (specified in AAPCS64 \S
-   4.1).  ABI_BREAK is set to true if the alignment was incorrectly
-   calculated in versions of GCC prior to GCC-9.  This is a helper
-   function for local use only.  */
+   4.1).  ABI_BREAK is set to the old alignment if the alignment was
+   incorrectly calculated in versions of GCC prior to GCC-9.  This is
+   a helper function for local use only.  */
  
  static unsigned int

  aarch64_function_arg_alignment (machine_mode mode, const_tree type,
@@ -7304,7 +7304,10 @@ aarch64_function_arg_alignment (machine_mode mode, 
const_tree type,
   "s" contains only one Fundamental Data Type (the int field)
   but gains 8-byte alignment and size thanks to "e".  */
alignment = std::max (alignment, DECL_ALIGN (field));
-   if (DECL_BIT_FIELD_TYPE (field))
+
+   /* Take bit-field type's alignment into account only if the
+  user didn't override this field's alignment.  */
+   if (DECL_BIT_FIELD_TYPE (field) && !DECL_USER_ALIGN (field))


I think we need to check DECL_PACKED instead.  On its own, an alignment
attribute on the field can only increase alignment, not decrease it.
In constrast, the packed attribute effectively forces the alignment to
1 byte, so has an effect even without an alignment attribute.  Adding an
explicit alignment on top can then increase the alignment from 1 to any
value (bigger or smaller than the original underlying type).


Right, but the comment before aarch64_function_arg_alignment says:

"The idea is to suppress any stronger alignment requested by the user 
and opt for the natural alignment (specified in AAPCS64 \S 4.1)"


When using DECL_PACKED, wouldn't we check the opposite of this (ie. that 
the user requested a smaller alignment)?   I mean we'd not "suppress 
stronger alignment" since such cases do not have DECL_PACKED?



However I'm not sure which part of the ABI is mentioned in the comment, 
in my copy 4.1 is "Design Goals" and does not elaborate on bitfields and 
parameters.





E.g. for:

-
typedef unsigned long long ull __attribute__((aligned(ALIGN)));

#ifndef EXTRA
#define EXTRA unsigned long long x;
#endif

struct S1 { __attribute__((aligned(1))) ull i : 1; EXTRA };
struct S2 { __attribute__((aligned(2))) ull i : 1; EXTRA };
struct S4 { __attribute__((aligned(4))) ull i : 1; EXTRA };
struct S8 { __attribute__((aligned(8))) ull i : 1; EXTRA };
struct S16 { __attribute__((aligned(16))) ull i : 1; EXTRA };

struct Sp { ull i : 1; EXTRA }__attribute__((packed));
struct S1p { __attribute__((packed, aligned(1)))

Re: GCC Rust git branch

2022-06-08 Thread Thomas Schwinge
Hi!

This is about GCC/Rust, , now also having a
presence in GCC upstream Git sources; see also
 "GCC Git Branch".

On 2021-05-24T16:24:38+, Joseph Myers  wrote:
> On Mon, 24 May 2021, Philip Herron wrote:
>
>> remote: error: hook declined to update refs/heads/gccrs
>
> refs/heads/gccrs doesn't match the branch naming conventions as documented
> at https://gcc.gnu.org/git.html (where you'd use refs/heads/devel/* for
> shared development branches), so if you hadn't had commit message
> formatting issues, the push would have been rejected for bad branch naming
> as well.
>
>> The commit message here is poorly formatted. To move forward, should I
>> rebase the tree to fix this commit and force push to rewrite the
>> history? Or is there a way to relax the rule for a new branch? Any
>> advice would be welcome.
>
> If the community desires to relax the checks in a particular case, the way
> to do it would probably be to set hooks.no-precommit-check, naming the
> agreed branch name, temporarily in refs/meta/config:project.config, then
> revert that project.config change afterwards.  See
> https://github.com/AdaCore/git-hooks for the detailed documentation of
> hook configuration.

Thanks.  I've thus pushed to refs/meta/config branch
commit 15e03be6fc6406e41c75ff95a9de449663fc9f0e "Enable
'no-precommit-check' for GCC/Rust development branches, 'devel/rust/*'",
see attached.


I've further pushed to gcc-wwwdocs
commit 325020ef06c714fbfd508d57e3f0bda272470464
"Document Git branch devel/rust/master", see attached.


I've also set up GCC Bugzilla:

  - Add new component *rust*:

.
  - Add new version *rust/master*:

.


Grüße
 Thomas


-
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 
München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas 
Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht 
München, HRB 106955
>From 15e03be6fc6406e41c75ff95a9de449663fc9f0e Mon Sep 17 00:00:00 2001
From: Thomas Schwinge 
Date: Wed, 8 Jun 2022 12:00:04 +0200
Subject: [PATCH] Enable 'no-precommit-check' for GCC/Rust development
 branches, 'devel/rust/*'

---
 project.config | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/project.config b/project.config
index b7bdaf30b9e..5905fe26272 100644
--- a/project.config
+++ b/project.config
@@ -87,3 +87,9 @@
 	# Custom email formatter.  This inserts GCC monotonically
 	# increasing commit ids in the commit emails.
 	commit-email-formatter = /git/gcc.git/hooks-bin/commit_email_formatter
+
+	# For GCC/Rust development that happens outside of GCC proper,
+	# , the Git commit messages
+	# don't always adhere to standard GCC style; see
+	# .
+	no-precommit-check = refs/heads/devel/rust/.*
-- 
2.35.1

>From 325020ef06c714fbfd508d57e3f0bda272470464 Mon Sep 17 00:00:00 2001
From: Thomas Schwinge 
Date: Wed, 8 Jun 2022 14:21:06 +0200
Subject: [PATCH] Document Git branch devel/rust/master

---
 htdocs/git.html | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/htdocs/git.html b/htdocs/git.html
index 5fbd98bf..f9acea54 100644
--- a/htdocs/git.html
+++ b/htdocs/git.html
@@ -349,6 +349,12 @@ in Git.
 implementation of Fortran coarrays.  It is maintained by
 Nicolas König.
 
+  https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;a=shortlog;h=refs/heads/devel/rust/master";>devel/rust/master
+  
+This branch is for development of
+https://gcc.gnu.org/wiki/RustFrontEnd";>Rust programming language
+  support in GCC.
+  
 
 
 Distribution Branches
-- 
2.35.1



[PATCH] PR middle-end/105874: Use EXPAND_MEMORY to fix ada bootstrap.

2022-06-08 Thread Roger Sayle

Many thanks to Tamar Christina for filing PR middle-end/105874 indicating
that SPECcpu 2017's Leela is failing on x86_64 due to a miscompilation
of FastBoard::is_eye.  This function is much smaller and easier to work
with than my previous hunt for the cause of the Ada bootstrap failures
due to miscompilation somewhere in GCC (or one of the 131 places that
the problematic form of optimization triggers during an ada bootstrap).

It turns out the source of the miscompilation introduced by my recent
patch is the distinction (during RTL expansion) of l-values and r-values.
According to the documentation above expand_modifier, EXPAND_MEMORY
should be used for lvalues (when a memory is required), and EXPAND_NORMAL
for rvalues when a constant is permissible.  In what I'd like to consider
a latent bug, the recursive call to expand_expr_real on line 11188 of
expr.cc, in the case handling ARRAY_REF, COMPONENT_REF, BIT_FIELD_REF
and ARRARY_RANGE_REF was passing EXPAND_NORMAL when it really required
(the semantics of) EXPAND_MEMORY.  All the time that VAR_DECLs were
being returned as memory this was fine, but as soon as we're able to
optimize sort arrays into immediate constants, bad things happen.

In the test case from Leela, we notice that the array s_eyemask
always has DImode constant value { 4, 64 }, which is useful as
an rvalue, but not when we need to index it as an lvalue, as in
s_eyemask[color].  This also explains why everything being accepted
by immediate_const_ctor_p (during an ada bootstrap) looks reasonable,
what's incorrect is that we don't know how these structs/arrays are
to be used.

The fix is to ensure that we call expand_expr with EXPAND_MEMORY
when processing the VAR_DECL's returned by get_inner_reference.

This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
and make -k check (with no new failures), but also with
--enable-languages="ada" where it allows the bootstrap to finish,
and with no unexpected failures in the acats and gnat testsuites.
Ok for mainline?


2022-06-08  Roger Sayle  

gcc/ChangeLog
PR middle-end/105874
* gcc/expr.cc (expand_expr_real_1) :  New local
variable tem_modifier for calculating the expand_modifier enum to
use for expanding tem.  If tem is a VAR_DECL, use EXPAND_MEMORY.

gcc/testsuite/ChangeLog
PR middle-end/105874
* g++.dg/opt/pr105874.C: New test case.


Sorry again for the inconvenience/breakage.
Roger
--

diff --git a/gcc/expr.cc b/gcc/expr.cc
index fb062dc..a013650 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -11183,6 +11183,13 @@ expand_expr_real_1 (tree exp, rtx target, machine_mode 
tmode,
   infinitely recurse.  */
gcc_assert (tem != exp);
 
+   /* If tem is a VAR_DECL, we need a memory reference.  */
+   enum expand_modifier tem_modifier = modifier;
+   if (tem_modifier == EXPAND_SUM)
+ tem_modifier = EXPAND_NORMAL;
+   if (TREE_CODE (tem) == VAR_DECL)
+ tem_modifier = EXPAND_MEMORY;
+
/* If TEM's type is a union of variable size, pass TARGET to the inner
   computation, since it will need a temporary and TARGET is known
   to have to do.  This occurs in unchecked conversion in Ada.  */
@@ -11194,9 +11201,7 @@ expand_expr_real_1 (tree exp, rtx target, machine_mode 
tmode,
   != INTEGER_CST)
   && modifier != EXPAND_STACK_PARM
   ? target : NULL_RTX),
- VOIDmode,
- modifier == EXPAND_SUM ? EXPAND_NORMAL : modifier,
- NULL, true);
+ VOIDmode, tem_modifier, NULL, true);
 
/* If the field has a mode, we want to access it in the
   field's mode, not the computed mode.
diff --git a/gcc/testsuite/g++.dg/opt/pr105874.C 
b/gcc/testsuite/g++.dg/opt/pr105874.C
new file mode 100644
index 000..58699a6
--- /dev/null
+++ b/gcc/testsuite/g++.dg/opt/pr105874.C
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -std=c++11" } */
+#include 
+
+static constexpr int NBR_SHIFT = 4;
+
+static constexpr int MAXBOARDSIZE = 25;
+
+static constexpr int MAXSQ = ((MAXBOARDSIZE + 2) * (MAXBOARDSIZE + 2));
+
+enum square_t : char {
+BLACK = 0, WHITE = 1, EMPTY = 2, INVAL = 3
+};
+
+const std::array s_eyemask = {
+4 * (1 << (NBR_SHIFT * BLACK)),
+4 * (1 << (NBR_SHIFT * WHITE))
+};
+
+/* counts of neighboring stones */
+std::array m_neighbours;
+
+int is_eye(const int color, const int i) {
+/* check for 4 neighbors of the same color */
+int ownsurrounded = (m_neighbours[i] & s_eyemask[color]);
+
+return ownsurrounded;
+}
+
+/* { dg-final { scan-assembler "s_eyemask" } } */


Re: aarch64: Fix bitfield alignment in param passing [PR105549]

2022-06-08 Thread Richard Sandiford via Gcc-patches
Christophe Lyon  writes:
> On 6/7/22 19:44, Richard Sandiford wrote:
>> Christophe Lyon via Gcc-patches  writes:
>>> While working on enabling DFP for AArch64, I noticed new failures in
>>> gcc.dg/compat/struct-layout-1.exp (t028) which were not actually
>>> caused by DFP types handling. These tests are generated during 'make
>>> check' and enabling DFP made generation different (not sure if new
>>> non-DFP tests are generated, or if existing ones are generated
>>> differently, the tests in question are huge and difficult to compare).
>>>
>>> Anyway, I reduced the problem to what I attach at the end of the new
>>> gcc.target/aarch64/aapcs64/va_arg-17.c test and rewrote it in the same
>>> scheme as other va_arg* AArch64 tests.  Richard Sandiford further
>>> reduced this to a non-vararg function, added as a second testcase.
>>>
>>> This is a tough case mixing bitfields and alignment, where
>>> aarch64_function_arg_alignment did not follow what its descriptive
>>> comment says: we want to use the natural alignment of the bitfield
>>> type only if the user didn't override the alignment for the bitfield
>>> itself.
>>>
>>> The fix is thus very small, and this patch adds two new tests
>>> (va_arg-17.c and pr105549.c). va_arg-17.c contains the reduced
>>> offending testcase from struct-layout-1.exp for reference.
>>>
>>> We also take the opportunity to fix the comment above
>>> aarch64_function_arg_alignment since the value of the abi_break
>>> parameter was changed in a previous commit, no longer match the
>>> description.
>>>
>>> 2022-06-02  Christophe Lyon  
>>>
>>> gcc/
>>> PR target/105549
>>> * config/aarch64/aarch64.cc (aarch64_function_arg_alignment):
>>> Check DECL_USER_ALIGN for bitfield.
>>>
>>> gcc/testsuite/
>>> PR target/105549
>>> * gcc.target/aarch64/aapcs64/va_arg-17.c: New.
>>> * gcc.target/aarch64/pr105549.c: New.
>>>
>>>
>>> ### Attachment also inlined for ease of reply
>>> ###
>>>
>>>
>>> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
>>> index 
>>> 40fc5e633992036a2c06867857a681792178ef00..2c6ccce7cb5dc32097d24514ee525729efb6b7ff
>>>  100644
>>> --- a/gcc/config/aarch64/aarch64.cc
>>> +++ b/gcc/config/aarch64/aarch64.cc
>>> @@ -7262,9 +7262,9 @@ aarch64_vfp_is_call_candidate (cumulative_args_t 
>>> pcum_v, machine_mode mode,
>>>   /* Given MODE and TYPE of a function argument, return the alignment in
>>>  bits.  The idea is to suppress any stronger alignment requested by
>>>  the user and opt for the natural alignment (specified in AAPCS64 \S
>>> -   4.1).  ABI_BREAK is set to true if the alignment was incorrectly
>>> -   calculated in versions of GCC prior to GCC-9.  This is a helper
>>> -   function for local use only.  */
>>> +   4.1).  ABI_BREAK is set to the old alignment if the alignment was
>>> +   incorrectly calculated in versions of GCC prior to GCC-9.  This is
>>> +   a helper function for local use only.  */
>>>   
>>>   static unsigned int
>>>   aarch64_function_arg_alignment (machine_mode mode, const_tree type,
>>> @@ -7304,7 +7304,10 @@ aarch64_function_arg_alignment (machine_mode mode, 
>>> const_tree type,
>>>"s" contains only one Fundamental Data Type (the int field)
>>>but gains 8-byte alignment and size thanks to "e".  */
>>> alignment = std::max (alignment, DECL_ALIGN (field));
>>> -   if (DECL_BIT_FIELD_TYPE (field))
>>> +
>>> +   /* Take bit-field type's alignment into account only if the
>>> +  user didn't override this field's alignment.  */
>>> +   if (DECL_BIT_FIELD_TYPE (field) && !DECL_USER_ALIGN (field))
>> 
>> I think we need to check DECL_PACKED instead.  On its own, an alignment
>> attribute on the field can only increase alignment, not decrease it.
>> In constrast, the packed attribute effectively forces the alignment to
>> 1 byte, so has an effect even without an alignment attribute.  Adding an
>> explicit alignment on top can then increase the alignment from 1 to any
>> value (bigger or smaller than the original underlying type).
>
> Right, but the comment before aarch64_function_arg_alignment says:
>
> "The idea is to suppress any stronger alignment requested by the user 
> and opt for the natural alignment (specified in AAPCS64 \S 4.1)"
>
> When using DECL_PACKED, wouldn't we check the opposite of this (ie. that 
> the user requested a smaller alignment)?   I mean we'd not "suppress 
> stronger alignment" since such cases do not have DECL_PACKED?

I think "stronger alignment" here means "greater alignment" rather
than "less alignment".  But in these examples we're dealing with
alignments of the fields.  I think that part is OK, and that the
intention is to ignore any greater alignment specified at the structure
level, independently of the fields.

In other words, if field list X occupies 16 bytes, then S1 and S2
below should be handled in the same way as far as register assignment
is concerned:

  struct S1 { X };
  struct S2 { X } 

Re: [PATCH v4, rs6000] Implemented f[min/max]_optab by xs[min/max]dp [PR103605]

2022-06-08 Thread Segher Boessenkool
On Wed, Jun 08, 2022 at 11:28:11AM +0800, HAO CHEN GUI wrote:
>   This patch implements optab f[min/max]_optab by xs[min/max]dp on rs6000.
> Tests show that outputs of xs[min/max]dp are consistent with the standard
> of C99 fmin/max.

But it regresses the code quality generated with -ffast-math (because
the new unspecs arent't optimised like standard rtl is).  This can be
follow-up work of course -- and the best direction is to make fmin/fmax
generic, even!  :-)


Segher


RE: [PATCH]AArch64 relax predicate on load structure load instructions

2022-06-08 Thread Tamar Christina via Gcc-patches
> -Original Message-
> From: Richard Sandiford 
> Sent: Wednesday, June 8, 2022 11:31 AM
> To: Tamar Christina 
> Cc: gcc-patches@gcc.gnu.org; nd ; Richard Earnshaw
> ; Marcus Shawcroft
> ; Kyrylo Tkachov 
> Subject: Re: [PATCH]AArch64 relax predicate on load structure load
> instructions
> 
> Tamar Christina  writes:
> > Hi All,
> >
> > At some point in time we started lowering the ld1r instructions in gimple.
> >
> > That is:
> >
> > uint8x8_t f1(const uint8_t *in) {
> > return vld1_dup_u8(&in[1]);
> > }
> >
> > generates at gimple:
> >
> >   _3 = MEM[(const uint8_t *)in_1(D) + 1B];
> >   _4 = {_3, _3, _3, _3, _3, _3, _3, _3};
> >
> > Which is good, but we then generate:
> >
> > f1:
> > ldr b0, [x0, 1]
> > dup v0.8b, v0.b[0]
> > ret
> >
> > instead of ld1r.
> >
> > The reason for this is because the load instructions have a too
> > restrictive predicate on them which causes combine not to be able to
> > combine the instructions due to the predicate only accepting simple
> addressing modes.
> >
> > This patch relaxes the predicate to accept any memory operand and
> > relies on LRA to legitimize the address when it needs to as the
> > constraint still only allows the simple addressing mode.  Reload is
> > always able to legitimize to these.
> >
> > Secondly since we are now actually generating more ld1r it became
> > clear that the lane instructions suffer from a similar issue.
> >
> > i.e.
> >
> > float32x4_t f2(const float32_t *in, float32x4_t a) {
> > float32x4_t dup = vld1q_dup_f32(&in[1]);
> > return vfmaq_laneq_f32 (a, a, dup, 1); }
> >
> > would generate ld1r + vector fmla instead of ldr + lane fmla.
> >
> > The reason for this is similar to the ld1r issue.  The predicate is
> > too restrictive in only acception register operands but not memory.
> >
> > This relaxes it to accept register and/or memory while leaving the
> > constraint to only accept registers.  This will have LRA generate a
> > reload if needed forcing the memory to registers using the standard
> patterns.
> >
> > These two changes allow combine and reload to generate the right
> sequences.
> >
> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> 
> This is going against the general direction of travel, which is to make the
> instruction's predicates and conditions enforce the constraints as much as
> possible (making optimistic assumptions about pseudo registers).
> 
> The RA *can* deal with things like:
> 
>   (match_operand:M N "general_operand" "r")
> 
> but it's best avoided, for a few reasons:
> 
> (1) The fix-up will be done in LRA, so IRA will not see the temporary
> registers.  This can make the allocation of those temporaries
> suboptimal but (more importantly) it might require other
> previously-allocated registers to be spilled late due to the
> unexpected increase in register pressure.
> 
> (2) It ends up hiding instructions from the pre-RA optimisers.
> 
> (3) It can also prevent combine opportunities (as well as create them),
> unless the loose predicates in an insn I are propagated to all
> patterns that might result from combining I with something else.
> 
> It sounds like the first problem (not generating ld1r) could be fixed by (a)
> combining aarch64_simd_dup and *aarch64_simd_ld1r, so
> that the register and memory alternatives are in the same pattern and (b)
> using the merged instruction(s) to implement the vec_duplicate optab.
> Target-independent code should then make the address satisfy the
> predicate, simplifying the address where necessary.
> 

I think I am likely missing something here. I would assume that you wanted
to use the optab to split the addressing off from the mem expression so the
combined insn matches.

But in that case, why do you need to combine the two instructions?
I've tried and it doesn't work since the vec_duplicate optab doesn't see the
mem as op1, because in gimple the mem is not part of the duplicate.

So you still just see:

>>> dbgrtx (ops[1].value)
(subreg/s/v:QI (reg:SI 92 [ _3 ]) 0)

As the operand as the argument to the dup is just an SSA_NAME.

If not and you wanted the combined insn to accept

(set (reg:SI 92 [ _3 ])
(zero_extend:SI (mem:QI (plus:DI (reg:DI 97)
(const_int 1 [0x1])) [0 MEM[(const uint8_tD.4561 *)in_1(D) + 
1B]+0 S1 A8])))

Then that's also not possible without relaxing the combined predicates.  As far 
as I can tell
If I'm not allowed to use LRA for this, then the only thing that could work is 
an early split?

Or do I have to modify store_constructor to try a variant where it tries 
pushing in the
Decl of an SSA_NAME first?

I guess this also only really works for ld1r, whenever we lower ld2(r) etc 
we'll have the same
issue again... But I suppose that's for the next person 😊

Thanks,
Tamar

> I'm not sure whether fixing the ld1r problem that way will avoid the
> vfmaq_laneq_f32 problem; let me know if not.
> 
> Thanks,
> Richard
> 
> > Ok for master?
> >
> >

Re: [PATCH] PR middle-end/105874: Use EXPAND_MEMORY to fix ada bootstrap.

2022-06-08 Thread Eric Botcazou via Gcc-patches
> The fix is to ensure that we call expand_expr with EXPAND_MEMORY
> when processing the VAR_DECL's returned by get_inner_reference.
> 
> This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
> and make -k check (with no new failures), but also with
> --enable-languages="ada" where it allows the bootstrap to finish,
> and with no unexpected failures in the acats and gnat testsuites.
> Ok for mainline?

Yes, thanks (modulo the nit in the ChangeLog).

> 2022-06-08  Roger Sayle  
> 
> gcc/ChangeLog
> PR middle-end/105874
> * gcc/expr.cc (expand_expr_real_1) :  New local
> variable tem_modifier for calculating the expand_modifier enum to
> use for expanding tem.  If tem is a VAR_DECL, use EXPAND_MEMORY.

gcc/ prefix to be stripped

-- 
Eric Botcazou




Re: [PATCH]AArch64 relax predicate on load structure load instructions

2022-06-08 Thread Richard Sandiford via Gcc-patches
Tamar Christina  writes:
>> -Original Message-
>> From: Richard Sandiford 
>> Sent: Wednesday, June 8, 2022 11:31 AM
>> To: Tamar Christina 
>> Cc: gcc-patches@gcc.gnu.org; nd ; Richard Earnshaw
>> ; Marcus Shawcroft
>> ; Kyrylo Tkachov 
>> Subject: Re: [PATCH]AArch64 relax predicate on load structure load
>> instructions
>> 
>> Tamar Christina  writes:
>> > Hi All,
>> >
>> > At some point in time we started lowering the ld1r instructions in gimple.
>> >
>> > That is:
>> >
>> > uint8x8_t f1(const uint8_t *in) {
>> > return vld1_dup_u8(&in[1]);
>> > }
>> >
>> > generates at gimple:
>> >
>> >   _3 = MEM[(const uint8_t *)in_1(D) + 1B];
>> >   _4 = {_3, _3, _3, _3, _3, _3, _3, _3};
>> >
>> > Which is good, but we then generate:
>> >
>> > f1:
>> >ldr b0, [x0, 1]
>> >dup v0.8b, v0.b[0]
>> >ret
>> >
>> > instead of ld1r.
>> >
>> > The reason for this is because the load instructions have a too
>> > restrictive predicate on them which causes combine not to be able to
>> > combine the instructions due to the predicate only accepting simple
>> addressing modes.
>> >
>> > This patch relaxes the predicate to accept any memory operand and
>> > relies on LRA to legitimize the address when it needs to as the
>> > constraint still only allows the simple addressing mode.  Reload is
>> > always able to legitimize to these.
>> >
>> > Secondly since we are now actually generating more ld1r it became
>> > clear that the lane instructions suffer from a similar issue.
>> >
>> > i.e.
>> >
>> > float32x4_t f2(const float32_t *in, float32x4_t a) {
>> > float32x4_t dup = vld1q_dup_f32(&in[1]);
>> > return vfmaq_laneq_f32 (a, a, dup, 1); }
>> >
>> > would generate ld1r + vector fmla instead of ldr + lane fmla.
>> >
>> > The reason for this is similar to the ld1r issue.  The predicate is
>> > too restrictive in only acception register operands but not memory.
>> >
>> > This relaxes it to accept register and/or memory while leaving the
>> > constraint to only accept registers.  This will have LRA generate a
>> > reload if needed forcing the memory to registers using the standard
>> patterns.
>> >
>> > These two changes allow combine and reload to generate the right
>> sequences.
>> >
>> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>> 
>> This is going against the general direction of travel, which is to make the
>> instruction's predicates and conditions enforce the constraints as much as
>> possible (making optimistic assumptions about pseudo registers).
>> 
>> The RA *can* deal with things like:
>> 
>>   (match_operand:M N "general_operand" "r")
>> 
>> but it's best avoided, for a few reasons:
>> 
>> (1) The fix-up will be done in LRA, so IRA will not see the temporary
>> registers.  This can make the allocation of those temporaries
>> suboptimal but (more importantly) it might require other
>> previously-allocated registers to be spilled late due to the
>> unexpected increase in register pressure.
>> 
>> (2) It ends up hiding instructions from the pre-RA optimisers.
>> 
>> (3) It can also prevent combine opportunities (as well as create them),
>> unless the loose predicates in an insn I are propagated to all
>> patterns that might result from combining I with something else.
>> 
>> It sounds like the first problem (not generating ld1r) could be fixed by (a)
>> combining aarch64_simd_dup and *aarch64_simd_ld1r, so
>> that the register and memory alternatives are in the same pattern and (b)
>> using the merged instruction(s) to implement the vec_duplicate optab.
>> Target-independent code should then make the address satisfy the
>> predicate, simplifying the address where necessary.
>> 
>
> I think I am likely missing something here. I would assume that you wanted
> to use the optab to split the addressing off from the mem expression so the
> combined insn matches.
>
> But in that case, why do you need to combine the two instructions?
> I've tried and it doesn't work since the vec_duplicate optab doesn't see the
> mem as op1, because in gimple the mem is not part of the duplicate.
>
> So you still just see:
>
 dbgrtx (ops[1].value)
> (subreg/s/v:QI (reg:SI 92 [ _3 ]) 0)
>
> As the operand as the argument to the dup is just an SSA_NAME.

Ah, yeah, I'd forgotten that fixed-length vec_duplicates would
come from a constructor rather than a vec_duplicate_expr, so we don't
get the usual benefit of folding single-use mems during expand.

https://gcc.gnu.org/pipermail/gcc-patches/2022-May/595362.html
moves towards using vec_duplicate even for fixed-length vectors.
If we take that approach, then I suppose a plain constructor
should be folded to a vec_duplicate where possible.

(Alternatively, we could use an extended vec_perm_expr with
scalar inputs, as Richi suggested in that thread.)

If we don't do that, or don't do it yet, then…

> If not and you wanted the combined insn to accept
>
> (set (reg:SI 92 [ _3 ])
> (zero_extend:SI (mem:QI (plus:DI (reg:

Re: [PATCH 1/3] Disable generating store vector pair.

2022-06-08 Thread will schmidt via Gcc-patches
On Tue, 2022-06-07 at 23:16 -0400, Michael Meissner wrote:
> On Tue, Jun 07, 2022 at 07:59:34PM -0500, Peter Bergner wrote:
> > On 6/7/22 4:24 PM, Segher Boessenkool wrote:
> > > On Tue, Jun 07, 2022 at 04:17:04PM -0500, Peter Bergner wrote:
> > > > I think I mentioned this offline, but I'd prefer a negative target flag,
> > > > something like TARGET_NO_STORE_VECTOR_PAIR that defaults to off, 
> > > > meaning we'd
> > > > generate stxvp by default.
> > > 
> > > NAK.  All negatives should be -mno-xxx with -mxxx the corresponding
> > > positive.  All of them.
> > 
> > That's not what I was asking for.  I totally agree that 
> > -mno-store-vector-pair
> > should disable generating stxvp and that -mstore-vector-pair should enable
> > generating it.  What I asked for was that the internal flag we use to enable
> > and disable it should be a negative flag, where TARGET_NO_STORE_VECTOR_PAIR 
> > is
> > true when we use -mno-store-vector-pair and false when using 
> > -mstore-vector-pair.
> > That way we can add that flag to power10's rs6000-cpu.def entry and then 
> > we're
> > done.  What I don't want to have to do is that if/when power87 is released, 
> > we
> > still have to add TARGET_STORE_VECTOR_PAIR its rs6000-cpu.def entry just to
> > get stxvp insns generated.  That adds a cost to every cpu after power10 
> > since
> > we'd have to remember to add that flag to every follow-on cpu.
> 
> FWIW, I really dislike having negative flags like that (just talking about the
> option mask internals, not the user option).

I can't tell there is agreement in either direction, i'll throw some
comments out and see if that helps make a decision. 

I agree with avoiding the negative flags.  Whenever I run across a code
snippet reading  "if (! TARGET_NOT_FOO) ... " it's time to double-check 
everything.  :-)  

If the proposal is to have "TARGET_NO_STORE_VECTOR_PAIR" set to "off",
I'd counter propose whatever variation possible to drop the "NO" from
the string. i.e. "TARGET_STORE_VECTOR_PAIR" set to however it makes
sense to indicate enabled, or not.

All that said, .. with a strong preference to have the internal flags
matching the option flags as closely as possible.


> 
> I don't view the cost to add one postive flag to the next CPU as bad, as it
> will be a one time cost.  Presumably it would be set also next++ CPU.  This is
> like power8 is all of the power7 flags + new flags.  Power9 is all of the
> power8 flags + new flags.  I.e. in general it is cumulative.  Yes, I'm aware
> there are times when there are breaks, but hopefully those are rare.

This sounds reasonable.   Some weight could be added for which way to
bias the flag based on a guess of what the 'power87' release will
allow, but ultimately that shouldn't really matter. 

And no, power87 isnt' real AFAIK,.. I'm just repeating the example
provided by Peter :-) 

Thanks
-Will

> 
> Otherwise it is like the mess with -mpower8-fusion, where going from power8 to
> power9 we have to clear the fusion flag.  If store vector pair is a postive
> flag, then it isn't set in power10 flags, but it might be set in next cpu
> flags.  But if it is a negative flag, we have to explicitly clear it.
> 
> We can do it, but I just prefer to go with the positive flag approach.
> 



[PATCH 1/2]AArch64 Fix 128-bit sequential consistency atomic operations.

2022-06-08 Thread Tamar Christina via Gcc-patches
Hi All,

The AArch64 implementation of 128-bit atomics is broken.

For 128-bit atomics we rely on pthread barriers to correct guard the address
in the pointer to get correct memory ordering.  However for 128-bit atomics the
address under the lock is different from the original pointer.

This means that one of the values under the atomic operation is not protected
properly and so we fail during when the user has requested sequential
consistency as there's no barrier to enforce this requirement.

As such users have resorted to adding an

#ifdef GCC

#endif

around the use of these atomics.

This corrects the issue by issuing a barrier only when __ATOMIC_SEQ_CST was
requested.  To remedy this performance hit I think we should revisit using a
similar approach to out-line-atomics for the 128-bit atomics.

Note that I believe I need the empty file due to the include_next chain but
I am not entirely sure.  I have hand verified that the barriers are inserted
for atomic seq cst.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master? and for backporting to GCC 12, 11 and 10?

Thanks,
Tamar

libatomic/ChangeLog:

PR target/102218
* config/aarch64/aarch64-config.h: New file.
* config/aarch64/host-config.h: New file.

--- inline copy of patch -- 
diff --git a/libatomic/config/aarch64/aarch64-config.h 
b/libatomic/config/aarch64/aarch64-config.h
new file mode 100644
index 
..d3474fa8ff80cb0c3ddbf8c48acd931d2339d33d
--- /dev/null
+++ b/libatomic/config/aarch64/aarch64-config.h
@@ -0,0 +1,23 @@
+/* Copyright (C) 2022 Free Software Foundation, Inc.
+
+   This file is part of the GNU Atomic Library (libatomic).
+
+   Libatomic is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   Libatomic is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   .  */
+
diff --git a/libatomic/config/aarch64/host-config.h 
b/libatomic/config/aarch64/host-config.h
new file mode 100644
index 
..f445a47d25ef5cc51cd2167069500245d07bf1bc
--- /dev/null
+++ b/libatomic/config/aarch64/host-config.h
@@ -0,0 +1,46 @@
+/* Copyright (C) 2022 Free Software Foundation, Inc.
+
+   This file is part of the GNU Atomic Library (libatomic).
+
+   Libatomic is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   Libatomic is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   .  */
+
+/* Avoiding the DMB (or kernel helper) can be a good thing.  */
+#define WANT_SPECIALCASE_RELAXED
+
+/* Glibc, at least, uses acq_rel in its pthread mutex
+   implementation.  If the user is asking for seq_cst,
+   this is insufficient.  */
+
+static inline void __attribute__((always_inline, artificial))
+pre_seq_barrier(int model)
+{
+  if (model == __ATOMIC_SEQ_CST)
+__atomic_thread_fence (__ATOMIC_SEQ_CST);
+}
+
+static inline void __attribute__((always_inline, artificial))
+post_seq_barrier(int model)
+{
+  pre_seq_barrier(model);
+}
+
+#define pre_post_seq_barrier 1
+
+#include_next 




-- 
diff --git a/libatomic/config/aarch64/aarch64-config.h 
b/libatomic/config/aarch64/aarch64-config.h
new file mode 100644
index 
..d3474fa8ff80cb0c3ddbf8c48acd931d2339d33d
--- /dev/null
+++ b/libatomic/config/aarch64/aarch64-config.h
@@ -0,0 +1,23 @@
+/* Copyright (C) 2022 Free Software Foundation, Inc.
+
+   This 

[PATCH 2/2][AArch32] Fix 128-bit sequential consistency atomic operations.

2022-06-08 Thread Tamar Christina via Gcc-patches
Hi All,

Similar to AArch64 the Arm implementation of 128-bit atomics is broken.

For 128-bit atomics we rely on pthread barriers to correct guard the address
in the pointer to get correct memory ordering.  However for 128-bit atomics the
address under the lock is different from the original pointer.

This means that one of the values under the atomic operation is not protected
properly and so we fail during when the user has requested sequential
consistency as there's no barrier to enforce this requirement.

As such users have resorted to adding an

#ifdef GCC

#endif

around the use of these atomics.

This corrects the issue by issuing a barrier only when __ATOMIC_SEQ_CST was
requested.  I have hand verified that the barriers are inserted
for atomic seq cst.


Bootstrapped Regtested on arm-none-linux-gnueabihf and no issues.

Ok for master? and for backporting to GCC 12, 11 and 10?

Thanks,
Tamar

libatomic/ChangeLog:

PR target/102218
* config/arm/host-config.h (pre_seq_barrier, post_seq_barrier,
pre_post_seq_barrier): Require barrier on __ATOMIC_SEQ_CST.

--- inline copy of patch -- 
diff --git a/libatomic/config/arm/host-config.h 
b/libatomic/config/arm/host-config.h
index 
bbf4a3f84c3f3ae21fb2162aab68bdedf3fbdcb4..ef16fad2a35ec9055e918849e69a1a0e23b62838
 100644
--- a/libatomic/config/arm/host-config.h
+++ b/libatomic/config/arm/host-config.h
@@ -1,4 +1,23 @@
 /* Avoiding the DMB (or kernel helper) can be a good thing.  */
 #define WANT_SPECIALCASE_RELAXED
 
+/* Glibc, at least, uses acq_rel in its pthread mutex
+   implementation.  If the user is asking for seq_cst,
+   this is insufficient.  */
+
+static inline void __attribute__((always_inline, artificial))
+pre_seq_barrier(int model)
+{
+  if (model == __ATOMIC_SEQ_CST)
+__atomic_thread_fence (__ATOMIC_SEQ_CST);
+}
+
+static inline void __attribute__((always_inline, artificial))
+post_seq_barrier(int model)
+{
+  pre_seq_barrier(model);
+}
+
+#define pre_post_seq_barrier 1
+
 #include_next 




-- 
diff --git a/libatomic/config/arm/host-config.h 
b/libatomic/config/arm/host-config.h
index 
bbf4a3f84c3f3ae21fb2162aab68bdedf3fbdcb4..ef16fad2a35ec9055e918849e69a1a0e23b62838
 100644
--- a/libatomic/config/arm/host-config.h
+++ b/libatomic/config/arm/host-config.h
@@ -1,4 +1,23 @@
 /* Avoiding the DMB (or kernel helper) can be a good thing.  */
 #define WANT_SPECIALCASE_RELAXED
 
+/* Glibc, at least, uses acq_rel in its pthread mutex
+   implementation.  If the user is asking for seq_cst,
+   this is insufficient.  */
+
+static inline void __attribute__((always_inline, artificial))
+pre_seq_barrier(int model)
+{
+  if (model == __ATOMIC_SEQ_CST)
+__atomic_thread_fence (__ATOMIC_SEQ_CST);
+}
+
+static inline void __attribute__((always_inline, artificial))
+post_seq_barrier(int model)
+{
+  pre_seq_barrier(model);
+}
+
+#define pre_post_seq_barrier 1
+
 #include_next 





Re: [PATCH 1/3] Disable generating store vector pair.

2022-06-08 Thread Peter Bergner via Gcc-patches
On 6/7/22 10:16 PM, Michael Meissner wrote:
> Otherwise it is like the mess with -mpower8-fusion, where going from power8 to
> power9 we have to clear the fusion flag.  If store vector pair is a postive
> flag, then it isn't set in power10 flags, but it might be set in next cpu
> flags.  But if it is a negative flag, we have to explicitly clear it.

Ok, I completely forgot about this specific issue and its negative affect on
inlining, so I agree it's a bad idea.  Request withdrawn. :-) 

Peter




Re: [PATCH v2 01/11] OpenMP 5.0: Clause ordering for OpenMP 5.0 (topological sorting by base pointer)

2022-06-08 Thread Julian Brown
Hi Jakub,

Thanks for review!

On Tue, 24 May 2022 15:03:07 +0200
Jakub Jelinek via Fortran  wrote:

> On Fri, Mar 18, 2022 at 09:24:51AM -0700, Julian Brown wrote:
> > 2021-11-23  Julian Brown  
> > 
> > gcc/
> > * gimplify.c (is_or_contains_p,
> > omp_target_reorder_clauses): Delete functions.
> > (omp_tsort_mark): Add enum.
> > (omp_mapping_group): Add struct.
> > (debug_mapping_group, omp_get_base_pointer,
> > omp_get_attachment, omp_group_last, omp_gather_mapping_groups,
> > omp_group_base, omp_index_mapping_groups, omp_containing_struct,
> > omp_tsort_mapping_groups_1, omp_tsort_mapping_groups,
> > omp_segregate_mapping_groups, omp_reorder_mapping_groups):
> > New functions.
> > (gimplify_scan_omp_clauses): Call above functions instead of
> > omp_target_reorder_clauses, unless we've seen an error.
> > * omp-low.c (scan_sharing_clauses): Avoid strict test if we
> > haven't sorted mapping groups.
> > 
> > gcc/testsuite/
> > * g++.dg/gomp/target-lambda-1.C: Adjust expected output.
> > * g++.dg/gomp/target-this-3.C: Likewise.
> > * g++.dg/gomp/target-this-4.C: Likewise.
> > +  
> 
> Wouldn't hurt to add a comment on the meanings of the enumerators.

Added.

> > +enum omp_tsort_mark {
> > +  UNVISITED,
> > +  TEMPORARY,
> > +  PERMANENT
> > +};
> > +
> > +struct omp_mapping_group {
> > +  tree *grp_start;
> > +  tree grp_end;
> > +  omp_tsort_mark mark;
> > +  struct omp_mapping_group *sibling;
> > +  struct omp_mapping_group *next;
> > +};
> > +
> > +__attribute__((used)) static void  
> 
> I'd use what is used elsewhere,
> DEBUG_FUNCTION void
> without static.

Fixed.

> > +static tree
> > +omp_get_base_pointer (tree expr)

> I must say I don't see advantages of just a single loop that
> looks through all ARRAY_REFs and all COMPONENT_REFs and then just
> checks if the expr it got in the end is a decl or INDIRECT_REF
> or MEM_REF with offset 0.
> 
> > +static tree
> > +omp_containing_struct (tree expr)
> Again?

I've simplified these loops, and removed the "needs improvement"
comment.

> > @@ -9063,11 +9820,29 @@ gimplify_scan_omp_clauses (tree *list_p,
> > gimple_seq *pre_p, break;
> >}
> >  
> > -  if (code == OMP_TARGET
> > -  || code == OMP_TARGET_DATA
> > -  || code == OMP_TARGET_ENTER_DATA
> > -  || code == OMP_TARGET_EXIT_DATA)
> > -omp_target_reorder_clauses (list_p);
> > +  /* Topological sorting may fail if we have duplicate nodes, which
> > + we should have detected and shown an error for already.  Skip
> > + sorting in that case.  */
> > +  if (!seen_error ()
> > +  && (code == OMP_TARGET
> > + || code == OMP_TARGET_DATA
> > + || code == OMP_TARGET_ENTER_DATA
> > + || code == OMP_TARGET_EXIT_DATA))
> > +{
> > +  vec *groups;
> > +  groups = omp_gather_mapping_groups (list_p);
> > +  if (groups)
> > +   {
> > + hash_map *grpmap;
> > + grpmap = omp_index_mapping_groups (groups);
> > + omp_mapping_group *outlist
> > +   = omp_tsort_mapping_groups (groups, grpmap);
> > + outlist = omp_segregate_mapping_groups (outlist);
> > + list_p = omp_reorder_mapping_groups (groups, outlist,
> > list_p);
> > + delete grpmap;
> > + delete groups;
> > +   }
> > +}  
> 
> I think big question is if we do want to do this map clause reordering
> before processing the  omp target etc. clauses, or after (during
> gimplify_adjust_omp_clauses, when clauses from the implicit mappings
> are added too and especially with the declare mapper expansions),
> or both before and after.

The existing code constrains us a bit here, unless we want to
completely rewrite it!

We can only do sorting on clauses before gimplification, otherwise the
"structural" matching of the parsed syntax of base pointers inside other
clauses on the directive, etc. will certainly fail.

(Semi-relatedly, I asked this on the omp-lang mailing list:

  "When we have mappings that represent base pointers, and other
  mappings that use those base pointers, the former must be ordered to
  take place before the latter -- but should we determine that relation
  purely syntactically? How about if we write e.g. "p->" on one vs.
  "(*p)." on the other?"

but no reply...)

So, this is fine for sorting explicit mapping clauses. When planning
the approach I've used for "declare mapper" support, I wrote this (in
an internal email):

"At the moment, gimplifying OMP workshare regions proceeds in three
phases:

 1. Clauses are processed (gimplify_scan_omp_clauses), creating
records of mapped variables in a splay tree, with associated flags.

 2. The body of the workshare region is processed (gimplified),
augmenting the same splay tree with information about variables
which are used implicitly (and maybe also modifying the "explicit"
mappings from the first step).

 3. The clauses are modified based on the results of the second stage
(gimplify_adjust_omp_clauses). E.g. clauses are removed that refer

c++: Reimplement static init/fini generation

2022-06-08 Thread Nathan Sidwell

Currently we generate static init/fini code by generating a set of
functions taking an 'initp' bool and an unsigned priority.  (There can
be more than one, as we repeat the end-of-compile loop.)  We then
generate a set of real init or fini functions for each needed
prioroty, calling the previous set of functions.  This is of course
very tangled, but excitingly the value-range-propagator is clever
enough to unentangle it.  However, the current arrangement makes
generation awkward, particularly as to how to optimize the
module-global-init generation.

This reimplements the generation to generate a set of separate
init/fini functions for each needed priority, and then call them from
the real inits previously mentioned.  This replaces a splay tree,
recording which priority/init combos we needed, with a pair of hash
tables, mapping priority to init functions.  Much simpler. (For some 
reason simple_hash_map_traits<...> caused a bootstrap problem with GTY 
PCH generation.)


While there, rename several of the functions as they are only dealing
with part of the init/fini generation, not the whole set.

nathan

--
Nathan SidwellFrom 90a6c3b6d69765ea9269ba7ae16ef02d5527e875 Mon Sep 17 00:00:00 2001
From: Nathan Sidwell 
Date: Tue, 31 May 2022 10:42:35 -0700
Subject: [PATCH] c++: Reimplement static init/fini generation

Currently we generate static init/fini code by generating a set of
functions taking an 'initp' bool and an unsigned priority.  (There can
be more than one, as we repeat the end-of-compile loop.)  We then
generate a set of real init or fini functions for each needed
prioroty, calling the previous set of functions.  This is of course
very tangled, but excitingly the value-range-propagator is clever
enough to unentangle it.  However, the current arrangement makes
generation awkward, particularly as to how to optimize the
module-global-init generation.

This reimplements the generation to generate a set of separate
init/fini functions for each needed priority, and then call them from
the real inits previously mentioned.  This replaces a splay tree,
recording which priority/init combos we needed, with a pair of hash
tables, mapping priority to init functions.  Much simpler.

While there, rename several of the functions as they are only dealing
with part of the init/fini generation, not the whole set.

	gcc/cp/
	* decl2.cc (struct priority_info_s, priority_info): Delete.
	(priority_map_traits, priority_map_t): New.
	(static_init_fini_fns): New.
	(INITIALIZE_P_IDENTIFIER, PRIORITY_IDENTIFIER): Delete.
	(initialize_p_decl, priority_decl): Delete.
	(ssdf_decls, priority_info_map): Delete.
	(start_static_storage_duration_function): Rename to ...
	(start_partial_init_fini_fn): ... here. Create a void arg fn.
	Add it to the slot in the appropriate static_init_fini_fns
	hash table.
	(finish_static_storage_duration_function): Rename to ...
	(finish_partial_init_fini_fn): ... here.
	(get_priority_info): Delete.
	(one_static_initialization_or_destruction): Assert not
	trivial dtor.
	(do_static_initialization_or_destruction): Rename to ...
	(emit_partial_init_fini_fn) ... here.  Start & finish the fn.
	Simply init/fini each var.
	(partition_vars_for_init_fini): Partition vars according to
	priority and add to init and/or fini list.
	(generate_ctor_or_dtor_function): Start and finish the function.
	Do santitizer calls here.
	(generate_ctor_and_dtor_functions_for_priority): Delete.
	(c_parse_final_cleanups): Reimplement global init/fini
	processing.

	gcc/testsuite/
	* g++.dg/init/static-cdtor1.C: New.
---
 gcc/cp/decl2.cc   | 482 +-
 gcc/testsuite/g++.dg/init/static-cdtor1.C |  17 +
 2 files changed, 202 insertions(+), 297 deletions(-)
 create mode 100644 gcc/testsuite/g++.dg/init/static-cdtor1.C

diff --git a/gcc/cp/decl2.cc b/gcc/cp/decl2.cc
index 974afe798b6..bfb6a32e3b6 100644
--- a/gcc/cp/decl2.cc
+++ b/gcc/cp/decl2.cc
@@ -55,27 +55,14 @@ int raw_dump_id;
  
 extern cpp_reader *parse_in;
 
-/* This structure contains information about the initializations
-   and/or destructions required for a particular priority level.  */
-typedef struct priority_info_s {
-  /* Nonzero if there have been any initializations at this priority
- throughout the translation unit.  */
-  int initializations_p;
-  /* Nonzero if there have been any destructions at this priority
- throughout the translation unit.  */
-  int destructions_p;
-} *priority_info;
-
 static tree start_objects (bool, unsigned);
 static tree finish_objects (bool, unsigned, tree);
-static tree start_static_storage_duration_function (unsigned);
-static void finish_static_storage_duration_function (tree);
-static priority_info get_priority_info (int);
-static void do_static_initialization_or_destruction (bool, tree);
+static tree start_partial_init_fini_fn (bool, unsigned, unsigned);
+static void finish_partial_init_fini_fn (tree);
+static void emit_partial_init_fini_fn (bool, unsigned, tree,
+   unsigned

[COMMITTED] gcc: xtensa: fix PR target/105879

2022-06-08 Thread Max Filippov via Gcc-patches
split_double operates with the 'word that comes first in memory in the
target' terminology, while gen_lowpart operates with the 'value
representing some low-order bits of X' terminology. They are not
equivalent and must be dealt with differently on little- and big-endian
targets.

gcc/
PR target/105879
* config/xtensa/xtensa.md (movdi): Rename 'first' and 'second'
to 'lowpart' and 'highpart' so that they match 'gen_lowpart' and
'gen_highpart' bitwise semantics and fix order of highpart and
lowpart depending on target endianness.
---
 gcc/config/xtensa/xtensa.md | 13 -
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/gcc/config/xtensa/xtensa.md b/gcc/config/xtensa/xtensa.md
index 6f5cbc541d85..8a119038ba15 100644
--- a/gcc/config/xtensa/xtensa.md
+++ b/gcc/config/xtensa/xtensa.md
@@ -799,11 +799,14 @@
 because of offering further optimization opportunities.  */
   if (register_operand (operands[0], DImode))
{
- rtx first, second;
-
- split_double (operands[1], &first, &second);
- emit_insn (gen_movsi (gen_lowpart (SImode, operands[0]), first));
- emit_insn (gen_movsi (gen_highpart (SImode, operands[0]), second));
+ rtx lowpart, highpart;
+
+ if (TARGET_BIG_ENDIAN)
+   split_double (operands[1], &highpart, &lowpart);
+ else
+   split_double (operands[1], &lowpart, &highpart);
+ emit_insn (gen_movsi (gen_lowpart (SImode, operands[0]), lowpart));
+ emit_insn (gen_movsi (gen_highpart (SImode, operands[0]), highpart));
  DONE;
}
 
-- 
2.30.2



[PATCH] c++: optimize specialization of nested class templates

2022-06-08 Thread Patrick Palka via Gcc-patches
When substituting a class template specialization, tsubst_aggr_type
substitutes the TYPE_CONTEXT before passing it to lookup_template_class.
This appears to be unnecessary, however, because the the initial value
of lookup_template_class's context parameter is unused outside of the
IDENTIFIER_NODE case, and l_t_c performs its own substitution of the
context, anyway.  So this patch removes the redundant substitution in
tsubst_aggr_type.  Doing so causes us to ICE on template/nested5.C
because during lookup_template_class for A::C::D with T=E and S=S,
we substitute and complete the context A::C with T=E, which in turn
registers the desired dependent specialization of D for us and we end up
trying to register it again.  This patch fixes this by checking the
specializations table again after completion of the context.

This patch also implements a couple of other optimizations:

  * In lookup_template_class, if the context of the partially
instantiated template is already non-dependent, then we could
reuse that instead of substituting the context of the most
general template.
  * When substituting the TYPE_DECL for an injected-class-name
in tsubst_decl, we can avoid substituting its TREE_TYPE and
DECL_TI_ARGS.

Together these optimizations improve memory usage for the range-v3
testcase test/view/split.cc by about 5%.  The improvement is probably
more significant when dealing with deeply nested class templates.

Bootstrapped and regtested on x86_64-pc-linux-gnu, does this look OK for
trunk?

gcc/cp/ChangeLog:

* pt.cc (lookup_template_class): Remove dead stores to
context parameter.  Don't substitute the context of the
most general template if that of the partially instantiated
template is non-dependent.  Check the specializations table
again after completing the context of a nested dependent
specialization.
(tsubst_aggr_type) : Don't substitute
TYPE_CONTEXT or pass it to lookup_template_class.
(tsubst_decl) : Avoid substituting the
TREE_TYPE and DECL_TI_ARGS when DECL_SELF_REFERENCE_P.
---
 gcc/cp/pt.cc | 69 +++-
 1 file changed, 41 insertions(+), 28 deletions(-)

diff --git a/gcc/cp/pt.cc b/gcc/cp/pt.cc
index 59b94317e88..28023d60684 100644
--- a/gcc/cp/pt.cc
+++ b/gcc/cp/pt.cc
@@ -9840,8 +9840,6 @@ lookup_template_class (tree d1, tree arglist, tree 
in_decl, tree context,
  if (context)
pop_decl_namespace ();
}
-  if (templ)
-   context = DECL_CONTEXT (templ);
 }
   else if (TREE_CODE (d1) == TYPE_DECL && MAYBE_CLASS_TYPE_P (TREE_TYPE (d1)))
 {
@@ -9868,7 +9866,6 @@ lookup_template_class (tree d1, tree arglist, tree 
in_decl, tree context,
 {
   templ = d1;
   d1 = DECL_NAME (templ);
-  context = DECL_CONTEXT (templ);
 }
   else if (DECL_TEMPLATE_TEMPLATE_PARM_P (d1))
 {
@@ -10059,8 +10056,25 @@ lookup_template_class (tree d1, tree arglist, tree 
in_decl, tree context,
   context = DECL_CONTEXT (gen_tmpl);
   if (context && TYPE_P (context))
{
- context = tsubst_aggr_type (context, arglist, complain, in_decl, 
true);
- context = complete_type (context);
+ if (!uses_template_parms (DECL_CONTEXT (templ)))
+   /* If the context of the partially instantiated template is
+  already non-dependent, then we might as well use it.  */
+   context = DECL_CONTEXT (templ);
+ else
+   {
+ context = tsubst_aggr_type (context, arglist, complain, in_decl, 
true);
+ context = complete_type (context);
+ if (is_dependent_type && arg_depth > 1)
+   {
+ /* If this is a dependent nested specialization such as
+A::B, then completion of A might have
+registered this specialization of B for us, so check
+the table again (33959).  */
+ entry = type_specializations->find_with_hash (&elt, hash);
+ if (entry)
+   return entry->spec;
+   }
+   }
}
   else
context = tsubst (context, arglist, complain, in_decl);
@@ -13711,25 +13725,12 @@ tsubst_aggr_type (tree t,
   if (TYPE_TEMPLATE_INFO (t) && uses_template_parms (t))
{
  tree argvec;
- tree context;
  tree r;
 
  /* In "sizeof(X)" we need to evaluate "I".  */
  cp_evaluated ev;
 
- /* First, determine the context for the type we are looking
-up.  */
- context = TYPE_CONTEXT (t);
- if (context && TYPE_P (context))
-   {
- context = tsubst_aggr_type (context, args, complain,
- in_decl, /*entering_scope=*/1);
- /* If context is a nested class inside a class template,
-it may still need to be instantiated (c++/33959).

[pushed] c++: redeclared hidden friend take 2 [PR105852]

2022-06-08 Thread Jason Merrill via Gcc-patches
My previous patch for 105761 avoided copying DECL_TEMPLATE_INFO from a
friend to a later definition, but in this testcase we have first a
non-friend declaration and then a definition, and we need to avoid copying
in that case as well.  But we do still want to set new_template_info to
avoid GC trouble.

With this change, the modules dump correctly identifies ::foo as a
non-template function in tpl-friend-2_a.C.

Along the way I noticed that the duplicate_decls handling of
DECL_UNIQUE_FRIEND_P was backwards for templates, where we don't clobber
DECL_LANG_SPECIFIC (olddecl) with DECL_LANG_SPECIFIC (newdecl) like we do
for non-templates.

Tested x86_64-pc-linux-gnu, applying to trunk.

PR c++/105852
PR c++/105761

gcc/cp/ChangeLog:

* decl.cc (duplicate_decls): Avoid copying template info
from non-templated friend even if newdecl isn't a definition.
Correct handling of DECL_UNIQUE_FRIEND_P on templates.
* pt.cc (non_templated_friend_p): New.
* cp-tree.h (non_templated_friend_p): Declare it.

gcc/testsuite/ChangeLog:

* g++.dg/modules/tpl-friend-2_a.C: Adjust expected dump.
* g++.dg/template/friend74.C: New test.
---
 gcc/cp/cp-tree.h  |  1 +
 gcc/cp/decl.cc| 16 --
 gcc/cp/pt.cc  | 29 ++-
 gcc/testsuite/g++.dg/modules/tpl-friend-2_a.C |  2 +-
 gcc/testsuite/g++.dg/template/friend74.C  |  8 +
 5 files changed, 44 insertions(+), 12 deletions(-)
 create mode 100644 gcc/testsuite/g++.dg/template/friend74.C

diff --git a/gcc/cp/cp-tree.h b/gcc/cp/cp-tree.h
index cc13809f38a..3d8a08b8dd7 100644
--- a/gcc/cp/cp-tree.h
+++ b/gcc/cp/cp-tree.h
@@ -7395,6 +7395,7 @@ extern bool push_tinst_level_loc(tree, 
location_t);
 extern bool push_tinst_level_loc(tree, tree, location_t);
 extern void pop_tinst_level (void);
 extern struct tinst_level *outermost_tinst_level(void);
+extern bool non_templated_friend_p (tree);
 extern void init_template_processing   (void);
 extern void print_template_statistics  (void);
 bool template_template_parameter_p (const_tree);
diff --git a/gcc/cp/decl.cc b/gcc/cp/decl.cc
index 7add82aa39e..3e869954ccb 100644
--- a/gcc/cp/decl.cc
+++ b/gcc/cp/decl.cc
@@ -2294,8 +2294,8 @@ duplicate_decls (tree newdecl, tree olddecl, bool hiding, 
bool was_hidden)
  merge_default_template_args (new_parms, old_parms,
   /*class_p=*/false);
}
- if (!DECL_UNIQUE_FRIEND_P (old_result))
-   DECL_UNIQUE_FRIEND_P (new_result) = false;
+ if (!DECL_UNIQUE_FRIEND_P (new_result))
+   DECL_UNIQUE_FRIEND_P (old_result) = false;
 
  check_default_args (newdecl);
 
@@ -2654,13 +2654,7 @@ duplicate_decls (tree newdecl, tree olddecl, bool 
hiding, bool was_hidden)
   if (LANG_DECL_HAS_MIN (newdecl))
{
  DECL_ACCESS (newdecl) = DECL_ACCESS (olddecl);
- if (new_defines_function
- && DECL_TEMPLATE_INFO (olddecl)
- && DECL_UNIQUE_FRIEND_P (DECL_TEMPLATE_RESULT
-  (DECL_TI_TEMPLATE (olddecl
-   /* Don't copy template info from a non-template friend declaration
-  in a class template (PR105761).  */;
- else if (DECL_TEMPLATE_INFO (newdecl))
+ if (DECL_TEMPLATE_INFO (newdecl))
{
  new_template_info = DECL_TEMPLATE_INFO (newdecl);
  if (DECL_TEMPLATE_INSTANTIATION (olddecl)
@@ -2668,8 +2662,10 @@ duplicate_decls (tree newdecl, tree olddecl, bool 
hiding, bool was_hidden)
/* Remember the presence of explicit specialization args.  */
TINFO_USED_TEMPLATE_ID (DECL_TEMPLATE_INFO (olddecl))
  = TINFO_USED_TEMPLATE_ID (new_template_info);
- DECL_TEMPLATE_INFO (newdecl) = DECL_TEMPLATE_INFO (olddecl);
}
+
+ if (non_templated_friend_p (olddecl))
+   /* Don't copy tinfo from a non-templated friend (PR105761).  */;
  else
DECL_TEMPLATE_INFO (newdecl) = DECL_TEMPLATE_INFO (olddecl);
}
diff --git a/gcc/cp/pt.cc b/gcc/cp/pt.cc
index dcacba56a1c..9c1b026857e 100644
--- a/gcc/cp/pt.cc
+++ b/gcc/cp/pt.cc
@@ -11165,6 +11165,33 @@ outermost_tinst_level (void)
   return level;
 }
 
+/* True iff T is a friend function declaration that is not itself a template
+   and is not defined in a class template.  */
+
+bool
+non_templated_friend_p (tree t)
+{
+  if (t && TREE_CODE (t) == FUNCTION_DECL
+  && DECL_UNIQUE_FRIEND_P (t))
+{
+  tree ti = DECL_TEMPLATE_INFO (t);
+  if (!ti)
+   return true;
+  /* DECL_FRIEND_CONTEXT is set for a friend defined in class.  */
+  if (DECL_FRIEND_CONTEXT (t))
+   return false;
+  /* Non-templated friends in a class template are still 

[pushed] c++: non-templated friends [PR105852]

2022-06-08 Thread Jason Merrill via Gcc-patches
The previous patch for 105852 avoids copying DECL_TEMPLATE_INFO from a
non-templated friend, but it really shouldn't have it in the first place.

Tested x86_64-pc-linux-gnu, applying to trunk.

PR c++/105852

gcc/cp/ChangeLog:

* decl.cc (duplicate_decls): Change non-templated friend
check to an assert.
* pt.cc (tsubst_function_decl): Don't set DECL_TEMPLATE_INFO
on non-templated friends.
(tsubst_friend_function): Adjust.
---
 gcc/cp/decl.cc |  9 +
 gcc/cp/pt.cc   | 14 ++
 2 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/gcc/cp/decl.cc b/gcc/cp/decl.cc
index 3e869954ccb..7f3b3c3c588 100644
--- a/gcc/cp/decl.cc
+++ b/gcc/cp/decl.cc
@@ -2664,10 +2664,11 @@ duplicate_decls (tree newdecl, tree olddecl, bool 
hiding, bool was_hidden)
  = TINFO_USED_TEMPLATE_ID (new_template_info);
}
 
- if (non_templated_friend_p (olddecl))
-   /* Don't copy tinfo from a non-templated friend (PR105761).  */;
- else
-   DECL_TEMPLATE_INFO (newdecl) = DECL_TEMPLATE_INFO (olddecl);
+ /* We don't want to copy template info from a non-templated friend
+(PR105761), but these shouldn't have DECL_TEMPLATE_INFO now.  */
+ gcc_checking_assert (!DECL_TEMPLATE_INFO (olddecl)
+  || !non_templated_friend_p (olddecl));
+ DECL_TEMPLATE_INFO (newdecl) = DECL_TEMPLATE_INFO (olddecl);
}
 
   if (DECL_DECLARES_FUNCTION_P (newdecl))
diff --git a/gcc/cp/pt.cc b/gcc/cp/pt.cc
index 9c1b026857e..3154186ac20 100644
--- a/gcc/cp/pt.cc
+++ b/gcc/cp/pt.cc
@@ -11298,9 +11298,10 @@ tsubst_friend_function (tree decl, tree args)
   tree new_friend_template_info = DECL_TEMPLATE_INFO (new_friend);
   tree new_friend_result_template_info = NULL_TREE;
   bool new_friend_is_defn =
-   (DECL_INITIAL (DECL_TEMPLATE_RESULT
-  (template_for_substitution (new_friend)))
-!= NULL_TREE);
+   (new_friend_template_info
+&& (DECL_INITIAL (DECL_TEMPLATE_RESULT
+  (template_for_substitution (new_friend)))
+!= NULL_TREE));
   tree not_tmpl = new_friend;
 
   if (TREE_CODE (new_friend) == TEMPLATE_DECL)
@@ -14084,6 +14085,10 @@ tsubst_function_decl (tree t, tree args, 
tsubst_flags_t complain,
  && !LAMBDA_FUNCTION_P (t))
return t;
 
+  /* A non-templated friend doesn't get DECL_TEMPLATE_INFO.  */
+  if (non_templated_friend_p (t))
+   goto friend_case;
+
   /* Calculate the most general template of which R is a
 specialization.  */
   gen_tmpl = most_general_template (DECL_TI_TEMPLATE (t));
@@ -14129,6 +14134,7 @@ tsubst_function_decl (tree t, tree args, tsubst_flags_t 
complain,
 tsubst_friend_function, and we want only to create a
 new decl (R) with appropriate types so that we can call
 determine_specialization.  */
+friend_case:
   gen_tmpl = NULL_TREE;
   argvec = NULL_TREE;
 }
@@ -14324,7 +14330,7 @@ tsubst_function_decl (tree t, tree args, tsubst_flags_t 
complain,
   /* If this is an instantiation of a member template, clone it.
 If it isn't, that'll be handled by
 clone_constructors_and_destructors.  */
-  if (PRIMARY_TEMPLATE_P (gen_tmpl))
+  if (gen_tmpl && PRIMARY_TEMPLATE_P (gen_tmpl))
clone_cdtor (r, /*update_methods=*/false);
 }
   else if ((complain & tf_error) != 0

base-commit: 7d87790a871482e9c5142a8e885b4a5f76d197c8
-- 
2.27.0



Re: [PATCH] RISC-V: Compute default ABI from -mcpu or -march

2022-06-08 Thread Andrew Pinski via Gcc-patches
On Mon, Jun 6, 2022 at 7:53 PM wangpc via Gcc-patches
 wrote:
>
> If -mcpu or -march is specified and there is no -mabi, we will calculate
> default ABI from arch string provided by -march or defined in CPU info.

This is 100% wrong and goes against what all other targets do. All
other targets have the following:
a base ABI is selected and then add on to it via -march/-mcpu and only
for the vector ABI (PowerPC backend used not even do that and required
you to supply an option for the vector ABI but that might have changed
in recent years).
Basically the ABI should default to something (via either when
configured with --with-abi= or just a default) and reject invalid
combinations. That is if the ABI specified requires FPU and the
arch/cpu does not have FPU reject it (same with 32bit vs 64bit). This
is how other targets work in GCC and really it should be the same for
RISC-V. Why make it any different and make it harder for people to
convert from one target to another.

Also I think the whole number of extensions of RISC-V makes life much
harder, it would be easier if there was level of support like x86_64
and ARMv8/9 have moved to.
It is past time to require these kind of levels really; otherwise you
end up with so much fragmentation that RISC-V becomes the way of MIPS.

Thanks,
Andrew Pinski


>
> gcc/ChangeLog:
>
> * common/config/riscv/riscv-common.cc (compute_default_abi): 
> Implementation
> to calculate -mabi from arch string.
> (riscv_expand_abi_from_arch): New spec function to calcalute -mabi 
> from arch
> string provided by -march option.
> (riscv_expand_abi_from_cpu): New spec function to find CPU info and 
> calculate
> -mabi from arch string defined in CPU info.
> * config/riscv/riscv.h (EXTRA_SPEC_FUNCTIONS): Add above spec 
> functions.
> (OPTION_DEFAULT_SPECS): Use new spec functions to calculate -mabi and 
> -march
> has higher priority than -mcpu.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/riscv/mabi-1.c: ilp32e test.
> * gcc.target/riscv/mabi-2.c: ilp32 test.
> * gcc.target/riscv/mabi-3.c: ilp32f test.
> * gcc.target/riscv/mabi-4.c: ilp32d test.
> * gcc.target/riscv/mabi-5.c: lp64 test.
> * gcc.target/riscv/mabi-6.c: lp64f test.
> * gcc.target/riscv/mabi-7.c: lp64d test.
> * gcc.target/riscv/mabi-8.c: -march override -mcpu.
> ---
>  gcc/common/config/riscv/riscv-common.cc | 66 +
>  gcc/config/riscv/riscv.h| 15 --
>  gcc/testsuite/gcc.target/riscv/mabi-1.c |  7 +++
>  gcc/testsuite/gcc.target/riscv/mabi-2.c |  7 +++
>  gcc/testsuite/gcc.target/riscv/mabi-3.c |  7 +++
>  gcc/testsuite/gcc.target/riscv/mabi-4.c |  7 +++
>  gcc/testsuite/gcc.target/riscv/mabi-5.c |  7 +++
>  gcc/testsuite/gcc.target/riscv/mabi-6.c |  7 +++
>  gcc/testsuite/gcc.target/riscv/mabi-7.c |  7 +++
>  gcc/testsuite/gcc.target/riscv/mabi-8.c |  7 +++
>  10 files changed, 134 insertions(+), 3 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/riscv/mabi-1.c
>  create mode 100644 gcc/testsuite/gcc.target/riscv/mabi-2.c
>  create mode 100644 gcc/testsuite/gcc.target/riscv/mabi-3.c
>  create mode 100644 gcc/testsuite/gcc.target/riscv/mabi-4.c
>  create mode 100644 gcc/testsuite/gcc.target/riscv/mabi-5.c
>  create mode 100644 gcc/testsuite/gcc.target/riscv/mabi-6.c
>  create mode 100644 gcc/testsuite/gcc.target/riscv/mabi-7.c
>  create mode 100644 gcc/testsuite/gcc.target/riscv/mabi-8.c
>
> diff --git a/gcc/common/config/riscv/riscv-common.cc 
> b/gcc/common/config/riscv/riscv-common.cc
> index 0e5be2ce105..f8e40549d18 100644
> --- a/gcc/common/config/riscv/riscv-common.cc
> +++ b/gcc/common/config/riscv/riscv-common.cc
> @@ -1266,6 +1266,72 @@ riscv_default_mtune (int argc, const char **argv)
>  return default_mtune;
>  }
>
> +/* Compute default -mabi option from arch string.  */
> +
> +static const char *
> +compute_default_abi (const char* arch_str)
> +{
> +  location_t loc = UNKNOWN_LOCATION;
> +
> +  riscv_parse_arch_string (arch_str, NULL, loc);
> +
> +  if (current_subset_list->xlen () == 64)
> +{
> +  if (current_subset_list->lookup ("d", RISCV_DONT_CARE_VERSION,
> +RISCV_DONT_CARE_VERSION))
> +   return "lp64d";
> +  if (current_subset_list->lookup ("f", RISCV_DONT_CARE_VERSION,
> +RISCV_DONT_CARE_VERSION))
> +   return "lp64f";
> +  return "lp64";
> +}
> +  else
> +{
> +  if (current_subset_list->lookup ("e", RISCV_DONT_CARE_VERSION,
> +RISCV_DONT_CARE_VERSION))
> +   return "ilp32e";
> +  if (current_subset_list->lookup ("d", RISCV_DONT_CARE_VERSION,
> +RISCV_DONT_CARE_VERSION))
> +   return "ilp32d";
> +  if (current_subset_list->lookup ("f", RISCV_DONT_CARE_VERSION,
> +R

Re: [PATCH v4, rs6000] Implemented f[min/max]_optab by xs[min/max]dp [PR103605]

2022-06-08 Thread HAO CHEN GUI via Gcc-patches
Hi,

On 8/6/2022 下午 9:24, Segher Boessenkool wrote:
> But it regresses the code quality generated with -ffast-math (because
> the new unspecs arent't optimised like standard rtl is).  This can be
> follow-up work of course -- and the best direction is to make fmin/fmax
> generic, even!  :-)

fmin/max will be folded to MIN/MAX_EXPR when fast-math is set. So the
behavior doesn't change when fast-math is set.



[PATCH 1/2]middle-end Support optimized division by pow2 bitmask

2022-06-08 Thread Tamar Christina via Gcc-patches
Hi All,

In plenty of image and video processing code it's common to modify pixel values
by a widening operation and then scale them back into range by dividing by 255.

This patch adds an optab to allow us to emit an optimized sequence when doing
an unsigned division that is equivalent to:

   x = y / (2 ^ (bitsize (y)/2)-1

Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

* internal-fn.def (DIV_POW2_BITMASK): New.
* optabs.def (udiv_pow2_bitmask_optab): New.
* doc/md.texi: Document it.
* tree-vect-patterns.cc (vect_recog_divmod_pattern): Recognize pattern.

gcc/testsuite/ChangeLog:

* gcc.dg/vect/vect-div-bitmask-1.c: New test.
* gcc.dg/vect/vect-div-bitmask-2.c: New test.
* gcc.dg/vect/vect-div-bitmask-3.c: New test.
* gcc.dg/vect/vect-div-bitmask.h: New file.

--- inline copy of patch -- 
diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index 
f3619c505c025f158c2bc64756531877378b22e1..784c49d7d24cef7619e4d613f7b4f6e945866c38
 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -5588,6 +5588,18 @@ signed op0, op1;
 op0 = op1 / (1 << imm);
 @end smallexample
 
+@cindex @code{udiv_pow2_bitmask@var{m2}} instruction pattern
+@item @samp{udiv_pow2_bitmask@var{m2}}
+@cindex @code{udiv_pow2_bitmask@var{m2}} instruction pattern
+@itemx @samp{udiv_pow2_bitmask@var{m2}}
+Unsigned vector division by an immediate that is equivalent to
+@samp{2^(bitsize(m) / 2) - 1}.
+@smallexample
+unsigned short op0; op1;
+@dots{}
+op0 = op1 / 0xffU;
+@end smallexample
+
 @cindex @code{vec_shl_insert_@var{m}} instruction pattern
 @item @samp{vec_shl_insert_@var{m}}
 Shift the elements in vector input operand 1 left one element (i.e.@:
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index 
d2d550d358606022b1cb44fa842f06e0be507bc3..a3e3cc1520f77683ebf6256898f916ed45de475f
 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -159,6 +159,8 @@ DEF_INTERNAL_OPTAB_FN (VEC_SHL_INSERT, ECF_CONST | 
ECF_NOTHROW,
   vec_shl_insert, binary)
 
 DEF_INTERNAL_OPTAB_FN (DIV_POW2, ECF_CONST | ECF_NOTHROW, sdiv_pow2, binary)
+DEF_INTERNAL_OPTAB_FN (DIV_POW2_BITMASK, ECF_CONST | ECF_NOTHROW,
+  udiv_pow2_bitmask, unary)
 
 DEF_INTERNAL_OPTAB_FN (FMS, ECF_CONST, fms, ternary)
 DEF_INTERNAL_OPTAB_FN (FNMA, ECF_CONST, fnma, ternary)
diff --git a/gcc/optabs.def b/gcc/optabs.def
index 
801310ebaa7d469520809bb7efed6820f8eb866b..3f0ac05ef5ad5aed8d6ca391f4eed71b0494e17f
 100644
--- a/gcc/optabs.def
+++ b/gcc/optabs.def
@@ -372,6 +372,7 @@ OPTAB_D (smulhrs_optab, "smulhrs$a3")
 OPTAB_D (umulhs_optab, "umulhs$a3")
 OPTAB_D (umulhrs_optab, "umulhrs$a3")
 OPTAB_D (sdiv_pow2_optab, "sdiv_pow2$a3")
+OPTAB_D (udiv_pow2_bitmask_optab, "udiv_pow2_bitmask$a2")
 OPTAB_D (vec_pack_sfix_trunc_optab, "vec_pack_sfix_trunc_$a")
 OPTAB_D (vec_pack_ssat_optab, "vec_pack_ssat_$a")
 OPTAB_D (vec_pack_trunc_optab, "vec_pack_trunc_$a")
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c 
b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
new file mode 100644
index 
..a7ea3cce4764239c5d281a8f0bead1f6a452de3f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
@@ -0,0 +1,25 @@
+/* { dg-require-effective-target vect_int } */
+
+#include 
+#include "tree-vect.h"
+
+#define N 50
+#define TYPE uint8_t 
+
+__attribute__((noipa, noinline, optimize("O1")))
+void fun1(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+pixel[i] = (pixel[i] * level) / 0xff;
+}
+
+__attribute__((noipa, noinline, optimize("O3")))
+void fun2(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+pixel[i] = (pixel[i] * level) / 0xff;
+}
+
+#include "vect-div-bitmask.h"
+
+/* { dg-final { scan-tree-dump "vect_recog_divmod_pattern: detected" "vect" } 
} */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c 
b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
new file mode 100644
index 
..009e16e1b36497e5724410d9843f1ce122b26dda
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
@@ -0,0 +1,25 @@
+/* { dg-require-effective-target vect_int } */
+
+#include 
+#include "tree-vect.h"
+
+#define N 50
+#define TYPE uint16_t 
+
+__attribute__((noipa, noinline, optimize("O1")))
+void fun1(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+pixel[i] = (pixel[i] * level) / 0xU;
+}
+
+__attribute__((noipa, noinline, optimize("O3")))
+void fun2(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+pixel[i] = (pixel[i] * level) / 0xU;
+}
+
+#include "vect-div-bitmask.h"
+
+/* { dg-final { scan-tree-dump "vect_recog_divmod_pattern: detected" "vect" } 
} */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c 
b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
new file mode 100644
inde

[PATCH 2/2]AArch64 aarch64: Add implementation for pow2 bitmask division.

2022-06-08 Thread Tamar Christina via Gcc-patches
Hi All,

This adds an implementation for the new optab for unsigned pow2 bitmask for
AArch64.

The implementation rewrites:

   x = y / (2 ^ (sizeof (y)/2)-1

into e.g. (for bytes)

   (x + ((x + 257) >> 8)) >> 8

where it's required that the additions be done in double the precision of x
such that we don't lose any bits during an overflow.

Essentially the sequence decomposes the division into doing two smaller
divisions, one for the top and bottom parts of the number and adding the results
back together.

To account for the fact that shift by 8 would be division by 256 we add 1 to
both parts of x such that when 255 we still get 1 as the answer.

Because the amount we shift are half the original datatype we can use the
halfing instructions the ISA provides to do the operation instead of using
actual shifts.

For AArch64 this means we generate for:

void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n)
{
  for (int i = 0; i < (n & -16); i+=1)
pixel[i] = (pixel[i] * level) / 0xff;
}

the following:

moviv3.16b, 0x1
umull2  v1.8h, v0.16b, v2.16b
umull   v0.8h, v0.8b, v2.8b
addhn   v5.8b, v1.8h, v3.8h
addhn   v4.8b, v0.8h, v3.8h
uaddw   v1.8h, v1.8h, v5.8b
uaddw   v0.8h, v0.8h, v4.8b
uzp2v0.16b, v0.16b, v1.16b

instead of:

umull   v2.8h, v1.8b, v5.8b
umull2  v1.8h, v1.16b, v5.16b
umull   v0.4s, v2.4h, v3.4h
umull2  v2.4s, v2.8h, v3.8h
umull   v4.4s, v1.4h, v3.4h
umull2  v1.4s, v1.8h, v3.8h
uzp2v0.8h, v0.8h, v2.8h
uzp2v1.8h, v4.8h, v1.8h
shrnv0.8b, v0.8h, 7
shrn2   v0.16b, v1.8h, 7

Which results in significantly faster code.

Thanks for Wilco for the concept.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

* config/aarch64/aarch64-simd.md (udiv_pow2_bitmask2): New.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/div-by-bitmask.c: New test.

--- inline copy of patch -- 
diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 
18733428f3fb91d937346aa360f6d1fe13ca1eae..6b0405924a03a243949a6741f4c0e989d9ca2869
 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -4845,6 +4845,57 @@ (define_expand "aarch64_hn2"
   }
 )
 
+;; div optimizations using narrowings
+;; we can do the division e.g. shorts by 255 faster by calculating it as
+;; (x + ((x + 257) >> 8)) >> 8 assuming the operation is done in
+;; double the precision of x.
+;;
+;; If we imagine a short as being composed of two blocks of bytes then
+;; adding 257 or 0b_0001__0001 to the number is equivalen to
+;; adding 1 to each sub component:
+;;
+;;  short value of 16-bits
+;; ┌──┬┐
+;; │  ││
+;; └──┴┘
+;;   8-bit part1 ▲  8-bit part2   ▲
+;;   ││
+;;   ││
+;;  +1   +1
+;;
+;; after the first addition, we have to shift right by 8, and narrow the
+;; results back to a byte.  Remember that the addition must be done in
+;; double the precision of the input.  Since 8 is half the size of a short
+;; we can use a narrowing halfing instruction in AArch64, addhn which also
+;; does the addition in a wider precision and narrows back to a byte.  The
+;; shift itself is implicit in the operation as it writes back only the top
+;; half of the result. i.e. bits 2*esize-1:esize.
+;;
+;; Since we have narrowed the result of the first part back to a byte, for
+;; the second addition we can use a widening addition, uaddw.
+;;
+;; For the finaly shift, since it's unsigned arithmatic we emit an ushr by 8
+;; to shift and the vectorizer.
+;;
+;; The shift is later optimized by combine to a uzp2 with movi #0.
+(define_expand "udiv_pow2_bitmask2"
+  [(match_operand:VQN 0 "register_operand")
+   (match_operand:VQN 1 "register_operand")]
+  "TARGET_SIMD"
+{
+  rtx addend = gen_reg_rtx (mode);
+  rtx val = aarch64_simd_gen_const_vector_dup (mode, 1);
+  emit_move_insn (addend, lowpart_subreg (mode, val, mode));
+  rtx tmp1 = gen_reg_rtx (mode);
+  rtx tmp2 = gen_reg_rtx (mode);
+  emit_insn (gen_aarch64_addhn (tmp1, operands[1], addend));
+  unsigned bitsize = GET_MODE_UNIT_BITSIZE (mode);
+  rtx shift_vector = aarch64_simd_gen_const_vector_dup (mode, bitsize);
+  emit_insn (gen_aarch64_uaddw (tmp2, operands[1], tmp1));
+  emit_insn (gen_aarch64_simd_lshr (operands[0], tmp2, shift_vector));
+  DONE;
+})
+
 ;; pmul.
 
 (define_insn "aarch64_pmul"
diff --git a/gcc/testsuite/gcc.target/aarch64/div-by-bitmask.c 
b/gcc/testsuite/gcc.target/aarch64/div-by-bitmask.c
new file mode 100644
index 
..c03aee695ef834fbe3533a21d54a218160b0007d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/div-by-bitmask.c
@@ -0,0 +1,70 @@
+/* { dg-do compile } */
+

[PATCH] or1k: Add support for a little-endian target variant

2022-06-08 Thread Samuel Holland
While not officially sanctioned by the architecture spec, little-endian
or1k processors do exist in the wild, for example the Allwinner AR100.
Let's add native support for this, instead of hacks like using objcopy
to byteswap ELF file contents.

gcc/ChangeLog:

* config.gcc: Set TARGET_LITTLE_ENDIAN_DEFAULT from target.
* config/or1k/elf.h (LINK_SPEC): Pass -EB/-EL to the linker.
* config/or1k/linux.h (LINK_SPEC): Pass -EB/-EL to the linker.
* config/or1k/or1k.h (ENDIAN_SPEC): Set default endianness.
(DRIVER_SELF_SPECS): Set default endianness.
(ASM_SPEC): Pass -EB/-EL to the assembler.
(LINK_SPEC): Pass -EB/-EL to the linker.
(BYTES_BIG_ENDIAN): Make dependent on TARGET_BIG_ENDIAN.
(WORDS_BIG_ENDIAN): Likewise.
* config/or1k/or1k.opt: Add -mbig-endian/-mlittle-endian.

libgcc/ChangeLog:

* config.host (or1k): Generalize arch to or1k*.

Signed-off-by: Samuel Holland 
---
 gcc/config.gcc   |  5 +
 gcc/config/or1k/elf.h|  1 +
 gcc/config/or1k/linux.h  |  1 +
 gcc/config/or1k/or1k.h   | 19 +--
 gcc/config/or1k/or1k.opt |  8 
 libgcc/config.host   |  4 ++--
 6 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/gcc/config.gcc b/gcc/config.gcc
index c5064dd37666..0c3a09dfe810 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -2866,6 +2866,11 @@ or1k*-*-*)
done
TM_MULTILIB_CONFIG=`echo $TM_MULTILIB_CONFIG | sed 's/^,//'`
 
+   case ${target} in
+   or1k*le*-*)
+   tm_defines="${tm_defines} TARGET_LITTLE_ENDIAN_DEFAULT=1"
+   ;;
+   esac
case ${target} in
or1k*-*-linux*)
tm_file="${tm_file} gnu-user.h linux.h glibc-stdint.h"
diff --git a/gcc/config/or1k/elf.h b/gcc/config/or1k/elf.h
index d3d27028aa37..44c0a0687c7f 100644
--- a/gcc/config/or1k/elf.h
+++ b/gcc/config/or1k/elf.h
@@ -27,6 +27,7 @@
 
 #undef LINK_SPEC
 #define LINK_SPEC "%{h*}   \
+   %{mbig-endian:-EB} %{mlittle-endian:-EL}\
%{static:-Bstatic}  \
%{shared:-shared}   \
%{symbolic:-Bsymbolic}  \
diff --git a/gcc/config/or1k/linux.h b/gcc/config/or1k/linux.h
index 80f77c722322..8fe8b0168b11 100644
--- a/gcc/config/or1k/linux.h
+++ b/gcc/config/or1k/linux.h
@@ -36,6 +36,7 @@
 
 #undef LINK_SPEC
 #define LINK_SPEC "%{h*}   \
+   %{mbig-endian:-EB} %{mlittle-endian:-EL}\
%{static:-Bstatic}  \
%{shared:-shared}   \
%{symbolic:-Bsymbolic}  \
diff --git a/gcc/config/or1k/or1k.h b/gcc/config/or1k/or1k.h
index 18a526d386f1..b5e4a2181336 100644
--- a/gcc/config/or1k/or1k.h
+++ b/gcc/config/or1k/or1k.h
@@ -23,6 +23,21 @@
 
 #include "config/or1k/or1k-opts.h"
 
+#ifndef TARGET_LITTLE_ENDIAN_DEFAULT
+#define ENDIAN_SPEC "-mbig-endian"
+#else
+#define ENDIAN_SPEC "-mlittle-endian"
+#endif
+
+/* Force the default endianness and ABI flags onto the command line
+   in order to make the other specs easier to write.  */
+#undef DRIVER_SELF_SPECS
+#define DRIVER_SELF_SPECS \
+  " %{!mbig-endian:%{!mlittle-endian:" ENDIAN_SPEC "}}"
+
+#define ASM_SPEC "%{mbig-endian:-EB} %{mlittle-endian:-EL}"
+#define LINK_SPEC "%{mbig-endian:-EB} %{mlittle-endian:-EL}"
+
 /* Names to predefine in the preprocessor for this target machine.  */
 #define TARGET_CPU_CPP_BUILTINS()  \
   do   \
@@ -48,8 +63,8 @@
 
 #define DEFAULT_SIGNED_CHAR 1
 #define BITS_BIG_ENDIAN 0
-#define BYTES_BIG_ENDIAN 1
-#define WORDS_BIG_ENDIAN 1
+#define BYTES_BIG_ENDIAN (TARGET_BIG_ENDIAN)
+#define WORDS_BIG_ENDIAN (TARGET_BIG_ENDIAN)
 #define BITS_PER_WORD 32
 #define UNITS_PER_WORD 4
 #define POINTER_SIZE 32
diff --git a/gcc/config/or1k/or1k.opt b/gcc/config/or1k/or1k.opt
index 8a66832a99b1..497f259faae9 100644
--- a/gcc/config/or1k/or1k.opt
+++ b/gcc/config/or1k/or1k.opt
@@ -24,6 +24,14 @@
 HeaderInclude
 config/or1k/or1k-opts.h
 
+mbig-endian
+Target Report RejectNegative Mask(BIG_ENDIAN)
+Use big-endian byte order.
+
+mlittle-endian
+Target Report RejectNegative InverseMask(BIG_ENDIAN, LITTLE_ENDIAN)
+Use little-endian byte order.
+
 mhard-div
 Target RejectNegative InverseMask(SOFT_DIV)
 Enable generation of hardware divide (l.div, l.divu) instructions.  This is the
diff --git a/libgcc/config.host b/libgcc/config.host
index 8c56fcae5d2f..45c8d1a47d9a 100644
--- a/libgcc/config.host
+++ b/libgcc/config.host
@@ -1144,12 +1144,12 @@ nios2-*-*)
tmake_file="$tmake_file nios2/t-nios2 t-softfp-sfdf t-softfp-excl 
t-softfp"
extra_parts="$extra_parts crti.o crtn.o"
;;
-or1k-*-linux*)
+or1k*-*-linux*)
tmake_file="$tmake_file or1k/t-or1k or1k/t-crtstuff"
tmake_file="$tmake_file t-softfp-sfdf t-softfp"
md_unwind_header=or1k/linux-unwind.h
;;
-or1k-*-*)
+or1k*-*-*)
tmake_file=

[PATCH, OpenMP, v4] Implement uses_allocators clause for target regions

2022-06-08 Thread Chung-Lin Tang

Hi Jakub,
this is v4 of the uses_allocators patch.

On 2022/5/31 6:02 PM, Jakub Jelinek wrote:

The response I got on omp-lang is that it is intentional that in the new
syntax only a single allocator is allowed.
So I'd suggest to implement:
1) if has_modifiers (i.e. certainly new syntax), only allow a single
enumerator / identifier for a variable and no ()s after it
2) if !has_modifiers and there is exactly one allocator without ()s,
treat it like new syntax
3) otherwise, it is the old (5.1) syntax, which allows a list and that
list can contain ()s for traits, but in the light of the 5.2 wording,
I'd even for that case avoid diagnosing missing traits for non-predefined
allocators
4) omp_null_allocator should be diagnosed as invalid,
private (omp_null_allocator) is rejected...


I've adjusted the checking to enforce these rules, and updated the testcases.
Re-tested without regressions.


5) for C++, we should handle FIELD_DECLs, but it shouldn't be hard, just
look how it is handled for private too


As discussed in the other mail, private() for FIELD_DECLs on target constructs
seem not working properly, filed PR105861 for this.

Currently uses_allocators (which also uses private) is still sorry() for 
FIELD_DECLs
in this v4 patch. Will file another issue to track after patch is committed.

(ChangeLog should be the same as before, so omitted here)

Thanks,
Chung-Lindiff --git a/gcc/builtin-types.def b/gcc/builtin-types.def
index 3a7cecdf087..be3e6ff697e 100644
--- a/gcc/builtin-types.def
+++ b/gcc/builtin-types.def
@@ -283,6 +283,7 @@ DEF_FUNCTION_TYPE_1 (BT_FN_DFLOAT32_DFLOAT32, BT_DFLOAT32, 
BT_DFLOAT32)
 DEF_FUNCTION_TYPE_1 (BT_FN_DFLOAT64_DFLOAT64, BT_DFLOAT64, BT_DFLOAT64)
 DEF_FUNCTION_TYPE_1 (BT_FN_DFLOAT128_DFLOAT128, BT_DFLOAT128, BT_DFLOAT128)
 DEF_FUNCTION_TYPE_1 (BT_FN_VOID_VPTR, BT_VOID, BT_VOLATILE_PTR)
+DEF_FUNCTION_TYPE_1 (BT_FN_VOID_PTRMODE, BT_VOID, BT_PTRMODE)
 DEF_FUNCTION_TYPE_1 (BT_FN_VOID_PTRPTR, BT_VOID, BT_PTR_PTR)
 DEF_FUNCTION_TYPE_1 (BT_FN_VOID_CONST_PTR, BT_VOID, BT_CONST_PTR)
 DEF_FUNCTION_TYPE_1 (BT_FN_UINT_UINT, BT_UINT, BT_UINT)
@@ -641,6 +642,8 @@ DEF_FUNCTION_TYPE_3 (BT_FN_PTR_SIZE_SIZE_PTRMODE,
 BT_PTR, BT_SIZE, BT_SIZE, BT_PTRMODE)
 DEF_FUNCTION_TYPE_3 (BT_FN_VOID_PTR_UINT8_PTRMODE, BT_VOID, BT_PTR, BT_UINT8,
 BT_PTRMODE)
+DEF_FUNCTION_TYPE_3 (BT_FN_PTRMODE_PTRMODE_INT_PTR, BT_PTRMODE, BT_PTRMODE,
+BT_INT, BT_PTR)
 
 DEF_FUNCTION_TYPE_4 (BT_FN_SIZE_CONST_PTR_SIZE_SIZE_FILEPTR,
 BT_SIZE, BT_CONST_PTR, BT_SIZE, BT_SIZE, BT_FILEPTR)
diff --git a/gcc/c-family/c-omp.cc b/gcc/c-family/c-omp.cc
index 66d17a2673d..50db6936728 100644
--- a/gcc/c-family/c-omp.cc
+++ b/gcc/c-family/c-omp.cc
@@ -1873,6 +1873,7 @@ c_omp_split_clauses (location_t loc, enum tree_code code,
case OMP_CLAUSE_HAS_DEVICE_ADDR:
case OMP_CLAUSE_DEFAULTMAP:
case OMP_CLAUSE_DEPEND:
+   case OMP_CLAUSE_USES_ALLOCATORS:
  s = C_OMP_CLAUSE_SPLIT_TARGET;
  break;
case OMP_CLAUSE_NUM_TEAMS:
diff --git a/gcc/c-family/c-pragma.h b/gcc/c-family/c-pragma.h
index 54864c2ec41..7f8944f81d6 100644
--- a/gcc/c-family/c-pragma.h
+++ b/gcc/c-family/c-pragma.h
@@ -154,6 +154,7 @@ enum pragma_omp_clause {
   PRAGMA_OMP_CLAUSE_UNTIED,
   PRAGMA_OMP_CLAUSE_USE_DEVICE_PTR,
   PRAGMA_OMP_CLAUSE_USE_DEVICE_ADDR,
+  PRAGMA_OMP_CLAUSE_USES_ALLOCATORS,
 
   /* Clauses for OpenACC.  */
   PRAGMA_OACC_CLAUSE_ASYNC,
diff --git a/gcc/c/c-parser.cc b/gcc/c/c-parser.cc
index 492d995a281..0fe5b7ac2e4 100644
--- a/gcc/c/c-parser.cc
+++ b/gcc/c/c-parser.cc
@@ -12922,6 +12922,8 @@ c_parser_omp_clause_name (c_parser *parser)
result = PRAGMA_OMP_CLAUSE_USE_DEVICE_ADDR;
  else if (!strcmp ("use_device_ptr", p))
result = PRAGMA_OMP_CLAUSE_USE_DEVICE_PTR;
+ else if (!strcmp ("uses_allocators", p))
+   result = PRAGMA_OMP_CLAUSE_USES_ALLOCATORS;
  break;
case 'v':
  if (!strcmp ("vector", p))
@@ -15651,6 +15653,213 @@ c_parser_omp_clause_allocate (c_parser *parser, tree 
list)
   return nl;
 }
 
+/* OpenMP 5.0:
+   uses_allocators ( allocator-list )
+
+   allocator-list:
+   allocator
+   allocator , allocator-list
+   allocator ( traits-array )
+   allocator ( traits-array ) , allocator-list
+
+   OpenMP 5.2:
+
+   uses_allocators ( modifier : allocator-list )
+   uses_allocators ( modifier , modifier : allocator-list )
+
+   modifier:
+   traits ( traits-array )
+   memspace ( mem-space-handle )  */
+
+static tree
+c_parser_omp_clause_uses_allocators (c_parser *parser, tree list)
+{
+  location_t clause_loc = c_parser_peek_token (parser)->location;
+  tree t = NULL_TREE, nl = list;
+  matching_parens parens;
+  if (!parens.require_open (parser))
+return list;
+
+  tree memspace_expr = NULL_TREE;
+  tree traits_var = NULL_TREE;
+
+  struct item_tok
+  {
+location_t loc;
+tree id;
+item_tok (void) : loc (UNKN