[gcc r14-9333] aarch64: Define out-of-class static constants

2024-03-06 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:c7a9883663a888617b6e3584233aa756b30519f8

commit r14-9333-gc7a9883663a888617b6e3584233aa756b30519f8
Author: Richard Sandiford 
Date:   Wed Mar 6 10:04:56 2024 +

aarch64: Define out-of-class static constants

While reworking the aarch64 feature descriptions, I forgot
to add out-of-class definitions of some static constants.
This could lead to a build failure with some compilers.

This was seen with some WIP to increase the number of extensions
beyond 64.  It's latent on trunk though, and a regression from
before the rework.

gcc/
* config/aarch64/aarch64-feature-deps.h (feature_deps::info): Add
out-of-class definitions of static constants.

Diff:
---
 gcc/config/aarch64/aarch64-feature-deps.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/gcc/config/aarch64/aarch64-feature-deps.h 
b/gcc/config/aarch64/aarch64-feature-deps.h
index a1b81f9070b..3641badb82f 100644
--- a/gcc/config/aarch64/aarch64-feature-deps.h
+++ b/gcc/config/aarch64/aarch64-feature-deps.h
@@ -71,6 +71,9 @@ template struct info;
 static constexpr auto enable = flag | get_enable REQUIRES; \
 static constexpr auto explicit_on = enable | get_enable EXPLICIT_ON; \
   };   \
+  const aarch64_feature_flags info::flag;  \
+  const aarch64_feature_flags info::enable;\
+  const aarch64_feature_flags info::explicit_on; \
   constexpr info IDENT ()  \
   {\
 return info ();\


[gcc r15-3015] testsuite: Reduce cut-&-paste in scanltranstree.exp

2024-08-19 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:71059d268f567b664e74101c8be7b58441f15d29

commit r15-3015-g71059d268f567b664e74101c8be7b58441f15d29
Author: Richard Sandiford 
Date:   Mon Aug 19 09:40:33 2024 +0100

testsuite: Reduce cut-&-paste in scanltranstree.exp

scanltranstree.exp defines some LTO wrappers around standard
non-LTO scanners.  Four of them are cut-&-paste variants of
one another, so this patch generates them from a single template.
It also does the same for scan-ltrans-tree-dump-times, so that
other *-times scanners can be added easily in future.

The scanners seem to be lightly used.  gcc.dg/ipa/ipa-icf-38.c uses
scan-ltrans-tree-dump{,-not} and libgomp.c/declare-variant-1.c
uses scan-ltrans-tree-dump-{not,times}.  Nothing currently seems
to use scan-ltrans-tree-dump-dem*.

gcc/testsuite/
* lib/scanltranstree.exp: Redefine the routines using two
templates.

Diff:
---
 gcc/testsuite/lib/scanltranstree.exp | 186 ---
 1 file changed, 62 insertions(+), 124 deletions(-)

diff --git a/gcc/testsuite/lib/scanltranstree.exp 
b/gcc/testsuite/lib/scanltranstree.exp
index 79f05f0ffed..bc6e02dc369 100644
--- a/gcc/testsuite/lib/scanltranstree.exp
+++ b/gcc/testsuite/lib/scanltranstree.exp
@@ -19,130 +19,68 @@
 
 load_lib scandump.exp
 
-# Utility for scanning compiler result, invoked via dg-final.
-# Call pass if pattern is present, otherwise fail.
-#
-# Argument 0 is the regexp to match
-# Argument 1 is the name of the dumped tree pass
-# Argument 2 handles expected failures and the like
-proc scan-ltrans-tree-dump { args } {
-
-if { [llength $args] < 2 } {
-   error "scan-ltrans-tree-dump: too few arguments"
-   return
-}
-if { [llength $args] > 3 } {
-   error "scan-ltrans-tree-dump: too many arguments"
-   return
-}
-if { [llength $args] >= 3 } {
-   scan-dump "ltrans-tree" [lindex $args 0] \
- "\[0-9\]\[0-9\]\[0-9\]t.[lindex $args 1]" ".ltrans0.ltrans" \
- [lindex $args 2]
-} else {
-   scan-dump "ltrans-tree" [lindex $args 0] \
- "\[0-9\]\[0-9\]\[0-9\]t.[lindex $args 1]" ".ltrans0.ltrans"
-}
-}
-
-# Call pass if pattern is present given number of times, otherwise fail.
-# Argument 0 is the regexp to match
-# Argument 1 is number of times the regexp must be found
-# Argument 2 is the name of the dumped tree pass
-# Argument 3 handles expected failures and the like
-proc scan-ltrans-tree-dump-times { args } {
-
-if { [llength $args] < 3 } {
-   error "scan-ltrans-tree-dump-times: too few arguments"
-   return
-}
-if { [llength $args] > 4 } {
-   error "scan-ltrans-tree-dump-times: too many arguments"
-   return
-}
-if { [llength $args] >= 4 } {
-   scan-dump-times "ltrans-tree" [lindex $args 0] [lindex $args 1] \
-   "\[0-9\]\[0-9\]\[0-9\]t.[lindex $args 2]" \
-   ".ltrans0.ltrans" [lindex $args 3]
-} else {
-   scan-dump-times "ltrans-tree" [lindex $args 0] [lindex $args 1] \
-   "\[0-9\]\[0-9\]\[0-9\]t.[lindex $args 2]" 
".ltrans0.ltrans"
-}
+# The first item in the list is an LTO equivalent of the second item
+# in the list; see the documentation of the second item for details.
+foreach { name scan type suffix } {
+scan-ltrans-tree-dump scan-dump ltrans-tree t
+scan-ltrans-tree-dump-not scan-dump-not ltrans-tree t
+scan-ltrans-tree-dump-dem scan-dump-dem ltrans-tree t
+scan-ltrans-tree-dump-dem-not scan-dump-dem-not ltrans-tree t
+} {
+eval [string map [list @NAME@ $name \
+  @SCAN@ $scan \
+  @TYPE@ $type \
+  @SUFFIX@ $suffix] {
+proc @NAME@ { args } {
+   if { [llength $args] < 2 } {
+   error "@NAME@: too few arguments"
+   return
+   }
+   if { [llength $args] > 3 } {
+   error "@NAME@: too many arguments"
+   return
+   }
+   if { [llength $args] >= 3 } {
+   @SCAN@ @TYPE@ [lindex $args 0] \
+   "\[0-9\]\[0-9\]\[0-9\]@SUFFIX@.[lindex $args 1]" \
+   ".ltrans0.ltrans" \
+   [lindex $args 2]
+   } else {
+   @SCAN@ @TYPE@ [lindex $args 0] \
+   "\[0-9\]\[0-9\]\[0-9\]@SUFFIX@.[lindex $args 1]" \
+   ".ltrans0.ltrans"
+   }
+}
+}]
 }
 
-# Call pass if pattern is not present, otherwise fail.
-#
-# Argument 0 is the regexp to match
-# Argument 1 is the name of the dumped tree pass
-# Argument 2 handles expected failures and the like
-proc scan-ltrans-tree-dump-not { args } {
-
-if { [llength $args] < 2 } {
-   error "scan-ltrans-tree-dump-not: too few arguments"
-   return
-}
-if { [llength $args] > 3 } {
-   error "scan-ltrans-tree-dump-not: too many arguments"

[gcc r15-3073] aarch64: Fix caller saves of VNx2QI [PR116238]

2024-08-21 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:ec9d6d45191f639482344362d048294e74587ca3

commit r15-3073-gec9d6d45191f639482344362d048294e74587ca3
Author: Richard Sandiford 
Date:   Wed Aug 21 17:35:47 2024 +0100

aarch64: Fix caller saves of VNx2QI [PR116238]

The testcase contains a VNx2QImode pseudo that is live across a call
and that cannot be allocated a call-preserved register.  LRA quite
reasonably tried to save it before the call and restore it afterwards.
Unfortunately, the target told it to do that in SImode, even though
punning between SImode and VNx2QImode is disallowed by both
TARGET_CAN_CHANGE_MODE_CLASS and TARGET_MODES_TIEABLE_P.

The natural class to use for SImode is GENERAL_REGS, so this led
to an unsalvageable situation in which we had:

  (set (subreg:VNx2QI (reg:SI A) 0) (reg:VNx2QI B))

where A needed GENERAL_REGS and B needed FP_REGS.  We therefore ended
up in a reload loop.

The hooks above should ensure that this situation can never occur
for incoming subregs.  It only happened here because the target
explicitly forced it.

The decision to use SImode for modes smaller than 4 bytes dates
back to the beginning of the port, before 16-bit floating-point
modes existed.  I'm not sure whether promoting to SImode really
makes sense for any FPR, but that's a separate performance/QoI
discussion.  For now, this patch just disallows using SImode
when it is wrong for correctness reasons, since that should be
safer to backport.

gcc/
PR testsuite/116238
* config/aarch64/aarch64.cc (aarch64_hard_regno_caller_save_mode):
Only return SImode if we can convert to and from it.

gcc/testsuite/
PR testsuite/116238
* gcc.target/aarch64/sve/pr116238.c: New test.

Diff:
---
 gcc/config/aarch64/aarch64.cc   |  7 ---
 gcc/testsuite/gcc.target/aarch64/sve/pr116238.c | 13 +
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index bfd7bcdef7cb..4e312c435769 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -2521,10 +2521,11 @@ aarch64_hard_regno_caller_save_mode (unsigned regno, 
unsigned,
  unnecessarily significant.  */
   if (PR_REGNUM_P (regno))
 return mode;
-  if (known_ge (GET_MODE_SIZE (mode), 4))
-return mode;
-  else
+  if (known_lt (GET_MODE_SIZE (mode), 4)
+  && REG_CAN_CHANGE_MODE_P (regno, mode, SImode)
+  && REG_CAN_CHANGE_MODE_P (regno, SImode, mode))
 return SImode;
+  return mode;
 }
 
 /* Return true if I's bits are consecutive ones from the MSB.  */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr116238.c 
b/gcc/testsuite/gcc.target/aarch64/sve/pr116238.c
new file mode 100644
index ..fe66b198107f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pr116238.c
@@ -0,0 +1,13 @@
+/* { dg-additional-options "-O2 -msve-vector-bits=128" } */
+
+void foo();
+typedef unsigned char v2qi __attribute__((vector_size(2)));
+void f(v2qi *ptr)
+{
+  v2qi x = *ptr;
+  asm volatile ("" :: "w" (x));
+  asm volatile ("" ::: "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15");
+  foo();
+  asm volatile ("" :: "w" (x));
+  *ptr = x;
+}


[gcc r15-3212] lra: Don't apply eliminations to allocated registers [PR116321]

2024-08-27 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:9db997e5ac4a206b9428eb2447fcdc90e37725f4

commit r15-3212-g9db997e5ac4a206b9428eb2447fcdc90e37725f4
Author: Richard Sandiford 
Date:   Tue Aug 27 09:48:28 2024 +0100

lra: Don't apply eliminations to allocated registers [PR116321]

The sequence of events in this PR is that:

- the function has many addresses in which only a single hard base
  register is acceptable.  Let's call the hard register H.

- IRA allocates that register to one of the pseudo base registers.
  Let's call the pseudo register P.

- Some of the other addresses that require H occur when P is still live.

- LRA therefore has to spill P.

- When it reallocates P, LRA chooses to use FRAME_POINTER_REGNUM,
  which has been eliminated to the stack pointer.  (This is ok,
  since the frame register is free.)

- Spilling P causes LRA to reprocess the instruction that uses P.

- When reprocessing the address that has P as its base, LRA first
  applies the new allocation, to get FRAME_POINTER_REGNUM,
  and then applies the elimination, to get the stack pointer.

The last step seems wrong: the elimination should only apply to
pre-existing uses of FRAME_POINTER_REGNUM, not to uses that result
from allocating pseudos.  Applying both means that we get the wrong
register number, and therefore the wrong class.

The PR is about an existing testcase that fails with LRA on m86k.

gcc/
PR middle-end/116321
* lra-constraints.cc (get_hard_regno): Only apply eliminations
to existing hard registers.
(get_reg_class): Likewise.

Diff:
---
 gcc/lra-constraints.cc | 18 +-
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/gcc/lra-constraints.cc b/gcc/lra-constraints.cc
index 90cbe6c012b7..fdcc07764a2e 100644
--- a/gcc/lra-constraints.cc
+++ b/gcc/lra-constraints.cc
@@ -200,12 +200,13 @@ get_hard_regno (rtx x)
 reg = SUBREG_REG (x);
   if (! REG_P (reg))
 return -1;
-  if (! HARD_REGISTER_NUM_P (hard_regno = REGNO (reg)))
-hard_regno = lra_get_regno_hard_regno (hard_regno);
+  int regno = REGNO (reg);
+  if (HARD_REGISTER_NUM_P (regno))
+hard_regno = lra_get_elimination_hard_regno (regno);
+  else
+hard_regno = lra_get_regno_hard_regno (regno);
   if (hard_regno < 0)
 return -1;
-  if (HARD_REGISTER_NUM_P (REGNO (reg)))
-hard_regno = lra_get_elimination_hard_regno (hard_regno);
   if (SUBREG_P (x))
 hard_regno += subreg_regno_offset (hard_regno, GET_MODE (reg),
   SUBREG_BYTE (x),  GET_MODE (x));
@@ -221,13 +222,12 @@ get_reg_class (int regno)
 {
   int hard_regno;
 
-  if (! HARD_REGISTER_NUM_P (hard_regno = regno))
+  if (HARD_REGISTER_NUM_P (regno))
+hard_regno = lra_get_elimination_hard_regno (regno);
+  else
 hard_regno = lra_get_regno_hard_regno (regno);
   if (hard_regno >= 0)
-{
-  hard_regno = lra_get_elimination_hard_regno (hard_regno);
-  return REGNO_REG_CLASS (hard_regno);
-}
+return REGNO_REG_CLASS (hard_regno);
   if (regno >= new_regno_start)
 return lra_get_allocno_class (regno);
   return NO_REGS;


[gcc r15-3213] Handle arithmetic on eliminated address indices [PR116413]

2024-08-27 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:708ee71808ea61758e73d0e36274b4194b28576a

commit r15-3213-g708ee71808ea61758e73d0e36274b4194b28576a
Author: Richard Sandiford 
Date:   Tue Aug 27 09:48:28 2024 +0100

Handle arithmetic on eliminated address indices [PR116413]

This patch fixes gcc.c-torture/compile/opout.c for m68k with LRA
enabled.  The test has:

...
z (a, b)
{
  return (int) &a + (int) &b + (int) x + (int) z;
}

so it adds the address of two incoming arguments.  This ends up
being treated as an LEA in which the "index" is the incoming
argument pointer, which the LEA multiplies by 2.  The incoming
argument pointer is then eliminated, leading to:

(plus:SI (plus:SI (ashift:SI (plus:SI (reg/f:SI 24 %argptr)
(const_int -4 [0xfffc]))
(const_int 1 [0x1]))
(reg/f:SI 41 [ _6 ]))
(const_int 20 [0x14]))

In the address_info scheme, the innermost plus has to be treated
as the index "term", since that's the thing that's subject to
index_reg_class.

gcc/
PR middle-end/116413
* rtl.h (address_info): Update commentary.
* rtlanal.cc (valid_base_or_index_term_p): New function, split
out from...
(get_base_term, get_index_term): ...here.  Handle elimination 
PLUSes.

Diff:
---
 gcc/rtl.h  | 14 --
 gcc/rtlanal.cc | 29 +
 2 files changed, 33 insertions(+), 10 deletions(-)

diff --git a/gcc/rtl.h b/gcc/rtl.h
index 2370d6081614..1ef6432fd9c1 100644
--- a/gcc/rtl.h
+++ b/gcc/rtl.h
@@ -2225,11 +2225,21 @@ struct address_info {
reloading.
 
  - *BASE is a variable expression representing a base address.
-   It contains exactly one REG, SUBREG or MEM, pointed to by BASE_TERM.
+   It contains exactly one "term", pointed to by BASE_TERM.
+   This term can be one of the following:
+
+   (1) a REG, or a SUBREG of a REG
+   (2) an eliminated REG (a PLUS of (1) and a constant)
+   (3) a MEM, or a SUBREG of a MEM
+   (4) a SCRATCH
+
+   This term is the one that base_reg_class constrains.
 
  - *INDEX is a variable expression representing an index value.
It may be a scaled expression, such as a MULT.  It has exactly
-   one REG, SUBREG or MEM, pointed to by INDEX_TERM.
+   one "term", pointed to by INDEX_TERM.  The possible terms are
+   the same as for BASE.  This term is the one that index_reg_class
+   constrains.
 
  - *DISP is a constant, possibly mutated.  DISP_TERM points to the
unmutated RTX_CONST_OBJ.  */
diff --git a/gcc/rtlanal.cc b/gcc/rtlanal.cc
index 71207ee4f417..8afbb32f2206 100644
--- a/gcc/rtlanal.cc
+++ b/gcc/rtlanal.cc
@@ -6494,6 +6494,25 @@ binary_scale_code_p (enum rtx_code code)
   || code == ROTATERT);
 }
 
+/* Return true if X appears to be a valid base or index term.  */
+static bool
+valid_base_or_index_term_p (rtx x)
+{
+  if (GET_CODE (x) == SCRATCH)
+return true;
+  /* Handle what appear to be eliminated forms of a register.  If we reach
+ here, the elimination occurs outside of the outermost PLUS tree,
+ and so the elimination offset cannot be treated as a displacement
+ of the main address.  Instead, we need to treat the whole PLUS as
+ the base or index term.  The address can only be made legitimate by
+ reloading the PLUS.  */
+  if (GET_CODE (x) == PLUS && CONST_SCALAR_INT_P (XEXP (x, 1)))
+x = XEXP (x, 0);
+  if (GET_CODE (x) == SUBREG)
+x = SUBREG_REG (x);
+  return REG_P (x) || MEM_P (x);
+}
+
 /* If *INNER can be interpreted as a base, return a pointer to the inner term
(see address_info).  Return null otherwise.  */
 
@@ -6502,10 +6521,7 @@ get_base_term (rtx *inner)
 {
   if (GET_CODE (*inner) == LO_SUM)
 inner = strip_address_mutations (&XEXP (*inner, 0));
-  if (REG_P (*inner)
-  || MEM_P (*inner)
-  || GET_CODE (*inner) == SUBREG
-  || GET_CODE (*inner) == SCRATCH)
+  if (valid_base_or_index_term_p (*inner))
 return inner;
   return 0;
 }
@@ -6519,10 +6535,7 @@ get_index_term (rtx *inner)
   /* At present, only constant scales are allowed.  */
   if (binary_scale_code_p (GET_CODE (*inner)) && CONSTANT_P (XEXP (*inner, 1)))
 inner = strip_address_mutations (&XEXP (*inner, 0));
-  if (REG_P (*inner)
-  || MEM_P (*inner)
-  || GET_CODE (*inner) == SUBREG
-  || GET_CODE (*inner) == SCRATCH)
+  if (valid_base_or_index_term_p (*inner))
 return inner;
   return 0;
 }


[gcc r15-3258] Tweak documentation of ASM_INPUT_P

2024-08-28 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:3c9338b532fd609f5cc1c50d6a4e77e0e1ab3bc5

commit r15-3258-g3c9338b532fd609f5cc1c50d6a4e77e0e1ab3bc5
Author: Richard Sandiford 
Date:   Wed Aug 28 16:37:53 2024 +0100

Tweak documentation of ASM_INPUT_P

The documentation of ASM_INPUT_P implied that the flag has no
effect on ASM_EXPRs that have operands (and which therefore must be
extended asms).  In fact we require ASM_INPUT_P to be false for all
extended asms.

gcc/
* tree.h (ASM_INPUT_P): Fix documentation.

Diff:
---
 gcc/tree.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/gcc/tree.h b/gcc/tree.h
index 5dcbb2fb5dd6..c501019717f0 100644
--- a/gcc/tree.h
+++ b/gcc/tree.h
@@ -1428,8 +1428,9 @@ class auto_suppress_location_wrappers
 #define ASM_INPUTS(NODE)TREE_OPERAND (ASM_EXPR_CHECK (NODE), 2)
 #define ASM_CLOBBERS(NODE)  TREE_OPERAND (ASM_EXPR_CHECK (NODE), 3)
 #define ASM_LABELS(NODE)   TREE_OPERAND (ASM_EXPR_CHECK (NODE), 4)
-/* Nonzero if we want to create an ASM_INPUT instead of an
-   ASM_OPERAND with no operands.  */
+/* Nonzero if the asm is a basic asm, zero if it is an extended asm.
+   Basic asms use a plain ASM_INPUT insn pattern whereas extended asms
+   use an ASM_OPERANDS insn pattern.  */
 #define ASM_INPUT_P(NODE) (ASM_EXPR_CHECK (NODE)->base.static_flag)
 #define ASM_VOLATILE_P(NODE) (ASM_EXPR_CHECK (NODE)->base.public_flag)
 /* Nonzero if we want to consider this asm as minimum length and cost


[gcc r15-3259] aarch64: Add a test for zeroing <64bits>x2_t structures

2024-08-28 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:035c196cb9be2f1aee142240d506dde474cbe64e

commit r15-3259-g035c196cb9be2f1aee142240d506dde474cbe64e
Author: Richard Sandiford 
Date:   Wed Aug 28 16:41:08 2024 +0100

aarch64: Add a test for zeroing <64bits>x2_t structures

g:8d6c6fbc5271dde433998c09407b30e2cf195420 improved the code
generated for functions like:

  void test_s8 (int8x8x2_t *ptr) { *ptr = (int8x8x2_t) {}; }

Previously we would load zero from the constant pool, whereas
now we just use "stp xzr, xzr".  This patch adds a test for
this improvement.

gcc/testsuite/
* gcc.target/aarch64/struct_zero.c: New test.

Diff:
---
 gcc/testsuite/gcc.target/aarch64/struct_zero.c | 21 +
 1 file changed, 21 insertions(+)

diff --git a/gcc/testsuite/gcc.target/aarch64/struct_zero.c 
b/gcc/testsuite/gcc.target/aarch64/struct_zero.c
new file mode 100644
index ..13f7236a4d27
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/struct_zero.c
@@ -0,0 +1,21 @@
+/* { dg-options "-O2" } */
+
+#include 
+
+void test_s8 (int8x8x2_t *ptr) { *ptr = (int8x8x2_t) {}; }
+void test_u8 (uint8x8x2_t *ptr) { *ptr = (uint8x8x2_t) {}; }
+void test_p8 (poly8x8x2_t *ptr) { *ptr = (poly8x8x2_t) {}; }
+void test_s16 (int16x4x2_t *ptr) { *ptr = (int16x4x2_t) {}; }
+void test_u16 (uint16x4x2_t *ptr) { *ptr = (uint16x4x2_t) {}; }
+void test_p16 (poly16x4x2_t *ptr) { *ptr = (poly16x4x2_t) {}; }
+void test_bf16 (bfloat16x4x2_t *ptr) { *ptr = (bfloat16x4x2_t) {}; }
+void test_f16 (float16x4x2_t *ptr) { *ptr = (float16x4x2_t) {}; }
+void test_s32 (int32x2x2_t *ptr) { *ptr = (int32x2x2_t) {}; }
+void test_u32 (uint32x2x2_t *ptr) { *ptr = (uint32x2x2_t) {}; }
+void test_f32 (float32x2x2_t *ptr) { *ptr = (float32x2x2_t) {}; }
+void test_s64 (int64x1x2_t *ptr) { *ptr = (int64x1x2_t) {}; }
+void test_u64 (uint64x1x2_t *ptr) { *ptr = (uint64x1x2_t) {}; }
+void test_p64 (poly64x1x2_t *ptr) { *ptr = (poly64x1x2_t) {}; }
+void test_f64 (float64x1x2_t *ptr) { *ptr = (float64x1x2_t) {}; }
+
+/* { dg-final { scan-assembler-times {\tstp\txzr, xzr, \[x0\]\n} 15 } } */


[gcc r15-3260] aarch64: Fix gather x32/x64 selection

2024-08-28 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:3e27ea26fedf00c2662d8460cdf6aca05d0d64aa

commit r15-3260-g3e27ea26fedf00c2662d8460cdf6aca05d0d64aa
Author: Richard Sandiford 
Date:   Wed Aug 28 16:41:08 2024 +0100

aarch64: Fix gather x32/x64 selection

The SVE gather and scatter costs are classified based on whether
they do 4 loads per 128 bits (x32) or 2 loads per 128 bits (x64).
The number after the "x" refers to the number of bits in each
"container".

However, the test for which to use was based on the element size
rather than the container size.  This meant that we'd use the
overly conservative x32 costs for VNx2SI gathers.  VNx2SI gathers
are really .D gathers in which the upper half of each extension
result is ignored.

This patch is necessary to switch -mtune=generic over to the
"new" vector costs.

gcc/
* config/aarch64/aarch64.cc (aarch64_detect_vector_stmt_subtype)
(aarch64_vector_costs::add_stmt_cost): Use the x64 cost rather
than x32 cost for all VNx2 modes.

Diff:
---
 gcc/config/aarch64/aarch64.cc | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 40dacfcf2e78..033ea61d3a8e 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -16819,7 +16819,8 @@ aarch64_detect_vector_stmt_subtype (vec_info *vinfo, 
vect_cost_for_stmt kind,
   && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
 {
   unsigned int nunits = vect_nunits_for_cost (vectype);
-  if (GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype)) == 64)
+  /* Test for VNx2 modes, which have 64-bit containers.  */
+  if (known_eq (GET_MODE_NUNITS (TYPE_MODE (vectype)), aarch64_sve_vg))
return { sve_costs->gather_load_x64_cost, nunits };
   return { sve_costs->gather_load_x32_cost, nunits };
 }
@@ -17309,7 +17310,9 @@ aarch64_vector_costs::add_stmt_cost (int count, 
vect_cost_for_stmt kind,
  const sve_vec_cost *sve_costs = aarch64_tune_params.vec_costs->sve;
  if (sve_costs)
{
- if (GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype)) == 64)
+ /* Test for VNx2 modes, which have 64-bit containers.  */
+ if (known_eq (GET_MODE_NUNITS (TYPE_MODE (vectype)),
+   aarch64_sve_vg))
m_sve_gather_scatter_init_cost
  += sve_costs->gather_load_x64_init_cost;
  else


[gcc r15-3261] aarch64: Assume zero gather/scatter set-up cost for -mtune=generic

2024-08-28 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:abeeccef92892fe519cc417b30ae22ce9da2d5e6

commit r15-3261-gabeeccef92892fe519cc417b30ae22ce9da2d5e6
Author: Richard Sandiford 
Date:   Wed Aug 28 16:41:09 2024 +0100

aarch64: Assume zero gather/scatter set-up cost for -mtune=generic

generic_vector_cost is not currently used by any SVE target
by default; it has to be specifically selected by -mtune=generic.
Its SVE costing has historically been somewhat idealised, since
it predated any actual SVE cores.  This seems like a useful
tradition to continue, at least for testing purposes.

The ideal case is that gathers and scatters do not induce a specific
one-off overhead.  This patch therefore sets the gather/scatter init
costs to zero.

This patch is necessary to switch -mtune=generic over to the
"new" vector costs.

gcc/
* config/aarch64/tuning_models/generic.h (generic_sve_vector_cost):
Set gather_load_x32_init_cost and gather_load_x64_init_cost to 0.

Diff:
---
 gcc/config/aarch64/tuning_models/generic.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gcc/config/aarch64/tuning_models/generic.h 
b/gcc/config/aarch64/tuning_models/generic.h
index 101969bdbb9c..ee2f3ff42663 100644
--- a/gcc/config/aarch64/tuning_models/generic.h
+++ b/gcc/config/aarch64/tuning_models/generic.h
@@ -105,8 +105,8 @@ static const sve_vec_cost generic_sve_vector_cost =
   2, /* fadda_f64_cost  */
   4, /* gather_load_x32_cost  */
   2, /* gather_load_x64_cost  */
-  12, /* gather_load_x32_init_cost  */
-  4, /* gather_load_x64_init_cost  */
+  0, /* gather_load_x32_init_cost  */
+  0, /* gather_load_x64_init_cost  */
   1 /* scatter_store_elt_cost  */
 };


[gcc r15-3287] Make some smallest_int_mode_for_size calls cope with failure

2024-08-29 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:00ec6bd805924b2d7d72cf03b200b3b4b7831835

commit r15-3287-g00ec6bd805924b2d7d72cf03b200b3b4b7831835
Author: Richard Sandiford 
Date:   Thu Aug 29 14:00:23 2024 +0100

Make some smallest_int_mode_for_size calls cope with failure

smallest_int_mode_for_size now returns an optional mode rather
than aborting on failure.  This patch adjusts a couple of callers
so that they fail gracefully when no mode exists.

There should be no behavioural change, since anything that triggers
the new return paths would previously have aborted.  I just think
this is how the code would have been written if the option had been
available earlier.

gcc/
* dse.cc (find_shift_sequence): Allow smallest_int_mode_for_size
to failure.
* optabs.cc (expand_twoval_binop_libfunc): Likewise.

Diff:
---
 gcc/dse.cc| 16 
 gcc/optabs.cc |  6 --
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/gcc/dse.cc b/gcc/dse.cc
index c3feff06f864..75825a44cb98 100644
--- a/gcc/dse.cc
+++ b/gcc/dse.cc
@@ -1717,12 +1717,12 @@ dump_insn_info (const char * start, insn_info_t 
insn_info)
line up, we need to extract the value from lower part of the rhs of
the store, shift it, and then put it into a form that can be shoved
into the read_insn.  This function generates a right SHIFT of a
-   value that is at least ACCESS_SIZE bytes wide of READ_MODE.  The
+   value that is at least ACCESS_BYTES bytes wide of READ_MODE.  The
shift sequence is returned or NULL if we failed to find a
shift.  */
 
 static rtx
-find_shift_sequence (poly_int64 access_size,
+find_shift_sequence (poly_int64 access_bytes,
 store_info *store_info,
 machine_mode read_mode,
 poly_int64 shift, bool speed, bool require_cst)
@@ -1734,11 +1734,11 @@ find_shift_sequence (poly_int64 access_size,
   /* If a constant was stored into memory, try to simplify it here,
  otherwise the cost of the shift might preclude this optimization
  e.g. at -Os, even when no actual shift will be needed.  */
+  auto access_bits = access_bytes * BITS_PER_UNIT;
   if (store_info->const_rhs
-  && known_le (access_size, GET_MODE_SIZE (MAX_MODE_INT)))
+  && known_le (access_bytes, GET_MODE_SIZE (MAX_MODE_INT))
+  && smallest_int_mode_for_size (access_bits).exists (&new_mode))
 {
-  auto new_mode = smallest_int_mode_for_size
-   (access_size * BITS_PER_UNIT).require ();
   auto byte = subreg_lowpart_offset (new_mode, store_mode);
   rtx ret
= simplify_subreg (new_mode, store_info->const_rhs, store_mode, byte);
@@ -1810,7 +1810,7 @@ find_shift_sequence (poly_int64 access_size,
}
}
 
-  if (maybe_lt (GET_MODE_SIZE (new_mode), access_size))
+  if (maybe_lt (GET_MODE_SIZE (new_mode), access_bytes))
continue;
 
   new_reg = gen_reg_rtx (new_mode);
@@ -1839,8 +1839,8 @@ find_shift_sequence (poly_int64 access_size,
 of the arguments and could be precomputed.  It may
 not be worth doing so.  We could precompute if
 worthwhile or at least cache the results.  The result
-technically depends on both SHIFT and ACCESS_SIZE,
-but in practice the answer will depend only on ACCESS_SIZE.  */
+technically depends on both SHIFT and ACCESS_BYTES,
+but in practice the answer will depend only on ACCESS_BYTES.  */
 
   if (cost > COSTS_N_INSNS (1))
continue;
diff --git a/gcc/optabs.cc b/gcc/optabs.cc
index ded9cc3d947a..2bcb3f7b47ae 100644
--- a/gcc/optabs.cc
+++ b/gcc/optabs.cc
@@ -2551,8 +2551,10 @@ expand_twoval_binop_libfunc (optab binoptab, rtx op0, 
rtx op1,
 
   /* The value returned by the library function will have twice as
  many bits as the nominal MODE.  */
-  libval_mode
-= smallest_int_mode_for_size (2 * GET_MODE_BITSIZE (mode)).require ();
+  auto return_size = 2 * GET_MODE_BITSIZE (mode);
+  if (!smallest_int_mode_for_size (return_size).exists (&libval_mode))
+return false;
+
   start_sequence ();
   libval = emit_library_call_value (libfunc, NULL_RTX, LCT_CONST,
libval_mode,


[gcc r15-3288] Allow subregs around constant displacements [PR116516]

2024-08-29 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:ac6d433b02ce26a646b2a7254b1d87fcc06b0beb

commit r15-3288-gac6d433b02ce26a646b2a7254b1d87fcc06b0beb
Author: Richard Sandiford 
Date:   Thu Aug 29 14:00:23 2024 +0100

Allow subregs around constant displacements [PR116516]

This patch fixes a regression introduced by g:708ee71808ea61758e73.
x86_64 allows addresses of the form:

  (zero_extend:DI (subreg:SI (symbol_ref:DI "foo") 0))

Before the previous patch, a lax SUBREG check meant that we would
treat the subreg as a base and reload it into a base register.
But that wasn't what the target was expecting.  Instead we should
treat "foo" as a constant displacement, to match:

leal foo, 

After the patch, we recognised that "foo" isn't a base register,
but ICEd on it rather than handling it as a displacement.

With or without the recent patches, if the address had instead been:

  (zero_extend:DI
(subreg:SI (plus:DI (reg:DI R) (symbol_ref:DI "foo") 0)))

then we would have treated "foo" as the displacement and R as the base
or index, as expected.  The problem was that the code that does this was
rejecting all subregs of objects, rather than just subregs of variable
objects.

gcc/
PR middle-end/116516
* rtlanal.cc (strip_address_mutations): Allow subregs around
constant displacements.

gcc/testsuite/
PR middle-end/116516
* gcc.c-torture/compile/pr116516.c: New test.

Diff:
---
 gcc/rtlanal.cc | 28 ++
 gcc/testsuite/gcc.c-torture/compile/pr116516.c | 10 +
 2 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/gcc/rtlanal.cc b/gcc/rtlanal.cc
index 8afbb32f2206..cb0c0c0d7197 100644
--- a/gcc/rtlanal.cc
+++ b/gcc/rtlanal.cc
@@ -6467,10 +6467,30 @@ strip_address_mutations (rtx *loc, enum rtx_code 
*outer_code)
/* (and ... (const_int -X)) is used to align to X bytes.  */
loc = &XEXP (*loc, 0);
   else if (code == SUBREG
-   && !OBJECT_P (SUBREG_REG (*loc))
-   && subreg_lowpart_p (*loc))
-   /* (subreg (operator ...) ...) inside and is used for mode
-  conversion too.  */
+  && (!OBJECT_P (SUBREG_REG (*loc))
+  || CONSTANT_P (SUBREG_REG (*loc)))
+  && subreg_lowpart_p (*loc))
+   /* (subreg (operator ...) ...) inside AND is used for mode
+  conversion too.  It is also used for load-address operations
+  in which an extension can be done for free, such as:
+
+(zero_extend:DI
+  (subreg:SI (plus:DI (reg:DI R) (symbol_ref:DI "foo") 0)))
+
+  The latter usage also covers subregs of plain "displacements",
+  such as:
+
+(zero_extend:DI (subreg:SI (symbol_ref:DI "foo") 0))
+
+  The inner address should then be the symbol_ref, not the subreg,
+  similarly to the plus case above.
+
+  In contrast, the subreg in:
+
+(zero_extend:DI (subreg:SI (reg:DI R) 0))
+
+  should be treated as the base, since it should be replaced by
+  an SImode hard register during register allocation.  */
loc = &SUBREG_REG (*loc);
   else
return loc;
diff --git a/gcc/testsuite/gcc.c-torture/compile/pr116516.c 
b/gcc/testsuite/gcc.c-torture/compile/pr116516.c
new file mode 100644
index ..c423ebfef5c0
--- /dev/null
+++ b/gcc/testsuite/gcc.c-torture/compile/pr116516.c
@@ -0,0 +1,10 @@
+extern void my_func (int);
+typedef struct {
+  int var;
+} info_t;
+extern void *_data_offs;
+void test()
+{
+  info_t *info = (info_t *) ((void *)((void *)1) + ((unsigned 
int)&_data_offs));
+  my_func(info->var == 0);
+}


[gcc r15-3376] Rename ASM_INPUT_P to ASM_BASIC_P

2024-09-02 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:a4b6c6ab0ba04a4fa409608a860067770317d0de

commit r15-3376-ga4b6c6ab0ba04a4fa409608a860067770317d0de
Author: Richard Sandiford 
Date:   Mon Sep 2 09:56:56 2024 +0100

Rename ASM_INPUT_P to ASM_BASIC_P

ASM_INPUT_P is so named because it causes the eventual rtl insn
pattern to be a top-level ASM_INPUT rather than an ASM_OPERANDS.
However, this name has caused confusion, partly due to earlier
documentation.  The name also sounds related to ASM_INPUTS but
is for a different piece of state.

This patch renames it to ASM_BASIC_P, with the inverse meaning
an extended asm.  ("Basic asm" is the term used in extend.texi.)

gcc/
* doc/generic.texi (ASM_BASIC_P): Document.
* tree.h (ASM_INPUT_P): Rename to...
(ASM_BASIC_P): ...this.
(ASM_VOLATILE_P, ASM_INLINE_P): Reindent.
* gimplify.cc (gimplify_asm_expr): Update after above renaming.
* tree-core.h (tree_base): Likewise.

gcc/c/
* c-typeck.cc (build_asm_expr): Rename ASM_INPUT_P to ASM_BASIC_P.

gcc/cp/
* pt.cc (tsubst_stmt): Rename ASM_INPUT_P to ASM_BASIC_P.
* parser.cc (cp_parser_asm_definition): Likewise.

gcc/d/
* toir.cc (IRVisitor): Rename ASM_INPUT_P to ASM_BASIC_P.

gcc/jit/
* jit-playback.cc (playback::block::add_extended_asm):  Rename
ASM_INPUT_P to ASM_BASIC_P.

gcc/m2/
* gm2-gcc/m2block.cc (flush_pending_note): Rename ASM_INPUT_P
to ASM_BASIC_P.
* gm2-gcc/m2statement.cc (m2statement_BuildAsm): Likewise.

Diff:
---
 gcc/c/c-typeck.cc |  2 +-
 gcc/cp/parser.cc  |  2 +-
 gcc/cp/pt.cc  |  2 +-
 gcc/d/toir.cc |  5 ++---
 gcc/doc/generic.texi  | 16 +++-
 gcc/gimplify.cc   |  2 +-
 gcc/jit/jit-playback.cc   |  2 +-
 gcc/m2/gm2-gcc/m2block.cc |  2 +-
 gcc/m2/gm2-gcc/m2statement.cc |  2 +-
 gcc/tree-core.h   |  2 +-
 gcc/tree.h|  6 +++---
 11 files changed, 24 insertions(+), 19 deletions(-)

diff --git a/gcc/c/c-typeck.cc b/gcc/c/c-typeck.cc
index 094e41fa2021..58b2724b39e3 100644
--- a/gcc/c/c-typeck.cc
+++ b/gcc/c/c-typeck.cc
@@ -11672,7 +11672,7 @@ build_asm_expr (location_t loc, tree string, tree 
outputs, tree inputs,
 
   /* asm statements without outputs, including simple ones, are treated
  as volatile.  */
-  ASM_INPUT_P (args) = simple;
+  ASM_BASIC_P (args) = simple;
   ASM_VOLATILE_P (args) = (noutputs == 0);
   ASM_INLINE_P (args) = is_inline;
 
diff --git a/gcc/cp/parser.cc b/gcc/cp/parser.cc
index 632d3dc5ecf4..edfa5a494405 100644
--- a/gcc/cp/parser.cc
+++ b/gcc/cp/parser.cc
@@ -23143,7 +23143,7 @@ cp_parser_asm_definition (cp_parser* parser)
  if (TREE_CODE (temp) == CLEANUP_POINT_EXPR)
temp = TREE_OPERAND (temp, 0);
 
- ASM_INPUT_P (temp) = 1;
+ ASM_BASIC_P (temp) = 1;
}
}
   else
diff --git a/gcc/cp/pt.cc b/gcc/cp/pt.cc
index 9e0f0486ffbc..024fa8a55290 100644
--- a/gcc/cp/pt.cc
+++ b/gcc/cp/pt.cc
@@ -18930,7 +18930,7 @@ tsubst_stmt (tree t, tree args, tsubst_flags_t 
complain, tree in_decl)
tree asm_expr = tmp;
if (TREE_CODE (asm_expr) == CLEANUP_POINT_EXPR)
  asm_expr = TREE_OPERAND (asm_expr, 0);
-   ASM_INPUT_P (asm_expr) = ASM_INPUT_P (t);
+   ASM_BASIC_P (asm_expr) = ASM_BASIC_P (t);
   }
   break;
 
diff --git a/gcc/d/toir.cc b/gcc/d/toir.cc
index 9f5531ce5cdf..a6848f2ffa2c 100644
--- a/gcc/d/toir.cc
+++ b/gcc/d/toir.cc
@@ -1491,10 +1491,9 @@ public:
   outputs, inputs, clobbers, labels);
 SET_EXPR_LOCATION (exp, make_location_t (s->loc));
 
-/* If the extended syntax was not used, mark the ASM_EXPR as being an
-   ASM_INPUT expression instead of an ASM_OPERAND with no operands.  */
+/* Record whether the basic rather than extended syntax was used.  */
 if (s->args == NULL && s->clobbers == NULL)
-  ASM_INPUT_P (exp) = 1;
+  ASM_BASIC_P (exp) = 1;
 
 /* All asm statements are assumed to have a side effect.  As a future
optimization, this could be unset when building in release mode.  */
diff --git a/gcc/doc/generic.texi b/gcc/doc/generic.texi
index c596b7d44b21..3de394fd6e0a 100644
--- a/gcc/doc/generic.texi
+++ b/gcc/doc/generic.texi
@@ -2095,11 +2095,17 @@ asm ("fsinx %1,%0" : "=f" (result) : "f" (angle));
 @end smallexample
 The first string is the @code{ASM_STRING}, containing the instruction
 template.  The next two strings are the output and inputs, respectively;
-this statement has no clobbers.  As this example indicates, ``plain''
-assembly statements are merely a special case of extended assembly
-statements; they have no cv-qualifiers, outputs, inputs, or clobbers.
-All of the strings will be @code{NUL}-terminated, and will contain no
-embed

[gcc r15-3377] Rename gimple_asm_input_p to gimple_asm_basic_p

2024-09-02 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:2865719efb16e9f199b332fcf06d69c98928738e

commit r15-3377-g2865719efb16e9f199b332fcf06d69c98928738e
Author: Richard Sandiford 
Date:   Mon Sep 2 09:56:56 2024 +0100

Rename gimple_asm_input_p to gimple_asm_basic_p

Following on from the earlier tree rename, this patch renames
gimple_asm_input_p to gimple_asm_basic_p, and similarly for
related names.

gcc/
* doc/gimple.texi (gimple_asm_basic_p): Document.
(gimple_asm_set_basic): Likewise.
* gimple.h (GF_ASM_INPUT): Rename to...
(GF_ASM_BASIC): ...this.
(gimple_asm_set_input): Rename to...
(gimple_asm_set_basic): ...this.
(gimple_asm_input_p): Rename to...
(gimple_asm_basic_p): ...this.
* cfgexpand.cc (expand_asm_stmt): Update after above renaming.
* gimple.cc (gimple_asm_clobbers_memory_p): Likewise.
* gimplify.cc (gimplify_asm_expr): Likewise.
* ipa-icf-gimple.cc (func_checker::compare_gimple_asm): Likewise.
* tree-cfg.cc (stmt_can_terminate_bb_p): Likewise.

Diff:
---
 gcc/cfgexpand.cc  |  2 +-
 gcc/doc/gimple.texi   |  9 +
 gcc/gimple.cc |  2 +-
 gcc/gimple.h  | 19 ++-
 gcc/gimplify.cc   |  2 +-
 gcc/ipa-icf-gimple.cc |  2 +-
 gcc/tree-cfg.cc   |  2 +-
 7 files changed, 24 insertions(+), 14 deletions(-)

diff --git a/gcc/cfgexpand.cc b/gcc/cfgexpand.cc
index 13f8c08d295a..f32cf1b20c9a 100644
--- a/gcc/cfgexpand.cc
+++ b/gcc/cfgexpand.cc
@@ -3121,7 +3121,7 @@ expand_asm_stmt (gasm *stmt)
 
   location_t locus = gimple_location (stmt);
 
-  if (gimple_asm_input_p (stmt))
+  if (gimple_asm_basic_p (stmt))
 {
   const char *s = gimple_asm_string (stmt);
   tree string = build_string (strlen (s), s);
diff --git a/gcc/doc/gimple.texi b/gcc/doc/gimple.texi
index 5f241b1c64f4..d8aaca260493 100644
--- a/gcc/doc/gimple.texi
+++ b/gcc/doc/gimple.texi
@@ -1112,6 +1112,15 @@ Return the string representing the assembly instruction 
in
 @code{GIMPLE_ASM} @code{G}.
 @end deftypefn
 
+@deftypefn {GIMPLE function} bool gimple_asm_basic_p (const gasm *g)
+Return true if @code{G} is a basic asm rather than an extended asm.
+@end deftypefn
+
+@deftypefn {GIMPLE function} void gimple_asm_set_basic (gasm *g, bool basic_p)
+Mark asm statement @code{G} as a basic asm or an extended asm based on
+@code{BASIC_P}.
+@end deftypefn
+
 @deftypefn {GIMPLE function} bool gimple_asm_volatile_p (const gasm *g)
 Return true if @code{G} is an asm statement marked volatile.
 @end deftypefn
diff --git a/gcc/gimple.cc b/gcc/gimple.cc
index a9f968cb0389..6e28cf291e16 100644
--- a/gcc/gimple.cc
+++ b/gcc/gimple.cc
@@ -2944,7 +2944,7 @@ gimple_asm_clobbers_memory_p (const gasm *stmt)
 }
 
   /* Non-empty basic ASM implicitly clobbers memory.  */
-  if (gimple_asm_input_p (stmt) && strlen (gimple_asm_string (stmt)) != 0)
+  if (gimple_asm_basic_p (stmt) && strlen (gimple_asm_string (stmt)) != 0)
 return true;
 
   return false;
diff --git a/gcc/gimple.h b/gcc/gimple.h
index bd315ffc2dd4..ee986eaf1539 100644
--- a/gcc/gimple.h
+++ b/gcc/gimple.h
@@ -135,7 +135,7 @@ enum gimple_rhs_class
 
Keep this list sorted.  */
 enum gf_mask {
-GF_ASM_INPUT   = 1 << 0,
+GF_ASM_BASIC   = 1 << 0,
 GF_ASM_VOLATILE= 1 << 1,
 GF_ASM_INLINE  = 1 << 2,
 GF_CALL_FROM_THUNK = 1 << 0,
@@ -4227,24 +4227,25 @@ gimple_asm_set_inline (gasm *asm_stmt, bool inline_p)
 }
 
 
-/* If INPUT_P is true, mark asm ASM_STMT as an ASM_INPUT.  */
+/* Mark whether asm ASM_STMT is a basic asm or an extended asm, based on
+   BASIC_P.  */
 
 inline void
-gimple_asm_set_input (gasm *asm_stmt, bool input_p)
+gimple_asm_set_basic (gasm *asm_stmt, bool basic_p)
 {
-  if (input_p)
-asm_stmt->subcode |= GF_ASM_INPUT;
+  if (basic_p)
+asm_stmt->subcode |= GF_ASM_BASIC;
   else
-asm_stmt->subcode &= ~GF_ASM_INPUT;
+asm_stmt->subcode &= ~GF_ASM_BASIC;
 }
 
 
-/* Return true if asm ASM_STMT is an ASM_INPUT.  */
+/* Return true if asm ASM_STMT is a basic asm rather than an extended asm.  */
 
 inline bool
-gimple_asm_input_p (const gasm *asm_stmt)
+gimple_asm_basic_p (const gasm *asm_stmt)
 {
-  return (asm_stmt->subcode & GF_ASM_INPUT) != 0;
+  return (asm_stmt->subcode & GF_ASM_BASIC) != 0;
 }
 
 
diff --git a/gcc/gimplify.cc b/gcc/gimplify.cc
index f0edb4f7edd9..9300138aa0c7 100644
--- a/gcc/gimplify.cc
+++ b/gcc/gimplify.cc
@@ -7352,7 +7352,7 @@ gimplify_asm_expr (tree *expr_p, gimple_seq *pre_p, 
gimple_seq *post_p)
   ASM_VOLATILE_P (expr)
   || noutputs == 0
   || labels);
-  gimple_asm_set_input (stmt, ASM_BASIC_P (expr));
+  gimple_asm_set_basic (stmt, ASM_BASIC_P (expr));
   gimple_asm_set_inline (stmt, ASM_INLINE_P (expr));
 
   gimplify_seq_add_stmt (pre_p, stmt);
diff --git

[gcc r15-1606] Revert one of the force_subreg changes

2024-06-25 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:b694bf417cdd7d0a4d78e9927bab6bc202b7df6c

commit r15-1606-gb694bf417cdd7d0a4d78e9927bab6bc202b7df6c
Author: Richard Sandiford 
Date:   Tue Jun 25 09:41:21 2024 +0100

Revert one of the force_subreg changes

One of the changes in g:d4047da6a070175aae7121c739d1cad6b08ff4b2
caused a regression in ft32-elf; see:

https://gcc.gnu.org/pipermail/gcc-patches/2024-June/655418.html

for details.  This change was different from the others in that the
original call was to simplify_subreg rather than simplify_lowpart_subreg.
The old code would therefore go on to do the force_reg for more cases
than the new code would.

gcc/
* expmed.cc (store_bit_field_using_insv): Revert earlier change
to use force_subreg instead of simplify_gen_subreg.

Diff:
---
 gcc/expmed.cc | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/gcc/expmed.cc b/gcc/expmed.cc
index 3b9475f5aa0..8bbbc94a98c 100644
--- a/gcc/expmed.cc
+++ b/gcc/expmed.cc
@@ -695,7 +695,13 @@ store_bit_field_using_insv (const extraction_insn *insv, 
rtx op0,
 if we must narrow it, be sure we do it correctly.  */
 
  if (GET_MODE_SIZE (value_mode) < GET_MODE_SIZE (op_mode))
-   tmp = force_subreg (op_mode, value1, value_mode, 0);
+   {
+ tmp = simplify_subreg (op_mode, value1, value_mode, 0);
+ if (! tmp)
+   tmp = simplify_gen_subreg (op_mode,
+  force_reg (value_mode, value1),
+  value_mode, 0);
+   }
  else
{
  if (targetm.mode_rep_extended (op_mode, value_mode) != UNKNOWN)


[gcc r15-1610] Add a debug counter for late-combine

2024-06-25 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:b6215065a5b14317a342176d5304ecaea3163639

commit r15-1610-gb6215065a5b14317a342176d5304ecaea3163639
Author: Richard Sandiford 
Date:   Tue Jun 25 12:58:12 2024 +0100

Add a debug counter for late-combine

This should help to diagnose problems like PR115631.

gcc/
* dbgcnt.def (late_combine): New debug counter.
* late-combine.cc (insn_combination::run): Use it.

Diff:
---
 gcc/dbgcnt.def  | 1 +
 gcc/late-combine.cc | 6 ++
 2 files changed, 7 insertions(+)

diff --git a/gcc/dbgcnt.def b/gcc/dbgcnt.def
index ed9f062eac2..e0b9b1b2a76 100644
--- a/gcc/dbgcnt.def
+++ b/gcc/dbgcnt.def
@@ -186,6 +186,7 @@ DEBUG_COUNTER (ipa_sra_params)
 DEBUG_COUNTER (ipa_sra_retvalues)
 DEBUG_COUNTER (ira_move)
 DEBUG_COUNTER (ivopts_loop)
+DEBUG_COUNTER (late_combine)
 DEBUG_COUNTER (lim)
 DEBUG_COUNTER (local_alloc_for_sched)
 DEBUG_COUNTER (loop_unswitch)
diff --git a/gcc/late-combine.cc b/gcc/late-combine.cc
index 22a1d81d38e..fc75d1c56d7 100644
--- a/gcc/late-combine.cc
+++ b/gcc/late-combine.cc
@@ -41,6 +41,7 @@
 #include "tree-pass.h"
 #include "cfgcleanup.h"
 #include "target.h"
+#include "dbgcnt.h"
 
 using namespace rtl_ssa;
 
@@ -428,6 +429,11 @@ insn_combination::run ()
   || !crtl->ssa->verify_insn_changes (m_nondebug_changes))
 return false;
 
+  // We've now decided that the optimization is valid and profitable.
+  // Allow it to be suppressed for bisection purposes.
+  if (!dbg_cnt (::late_combine))
+return false;
+
   substitute_optional_uses (m_def);
 
   confirm_change_group ();


[gcc r15-1616] late-combine: Honor targetm.cannot_copy_insn_p

2024-06-25 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:b87e19afa349691fdc91173bcf7a9afc7b3b0cb1

commit r15-1616-gb87e19afa349691fdc91173bcf7a9afc7b3b0cb1
Author: Richard Sandiford 
Date:   Tue Jun 25 18:02:35 2024 +0100

late-combine: Honor targetm.cannot_copy_insn_p

late-combine was failing to take targetm.cannot_copy_insn_p into
account, which led to multiple definitions of PIC symbols on
arm*-*-* targets.

gcc/
* late-combine.cc (insn_combination::substitute_nondebug_use):
Reject second and subsequent uses if targetm.cannot_copy_insn_p
disallows copying.

Diff:
---
 gcc/late-combine.cc | 12 
 1 file changed, 12 insertions(+)

diff --git a/gcc/late-combine.cc b/gcc/late-combine.cc
index fc75d1c56d7..b7c0bc07a8b 100644
--- a/gcc/late-combine.cc
+++ b/gcc/late-combine.cc
@@ -179,6 +179,18 @@ insn_combination::substitute_nondebug_use (use_info *use)
   if (dump_file && (dump_flags & TDF_DETAILS))
 dump_insn_slim (dump_file, use->insn ()->rtl ());
 
+  // Reject second and subsequent uses if the target does not allow
+  // the defining instruction to be copied.
+  if (targetm.cannot_copy_insn_p
+  && m_nondebug_changes.length () >= 2
+  && targetm.cannot_copy_insn_p (m_def_insn->rtl ()))
+{
+  if (dump_file && (dump_flags & TDF_DETAILS))
+   fprintf (dump_file, "-- The target does not allow multiple"
+" copies of insn %d\n", m_def_insn->uid ());
+  return false;
+}
+
   // Check that we can change the instruction pattern.  Leave recognition
   // of the result till later.
   insn_propagation prop (use_rtl, m_dest, m_src);


[gcc r15-1696] Disable late-combine for -O0 [PR115677]

2024-06-27 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:f6081ee665fd5e4e7d37e02c69d16df0d3eead10

commit r15-1696-gf6081ee665fd5e4e7d37e02c69d16df0d3eead10
Author: Richard Sandiford 
Date:   Thu Jun 27 14:51:37 2024 +0100

Disable late-combine for -O0 [PR115677]

late-combine relies on df, which for -O0 is only initialised late
(pass_df_initialize_no_opt, after split1).  Other df-based passes
cope with this by requiring optimize > 0, so this patch does the
same for late-combine.

gcc/
PR rtl-optimization/115677
* late-combine.cc (pass_late_combine::gate): New function.

Diff:
---
 gcc/late-combine.cc | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/gcc/late-combine.cc b/gcc/late-combine.cc
index b7c0bc07a8b..789d734692a 100644
--- a/gcc/late-combine.cc
+++ b/gcc/late-combine.cc
@@ -744,10 +744,16 @@ public:
 
   // opt_pass methods:
   opt_pass *clone () override { return new pass_late_combine (m_ctxt); }
-  bool gate (function *) override { return flag_late_combine_instructions; }
+  bool gate (function *) override;
   unsigned int execute (function *) override;
 };
 
+bool
+pass_late_combine::gate (function *)
+{
+  return optimize > 0 && flag_late_combine_instructions;
+}
+
 unsigned int
 pass_late_combine::execute (function *fn)
 {


[gcc r15-1807] Give fast DCE a separate dirty flag

2024-07-03 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:47ea6bddd15a568cedc5d7026d2cc9d5599e6e01

commit r15-1807-g47ea6bddd15a568cedc5d7026d2cc9d5599e6e01
Author: Richard Sandiford 
Date:   Wed Jul 3 09:17:42 2024 +0100

Give fast DCE a separate dirty flag

Thomas pointed out that we sometimes failed to eliminate some dead code
(specifically clobbers of otherwise unused registers) on nvptx when
late-combine is enabled.  This happens because:

- combine is able to optimise the function in a way that exposes dead code.
  This leaves the df information in a "dirty" state.

- late_combine calls df_analyze without DF_LR_RUN_DCE run set.
  This updates the df information and clears the "dirty" state.

- late_combine doesn't find any extra optimisations, and so leaves
  the df information up-to-date.

- if_after_combine (ce2) calls df_analyze with DF_LR_RUN_DCE set.
  Because the df information is already up-to-date, fast DCE is
  not run.

The upshot is that running late-combine has the effect of suppressing
a DCE opportunity that would have been noticed without late_combine.

I think this shows that we should track the state of the DCE separately
from the LR problem.  Every pass updates the latter, but not all passes
update the former.

gcc/
* df.h (DF_LR_DCE): New df_problem_id.
(df_lr_dce): New macro.
* df-core.cc (rest_of_handle_df_finish): Check for a null free_fun.
* df-problems.cc (df_lr_finalize): Split out fast DCE handling to...
(df_lr_dce_finalize): ...this new function.
(problem_LR_DCE): New df_problem.
(df_lr_add_problem): Register LR_DCE rather than LR itself.
* dce.cc (fast_dce): Clear df_lr_dce->solutions_dirty.

Diff:
---
 gcc/dce.cc |  3 ++
 gcc/df-core.cc |  3 +-
 gcc/df-problems.cc | 96 +-
 gcc/df.h   |  2 ++
 4 files changed, 74 insertions(+), 30 deletions(-)

diff --git a/gcc/dce.cc b/gcc/dce.cc
index be1a2a87732..04e8d98818d 100644
--- a/gcc/dce.cc
+++ b/gcc/dce.cc
@@ -1182,6 +1182,9 @@ fast_dce (bool word_level)
   BITMAP_FREE (processed);
   BITMAP_FREE (redo_out);
   BITMAP_FREE (all_blocks);
+
+  /* Both forms of DCE should make further DCE unnecessary.  */
+  df_lr_dce->solutions_dirty = false;
 }
 
 
diff --git a/gcc/df-core.cc b/gcc/df-core.cc
index b0e8a88d433..8fd778a8618 100644
--- a/gcc/df-core.cc
+++ b/gcc/df-core.cc
@@ -806,7 +806,8 @@ rest_of_handle_df_finish (void)
   for (i = 0; i < df->num_problems_defined; i++)
 {
   struct dataflow *dflow = df->problems_in_order[i];
-  dflow->problem->free_fun ();
+  if (dflow->problem->free_fun)
+   dflow->problem->free_fun ();
 }
 
   free (df->postorder);
diff --git a/gcc/df-problems.cc b/gcc/df-problems.cc
index 88ee0dd67fc..bfd24bd1e86 100644
--- a/gcc/df-problems.cc
+++ b/gcc/df-problems.cc
@@ -1054,37 +1054,10 @@ df_lr_transfer_function (int bb_index)
 }
 
 
-/* Run the fast dce as a side effect of building LR.  */
-
 static void
-df_lr_finalize (bitmap all_blocks)
+df_lr_finalize (bitmap)
 {
   df_lr->solutions_dirty = false;
-  if (df->changeable_flags & DF_LR_RUN_DCE)
-{
-  run_fast_df_dce ();
-
-  /* If dce deletes some instructions, we need to recompute the lr
-solution before proceeding further.  The problem is that fast
-dce is a pessimestic dataflow algorithm.  In the case where
-it deletes a statement S inside of a loop, the uses inside of
-S may not be deleted from the dataflow solution because they
-were carried around the loop.  While it is conservatively
-correct to leave these extra bits, the standards of df
-require that we maintain the best possible (least fixed
-point) solution.  The only way to do that is to redo the
-iteration from the beginning.  See PR35805 for an
-example.  */
-  if (df_lr->solutions_dirty)
-   {
- df_clear_flags (DF_LR_RUN_DCE);
- df_lr_alloc (all_blocks);
- df_lr_local_compute (all_blocks);
- df_worklist_dataflow (df_lr, all_blocks, df->postorder, df->n_blocks);
- df_lr_finalize (all_blocks);
- df_set_flags (DF_LR_RUN_DCE);
-   }
-}
 }
 
 
@@ -1266,6 +1239,69 @@ static const struct df_problem problem_LR =
   false   /* Reset blocks on dropping out of 
blocks_to_analyze.  */
 };
 
+/* Run the fast DCE after building LR.  This is a separate problem so that
+   the "dirty" flag is only cleared after a DCE pass is actually run.  */
+
+static void
+df_lr_dce_finalize (bitmap all_blocks)
+{
+  if (!(df->changeable_flags & DF_LR_RUN_DCE))
+return;
+
+  /* Also clears df_lr_dce->solutions_dirty.  */
+  run_fast_df_dce ();
+
+  /* If dce deletes some instructions, we need to recompute the lr
+ solution before proceeding further.  The problem is that fast
+ 

[gcc r15-1944] rtl-ssa: Add replace_nondebug_insn [PR115785]

2024-07-10 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:e08ebd7d77a216ee2313b585c370333c66497b53

commit r15-1944-ge08ebd7d77a216ee2313b585c370333c66497b53
Author: Richard Sandiford 
Date:   Wed Jul 10 17:01:29 2024 +0100

rtl-ssa: Add replace_nondebug_insn [PR115785]

change_insns is used to change multiple instructions at once, so that
the IR on return is valid & self-consistent.  These changes can involve
moving instructions, and the new position for one instruction might
be expressed in terms of the old position of another instruction
that is changing at the same time.

change_insns therefore adds placeholder instructions to mark each
new instruction position, then replaces each placeholder with the
corresponding real instruction.  This replacement was done in two
steps: removing the old placeholder instruction and inserting the new
real instruction.  But it's more convenient for the upcoming fix for
PR115785 if we do the operation as a single step.  That should also
be slightly more efficient, since e.g. no splay tree operations are
needed.

This operation happens purely on the rtl-ssa instruction chain.
The placeholders are never represented in rtl.

gcc/
PR rtl-optimization/115785
* rtl-ssa/functions.h (function_info::replace_nondebug_insn): 
Declare.
* rtl-ssa/insns.h (insn_info::order_node::set_uid): New function.
(insn_info::remove_note): Declare.
* rtl-ssa/insns.cc (insn_info::remove_note): New function.
(function_info::replace_nondebug_insn): Likewise.
* rtl-ssa/changes.cc (function_info::change_insns): Use
replace_nondebug_insn instead of remove_insn + add_insn.

Diff:
---
 gcc/rtl-ssa/changes.cc  |  5 +
 gcc/rtl-ssa/functions.h |  1 +
 gcc/rtl-ssa/insns.cc| 42 ++
 gcc/rtl-ssa/insns.h |  4 
 4 files changed, 48 insertions(+), 4 deletions(-)

diff --git a/gcc/rtl-ssa/changes.cc b/gcc/rtl-ssa/changes.cc
index bc80d7da8296..6b6f7cd5d3ab 100644
--- a/gcc/rtl-ssa/changes.cc
+++ b/gcc/rtl-ssa/changes.cc
@@ -874,14 +874,11 @@ function_info::change_insns (array_slice 
changes)
}
  else
{
- // Remove the placeholder first so that we have a wider range of
- // program points when inserting INSN.
  insn_info *after = placeholder->prev_any_insn ();
  if (!insn->is_temporary ())
remove_insn (insn);
- remove_insn (placeholder);
+ replace_nondebug_insn (placeholder, insn);
  insn->set_bb (after->bb ());
- add_insn_after (insn, after);
}
}
 }
diff --git a/gcc/rtl-ssa/functions.h b/gcc/rtl-ssa/functions.h
index e21346217235..8be04f1aa969 100644
--- a/gcc/rtl-ssa/functions.h
+++ b/gcc/rtl-ssa/functions.h
@@ -274,6 +274,7 @@ private:
   insn_info::order_node *need_order_node (insn_info *);
 
   void add_insn_after (insn_info *, insn_info *);
+  void replace_nondebug_insn (insn_info *, insn_info *);
   void append_insn (insn_info *);
   void remove_insn (insn_info *);
 
diff --git a/gcc/rtl-ssa/insns.cc b/gcc/rtl-ssa/insns.cc
index 68365e323ec6..7e26bfd978fe 100644
--- a/gcc/rtl-ssa/insns.cc
+++ b/gcc/rtl-ssa/insns.cc
@@ -70,6 +70,16 @@ insn_info::add_note (insn_note *note)
   *ptr = note;
 }
 
+// Remove NOTE from the instruction's notes.
+void
+insn_info::remove_note (insn_note *note)
+{
+  insn_note **ptr = &m_first_note;
+  while (*ptr != note)
+ptr = &(*ptr)->m_next_note;
+  *ptr = note->m_next_note;
+}
+
 // Implement compare_with for the case in which this insn and OTHER
 // have the same program point.
 int
@@ -346,6 +356,38 @@ function_info::add_insn_after (insn_info *insn, insn_info 
*after)
 }
 }
 
+// Replace non-debug instruction OLD_INSN with non-debug instruction NEW_INSN.
+// NEW_INSN is not currently linked.
+void
+function_info::replace_nondebug_insn (insn_info *old_insn, insn_info *new_insn)
+{
+  gcc_assert (!old_insn->is_debug_insn ()
+ && !new_insn->is_debug_insn ()
+ && !new_insn->has_insn_links ());
+
+  insn_info *prev = old_insn->prev_any_insn ();
+  insn_info *next_nondebug = old_insn->next_nondebug_insn ();
+
+  // We should never remove the entry or exit block's instructions.
+  gcc_checking_assert (prev && next_nondebug);
+
+  new_insn->copy_prev_from (old_insn);
+  new_insn->copy_next_from (old_insn);
+
+  prev->set_next_any_insn (new_insn);
+  next_nondebug->set_prev_sametype_insn (new_insn);
+
+  new_insn->set_point (old_insn->point ());
+  if (insn_info::order_node *order = old_insn->get_order_node ())
+{
+  order->set_uid (new_insn->uid ());
+  old_insn->remove_note (order);
+  new_insn->add_note (order);
+}
+
+  old_insn->clear_insn_links ();
+}
+
 // Remove INSN from the function's list of instructions.
 void
 function_info::remove_insn (insn_info *insn)
diff --g

[gcc r15-1945] recog: Handle some mode-changing hardreg propagations

2024-07-10 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:9d20529d94b23275885f380d155fe8671ab5353a

commit r15-1945-g9d20529d94b23275885f380d155fe8671ab5353a
Author: Richard Sandiford 
Date:   Wed Jul 10 17:01:29 2024 +0100

recog: Handle some mode-changing hardreg propagations

insn_propagation would previously only replace (reg:M H) with X
for some hard register H if the uses of H were also in mode M.
This patch extends it to handle simple mode punning too.

The original motivation was to try to get rid of the execution
frequency test in aarch64_split_simd_shift_p, but doing that is
follow-up work.

I tried this on at least one target per CPU directory (as for
the late-combine patches) and it seems to be a small win for
all of them.

The patch includes a couple of updates to the ia32 results.
In pr105033.c, foo3 replaced:

   vmovq   8(%esp), %xmm1
   vpunpcklqdq %xmm1, %xmm0, %xmm0

with:

   vmovhps 8(%esp), %xmm0, %xmm0

In vect-bfloat16-2b.c, 5 of the vec_extract_v32bf_* routines
(specifically the ones with nonzero even indices) replaced
things like:

   movl28(%esp), %eax
   vmovd   %eax, %xmm0

with:

   vpinsrw $0, 28(%esp), %xmm0, %xmm0

(These functions return a bf16, and so only the low 16 bits matter.)

gcc/
* recog.cc (insn_propagation::apply_to_rvalue_1): Handle simple
cases of hardreg propagation in which the register is set and
used in different modes.

gcc/testsuite/
* gcc.target/i386/pr105033.c: Expect vmovhps for the ia32 version
of foo.
* gcc.target/i386/vect-bfloat16-2b.c: Expect more vpinsrws.

Diff:
---
 gcc/recog.cc | 31 +++-
 gcc/testsuite/gcc.target/i386/pr105033.c |  4 ++-
 gcc/testsuite/gcc.target/i386/vect-bfloat16-2b.c |  2 +-
 3 files changed, 29 insertions(+), 8 deletions(-)

diff --git a/gcc/recog.cc b/gcc/recog.cc
index 56370e40e01f..36507f3f57ce 100644
--- a/gcc/recog.cc
+++ b/gcc/recog.cc
@@ -1055,7 +1055,11 @@ insn_propagation::apply_to_rvalue_1 (rtx *loc)
   machine_mode mode = GET_MODE (x);
 
   auto old_num_changes = num_validated_changes ();
-  if (from && GET_CODE (x) == GET_CODE (from) && rtx_equal_p (x, from))
+  if (from
+  && GET_CODE (x) == GET_CODE (from)
+  && (REG_P (x)
+ ? REGNO (x) == REGNO (from)
+ : rtx_equal_p (x, from)))
 {
   /* Don't replace register asms in asm statements; we mustn't
 change the user's register allocation.  */
@@ -1065,11 +1069,26 @@ insn_propagation::apply_to_rvalue_1 (rtx *loc)
  && asm_noperands (PATTERN (insn)) > 0)
return false;
 
+  rtx newval = to;
+  if (GET_MODE (x) != GET_MODE (from))
+   {
+ gcc_assert (REG_P (x) && HARD_REGISTER_P (x));
+ if (REG_NREGS (x) != REG_NREGS (from)
+ || !REG_CAN_CHANGE_MODE_P (REGNO (x), GET_MODE (from),
+GET_MODE (x)))
+   return false;
+ newval = simplify_subreg (GET_MODE (x), to, GET_MODE (from),
+   subreg_lowpart_offset (GET_MODE (x),
+  GET_MODE (from)));
+ if (!newval)
+   return false;
+   }
+
   if (should_unshare)
-   validate_unshare_change (insn, loc, to, 1);
+   validate_unshare_change (insn, loc, newval, 1);
   else
-   validate_change (insn, loc, to, 1);
-  if (mem_depth && !REG_P (to) && !CONSTANT_P (to))
+   validate_change (insn, loc, newval, 1);
+  if (mem_depth && !REG_P (newval) && !CONSTANT_P (newval))
{
  /* We're substituting into an address, but TO will have the
 form expected outside an address.  Canonicalize it if
@@ -1083,9 +1102,9 @@ insn_propagation::apply_to_rvalue_1 (rtx *loc)
{
  /* TO is owned by someone else, so create a copy and
 return TO to its original form.  */
- rtx to = copy_rtx (*loc);
+ newval = copy_rtx (*loc);
  cancel_changes (old_num_changes);
- validate_change (insn, loc, to, 1);
+ validate_change (insn, loc, newval, 1);
}
}
   num_replacements += 1;
diff --git a/gcc/testsuite/gcc.target/i386/pr105033.c 
b/gcc/testsuite/gcc.target/i386/pr105033.c
index ab05e3b3bc85..10e39783464d 100644
--- a/gcc/testsuite/gcc.target/i386/pr105033.c
+++ b/gcc/testsuite/gcc.target/i386/pr105033.c
@@ -1,6 +1,8 @@
 /* { dg-do compile } */
 /* { dg-options "-march=sapphirerapids -O2" } */
-/* { dg-final { scan-assembler-times {vpunpcklqdq[ \t]+} 3 } } */
+/* { dg-final { scan-assembler-times {vpunpcklqdq[ \t]+} 3 { target { ! ia32 } 
} } } */
+/* { dg-final { scan-assembler-times {vpunpcklqdq[ \t]+} 2 { target ia32 } } } 
*/
+/* { dg-final

[gcc r15-1947] internal-fn: Reuse SUBREG_PROMOTED_VAR_P handling

2024-07-10 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:5686d3b8ae16d9aeea8d39a56ec6f8ecee661e01

commit r15-1947-g5686d3b8ae16d9aeea8d39a56ec6f8ecee661e01
Author: Richard Sandiford 
Date:   Wed Jul 10 17:37:58 2024 +0100

internal-fn: Reuse SUBREG_PROMOTED_VAR_P handling

expand_fn_using_insn has code to handle SUBREG_PROMOTED_VAR_P
destinations.  Specifically, for:

  (subreg/v:M1 (reg:M2 R) ...)

it creates a new temporary register T, uses it for the output
operand, then sign- or zero-extends the M1 lowpart of T to M2,
storing the result in R.

This patch splits this handling out into helper routines and
uses them for other instances of:

  if (!rtx_equal_p (target, ops[0].value))
emit_move_insn (target, ops[0].value);

It's quite probable that this doesn't help any of the other cases;
in particular, it shouldn't affect vectors.  But I think it could
be useful for the CRC work.

gcc/
* internal-fn.cc (create_call_lhs_operand, assign_call_lhs): New
functions, split out from...
(expand_fn_using_insn): ...here.
(expand_load_lanes_optab_fn): Use them.
(expand_GOMP_SIMT_ENTER_ALLOC): Likewise.
(expand_GOMP_SIMT_LAST_LANE): Likewise.
(expand_GOMP_SIMT_ORDERED_PRED): Likewise.
(expand_GOMP_SIMT_VOTE_ANY): Likewise.
(expand_GOMP_SIMT_XCHG_BFLY): Likewise.
(expand_GOMP_SIMT_XCHG_IDX): Likewise.
(expand_partial_load_optab_fn): Likewise.
(expand_vec_cond_optab_fn): Likewise.
(expand_vec_cond_mask_optab_fn): Likewise.
(expand_RAWMEMCHR): Likewise.
(expand_gather_load_optab_fn): Likewise.
(expand_while_optab_fn): Likewise.
(expand_SPACESHIP): Likewise.

Diff:
---
 gcc/internal-fn.cc | 162 +++--
 1 file changed, 84 insertions(+), 78 deletions(-)

diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
index 4948b48bde81..95946bfd6839 100644
--- a/gcc/internal-fn.cc
+++ b/gcc/internal-fn.cc
@@ -199,6 +199,58 @@ const direct_internal_fn_info 
direct_internal_fn_array[IFN_LAST + 1] = {
   not_direct
 };
 
+/* Like create_output_operand, but for callers that will use
+   assign_call_lhs afterwards.  */
+
+static void
+create_call_lhs_operand (expand_operand *op, rtx lhs_rtx, machine_mode mode)
+{
+  /* Do not assign directly to a promoted subreg, since there is no
+ guarantee that the instruction will leave the upper bits of the
+ register in the state required by SUBREG_PROMOTED_SIGN.  */
+  rtx dest = lhs_rtx;
+  if (dest && GET_CODE (dest) == SUBREG && SUBREG_PROMOTED_VAR_P (dest))
+dest = NULL_RTX;
+  create_output_operand (op, dest, mode);
+}
+
+/* Move the result of an expanded instruction into the lhs of a gimple call.
+   LHS is the lhs of the call, LHS_RTX is its expanded form, and OP is the
+   result of the expanded instruction.  OP should have been set up by
+   create_call_lhs_operand.  */
+
+static void
+assign_call_lhs (tree lhs, rtx lhs_rtx, expand_operand *op)
+{
+  if (rtx_equal_p (lhs_rtx, op->value))
+return;
+
+  /* If the return value has an integral type, convert the instruction
+ result to that type.  This is useful for things that return an
+ int regardless of the size of the input.  If the instruction result
+ is smaller than required, assume that it is signed.
+
+ If the return value has a nonintegral type, its mode must match
+ the instruction result.  */
+  if (GET_CODE (lhs_rtx) == SUBREG && SUBREG_PROMOTED_VAR_P (lhs_rtx))
+{
+  /* If this is a scalar in a register that is stored in a wider
+mode than the declared mode, compute the result into its
+declared mode and then convert to the wider mode.  */
+  gcc_checking_assert (INTEGRAL_TYPE_P (TREE_TYPE (lhs)));
+  rtx tmp = convert_to_mode (GET_MODE (lhs_rtx), op->value, 0);
+  convert_move (SUBREG_REG (lhs_rtx), tmp,
+   SUBREG_PROMOTED_SIGN (lhs_rtx));
+}
+  else if (GET_MODE (lhs_rtx) == GET_MODE (op->value))
+emit_move_insn (lhs_rtx, op->value);
+  else
+{
+  gcc_checking_assert (INTEGRAL_TYPE_P (TREE_TYPE (lhs)));
+  convert_move (lhs_rtx, op->value, 0);
+}
+}
+
 /* Expand STMT using instruction ICODE.  The instruction has NOUTPUTS
output operands and NINPUTS input operands, where NOUTPUTS is either
0 or 1.  The output operand (if any) comes first, followed by the
@@ -220,15 +272,8 @@ expand_fn_using_insn (gcall *stmt, insn_code icode, 
unsigned int noutputs,
   gcc_assert (noutputs == 1);
   if (lhs)
lhs_rtx = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
-
-  /* Do not assign directly to a promoted subreg, since there is no
-guarantee that the instruction will leave the upper bits of the
-register in the state required by SUBREG_PROMOTED_SIGN.  */
-  rtx dest = lhs_rtx;

[gcc r15-1972] recog: Avoid validate_change shortcut for groups [PR115782]

2024-07-11 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:44fc801e97a8dc626a4806ff4124439003420b20

commit r15-1972-g44fc801e97a8dc626a4806ff4124439003420b20
Author: Richard Sandiford 
Date:   Thu Jul 11 14:44:11 2024 +0100

recog: Avoid validate_change shortcut for groups [PR115782]

In this PR, due to the -f flags, we ended up with:

bb1:  r10=r10
...
bb2:  r10=r10
...
bb3:  ...=r10

with bb1->bb2 and bb1->bb3.

late-combine successfully combined the bb1->bb2 def-use and set
the insn code to NOOP_MOVE_INSN_CODE.  The bb1->bb3 combination
then failed for... reasons.  At this point, everything should have
been rewound to its original state.

However, substituting r10=r10 into r10=r10 gives r10=r10, and
validate_change had an early-out for no-op rtl changes.  This meant
that validate_change did not register a change for the bb2 insn and
so did not save its old insn code.  The NOOP_MOVE_INSN_CODE therefore
persisted even after the attempt had been rewound.

IMO it'd be too cumbersome and error-prone to expect all users of
validate_change to be aware of this possibility.  If code is using
validate_change with in_group=1, I think it has a reasonable expectation
that a change will be registered and that the insn code will be saved
(and restored on cancel).  This patch therefore limits the shortcut
to the !in_group case.

gcc/
PR rtl-optimization/115782
* recog.cc (validate_change_1): Suppress early exit for no-op
changes that are part of a group.

gcc/testsuite/
PR rtl-optimization/115782
* gcc.dg/pr115782.c: New test.

Diff:
---
 gcc/recog.cc|  7 ++-
 gcc/testsuite/gcc.dg/pr115782.c | 23 +++
 2 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/gcc/recog.cc b/gcc/recog.cc
index 36507f3f57ce..7710c55b7452 100644
--- a/gcc/recog.cc
+++ b/gcc/recog.cc
@@ -230,7 +230,12 @@ validate_change_1 (rtx object, rtx *loc, rtx new_rtx, bool 
in_group,
   new_len = -1;
 }
 
-  if ((old == new_rtx || rtx_equal_p (old, new_rtx))
+  /* When a change is part of a group, callers expect to be able to change
+ INSN_CODE after making the change and have the code reset to its old
+ value by a later cancel_changes.  We therefore need to register group
+ changes even if they're no-ops.  */
+  if (!in_group
+  && (old == new_rtx || rtx_equal_p (old, new_rtx))
   && (new_len < 0 || XVECLEN (new_rtx, 0) == new_len))
 return true;
 
diff --git a/gcc/testsuite/gcc.dg/pr115782.c b/gcc/testsuite/gcc.dg/pr115782.c
new file mode 100644
index ..f4d11cc6d0f9
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/pr115782.c
@@ -0,0 +1,23 @@
+// { dg-require-effective-target lp64 }
+// { dg-options "-O2 -fno-guess-branch-probability -fgcse-sm 
-fno-expensive-optimizations -fno-gcse" }
+
+int printf(const char *, ...);
+int a, b, c, d, e, f, g, i, j, m, h;
+long k, l, n, o;
+int main() {
+  int p = e, r = i << a, q = r & b;
+  k = 4073709551613;
+  l = m = c = -(c >> j);
+  d = g ^ h ^ 4073709551613;
+  n = q - h;
+  o = ~d;
+  f = c * 4073709551613 / 409725 ^ r;
+  if ((n && m) || (q && j) || a)
+return 0;
+  d = o | p;
+  if (g)
+printf("0");
+  d = p;
+  c++;
+  return 0;
+}


[gcc r15-1998] aarch64: Avoid alloca in target attribute parsing

2024-07-12 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:7bcef7532b10040bb82567136a208d0c4560767d

commit r15-1998-g7bcef7532b10040bb82567136a208d0c4560767d
Author: Richard Sandiford 
Date:   Fri Jul 12 10:30:22 2024 +0100

aarch64: Avoid alloca in target attribute parsing

The handling of the target attribute used alloca to allocate
a copy of unverified user input, which could exhaust the stack
if the input is too long.  This patch converts it to auto_vecs
instead.

I wondered about converting it to use std::string, which we
already use elsewhere, but that would be more invasive and
controversial.

gcc/
* config/aarch64/aarch64.cc (aarch64_process_one_target_attr)
(aarch64_process_target_attr): Avoid alloca.

Diff:
---
 gcc/config/aarch64/aarch64.cc | 12 
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 7f0cc47d0f07..0d41a193ec18 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -19405,8 +19405,10 @@ aarch64_process_one_target_attr (char *arg_str)
   return false;
 }
 
-  char *str_to_check = (char *) alloca (len + 1);
-  strcpy (str_to_check, arg_str);
+  auto_vec buffer;
+  buffer.safe_grow (len + 1);
+  char *str_to_check = buffer.address ();
+  memcpy (str_to_check, arg_str, len + 1);
 
   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
  It is easier to detect and handle it explicitly here rather than going
@@ -19569,8 +19571,10 @@ aarch64_process_target_attr (tree args)
 }
 
   size_t len = strlen (TREE_STRING_POINTER (args));
-  char *str_to_check = (char *) alloca (len + 1);
-  strcpy (str_to_check, TREE_STRING_POINTER (args));
+  auto_vec buffer;
+  buffer.safe_grow (len + 1);
+  char *str_to_check = buffer.address ();
+  memcpy (str_to_check, TREE_STRING_POINTER (args), len + 1);
 
   if (len == 0)
 {


[gcc r15-2008] rtl-ssa: Fix prev_any_insn [PR115785]

2024-07-12 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:6e7053a641393211f52c176e540c8922288ab8db

commit r15-2008-g6e7053a641393211f52c176e540c8922288ab8db
Author: Richard Sandiford 
Date:   Fri Jul 12 15:50:36 2024 +0100

rtl-ssa: Fix prev_any_insn [PR115785]

Bit of a brown paper bag issue, but: due to the representation
of the insn chain, insn_info::prev_any_insn would sometimes skip
over instructions.  This led to an invalid update in the PR when
adding and removing instructions.

I think one of the reasons I failed to spot this when checking
the code is that m_prev_insn_or_last_debug_insn is misnamed:
it's the previous instruction *of the same type* or the last
debug instruction in a group.  The patch therefore renames it to
m_prev_sametype_or_last_debug_insn (with the term prev_sametype
already being used in some accessors).

The reason this didn't show up earlier is that (a) prev_any_insn
is rarely used directly, (b) no instructions were lost from the
def-use chains, and (c) only consecutive debug instructions were
skipped when walking the insn chain.

The chaining scheme makes prev_any_insn more complicated than
next_any_insn, prev_nondebug_insn and next_nondebug_insn, but the
object code produced is still relatively simple.

gcc/
PR rtl-optimization/115785
* rtl-ssa/insns.h (insn_info::prev_insn_or_last_debug_insn)
(insn_info::next_nondebug_or_debug_insn): Remove typedefs.
(insn_info::m_prev_insn_or_last_debug_insn): Rename to...
(insn_info::m_prev_sametype_or_last_debug_insn): ...this.
* rtl-ssa/internals.inl (insn_info::insn_info): Update after
above renaming.
(insn_info::copy_prev_from): Likewise.
(insn_info::set_prev_sametype_insn): Likewise.
(insn_info::set_last_debug_insn): Likewise.
(insn_info::clear_insn_links): Likewise.
(insn_info::has_insn_links): Likewise.
* rtl-ssa/member-fns.inl (insn_info::prev_nondebug_insn): Likewise.
(insn_info::prev_any_insn): Fix moves from non-debug to debug insns.

gcc/testsuite/
PR rtl-optimization/115785
* g++.dg/torture/pr115785.C: New test.

Diff:
---
 gcc/rtl-ssa/insns.h |  54 ++-
 gcc/rtl-ssa/internals.inl   |  13 +-
 gcc/rtl-ssa/member-fns.inl  |  25 +-
 gcc/testsuite/g++.dg/torture/pr115785.C | 696 
 4 files changed, 747 insertions(+), 41 deletions(-)

diff --git a/gcc/rtl-ssa/insns.h b/gcc/rtl-ssa/insns.h
index 80eae5eaa1ec..1304b18e085c 100644
--- a/gcc/rtl-ssa/insns.h
+++ b/gcc/rtl-ssa/insns.h
@@ -339,32 +339,6 @@ private:
   };
   using order_splay_tree = default_rootless_splay_tree;
 
-  // prev_insn_or_last_debug_insn represents a choice between two things:
-  //
-  // (1) A pointer to the previous instruction in the list that has the
-  // same is_debug_insn () value, or null if no such instruction exists.
-  //
-  // (2) A pointer to the end of a sublist of debug instructions.
-  //
-  // (2) is used if this instruction is a debug instruction and the
-  // previous instruction is not.  (1) is used otherwise.
-  //
-  // next_nondebug_or_debug_insn points to the next instruction but also
-  // records whether that next instruction is a debug instruction or a
-  // nondebug instruction.
-  //
-  // Thus the list is chained as follows:
-  //
-  // >> > > >
-  // NONDEBUG NONDEBUG DEBUG DEBUG DEBUG NONDEBUG ...
-  // <^ +-- < <  ^+--
-  //  | |||
-  //  | ++|
-  //  |   |
-  //  +---+
-  using prev_insn_or_last_debug_insn = pointer_mux;
-  using next_nondebug_or_debug_insn = pointer_mux;
-
   insn_info (bb_info *bb, rtx_insn *rtl, int cost_or_uid);
 
   static void print_uid (pretty_printer *, int);
@@ -395,9 +369,33 @@ private:
   void clear_insn_links ();
   bool has_insn_links ();
 
+  // m_prev_sametye_or_last_debug_insn represents a choice between two things:
+  //
+  // (1) A pointer to the previous instruction in the list that has the
+  // same is_debug_insn () value, or null if no such instruction exists.
+  //
+  // (2) A pointer to the end of a sublist of debug instructions.
+  //
+  // (2) is used if this instruction is a debug instruction and the
+  // previous instruction is not.  (1) is used otherwise.
+  //
+  // m_next_nondebug_or_debug_insn points to the next instruction but also
+  // records whether that next instruction is a debug instruction or a
+  // nondebug instruction.
+  //
+  // Thus the list is chained as follows:
+  //
+  // >> > > >
+ 

[gcc r15-2016] Add gcc.gnu.org account names to MAINTAINERS

2024-07-13 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:6fc24a022218c9017e0ee2a9f2913ef85609c265

commit r15-2016-g6fc24a022218c9017e0ee2a9f2913ef85609c265
Author: Richard Sandiford 
Date:   Sat Jul 13 16:22:58 2024 +0100

Add gcc.gnu.org account names to MAINTAINERS

As discussed in the thread starting at:

  https://gcc.gnu.org/pipermail/gcc/2024-June/244199.html

it would be useful to have the @gcc.gnu.org bugzilla account names
in MAINTAINERS.  This is because:

(a) Not every n...@gcc.gnu.org email listed in MAINTAINERS is registered
as a bugzilla user.

(b) Only @gcc.gnu.org accounts tend to have full rights to modify tickets.

(c) A maintainer's name and email address aren't always enough to guess
the bugzilla account name.

(d) The users list on bugzilla has many blank entries for "real name".

However, including @gcc.gnu.org to the account name might encourage
people to use it for ordinary email, rather than just for bugzilla.
This patch goes for the compromise of using the unqualified account
name, with some text near the top of the file to explain its usage.

There isn't room in the area maintainer sections for a new column,
so it seemed better to have the account name only in the Write
After Approval section.  It's then necessary to list all maintainers
there, even if they have more specific roles as well.

Also, there were some entries that didn't line up with the
prevailing columns (they had one tab too many or one tab too few).
It seemed easier to check for and report this, and other things,
if the file used spaces rather than tabs.

There was one instance of an email address without the trailing ">".
The updates to check-MAINTAINERS.py includes a test for that.

The account names in the file were taken from a trawl of the
gcc-cvs archives, with a very small number of manual edits for
ambiguities.  There are a handful of names that I couldn't find;
the new column has "-" for those.  The names were then filtered
against the bugzilla @gcc.gnu.org user list, with those not
present again being blanked out with "-".

ChangeLog:
* MAINTAINERS: Replace tabs with spaces.  Add a bugzilla account
name column to the Write After Approval section.  Line up the
email column and fix an entry that was missing the trailing ">".

contrib/ChangeLog:
* check-MAINTAINERS.py (sort_by_surname): Replace with...
(get_surname): ...this.
(has_tab, is_empty): Delete.
(check_group): Take a list of column positions as argument.
Check that lines conform to these column numbers.  Check that the
final column is an email in angle brackets.  Record surnames on
the fly.
(top level): Reject tabs.  Use paragraph counts to identify which
groups of lines should be checked.  Report missing sections.

Diff:
---
 MAINTAINERS  | 1640 +++---
 contrib/check-MAINTAINERS.py |  120 ++--
 2 files changed, 969 insertions(+), 791 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index d27640708c52..200a223b431f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -15,8 +15,13 @@ To report problems in GCC, please visit:
 
   http://gcc.gnu.org/bugs/
 
-Note: when adding someone to a more specific section please remove any
-corresponding entry from the Write After Approval list.
+If you'd like to CC a maintainer in bugzilla, please add @gcc.gnu.org
+to the account name given in the Write After Approval section below.
+Please use the email address given in <...> for direct email communication.
+
+Note: when adding someone who has commit access to a more specific section,
+please also ensure that there is a corresponding entry in the Write After
+Approval list, since that list contains the gcc.gnu.org account name.
 
 Note: please verify that sorting is correct with:
 ./contrib/check-MAINTAINERS.py MAINTAINERS
@@ -24,21 +29,21 @@ Note: please verify that sorting is correct with:
 Maintainers
 ===
 
-   Global Reviewers
-
-Richard Biener 
-Richard Earnshaw   
-Jakub Jelinek  
-Richard Kenner 
-Jeff Law   
-Michael Meissner   
-Jason Merrill  
-David S. Miller
-Joseph Myers   
-Richard Sandiford  
-Bernd Schmidt  
-Ian Lance Taylor   
-Jim Wilson 
+Global Reviewers
+
+Richard Biener  
+Richard Earnshaw  

[gcc r15-2069] rtl-ssa: Enforce earlyclobbers on hard-coded clobbers [PR115891]

2024-07-16 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:9f9faebb8ebfc0103461641cc49ba0b21877b2b1

commit r15-2069-g9f9faebb8ebfc0103461641cc49ba0b21877b2b1
Author: Richard Sandiford 
Date:   Tue Jul 16 15:31:17 2024 +0100

rtl-ssa: Enforce earlyclobbers on hard-coded clobbers [PR115891]

The asm in the testcase has a memory operand and also clobbers ax.
The clobber means that ax cannot be used to hold inputs, which
extends to the address of the memory.

I think I had an implicit assumption that constrain_operands
would enforce this, but in hindsight, that clearly wasn't going
to be true.  constrain_operands only looks at constraints, and
these clobbers are by definition outside the constraint system.
(And that's why they have to be handled conservatively, since there's
no way to distinguish the earlyclobber and non-earlyclobber cases.)

The semantics of hard-coded clobbers are generic enough that I think
they should be handled directly by rtl-ssa, rather than by consumers.
And in the context of rtl-ssa, the easiest way to check for a clash is
to walk the list of input registers, which we already have to hand.
It therefore seemed better not to push this down to a more generic
rtl helper.

The patch detects hard-coded clobbers in the same way as regrename:
by temporarily stubbing out the operands with pc_rtx.

gcc/
PR rtl-optimization/115891
* rtl-ssa/changes.cc (find_clobbered_access): New function.
(recog_level2): Use it to check for overlap between input
registers and hard-coded clobbers.  Conditionally reset
recog_data.insn after changing the insn code.

gcc/testsuite/
PR rtl-optimization/115891
* gcc.target/i386/pr115891.c: New test.

Diff:
---
 gcc/rtl-ssa/changes.cc   | 60 +++-
 gcc/testsuite/gcc.target/i386/pr115891.c | 10 ++
 2 files changed, 69 insertions(+), 1 deletion(-)

diff --git a/gcc/rtl-ssa/changes.cc b/gcc/rtl-ssa/changes.cc
index 6b6f7cd5d3ab..43c7b8e1e605 100644
--- a/gcc/rtl-ssa/changes.cc
+++ b/gcc/rtl-ssa/changes.cc
@@ -944,6 +944,25 @@ add_clobber (insn_change &change, add_regno_clobber_fn 
add_regno_clobber,
   return true;
 }
 
+// See if PARALLEL pattern PAT clobbers any of the registers in ACCESSES.
+// Return one such access if so, otherwise return null.
+static access_info *
+find_clobbered_access (access_array accesses, rtx pat)
+{
+  rtx subpat;
+  for (int i = 0; i < XVECLEN (pat, 0); ++i)
+if (GET_CODE (subpat = XVECEXP (pat, 0, i)) == CLOBBER)
+  {
+   rtx x = XEXP (subpat, 0);
+   if (REG_P (x))
+ for (auto *access : accesses)
+   if (access->regno () >= REGNO (x)
+   && access->regno () < END_REGNO (x))
+ return access;
+  }
+  return nullptr;
+}
+
 // Try to recognize the new form of the insn associated with CHANGE,
 // adding any clobbers that are necessary to make the instruction match
 // an .md pattern.  Return true on success.
@@ -1035,9 +1054,48 @@ recog_level2 (insn_change &change, add_regno_clobber_fn 
add_regno_clobber)
   pat = newpat;
 }
 
+  INSN_CODE (rtl) = icode;
+  if (recog_data.insn == rtl)
+recog_data.insn = nullptr;
+
+  // See if the pattern contains any hard-coded clobbers of registers
+  // that are also inputs to the instruction.  The standard rtl semantics
+  // treat such clobbers as earlyclobbers, since there is no way of proving
+  // which clobbers conflict with the inputs and which don't.
+  //
+  // (Non-hard-coded clobbers are handled by constraint satisfaction instead.)
+  rtx subpat;
+  if (GET_CODE (pat) == PARALLEL)
+for (int i = 0; i < XVECLEN (pat, 0); ++i)
+  if (GET_CODE (subpat = XVECEXP (pat, 0, i)) == CLOBBER
+ && REG_P (XEXP (subpat, 0)))
+   {
+ // Stub out all operands, so that we can tell which registers
+ // are hard-coded.
+ extract_insn (rtl);
+ for (int j = 0; j < recog_data.n_operands; ++j)
+   *recog_data.operand_loc[j] = pc_rtx;
+
+ auto *use = find_clobbered_access (change.new_uses, pat);
+
+ // Restore the operands.
+ for (int j = 0; j < recog_data.n_operands; ++j)
+   *recog_data.operand_loc[j] = recog_data.operand[j];
+
+ if (use)
+   {
+ if (dump_file && (dump_flags & TDF_DETAILS))
+   {
+ fprintf (dump_file, "register %d is both clobbered"
+  " and used as an input:\n", use->regno ());
+ print_rtl_single (dump_file, pat);
+   }
+ return false;
+   }
+   }
+
   // check_asm_operands checks the constraints after RA, so we don't
   // need to do it again.
-  INSN_CODE (rtl) = icode;
   if (reload_completed && !asm_p)
 {
   extract_insn (rtl);
diff --git a/gcc/testsuite/gcc.target/i386/pr115891.c 
b/gcc/testsuite/gcc.t

[gcc r15-2070] recog: restrict paradoxical mode punning in insn_propagation [PR115901]

2024-07-16 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:851ec9960b084ad37556ec627e6931e985e41a24

commit r15-2070-g851ec9960b084ad37556ec627e6931e985e41a24
Author: Richard Sandiford 
Date:   Tue Jul 16 15:31:17 2024 +0100

recog: restrict paradoxical mode punning in insn_propagation [PR115901]

In g:44fc801e97a8dc626a4806ff4124439003420b20 I'd extended
insn_propagation to handle simple cases of hard-reg mode punning.
One of the checks was that the new use mode occupied the same
number of registers as the original definition mode.  However,
as PR115901 shows, we need to avoid increasing the size of any
registers in the punned "to" expression as well.

Specifically, the test includes a DImode move from GPR x0 to
a vector register, followed by a V2DI use of the vector register.
The simplification would then create a V2DI spanning x0 and x1,
manufacturing a new, unwanted use of x1.

Checking for that kind of thing directly seems too cumbersome,
and is not related to the original motivation (which was to improve
handling of shared vector zeros on aarch64).  This patch therefore
restricts the paradoxical case to constants.

gcc/
PR rtl-optimization/115901
* recog.cc (insn_propagation::apply_to_rvalue_1): Restrict
paradoxical mode punning to cases where "to" is constant.

gcc/testsuite/
PR rtl-optimization/115901
* gcc.dg/torture/pr115901.c: New test.

Diff:
---
 gcc/recog.cc|  8 
 gcc/testsuite/gcc.dg/torture/pr115901.c | 14 ++
 2 files changed, 22 insertions(+)

diff --git a/gcc/recog.cc b/gcc/recog.cc
index 7710c55b7452..54b317126c29 100644
--- a/gcc/recog.cc
+++ b/gcc/recog.cc
@@ -1082,6 +1082,14 @@ insn_propagation::apply_to_rvalue_1 (rtx *loc)
  || !REG_CAN_CHANGE_MODE_P (REGNO (x), GET_MODE (from),
 GET_MODE (x)))
return false;
+ /* If the reference is paradoxical and the replacement
+value contains registers, we would need to check that the
+simplification below does not increase REG_NREGS for those
+registers either.  It seems simpler to punt on nonconstant
+values instead.  */
+ if (paradoxical_subreg_p (GET_MODE (x), GET_MODE (from))
+ && !CONSTANT_P (to))
+   return false;
  newval = simplify_subreg (GET_MODE (x), to, GET_MODE (from),
subreg_lowpart_offset (GET_MODE (x),
   GET_MODE (from)));
diff --git a/gcc/testsuite/gcc.dg/torture/pr115901.c 
b/gcc/testsuite/gcc.dg/torture/pr115901.c
new file mode 100644
index ..244af857d887
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/pr115901.c
@@ -0,0 +1,14 @@
+/* { dg-additional-options "-ftrivial-auto-var-init=zero" } */
+
+int p;
+void g(long);
+#define vec16 __attribute__((vector_size(16)))
+
+void l(vec16 long *);
+void h()
+{
+  long inv1;
+  vec16 long  inv = {p, inv1};
+  g (p);
+  l(&inv);
+}


[gcc r15-2071] rtl-ssa: Fix removal of order_nodes [PR115929]

2024-07-16 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:fec38d7987dd6d68b234b0076b57ac66a30a3a1d

commit r15-2071-gfec38d7987dd6d68b234b0076b57ac66a30a3a1d
Author: Richard Sandiford 
Date:   Tue Jul 16 15:33:23 2024 +0100

rtl-ssa: Fix removal of order_nodes [PR115929]

order_nodes are used to implement ordered comparisons between
two insns with the same program point number.  remove_insn would
remove an order_node from its splay tree, but didn't remove it
from the insn.  This caused confusion if the insn was later
reinserted somewhere else that also needed an order_node.

gcc/
PR rtl-optimization/115929
* rtl-ssa/insns.cc (function_info::remove_insn): Remove an
order_node from the instruction as well as from the splay tree.

gcc/testsuite/
PR rtl-optimization/115929
* gcc.dg/torture/pr115929-1.c: New test.

Diff:
---
 gcc/rtl-ssa/insns.cc  |  5 +++-
 gcc/testsuite/gcc.dg/torture/pr115929-1.c | 45 +++
 2 files changed, 49 insertions(+), 1 deletion(-)

diff --git a/gcc/rtl-ssa/insns.cc b/gcc/rtl-ssa/insns.cc
index 7e26bfd978fe..bc30734df89f 100644
--- a/gcc/rtl-ssa/insns.cc
+++ b/gcc/rtl-ssa/insns.cc
@@ -393,7 +393,10 @@ void
 function_info::remove_insn (insn_info *insn)
 {
   if (insn_info::order_node *order = insn->get_order_node ())
-insn_info::order_splay_tree::remove_node (order);
+{
+  insn_info::order_splay_tree::remove_node (order);
+  insn->remove_note (order);
+}
 
   if (auto *note = insn->find_note ())
 {
diff --git a/gcc/testsuite/gcc.dg/torture/pr115929-1.c 
b/gcc/testsuite/gcc.dg/torture/pr115929-1.c
new file mode 100644
index ..19b831ab99ef
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/pr115929-1.c
@@ -0,0 +1,45 @@
+/* { dg-require-effective-target lp64 } */
+/* { dg-options "-fno-gcse -fschedule-insns -fno-guess-branch-probability 
-fno-tree-fre -fno-tree-ch" } */
+
+int printf(const char *, ...);
+int a[6], b, c;
+char d, l;
+struct {
+  char e;
+  int f;
+  int : 8;
+  long g;
+  long h;
+} i[1][9] = {0};
+unsigned j;
+void n(char p) { b = b >> 8 ^ a[b ^ p]; }
+int main() {
+  int k, o;
+  while (b) {
+k = 0;
+for (; k < 9; k++) {
+  b = b ^ a[l];
+  n(j);
+  if (o)
+printf(&d);
+  long m = i[c][k].f;
+  b = b >> 8 ^ a[l];
+  n(m >> 32);
+  n(m);
+  if (o)
+printf("%d", d);
+  b = b >> 8 ^ l;
+  n(2);
+  n(0);
+  if (o)
+printf(&d);
+  b = b ^ a[l];
+  n(i[c][k].g >> 2);
+  n(i[c][k].g);
+  if (o)
+printf(&d);
+  printf("%d", i[c][k].f);
+}
+  }
+  return 0;
+}


[gcc r15-2109] genattrtab: Drop enum tags, consolidate type names

2024-07-17 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:b19906a029a059fc5015046bae60e3287d842bba

commit r15-2109-gb19906a029a059fc5015046bae60e3287d842bba
Author: Richard Sandiford 
Date:   Wed Jul 17 19:34:46 2024 +0100

genattrtab: Drop enum tags, consolidate type names

genattrtab printed an "enum" tag before references to attribute
enums, but that's redundant in C++.  Removing it means that each
attribute type becomes a single token and can be easily stored
in the attr_desc structure.

gcc/
* genattrtab.cc (attr_desc::cxx_type): New field.
(write_attr_get, write_attr_value): Use it.
(gen_attr, find_attr, make_internal_attr): Initialize it,
dropping enum tags.

Diff:
---
 gcc/genattrtab.cc | 37 ++---
 1 file changed, 14 insertions(+), 23 deletions(-)

diff --git a/gcc/genattrtab.cc b/gcc/genattrtab.cc
index 03c7d6c74a3b..2a51549ddd43 100644
--- a/gcc/genattrtab.cc
+++ b/gcc/genattrtab.cc
@@ -175,6 +175,7 @@ class attr_desc
 public:
   char *name;  /* Name of attribute.  */
   const char *enum_name;   /* Enum name for DEFINE_ENUM_NAME.  */
+  const char *cxx_type;/* The associated C++ type.  */
   class attr_desc *next;   /* Next attribute.  */
   struct attr_value *first_value; /* First value of this attribute.  */
   struct attr_value *default_val; /* Default value for this attribute.  */
@@ -3083,6 +3084,7 @@ gen_attr (md_rtx_info *info)
   if (GET_CODE (def) == DEFINE_ENUM_ATTR)
 {
   attr->enum_name = XSTR (def, 1);
+  attr->cxx_type = attr->enum_name;
   et = rtx_reader_ptr->lookup_enum_type (XSTR (def, 1));
   if (!et || !et->md_p)
error_at (info->loc, "No define_enum called `%s' defined",
@@ -3092,9 +3094,13 @@ gen_attr (md_rtx_info *info)
  add_attr_value (attr, ev->name);
 }
   else if (*XSTR (def, 1) == '\0')
-attr->is_numeric = 1;
+{
+  attr->is_numeric = 1;
+  attr->cxx_type = "int";
+}
   else
 {
+  attr->cxx_type = concat ("attr_", attr->name, nullptr);
   name_ptr = XSTR (def, 1);
   while ((p = next_comma_elt (&name_ptr)) != NULL)
add_attr_value (attr, p);
@@ -4052,12 +4058,7 @@ write_attr_get (FILE *outf, class attr_desc *attr)
 
   /* Write out start of function, then all values with explicit `case' lines,
  then a `default', then the value with the most uses.  */
-  if (attr->enum_name)
-fprintf (outf, "enum %s\n", attr->enum_name);
-  else if (!attr->is_numeric)
-fprintf (outf, "enum attr_%s\n", attr->name);
-  else
-fprintf (outf, "int\n");
+  fprintf (outf, "%s\n", attr->cxx_type);
 
   /* If the attribute name starts with a star, the remainder is the name of
  the subroutine to use, instead of `get_attr_...'.  */
@@ -4103,13 +4104,8 @@ write_attr_get (FILE *outf, class attr_desc *attr)
  cached_attrs[j] = name;
cached_attr = find_attr (&name, 0);
gcc_assert (cached_attr && cached_attr->is_const == 0);
-   if (cached_attr->enum_name)
- fprintf (outf, "  enum %s", cached_attr->enum_name);
-   else if (!cached_attr->is_numeric)
- fprintf (outf, "  enum attr_%s", cached_attr->name);
-   else
- fprintf (outf, "  int");
-   fprintf (outf, " cached_%s ATTRIBUTE_UNUSED;\n", name);
+   fprintf (outf, "  %s cached_%s ATTRIBUTE_UNUSED;\n",
+cached_attr->cxx_type, name);
j++;
   }
   cached_attr_count = j;
@@ -4395,14 +4391,7 @@ write_attr_value (FILE *outf, class attr_desc *attr, rtx 
value)
 case ATTR:
   {
class attr_desc *attr2 = find_attr (&XSTR (value, 0), 0);
-   if (attr->enum_name)
- fprintf (outf, "(enum %s)", attr->enum_name);
-   else if (!attr->is_numeric)
- fprintf (outf, "(enum attr_%s)", attr->name);
-   else if (!attr2->is_numeric)
- fprintf (outf, "(int)");
-
-   fprintf (outf, "get_attr_%s (%s)", attr2->name,
+   fprintf (outf, "(%s) get_attr_%s (%s)", attr->cxx_type, attr2->name,
 (attr2->is_const ? "" : "insn"));
   }
   break;
@@ -4672,7 +4661,8 @@ find_attr (const char **name_p, int create)
 
   attr = oballoc (class attr_desc);
   attr->name = DEF_ATTR_STRING (name);
-  attr->enum_name = 0;
+  attr->enum_name = nullptr;
+  attr->cxx_type = nullptr;
   attr->first_value = attr->default_val = NULL;
   attr->is_numeric = attr->is_const = attr->is_special = 0;
   attr->next = attrs[index];
@@ -4693,6 +4683,7 @@ make_internal_attr (const char *name, rtx value, int 
special)
   attr = find_attr (&name, 1);
   gcc_assert (!attr->default_val);
 
+  attr->cxx_type = "int";
   attr->is_numeric = 1;
   attr->is_const = 0;
   attr->is_special = (special & ATTR_SPECIAL) != 0;


[gcc r15-2110] rtl-ssa: Fix split_clobber_group [PR115928]

2024-07-17 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:71b31690a7c52413496e91bcc5ee4c68af2f366f

commit r15-2110-g71b31690a7c52413496e91bcc5ee4c68af2f366f
Author: Richard Sandiford 
Date:   Wed Jul 17 19:38:11 2024 +0100

rtl-ssa: Fix split_clobber_group [PR115928]

One of the goals of the rtl-ssa representation was to allow a
group of consecutive clobbers to be skipped in constant time,
with amortised sublinear insertion and deletion.  This involves
putting consecutive clobbers in groups.  Splitting or joining
groups would be linear if we had to update every clobber on
each update, so the operation to query a clobber's group is
lazy and (again) amortised sublinear.

This means that, when splitting a group into two, we cannot
reuse the old group for one side.  We have to invalidate it,
so that the lazy clobber_info::group query can tell that something
has changed.  The ICE in the PR came from failing to do that.

gcc/
PR rtl-optimization/115928
* rtl-ssa/accesses.h (clobber_group): Add a new constructor that
takes the first, last and root clobbers.
* rtl-ssa/internals.inl (clobber_group::clobber_group): Define it.
* rtl-ssa/accesses.cc (function_info::split_clobber_group): Use it.
Allocate a new group for both sides and invalidate the previous 
group.
(function_info::add_def): After calling split_clobber_group,
remove the old group from the splay tree.

gcc/testsuite/
PR rtl-optimization/115928
* gcc.dg/torture/pr115928.c: New test.

Diff:
---
 gcc/rtl-ssa/accesses.cc | 37 ++---
 gcc/rtl-ssa/accesses.h  |  3 ++-
 gcc/rtl-ssa/internals.inl   | 14 +
 gcc/testsuite/gcc.dg/torture/pr115928.c | 23 
 4 files changed, 55 insertions(+), 22 deletions(-)

diff --git a/gcc/rtl-ssa/accesses.cc b/gcc/rtl-ssa/accesses.cc
index 3f1304fc5bff..5cc05cb4be7f 100644
--- a/gcc/rtl-ssa/accesses.cc
+++ b/gcc/rtl-ssa/accesses.cc
@@ -792,11 +792,11 @@ function_info::merge_clobber_groups (clobber_info 
*clobber1,
 }
 
 // GROUP spans INSN, and INSN now sets the resource that GROUP clobbers.
-// Split GROUP around INSN and return the clobber that comes immediately
-// before INSN.
+// Split GROUP around INSN, to form two new groups, and return the clobber
+// that comes immediately before INSN.
 //
 // The resource that GROUP clobbers is known to have an associated
-// splay tree.
+// splay tree.  The caller must remove GROUP from the tree on return.
 clobber_info *
 function_info::split_clobber_group (clobber_group *group, insn_info *insn)
 {
@@ -827,27 +827,20 @@ function_info::split_clobber_group (clobber_group *group, 
insn_info *insn)
   prev = as_a (next->prev_def ());
 }
 
-  // Use GROUP to hold PREV and earlier clobbers.  Create a new group for
-  // NEXT onwards.
+  // Create a new group for each side of the split.  We need to invalidate
+  // the old group so that clobber_info::group can tell whether a lazy
+  // update is needed.
+  clobber_info *first_clobber = group->first_clobber ();
   clobber_info *last_clobber = group->last_clobber ();
-  clobber_group *group1 = group;
-  clobber_group *group2 = allocate (next);
-
-  // Finish setting up GROUP1, making sure that the roots and extremities
-  // have a correct group pointer.  Leave the rest to be updated lazily.
-  group1->set_last_clobber (prev);
-  tree1->set_group (group1);
-  prev->set_group (group1);
-
-  // Finish setting up GROUP2, with the same approach as for GROUP1.
-  group2->set_first_clobber (next);
-  group2->set_last_clobber (last_clobber);
-  next->set_group (group2);
-  tree2->set_group (group2);
-  last_clobber->set_group (group2);
+  auto *group1 = allocate (first_clobber, prev, tree1.root ());
+  auto *group2 = allocate (next, last_clobber, tree2.root ());
 
   // Insert GROUP2 into the splay tree as an immediate successor of GROUP1.
-  def_splay_tree::insert_child (group1, 1, group2);
+  def_splay_tree::insert_child (group, 1, group2);
+  def_splay_tree::insert_child (group, 1, group1);
+
+  // Invalidate the old group.
+  group->set_last_clobber (nullptr);
 
   return prev;
 }
@@ -952,6 +945,8 @@ function_info::add_def (def_info *def)
}
  prev = split_clobber_group (group, insn);
  next = prev->next_def ();
+ tree.remove_root ();
+ last->set_splay_root (tree.root ());
}
   // COMPARISON is < 0 if DEF comes before ROOT or > 0 if DEF comes
   // after ROOT.
diff --git a/gcc/rtl-ssa/accesses.h b/gcc/rtl-ssa/accesses.h
index 7d2916d00c28..27810a02063f 100644
--- a/gcc/rtl-ssa/accesses.h
+++ b/gcc/rtl-ssa/accesses.h
@@ -937,7 +937,8 @@ public:
   void print (pretty_printer *pp) const;
 
 private:
-  clobber_group (clobber_info *clobber);
+  clobber_group (clobber_info *);
+  clobber_group (clobber_info *, clobber_info *, clob

[gcc r15-2111] rtl-ssa: Fix move range canonicalisation [PR115929]

2024-07-17 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:43a7ece873eba47a11c0b21b0068eee53740551a

commit r15-2111-g43a7ece873eba47a11c0b21b0068eee53740551a
Author: Richard Sandiford 
Date:   Wed Jul 17 19:38:12 2024 +0100

rtl-ssa: Fix move range canonicalisation [PR115929]

In this PR, canonicalize_move_range walked off the end of a list
and triggered a null dereference.  There are multiple ways of fixing
that, but I think the approach taken in the patch should be
relatively efficient.

gcc/
PR rtl-optimization/115929
* rtl-ssa/movement.h (canonicalize_move_range): Check for null prev
and next insns and create an invalid move range for them.

gcc/testsuite/
PR rtl-optimization/115929
* gcc.dg/torture/pr115929-2.c: New test.

Diff:
---
 gcc/rtl-ssa/movement.h| 20 ++--
 gcc/testsuite/gcc.dg/torture/pr115929-2.c | 22 ++
 2 files changed, 40 insertions(+), 2 deletions(-)

diff --git a/gcc/rtl-ssa/movement.h b/gcc/rtl-ssa/movement.h
index 17d31e0b5cbe..ea1f788df49e 100644
--- a/gcc/rtl-ssa/movement.h
+++ b/gcc/rtl-ssa/movement.h
@@ -76,9 +76,25 @@ inline bool
 canonicalize_move_range (insn_range_info &move_range, insn_info *insn)
 {
   while (move_range.first != insn && !can_insert_after (move_range.first))
-move_range.first = move_range.first->next_nondebug_insn ();
+if (auto *next = move_range.first->next_nondebug_insn ())
+  move_range.first = next;
+else
+  {
+   // Invalidate the range.  prev_nondebug_insn is always nonnull
+   // if next_nondebug_insn is null.
+   move_range.last = move_range.first->prev_nondebug_insn ();
+   return false;
+  }
   while (move_range.last != insn && !can_insert_after (move_range.last))
-move_range.last = move_range.last->prev_nondebug_insn ();
+if (auto *prev = move_range.last->prev_nondebug_insn ())
+  move_range.last = prev;
+else
+  {
+   // Invalidate the range.  next_nondebug_insn is always nonnull
+   // if prev_nondebug_insn is null.
+   move_range.first = move_range.last->next_nondebug_insn ();
+   return false;
+  }
   return bool (move_range);
 }
 
diff --git a/gcc/testsuite/gcc.dg/torture/pr115929-2.c 
b/gcc/testsuite/gcc.dg/torture/pr115929-2.c
new file mode 100644
index ..c8473a74da6c
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/pr115929-2.c
@@ -0,0 +1,22 @@
+/* { dg-additional-options "-fschedule-insns" } */
+
+int a, b, c, d, e, f;
+int main() {
+  if (e && f)
+while (1)
+  while (a)
+a = 0;
+  if (c) {
+if (b)
+  goto g;
+int h = a;
+  i:
+b = ~((b ^ h) | 1 % b);
+if (a)
+g:
+  b = 0;
+  }
+  if (d)
+goto i;
+  return 0;
+}


[gcc r15-2160] arm: Update fp16-aapcs-[24].c after insn_propagation patch

2024-07-19 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:ebdad26ed9902c04704409b729d896a646188634

commit r15-2160-gebdad26ed9902c04704409b729d896a646188634
Author: Richard Sandiford 
Date:   Fri Jul 19 19:09:37 2024 +0100

arm: Update fp16-aapcs-[24].c after insn_propagation patch

These tests used to generate:

bl  swap
ldr r2, [sp, #4]
mov r0, r2  @ __fp16

but g:9d20529d94b23275885f380d155fe8671ab5353a means that we can
load directly into r0:

bl  swap
ldrhr0, [sp, #4]@ __fp16

This patch updates the tests to "defend" this change.

While there, the scans include:

mov\tr1, r[03]}

But if the spill of r2 occurs first, there's no real reason why
r2 couldn't be used as the temporary, instead r3.

The patch tries to update the scans while preserving the spirit
of the originals.

gcc/testsuite/
* gcc.target/arm/fp16-aapcs-2.c: Expect the return value to be
loaded directly from the stack.  Test that the swap generates
two moves out of r0/r1 and two moves in.
* gcc.target/arm/fp16-aapcs-4.c: Likewise.

Diff:
---
 gcc/testsuite/gcc.target/arm/fp16-aapcs-2.c | 8 +---
 gcc/testsuite/gcc.target/arm/fp16-aapcs-4.c | 8 +---
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/gcc/testsuite/gcc.target/arm/fp16-aapcs-2.c 
b/gcc/testsuite/gcc.target/arm/fp16-aapcs-2.c
index c34387f57828..12d20560f535 100644
--- a/gcc/testsuite/gcc.target/arm/fp16-aapcs-2.c
+++ b/gcc/testsuite/gcc.target/arm/fp16-aapcs-2.c
@@ -16,6 +16,8 @@ F (__fp16 a, __fp16 b, __fp16 c)
   return c;
 }
 
-/* { dg-final { scan-assembler-times {mov\tr[0-9]+, r[0-2]} 3 } }  */
-/* { dg-final { scan-assembler-times {mov\tr1, r[03]} 1 } }  */
-/* { dg-final { scan-assembler-times {mov\tr0, r[0-9]+} 2 } }  */
+/* The swap must include two moves out of r0/r1 and two moves in.  */
+/* { dg-final { scan-assembler-times {mov\tr[0-9]+, r[01]} 2 } }  */
+/* { dg-final { scan-assembler-times {mov\tr[01], r[0-9]+} 2 } }  */
+/* c should be spilled around the call.  */
+/* { dg-final { scan-assembler {str\tr2, ([^\n]*).*ldrh\tr0, \1} { target 
arm_little_endian } } } */
diff --git a/gcc/testsuite/gcc.target/arm/fp16-aapcs-4.c 
b/gcc/testsuite/gcc.target/arm/fp16-aapcs-4.c
index daac29137aeb..09fa64aa4946 100644
--- a/gcc/testsuite/gcc.target/arm/fp16-aapcs-4.c
+++ b/gcc/testsuite/gcc.target/arm/fp16-aapcs-4.c
@@ -16,6 +16,8 @@ F (__fp16 a, __fp16 b, __fp16 c)
   return c;
 }
 
-/* { dg-final { scan-assembler-times {mov\tr[0-9]+, r[0-2]} 3 } }  */
-/* { dg-final { scan-assembler-times {mov\tr1, r[03]} 1 } }  */
-/* { dg-final { scan-assembler-times {mov\tr0, r[0-9]+} 2 } }  */
+/* The swap must include two moves out of r0/r1 and two moves in.  */
+/* { dg-final { scan-assembler-times {mov\tr[0-9]+, r[01]} 2 } }  */
+/* { dg-final { scan-assembler-times {mov\tr[01], r[0-9]+} 2 } }  */
+/* c should be spilled around the call.  */
+/* { dg-final { scan-assembler {str\tr2, ([^\n]*).*ldrh\tr0, \1} { target 
arm_little_endian } } } */


[gcc r15-2161] Treat boolean vector elements as 0/-1 [PR115406]

2024-07-19 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:348d890c287a7ec4c88d3082ae6105537bd39398

commit r15-2161-g348d890c287a7ec4c88d3082ae6105537bd39398
Author: Richard Sandiford 
Date:   Fri Jul 19 19:09:37 2024 +0100

Treat boolean vector elements as 0/-1 [PR115406]

Previously we built vector boolean constants using 1 for true
elements and 0 for false elements.  This matches the predicates
produced by SVE's PTRUE instruction, but leads to a miscompilation
on AVX, where all bits of a boolean element should be set.

One option for RTL would be to make this target-configurable.
But that isn't really possible at the tree level, where vectors
should work in a more target-independent way.  (There is currently
no way to create a "generic" packed boolean vector, but never say
never :))  And, if we were going to pick a generic behaviour,
it would make sense to use 0/-1 rather than 0/1, for consistency
with integer vectors.

Both behaviours should work with SVE on read, since SVE ignores
the upper bits in each predicate element.  And the choice shouldn't
make much difference for RTL, since all SVE predicate modes are
expressed as vectors of BI, rather than of multi-bit booleans.

I suspect there might be some fallout from this change on SVE.
But I think we should at least give it a go, and see whether any
fallout provides a strong counterargument against the approach.

gcc/
PR middle-end/115406
* fold-const.cc (native_encode_vector_part): For vector booleans,
check whether an element is nonzero and, if so, set all of the
correspending bits in the target image.
* simplify-rtx.cc (native_encode_rtx): Likewise.

gcc/testsuite/
PR middle-end/115406
* gcc.dg/torture/pr115406.c: New test.

Diff:
---
 gcc/fold-const.cc   |  5 +++--
 gcc/simplify-rtx.cc |  3 ++-
 gcc/testsuite/gcc.dg/torture/pr115406.c | 18 ++
 3 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/gcc/fold-const.cc b/gcc/fold-const.cc
index 6179a09f9c0a..83c32dd10d4a 100644
--- a/gcc/fold-const.cc
+++ b/gcc/fold-const.cc
@@ -8100,16 +8100,17 @@ native_encode_vector_part (const_tree expr, unsigned 
char *ptr, int len,
   unsigned int elts_per_byte = BITS_PER_UNIT / elt_bits;
   unsigned int first_elt = off * elts_per_byte;
   unsigned int extract_elts = extract_bytes * elts_per_byte;
+  unsigned int elt_mask = (1 << elt_bits) - 1;
   for (unsigned int i = 0; i < extract_elts; ++i)
{
  tree elt = VECTOR_CST_ELT (expr, first_elt + i);
  if (TREE_CODE (elt) != INTEGER_CST)
return 0;
 
- if (ptr && wi::extract_uhwi (wi::to_wide (elt), 0, 1))
+ if (ptr && integer_nonzerop (elt))
{
  unsigned int bit = i * elt_bits;
- ptr[bit / BITS_PER_UNIT] |= 1 << (bit % BITS_PER_UNIT);
+ ptr[bit / BITS_PER_UNIT] |= elt_mask << (bit % BITS_PER_UNIT);
}
}
   return extract_bytes;
diff --git a/gcc/simplify-rtx.cc b/gcc/simplify-rtx.cc
index 35ba54c62921..a49eefb34d43 100644
--- a/gcc/simplify-rtx.cc
+++ b/gcc/simplify-rtx.cc
@@ -7232,7 +7232,8 @@ native_encode_rtx (machine_mode mode, rtx x, 
vec &bytes,
  target_unit value = 0;
  for (unsigned int j = 0; j < BITS_PER_UNIT; j += elt_bits)
{
- value |= (INTVAL (CONST_VECTOR_ELT (x, elt)) & mask) << j;
+ if (INTVAL (CONST_VECTOR_ELT (x, elt)))
+   value |= mask << j;
  elt += 1;
}
  bytes.quick_push (value);
diff --git a/gcc/testsuite/gcc.dg/torture/pr115406.c 
b/gcc/testsuite/gcc.dg/torture/pr115406.c
new file mode 100644
index ..800ef2f8317e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/pr115406.c
@@ -0,0 +1,18 @@
+// { dg-do run }
+// { dg-additional-options "-mavx512f" { target avx512f_runtime } }
+
+typedef __attribute__((__vector_size__ (1))) signed char V;
+
+signed char
+foo (V v)
+{
+  return ((V) v == v)[0];
+}
+
+int
+main ()
+{
+  signed char x = foo ((V) { });
+  if (x != -1)
+__builtin_abort ();
+}


[gcc r15-2197] aarch64: Tighten aarch64_simd_mem_operand_p [PR115969]

2024-07-22 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:ebde0cc101a3b26bc8c188e0d2f79b649bacc43a

commit r15-2197-gebde0cc101a3b26bc8c188e0d2f79b649bacc43a
Author: Richard Sandiford 
Date:   Mon Jul 22 16:42:15 2024 +0100

aarch64: Tighten aarch64_simd_mem_operand_p [PR115969]

aarch64_simd_mem_operand_p checked for a memory with a POST_INC
or REG address, but it didn't check what kind of register was
being used.  This meant that it allowed DImode FPRs as well as GPRs.

I wondered about rewriting it to use aarch64_classify_address,
but this one-line fix seemed simpler.  The structure then mirrors
the existing early exit in aarch64_classify_address itself:

  /* On LE, for AdvSIMD, don't support anything other than POST_INC or
 REG addressing.  */
  if (advsimd_struct_p
  && TARGET_SIMD
  && !BYTES_BIG_ENDIAN
  && (code != POST_INC && code != REG))
return false;

gcc/
PR target/115969
* config/aarch64/aarch64.cc (aarch64_simd_mem_operand_p): Require
the operand to be a legitimate memory_operand.

gcc/testsuite/
PR target/115969
* gcc.target/aarch64/pr115969.c: New test.

Diff:
---
 gcc/config/aarch64/aarch64.cc   | 5 +++--
 gcc/testsuite/gcc.target/aarch64/pr115969.c | 8 
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 89eb66348f77..9e51236ce9fa 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -23377,8 +23377,9 @@ aarch64_endian_lane_rtx (machine_mode mode, unsigned 
int n)
 bool
 aarch64_simd_mem_operand_p (rtx op)
 {
-  return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
-   || REG_P (XEXP (op, 0)));
+  return (MEM_P (op)
+ && (GET_CODE (XEXP (op, 0)) == POST_INC || REG_P (XEXP (op, 0)))
+ && memory_operand (op, VOIDmode));
 }
 
 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction.  */
diff --git a/gcc/testsuite/gcc.target/aarch64/pr115969.c 
b/gcc/testsuite/gcc.target/aarch64/pr115969.c
new file mode 100644
index ..ea46626e617c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr115969.c
@@ -0,0 +1,8 @@
+/* { dg-options "-O2" } */
+
+#define vec8 __attribute__((vector_size(8)))
+vec8 int f(int *a)
+{
+asm("":"+w"(a));
+return (vec8 int){a[0], a[0]};
+}


[gcc r15-2198] rtl-ssa: Add debug routines for def_splay_tree

2024-07-22 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:e62988b77757c6019f0a538492e9851cda689c2e

commit r15-2198-ge62988b77757c6019f0a538492e9851cda689c2e
Author: Richard Sandiford 
Date:   Mon Jul 22 16:42:16 2024 +0100

rtl-ssa: Add debug routines for def_splay_tree

This patch adds debug routines for def_splay_tree, which I found
useful while debugging PR116009.

gcc/
* rtl-ssa/accesses.h (rtl_ssa::pp_def_splay_tree): Declare.
(dump, debug): Add overloads for def_splay_tree.
* rtl-ssa/accesses.cc (rtl_ssa::pp_def_splay_tree): New function.
(dump, debug): Add overloads for def_splay_tree.

Diff:
---
 gcc/rtl-ssa/accesses.cc | 15 +++
 gcc/rtl-ssa/accesses.h  |  3 +++
 2 files changed, 18 insertions(+)

diff --git a/gcc/rtl-ssa/accesses.cc b/gcc/rtl-ssa/accesses.cc
index 5cc05cb4be7f..c77a1ff7ea76 100644
--- a/gcc/rtl-ssa/accesses.cc
+++ b/gcc/rtl-ssa/accesses.cc
@@ -1745,6 +1745,13 @@ rtl_ssa::pp_def_lookup (pretty_printer *pp, def_lookup 
dl)
   pp_def_mux (pp, dl.mux);
 }
 
+// Print TREE to PP.
+void
+rtl_ssa::pp_def_splay_tree (pretty_printer *pp, def_splay_tree tree)
+{
+  tree.print (pp, pp_def_node);
+}
+
 // Dump RESOURCE to FILE.
 void
 dump (FILE *file, resource_info resource)
@@ -1787,6 +1794,13 @@ dump (FILE *file, def_lookup result)
   dump_using (file, pp_def_lookup, result);
 }
 
+// Print TREE to FILE.
+void
+dump (FILE *file, def_splay_tree tree)
+{
+  dump_using (file, pp_def_splay_tree, tree);
+}
+
 // Debug interfaces to the dump routines above.
 void debug (const resource_info &x) { dump (stderr, x); }
 void debug (const access_info *x) { dump (stderr, x); }
@@ -1794,3 +1808,4 @@ void debug (const access_array &x) { dump (stderr, x); }
 void debug (const def_node *x) { dump (stderr, x); }
 void debug (const def_mux &x) { dump (stderr, x); }
 void debug (const def_lookup &x) { dump (stderr, x); }
+void debug (const def_splay_tree &x) { dump (stderr, x); }
diff --git a/gcc/rtl-ssa/accesses.h b/gcc/rtl-ssa/accesses.h
index 27810a02063f..7d0d7bcfb500 100644
--- a/gcc/rtl-ssa/accesses.h
+++ b/gcc/rtl-ssa/accesses.h
@@ -1052,6 +1052,7 @@ void pp_accesses (pretty_printer *, access_array,
 void pp_def_node (pretty_printer *, const def_node *);
 void pp_def_mux (pretty_printer *, def_mux);
 void pp_def_lookup (pretty_printer *, def_lookup);
+void pp_def_splay_tree (pretty_printer *, def_splay_tree);
 
 }
 
@@ -1063,6 +1064,7 @@ void dump (FILE *, rtl_ssa::access_array,
 void dump (FILE *, const rtl_ssa::def_node *);
 void dump (FILE *, rtl_ssa::def_mux);
 void dump (FILE *, rtl_ssa::def_lookup);
+void dump (FILE *, rtl_ssa::def_splay_tree);
 
 void DEBUG_FUNCTION debug (const rtl_ssa::resource_info *);
 void DEBUG_FUNCTION debug (const rtl_ssa::access_info *);
@@ -1070,3 +1072,4 @@ void DEBUG_FUNCTION debug (const rtl_ssa::access_array);
 void DEBUG_FUNCTION debug (const rtl_ssa::def_node *);
 void DEBUG_FUNCTION debug (const rtl_ssa::def_mux &);
 void DEBUG_FUNCTION debug (const rtl_ssa::def_lookup &);
+void DEBUG_FUNCTION debug (const rtl_ssa::def_splay_tree &);


[gcc r15-2199] rtl-ssa: Avoid using a stale splay tree root [PR116009]

2024-07-22 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:34f33ea801563e2eabb348e8d3e9344a91abfd48

commit r15-2199-g34f33ea801563e2eabb348e8d3e9344a91abfd48
Author: Richard Sandiford 
Date:   Mon Jul 22 16:42:16 2024 +0100

rtl-ssa: Avoid using a stale splay tree root [PR116009]

In the fix for PR115928, I'd failed to notice that "root" was used
later in the function, so needed to be updated.

gcc/
PR rtl-optimization/116009
* rtl-ssa/accesses.cc (function_info::add_def): Set the root
local variable after removing the old clobber group.

gcc/testsuite/
PR rtl-optimization/116009
* gcc.c-torture/compile/pr116009.c: New test.

Diff:
---
 gcc/rtl-ssa/accesses.cc|  3 ++-
 gcc/testsuite/gcc.c-torture/compile/pr116009.c | 23 +++
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/gcc/rtl-ssa/accesses.cc b/gcc/rtl-ssa/accesses.cc
index c77a1ff7ea76..0bba8391b002 100644
--- a/gcc/rtl-ssa/accesses.cc
+++ b/gcc/rtl-ssa/accesses.cc
@@ -946,7 +946,8 @@ function_info::add_def (def_info *def)
  prev = split_clobber_group (group, insn);
  next = prev->next_def ();
  tree.remove_root ();
- last->set_splay_root (tree.root ());
+ root = tree.root ();
+ last->set_splay_root (root);
}
   // COMPARISON is < 0 if DEF comes before ROOT or > 0 if DEF comes
   // after ROOT.
diff --git a/gcc/testsuite/gcc.c-torture/compile/pr116009.c 
b/gcc/testsuite/gcc.c-torture/compile/pr116009.c
new file mode 100644
index ..6a888d450f4c
--- /dev/null
+++ b/gcc/testsuite/gcc.c-torture/compile/pr116009.c
@@ -0,0 +1,23 @@
+int tt, tt1;
+int y6;
+void ff(void);
+int ttt;
+void g(int var) {
+  do  {
+int t1 = var == 45 || var == 3434;
+if (tt != 0)
+if (t1)
+ff();
+if (tt < 0)
+break;
+if (t1)
+  ff();
+if (tt < 0)
+break;
+ff();
+if (tt1)
+var = y6;
+if (t1)
+  ff();
+} while(1);
+}


[gcc r15-2298] rtl-ssa: Fix split_clobber_group tree insertion [PR116044]

2024-07-25 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:72fbd3b2b2a497dbbe6599239bd61c5624203ed0

commit r15-2298-g72fbd3b2b2a497dbbe6599239bd61c5624203ed0
Author: Richard Sandiford 
Date:   Thu Jul 25 08:54:22 2024 +0100

rtl-ssa: Fix split_clobber_group tree insertion [PR116044]

PR116044 is a regression in the testsuite on AMD GCN caused (again)
by the split_clobber_group code.  The first patch in this area
(g:71b31690a7c52413496e91bcc5ee4c68af2f366f) fixed a bug caused
by carrying the old group over as one of the split ones.  That
patch instead:

- created two new groups
- inserted them in the splay tree as neighbours of the old group
- removed the old group, and
- invalidated the old group (to force lazy recomputation when
  a clobber's parent group is queried)

However, this left add_def trying to insert the new definition
relative to a stale splay tree root.  The second patch
(g:34f33ea801563e2eabb348e8d3e9344a91abfd48) attempted to fix
that by inserting it relative to the new root.  But that's not
always correct either.  We specifically want to insert it after
the first of the two new groups, whether that group is the root
or not.

This patch does that, and tries to refactor the code to make
it a bit less brittle.

gcc/
PR rtl-optimization/116044
* rtl-ssa/functions.h (function_info::split_clobber_group): Return
an array of two clobber_groups.
* rtl-ssa/accesses.cc (function_info::split_clobber_group): Return
the new clobber groups.  Don't modify the splay tree here.
(function_info::add_def): Update call accordingly.  Generalize
the splay tree insertion code so that the new definition can be
inserted as a child of any existing node, not just the root.
Fix the insertion used after calling split_clobber_group.

Diff:
---
 gcc/rtl-ssa/accesses.cc | 66 +++--
 gcc/rtl-ssa/functions.h |  3 ++-
 2 files changed, 39 insertions(+), 30 deletions(-)

diff --git a/gcc/rtl-ssa/accesses.cc b/gcc/rtl-ssa/accesses.cc
index 0bba8391b002..5450ea118d1b 100644
--- a/gcc/rtl-ssa/accesses.cc
+++ b/gcc/rtl-ssa/accesses.cc
@@ -792,12 +792,12 @@ function_info::merge_clobber_groups (clobber_info 
*clobber1,
 }
 
 // GROUP spans INSN, and INSN now sets the resource that GROUP clobbers.
-// Split GROUP around INSN, to form two new groups, and return the clobber
-// that comes immediately before INSN.
+// Split GROUP around INSN, to form two new groups.  The first of the
+// returned groups comes before INSN and the second comes after INSN.
 //
-// The resource that GROUP clobbers is known to have an associated
-// splay tree.  The caller must remove GROUP from the tree on return.
-clobber_info *
+// The caller is responsible for updating the def_splay_tree and chaining
+// the defs together.
+std::array
 function_info::split_clobber_group (clobber_group *group, insn_info *insn)
 {
   // Search for either the previous or next clobber in the group.
@@ -835,14 +835,10 @@ function_info::split_clobber_group (clobber_group *group, 
insn_info *insn)
   auto *group1 = allocate (first_clobber, prev, tree1.root ());
   auto *group2 = allocate (next, last_clobber, tree2.root ());
 
-  // Insert GROUP2 into the splay tree as an immediate successor of GROUP1.
-  def_splay_tree::insert_child (group, 1, group2);
-  def_splay_tree::insert_child (group, 1, group1);
-
   // Invalidate the old group.
   group->set_last_clobber (nullptr);
 
-  return prev;
+  return { group1, group2 };
 }
 
 // Add DEF to the end of the function's list of definitions of
@@ -899,7 +895,7 @@ function_info::add_def (def_info *def)
   insn_info *insn = def->insn ();
 
   int comparison;
-  def_node *root = nullptr;
+  def_node *neighbor = nullptr;
   def_info *prev = nullptr;
   def_info *next = nullptr;
   if (*insn > *last->insn ())
@@ -909,8 +905,8 @@ function_info::add_def (def_info *def)
   if (def_splay_tree tree = last->splay_root ())
{
  tree.splay_max_node ();
- root = tree.root ();
- last->set_splay_root (root);
+ last->set_splay_root (tree.root ());
+ neighbor = tree.root ();
}
   prev = last;
 }
@@ -921,8 +917,8 @@ function_info::add_def (def_info *def)
   if (def_splay_tree tree = last->splay_root ())
{
  tree.splay_min_node ();
- root = tree.root ();
- last->set_splay_root (root);
+ last->set_splay_root (tree.root ());
+ neighbor = tree.root ();
}
   next = first;
 }
@@ -931,8 +927,8 @@ function_info::add_def (def_info *def)
   // Search the splay tree for an insertion point.
   def_splay_tree tree = need_def_splay_tree (last);
   comparison = lookup_def (tree, insn);
-  root = tree.root ();
-  last->set_splay_root (root);
+  last->set_splay_root (tree.root ());
+  neighbor

[gcc r14-9678] aarch64: Use constexpr for out-of-line statics

2024-03-26 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:5be2313bceea7b482c17ee730efe604b910800bd

commit r14-9678-g5be2313bceea7b482c17ee730efe604b910800bd
Author: Richard Sandiford 
Date:   Tue Mar 26 17:27:56 2024 +

aarch64: Use constexpr for out-of-line statics

GCC 4.8 complained about the use of const rather than constexpr
for out-of-line static constexprs.

gcc/
* config/aarch64/aarch64-feature-deps.h: Use constexpr for
out-of-line statics.

Diff:
---
 gcc/config/aarch64/aarch64-feature-deps.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-feature-deps.h 
b/gcc/config/aarch64/aarch64-feature-deps.h
index 3641badb82f..79126db8825 100644
--- a/gcc/config/aarch64/aarch64-feature-deps.h
+++ b/gcc/config/aarch64/aarch64-feature-deps.h
@@ -71,9 +71,9 @@ template struct info;
 static constexpr auto enable = flag | get_enable REQUIRES; \
 static constexpr auto explicit_on = enable | get_enable EXPLICIT_ON; \
   };   \
-  const aarch64_feature_flags info::flag;  \
-  const aarch64_feature_flags info::enable;\
-  const aarch64_feature_flags info::explicit_on; \
+  constexpr aarch64_feature_flags info::flag;  \
+  constexpr aarch64_feature_flags info::enable;
\
+  constexpr aarch64_feature_flags info::explicit_on; \
   constexpr info IDENT ()  \
   {\
 return info ();\


[gcc r13-8501] asan: Handle poly-int sizes in ASAN_MARK [PR97696]

2024-03-27 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:86b80b049167d28a9ef43aebdfbb80ae5deb0888

commit r13-8501-g86b80b049167d28a9ef43aebdfbb80ae5deb0888
Author: Richard Sandiford 
Date:   Wed Mar 27 15:30:19 2024 +

asan: Handle poly-int sizes in ASAN_MARK [PR97696]

This patch makes the expansion of IFN_ASAN_MARK let through
poly-int-sized objects.  The expansion itself was already generic
enough, but the tests for the fast path were too strict.

gcc/
PR sanitizer/97696
* asan.cc (asan_expand_mark_ifn): Allow the length to be a poly_int.

gcc/testsuite/
PR sanitizer/97696
* gcc.target/aarch64/sve/pr97696.c: New test.

(cherry picked from commit fca6f6fddb22b8665e840f455a7d0318d4575227)

Diff:
---
 gcc/asan.cc|  9 
 gcc/testsuite/gcc.target/aarch64/sve/pr97696.c | 29 ++
 2 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/gcc/asan.cc b/gcc/asan.cc
index df732c02150..1a443afedc0 100644
--- a/gcc/asan.cc
+++ b/gcc/asan.cc
@@ -3801,9 +3801,7 @@ asan_expand_mark_ifn (gimple_stmt_iterator *iter)
 }
   tree len = gimple_call_arg (g, 2);
 
-  gcc_assert (tree_fits_shwi_p (len));
-  unsigned HOST_WIDE_INT size_in_bytes = tree_to_shwi (len);
-  gcc_assert (size_in_bytes);
+  gcc_assert (poly_int_tree_p (len));
 
   g = gimple_build_assign (make_ssa_name (pointer_sized_int_node),
   NOP_EXPR, base);
@@ -3812,9 +3810,10 @@ asan_expand_mark_ifn (gimple_stmt_iterator *iter)
   tree base_addr = gimple_assign_lhs (g);
 
   /* Generate direct emission if size_in_bytes is small.  */
-  if (size_in_bytes
-  <= (unsigned)param_use_after_scope_direct_emission_threshold)
+  unsigned threshold = param_use_after_scope_direct_emission_threshold;
+  if (tree_fits_uhwi_p (len) && tree_to_uhwi (len) <= threshold)
 {
+  unsigned HOST_WIDE_INT size_in_bytes = tree_to_uhwi (len);
   const unsigned HOST_WIDE_INT shadow_size
= shadow_mem_size (size_in_bytes);
   const unsigned int shadow_align
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr97696.c 
b/gcc/testsuite/gcc.target/aarch64/sve/pr97696.c
new file mode 100644
index 000..8b7de18a07d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pr97696.c
@@ -0,0 +1,29 @@
+/* { dg-skip-if "" { no_fsanitize_address } } */
+/* { dg-options "-fsanitize=address -fsanitize-address-use-after-scope" } */
+
+#include 
+
+__attribute__((noinline, noclone)) int
+foo (char *a)
+{
+  int i, j = 0;
+  asm volatile ("" : "+r" (a) : : "memory");
+  for (i = 0; i < 12; i++)
+j += a[i];
+  return j;
+}
+
+int
+main ()
+{
+  int i, j = 0;
+  for (i = 0; i < 4; i++)
+{
+  char a[12];
+  __SVInt8_t freq;
+  __builtin_bcmp (&freq, a, 10);
+  __builtin_memset (a, 0, sizeof (a));
+  j += foo (a);
+}
+  return j;
+}


[gcc r12-10296] asan: Handle poly-int sizes in ASAN_MARK [PR97696]

2024-03-27 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:51e1629bc11f0ae4b8050712b26521036ed360aa

commit r12-10296-g51e1629bc11f0ae4b8050712b26521036ed360aa
Author: Richard Sandiford 
Date:   Wed Mar 27 17:38:09 2024 +

asan: Handle poly-int sizes in ASAN_MARK [PR97696]

This patch makes the expansion of IFN_ASAN_MARK let through
poly-int-sized objects.  The expansion itself was already generic
enough, but the tests for the fast path were too strict.

gcc/
PR sanitizer/97696
* asan.cc (asan_expand_mark_ifn): Allow the length to be a poly_int.

gcc/testsuite/
PR sanitizer/97696
* gcc.target/aarch64/sve/pr97696.c: New test.

(cherry picked from commit fca6f6fddb22b8665e840f455a7d0318d4575227)

Diff:
---
 gcc/asan.cc|  9 
 gcc/testsuite/gcc.target/aarch64/sve/pr97696.c | 29 ++
 2 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/gcc/asan.cc b/gcc/asan.cc
index 20e5ef9d378..72d1ef28be8 100644
--- a/gcc/asan.cc
+++ b/gcc/asan.cc
@@ -3746,9 +3746,7 @@ asan_expand_mark_ifn (gimple_stmt_iterator *iter)
 }
   tree len = gimple_call_arg (g, 2);
 
-  gcc_assert (tree_fits_shwi_p (len));
-  unsigned HOST_WIDE_INT size_in_bytes = tree_to_shwi (len);
-  gcc_assert (size_in_bytes);
+  gcc_assert (poly_int_tree_p (len));
 
   g = gimple_build_assign (make_ssa_name (pointer_sized_int_node),
   NOP_EXPR, base);
@@ -3757,9 +3755,10 @@ asan_expand_mark_ifn (gimple_stmt_iterator *iter)
   tree base_addr = gimple_assign_lhs (g);
 
   /* Generate direct emission if size_in_bytes is small.  */
-  if (size_in_bytes
-  <= (unsigned)param_use_after_scope_direct_emission_threshold)
+  unsigned threshold = param_use_after_scope_direct_emission_threshold;
+  if (tree_fits_uhwi_p (len) && tree_to_uhwi (len) <= threshold)
 {
+  unsigned HOST_WIDE_INT size_in_bytes = tree_to_uhwi (len);
   const unsigned HOST_WIDE_INT shadow_size
= shadow_mem_size (size_in_bytes);
   const unsigned int shadow_align
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr97696.c 
b/gcc/testsuite/gcc.target/aarch64/sve/pr97696.c
new file mode 100644
index 000..8b7de18a07d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pr97696.c
@@ -0,0 +1,29 @@
+/* { dg-skip-if "" { no_fsanitize_address } } */
+/* { dg-options "-fsanitize=address -fsanitize-address-use-after-scope" } */
+
+#include 
+
+__attribute__((noinline, noclone)) int
+foo (char *a)
+{
+  int i, j = 0;
+  asm volatile ("" : "+r" (a) : : "memory");
+  for (i = 0; i < 12; i++)
+j += a[i];
+  return j;
+}
+
+int
+main ()
+{
+  int i, j = 0;
+  for (i = 0; i < 4; i++)
+{
+  char a[12];
+  __SVInt8_t freq;
+  __builtin_bcmp (&freq, a, 10);
+  __builtin_memset (a, 0, sizeof (a));
+  j += foo (a);
+}
+  return j;
+}


[gcc r11-11295] aarch64: Fix vld1/st1_x4 intrinsic definitions

2024-03-27 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:daee0409d195d346562e423da783d5d1cf8ea175

commit r11-11295-gdaee0409d195d346562e423da783d5d1cf8ea175
Author: Richard Sandiford 
Date:   Wed Mar 27 19:26:56 2024 +

aarch64: Fix vld1/st1_x4 intrinsic definitions

The vld1_x4 and vst1_x4 patterns use XI registers for both 64-bit and
128-bit vectors.  This has the nice property that each individual
vector is within a separate 16-byte subreg of the XI, which should
reduce the number of memory spills needed.  However, it means that the
64-bit vector forms must convert between the native 4x64-bit structure
layout and the padded 4x128-bit XI layout.

The vld4 and vst4 functions did this correctly.  But the vld1x4 and
vst1x4 functions used a union between the native and padded layouts,
even though the layouts are different sizes.

This patch makes vld1x4 and vst1x4 use the same approach as vld4
and vst4.  It also fixes some uses of variables in the user namespace.

gcc/
* config/aarch64/arm_neon.h (vld1_s8_x4, vld1_s16_x4, vld1_s32_x4):
(vld1_u8_x4, vld1_u16_x4, vld1_u32_x4, vld1_f16_x4, vld1_f32_x4):
(vld1_p8_x4, vld1_p16_x4, vld1_s64_x4, vld1_u64_x4, vld1_p64_x4):
(vld1_f64_x4): Avoid using a union of a 256-bit structure and 
512-bit
XImode integer.  Instead use the same approach as the vld4 
intrinsics.
(vst1_s8_x4, vst1_s16_x4, vst1_s32_x4, vst1_u8_x4, vst1_u16_x4):
(vst1_u32_x4, vst1_f16_x4, vst1_f32_x4, vst1_p8_x4, vst1_p16_x4):
(vst1_s64_x4, vst1_u64_x4, vst1_p64_x4, vst1_f64_x4, vld1_bf16_x4):
(vst1_bf16_x4): Likewise for stores.
(vst1q_s8_x4, vst1q_s16_x4, vst1q_s32_x4, vst1q_u8_x4, 
vst1q_u16_x4):
(vst1q_u32_x4, vst1q_f16_x4, vst1q_f32_x4, vst1q_p8_x4, 
vst1q_p16_x4):
(vst1q_s64_x4, vst1q_u64_x4, vst1q_p64_x4, vst1q_f64_x4)
(vst1q_bf16_x4): Rename val parameter to __val.

Diff:
---
 gcc/config/aarch64/arm_neon.h | 469 ++
 1 file changed, 334 insertions(+), 135 deletions(-)

diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index baa30bd5a9d..8f53f4e1559 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -16498,10 +16498,14 @@ __extension__ extern __inline int8x8x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vld1_s8_x4 (const int8_t *__a)
 {
-  union { int8x8x4_t __i; __builtin_aarch64_simd_xi __o; } __au;
-  __au.__o
-= __builtin_aarch64_ld1x4v8qi ((const __builtin_aarch64_simd_qi *) __a);
-  return __au.__i;
+  int8x8x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld1x4v8qi ((const __builtin_aarch64_simd_qi *) __a);
+  ret.val[0] = (int8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 0);
+  ret.val[1] = (int8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 1);
+  ret.val[2] = (int8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 2);
+  ret.val[3] = (int8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 3);
+  return ret;
 }
 
 __extension__ extern __inline int8x16x4_t
@@ -16518,10 +16522,14 @@ __extension__ extern __inline int16x4x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vld1_s16_x4 (const int16_t *__a)
 {
-  union { int16x4x4_t __i; __builtin_aarch64_simd_xi __o; } __au;
-  __au.__o
-= __builtin_aarch64_ld1x4v4hi ((const __builtin_aarch64_simd_hi *) __a);
-  return __au.__i;
+  int16x4x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld1x4v4hi ((const __builtin_aarch64_simd_hi *) __a);
+  ret.val[0] = (int16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 0);
+  ret.val[1] = (int16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 1);
+  ret.val[2] = (int16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 2);
+  ret.val[3] = (int16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 3);
+  return ret;
 }
 
 __extension__ extern __inline int16x8x4_t
@@ -16538,10 +16546,14 @@ __extension__ extern __inline int32x2x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vld1_s32_x4 (const int32_t *__a)
 {
-  union { int32x2x4_t __i; __builtin_aarch64_simd_xi __o; } __au;
-  __au.__o
-  = __builtin_aarch64_ld1x4v2si ((const __builtin_aarch64_simd_si *) __a);
-  return __au.__i;
+  int32x2x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld1x4v2si ((const __builtin_aarch64_simd_si *) __a);
+  ret.val[0] = (int32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 0);
+  ret.val[1] = (int32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 1);
+  ret.val[2] = (int32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 2);
+  ret.val[3] = (int32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 3);
+  return ret;
 }
 
 __extension__ extern __inline int32x4x4_t
@@ -16558,10 +16570,14 @@ __extension__ extern __inline uint8x8x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vld1_u8_x4 (const uint8_t *__a)
 {
-  union

[gcc r11-11296] asan: Handle poly-int sizes in ASAN_MARK [PR97696]

2024-03-27 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:d98467091bfc23522fefd32f1253e1c9e80331d3

commit r11-11296-gd98467091bfc23522fefd32f1253e1c9e80331d3
Author: Richard Sandiford 
Date:   Wed Mar 27 19:26:57 2024 +

asan: Handle poly-int sizes in ASAN_MARK [PR97696]

This patch makes the expansion of IFN_ASAN_MARK let through
poly-int-sized objects.  The expansion itself was already generic
enough, but the tests for the fast path were too strict.

gcc/
PR sanitizer/97696
* asan.c (asan_expand_mark_ifn): Allow the length to be a poly_int.

gcc/testsuite/
PR sanitizer/97696
* gcc.target/aarch64/sve/pr97696.c: New test.

(cherry picked from commit fca6f6fddb22b8665e840f455a7d0318d4575227)

Diff:
---
 gcc/asan.c |  9 
 gcc/testsuite/gcc.target/aarch64/sve/pr97696.c | 29 ++
 2 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/gcc/asan.c b/gcc/asan.c
index ca3020f463c..2aa2be13bf6 100644
--- a/gcc/asan.c
+++ b/gcc/asan.c
@@ -3723,9 +3723,7 @@ asan_expand_mark_ifn (gimple_stmt_iterator *iter)
 }
   tree len = gimple_call_arg (g, 2);
 
-  gcc_assert (tree_fits_shwi_p (len));
-  unsigned HOST_WIDE_INT size_in_bytes = tree_to_shwi (len);
-  gcc_assert (size_in_bytes);
+  gcc_assert (poly_int_tree_p (len));
 
   g = gimple_build_assign (make_ssa_name (pointer_sized_int_node),
   NOP_EXPR, base);
@@ -3734,9 +3732,10 @@ asan_expand_mark_ifn (gimple_stmt_iterator *iter)
   tree base_addr = gimple_assign_lhs (g);
 
   /* Generate direct emission if size_in_bytes is small.  */
-  if (size_in_bytes
-  <= (unsigned)param_use_after_scope_direct_emission_threshold)
+  unsigned threshold = param_use_after_scope_direct_emission_threshold;
+  if (tree_fits_uhwi_p (len) && tree_to_uhwi (len) <= threshold)
 {
+  unsigned HOST_WIDE_INT size_in_bytes = tree_to_uhwi (len);
   const unsigned HOST_WIDE_INT shadow_size
= shadow_mem_size (size_in_bytes);
   const unsigned int shadow_align
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr97696.c 
b/gcc/testsuite/gcc.target/aarch64/sve/pr97696.c
new file mode 100644
index 000..8b7de18a07d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pr97696.c
@@ -0,0 +1,29 @@
+/* { dg-skip-if "" { no_fsanitize_address } } */
+/* { dg-options "-fsanitize=address -fsanitize-address-use-after-scope" } */
+
+#include 
+
+__attribute__((noinline, noclone)) int
+foo (char *a)
+{
+  int i, j = 0;
+  asm volatile ("" : "+r" (a) : : "memory");
+  for (i = 0; i < 12; i++)
+j += a[i];
+  return j;
+}
+
+int
+main ()
+{
+  int i, j = 0;
+  for (i = 0; i < 4; i++)
+{
+  char a[12];
+  __SVInt8_t freq;
+  __builtin_bcmp (&freq, a, 10);
+  __builtin_memset (a, 0, sizeof (a));
+  j += foo (a);
+}
+  return j;
+}


[gcc r14-9787] aarch64: Recognise svundef idiom [PR114577]

2024-04-04 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:86dce005a1d440154dbf585dde5a2dd4cfac7a05

commit r14-9787-g86dce005a1d440154dbf585dde5a2dd4cfac7a05
Author: Richard Sandiford 
Date:   Thu Apr 4 14:15:49 2024 +0100

aarch64: Recognise svundef idiom [PR114577]

GCC 14 adds the header file arm_neon_sve_bridge.h to help interface
SVE and Advanced SIMD code.  One of the defined idioms is:

  svset_neonq (svundef_TYPE (), advsimd_vector)

which simply reinterprets advsimd_vector as an SVE vector without
regard for what's in the upper bits.

GCC was failing to recognise this idiom, which was likely to
significantly hamper adoption.

There is (AFAIK) no good way of representing an extension with
undefined bits in gimple.  We could add an internal-only builtin
to represent it, but the current framework makes that somewhat
awkward.  It also doesn't seem very forward-looking.

This patch instead goes for the simpler approach of recognising
undefined arguments at expansion time.

gcc/
PR target/114577
* config/aarch64/aarch64-sve-builtins.h 
(aarch64_sve::lookup_fndecl):
Declare.
* config/aarch64/aarch64-sve-builtins.cc 
(aarch64_sve::lookup_fndecl):
New function.
* config/aarch64/aarch64-sve-builtins-base.cc (is_undef): Likewise.
(svset_neonq_impl::expand): Optimise expansions whose first argument
is undefined.

gcc/testsuite/
PR target/114577
* gcc.target/aarch64/sve/acle/general/pr114577_1.c: New test.
* gcc.target/aarch64/sve/acle/general/pr114577_2.c: Likewise.

Diff:
---
 gcc/config/aarch64/aarch64-sve-builtins-base.cc| 27 +++
 gcc/config/aarch64/aarch64-sve-builtins.cc | 16 
 gcc/config/aarch64/aarch64-sve-builtins.h  |  1 +
 .../aarch64/sve/acle/general/pr114577_1.c  | 94 ++
 .../aarch64/sve/acle/general/pr114577_2.c  | 46 +++
 5 files changed, 184 insertions(+)

diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc 
b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
index a8c3f84a70b..257ca5bf6ad 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
@@ -47,11 +47,31 @@
 #include "aarch64-builtins.h"
 #include "ssa.h"
 #include "gimple-fold.h"
+#include "tree-ssa.h"
 
 using namespace aarch64_sve;
 
 namespace {
 
+/* Return true if VAL is an undefined value.  */
+static bool
+is_undef (tree val)
+{
+  if (TREE_CODE (val) == SSA_NAME)
+{
+  if (ssa_undefined_value_p (val, false))
+   return true;
+
+  gimple *def = SSA_NAME_DEF_STMT (val);
+  if (gcall *call = dyn_cast (def))
+   if (tree fndecl = gimple_call_fndecl (call))
+ if (const function_instance *instance = lookup_fndecl (fndecl))
+   if (instance->base == functions::svundef)
+ return true;
+}
+  return false;
+}
+
 /* Return the UNSPEC_CMLA* unspec for rotation amount ROT.  */
 static int
 unspec_cmla (int rot)
@@ -1142,6 +1162,13 @@ public:
   expand (function_expander &e) const override
   {
 machine_mode mode = e.vector_mode (0);
+
+/* If the SVE argument is undefined, we just need to reinterpret the
+   Advanced SIMD argument as an SVE vector.  */
+if (!BYTES_BIG_ENDIAN
+   && is_undef (CALL_EXPR_ARG (e.call_expr, 0)))
+  return simplify_gen_subreg (mode, e.args[1], GET_MODE (e.args[1]), 0);
+
 rtx_vector_builder builder (VNx16BImode, 16, 2);
 for (unsigned int i = 0; i < 16; i++)
   builder.quick_push (CONST1_RTX (BImode));
diff --git a/gcc/config/aarch64/aarch64-sve-builtins.cc 
b/gcc/config/aarch64/aarch64-sve-builtins.cc
index 11f5c5c500c..e124d1f90a5 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins.cc
@@ -1055,6 +1055,22 @@ get_vector_type (sve_type type)
   return acle_vector_types[type.num_vectors - 1][vector_type];
 }
 
+/* If FNDECL is an SVE builtin, return its function instance, otherwise
+   return null.  */
+const function_instance *
+lookup_fndecl (tree fndecl)
+{
+  if (!fndecl_built_in_p (fndecl, BUILT_IN_MD))
+return nullptr;
+
+  unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
+  if ((code & AARCH64_BUILTIN_CLASS) != AARCH64_BUILTIN_SVE)
+return nullptr;
+
+  unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
+  return &(*registered_functions)[subcode]->instance;
+}
+
 /* Report an error against LOCATION that the user has tried to use
function FNDECL when extension EXTENSION is disabled.  */
 static void
diff --git a/gcc/config/aarch64/aarch64-sve-builtins.h 
b/gcc/config/aarch64/aarch64-sve-builtins.h
index e66729ed635..053006776a9 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins.h
+++ b/gcc/config/aarch64/aarch64-sve-builtins.h
@@ -810,6 +810,7 @@ extern tree acle_svprfop;
 
 bool vector_cst_all_same (tree, unsigned int);
 bool i

[gcc r14-9811] aarch64: Fix bogus cnot optimisation [PR114603]

2024-04-05 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:67cbb1c638d6ab3a9cb77e674541e2b291fb67df

commit r14-9811-g67cbb1c638d6ab3a9cb77e674541e2b291fb67df
Author: Richard Sandiford 
Date:   Fri Apr 5 14:47:15 2024 +0100

aarch64: Fix bogus cnot optimisation [PR114603]

aarch64-sve.md had a pattern that combined:

cmpeq   pb.T, pa/z, zc.T, #0
mov zd.T, pb/z, #1

into:

cnotzd.T, pa/m, zc.T

But this is only valid if pa.T is a ptrue.  In other cases, the
original would set inactive elements of zd.T to 0, whereas the
combined form would copy elements from zc.T.

gcc/
PR target/114603
* config/aarch64/aarch64-sve.md (@aarch64_pred_cnot): Replace
with...
(@aarch64_ptrue_cnot): ...this, requiring operand 1 to be
a ptrue.
(*cnot): Require operand 1 to be a ptrue.
* config/aarch64/aarch64-sve-builtins-base.cc (svcnot_impl::expand):
Use aarch64_ptrue_cnot for _x operations that are predicated
with a ptrue.  Represent other _x operations as fully-defined _m
operations.

gcc/testsuite/
PR target/114603
* gcc.target/aarch64/sve/acle/general/cnot_1.c: New test.

Diff:
---
 gcc/config/aarch64/aarch64-sve-builtins-base.cc| 25 ++
 gcc/config/aarch64/aarch64-sve.md  | 22 +--
 .../gcc.target/aarch64/sve/acle/general/cnot_1.c   | 23 
 3 files changed, 50 insertions(+), 20 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc 
b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
index 257ca5bf6ad..5be2315a3c6 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
@@ -517,15 +517,22 @@ public:
   expand (function_expander &e) const override
   {
 machine_mode mode = e.vector_mode (0);
-if (e.pred == PRED_x)
-  {
-   /* The pattern for CNOT includes an UNSPEC_PRED_Z, so needs
-  a ptrue hint.  */
-   e.add_ptrue_hint (0, e.gp_mode (0));
-   return e.use_pred_x_insn (code_for_aarch64_pred_cnot (mode));
-  }
-
-return e.use_cond_insn (code_for_cond_cnot (mode), 0);
+machine_mode pred_mode = e.gp_mode (0);
+/* The underlying _x pattern is effectively:
+
+dst = src == 0 ? 1 : 0
+
+   rather than an UNSPEC_PRED_X.  Using this form allows autovec
+   constructs to be matched by combine, but it means that the
+   predicate on the src == 0 comparison must be all-true.
+
+   For simplicity, represent other _x operations as fully-defined _m
+   operations rather than using a separate bespoke pattern.  */
+if (e.pred == PRED_x
+   && gen_lowpart (pred_mode, e.args[0]) == CONSTM1_RTX (pred_mode))
+  return e.use_pred_x_insn (code_for_aarch64_ptrue_cnot (mode));
+return e.use_cond_insn (code_for_cond_cnot (mode),
+   e.pred == PRED_x ? 1 : 0);
   }
 };
 
diff --git a/gcc/config/aarch64/aarch64-sve.md 
b/gcc/config/aarch64/aarch64-sve.md
index eca8623e587..0434358122d 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -3363,24 +3363,24 @@
 ;; - CNOT
 ;; -
 
-;; Predicated logical inverse.
-(define_expand "@aarch64_pred_cnot"
+;; Logical inverse, predicated with a ptrue.
+(define_expand "@aarch64_ptrue_cnot"
   [(set (match_operand:SVE_FULL_I 0 "register_operand")
(unspec:SVE_FULL_I
  [(unspec:
 [(match_operand: 1 "register_operand")
- (match_operand:SI 2 "aarch64_sve_ptrue_flag")
+ (const_int SVE_KNOWN_PTRUE)
  (eq:
-   (match_operand:SVE_FULL_I 3 "register_operand")
-   (match_dup 4))]
+   (match_operand:SVE_FULL_I 2 "register_operand")
+   (match_dup 3))]
 UNSPEC_PRED_Z)
-  (match_dup 5)
-  (match_dup 4)]
+  (match_dup 4)
+  (match_dup 3)]
  UNSPEC_SEL))]
   "TARGET_SVE"
   {
-operands[4] = CONST0_RTX (mode);
-operands[5] = CONST1_RTX (mode);
+operands[3] = CONST0_RTX (mode);
+operands[4] = CONST1_RTX (mode);
   }
 )
 
@@ -3389,7 +3389,7 @@
(unspec:SVE_I
  [(unspec:
 [(match_operand: 1 "register_operand")
- (match_operand:SI 5 "aarch64_sve_ptrue_flag")
+ (const_int SVE_KNOWN_PTRUE)
  (eq:
(match_operand:SVE_I 2 "register_operand")
(match_operand:SVE_I 3 "aarch64_simd_imm_zero"))]
@@ -11001,4 +11001,4 @@
   GET_MODE (operands[2]));
 return "sel\t%0., %3, %2., %1.";
   }
-)
\ No newline at end of file
+)
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/cnot_1.c 
b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/cnot_1.c
new file mode 1

[gcc r14-9833] aarch64: Fix vld1/st1_x4 intrinsic test

2024-04-08 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:278cad85077509b73b1faf32d36f3889c2a5524b

commit r14-9833-g278cad85077509b73b1faf32d36f3889c2a5524b
Author: Swinney, Jonathan 
Date:   Mon Apr 8 14:02:33 2024 +0100

aarch64: Fix vld1/st1_x4 intrinsic test

The test for this intrinsic was failing silently and so it failed to
report the bug reported in 114521. This patch modifes the test to
report the result.

Bug report: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114521

Signed-off-by: Jonathan Swinney 

gcc/testsuite/
* gcc.target/aarch64/advsimd-intrinsics/vld1x4.c: Exit with a 
nonzero
code if the test fails.

Diff:
---
 gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1x4.c | 10 +++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1x4.c 
b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1x4.c
index 89b289bb21d..17db262a31a 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1x4.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1x4.c
@@ -3,6 +3,7 @@
 /* { dg-skip-if "unimplemented" { arm*-*-* } } */
 /* { dg-options "-O3" } */
 
+#include 
 #include 
 #include "arm-neon-ref.h"
 
@@ -71,13 +72,16 @@ VARIANT (float64, 2, q_f64)
 VARIANTS (TESTMETH)
 
 #define CHECKS(BASE, ELTS, SUFFIX) \
-  if (test_vld1##SUFFIX##_x4 () != 0)  \
-fprintf (stderr, "test_vld1##SUFFIX##_x4");
+  if (test_vld1##SUFFIX##_x4 () != 0) {\
+fprintf (stderr, "test_vld1" #SUFFIX "_x4 failed\n"); \
+failed = true; \
+  }
 
 int
 main (int argc, char **argv)
 {
+  bool failed = false;
   VARIANTS (CHECKS)
 
-  return 0;
+  return (failed) ? 1 : 0;
 }


[gcc r14-9836] aarch64: Fix expansion of svsudot [PR114607]

2024-04-08 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:2c1c2485a4b1aca746ac693041e51ea6da5c64ca

commit r14-9836-g2c1c2485a4b1aca746ac693041e51ea6da5c64ca
Author: Richard Sandiford 
Date:   Mon Apr 8 16:53:32 2024 +0100

aarch64: Fix expansion of svsudot [PR114607]

Not sure how this happend, but: svsudot is supposed to be expanded
as USDOT with the operands swapped.  However, a thinko in the
expansion of svsudot meant that the arguments weren't in fact
swapped; the attempted swap was just a no-op.  And the testcases
blithely accepted that.

gcc/
PR target/114607
* config/aarch64/aarch64-sve-builtins-base.cc
(svusdot_impl::expand): Fix botched attempt to swap the operands
for svsudot.

gcc/testsuite/
PR target/114607
* gcc.target/aarch64/sve/acle/asm/sudot_s32.c: New test.

Diff:
---
 gcc/config/aarch64/aarch64-sve-builtins-base.cc   | 2 +-
 gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sudot_s32.c | 8 
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc 
b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
index 5be2315a3c6..0d2edf3f19e 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
@@ -2809,7 +2809,7 @@ public:
version) is through the USDOT instruction but with the second and third
inputs swapped.  */
 if (m_su)
-  e.rotate_inputs_left (1, 2);
+  e.rotate_inputs_left (1, 3);
 /* The ACLE function has the same order requirements as for svdot.
While there's no requirement for the RTL pattern to have the same sort
of order as that for dot_prod, it's easier to read.
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sudot_s32.c 
b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sudot_s32.c
index 4b452619eee..e06b69affab 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sudot_s32.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sudot_s32.c
@@ -6,7 +6,7 @@
 
 /*
 ** sudot_s32_tied1:
-** usdot   z0\.s, z2\.b, z4\.b
+** usdot   z0\.s, z4\.b, z2\.b
 ** ret
 */
 TEST_TRIPLE_Z (sudot_s32_tied1, svint32_t, svint8_t, svuint8_t,
@@ -17,7 +17,7 @@ TEST_TRIPLE_Z (sudot_s32_tied1, svint32_t, svint8_t, 
svuint8_t,
 ** sudot_s32_tied2:
 ** mov (z[0-9]+)\.d, z0\.d
 ** movprfx z0, z4
-** usdot   z0\.s, z2\.b, \1\.b
+** usdot   z0\.s, \1\.b, z2\.b
 ** ret
 */
 TEST_TRIPLE_Z_REV (sudot_s32_tied2, svint32_t, svint8_t, svuint8_t,
@@ -27,7 +27,7 @@ TEST_TRIPLE_Z_REV (sudot_s32_tied2, svint32_t, svint8_t, 
svuint8_t,
 /*
 ** sudot_w0_s32_tied:
 ** mov (z[0-9]+\.b), w0
-** usdot   z0\.s, z2\.b, \1
+** usdot   z0\.s, \1, z2\.b
 ** ret
 */
 TEST_TRIPLE_ZX (sudot_w0_s32_tied, svint32_t, svint8_t, uint8_t,
@@ -37,7 +37,7 @@ TEST_TRIPLE_ZX (sudot_w0_s32_tied, svint32_t, svint8_t, 
uint8_t,
 /*
 ** sudot_9_s32_tied:
 ** mov (z[0-9]+\.b), #9
-** usdot   z0\.s, z2\.b, \1
+** usdot   z0\.s, \1, z2\.b
 ** ret
 */
 TEST_TRIPLE_Z (sudot_9_s32_tied, svint32_t, svint8_t, uint8_t,


[gcc r14-9925] aarch64: Fix _BitInt testcases

2024-04-11 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:b87ba79200f2a727aa5c523abcc5c03fa11fc007

commit r14-9925-gb87ba79200f2a727aa5c523abcc5c03fa11fc007
Author: Andre Vieira (lists) 
Date:   Thu Apr 11 17:54:37 2024 +0100

aarch64: Fix _BitInt testcases

This patch fixes some testisms introduced by:

commit 5aa3fec38cc6f52285168b161bab1a869d864b44
Author: Andre Vieira 
Date:   Wed Apr 10 16:29:46 2024 +0100

 aarch64: Add support for _BitInt

The testcases were relying on an unnecessary sign-extend that is no longer
generated.

The tested version was just slightly behind top of trunk when the patch
was committed, and the codegen had changed, for the better, by then.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/bitfield-bitint-abi-align16.c (g1, g8, g16, 
g1p, g8p,
g16p): Remove unnecessary sbfx.
* gcc.target/aarch64/bitfield-bitint-abi-align8.c (g1, g8, g16, 
g1p, g8p,
g16p): Likewise.

Diff:
---
 .../aarch64/bitfield-bitint-abi-align16.c  | 30 +-
 .../aarch64/bitfield-bitint-abi-align8.c   | 30 +-
 2 files changed, 24 insertions(+), 36 deletions(-)

diff --git a/gcc/testsuite/gcc.target/aarch64/bitfield-bitint-abi-align16.c 
b/gcc/testsuite/gcc.target/aarch64/bitfield-bitint-abi-align16.c
index 3f292a45f95..4a228b0a1ce 100644
--- a/gcc/testsuite/gcc.target/aarch64/bitfield-bitint-abi-align16.c
+++ b/gcc/testsuite/gcc.target/aarch64/bitfield-bitint-abi-align16.c
@@ -55,9 +55,8 @@
 ** g1:
 ** mov (x[0-9]+), x0
 ** mov w0, w1
-** sbfx(x[0-9]+), \1, 0, 63
-** and x4, \2, 9223372036854775807
-** and x2, \2, 1
+** and x4, \1, 9223372036854775807
+** and x2, \1, 1
 ** mov x3, 0
 ** b   f1
 */
@@ -66,9 +65,8 @@
 ** g8:
 ** mov (x[0-9]+), x0
 ** mov w0, w1
-** sbfx(x[0-9]+), \1, 0, 63
-** and x4, \2, 9223372036854775807
-** and x2, \2, 1
+** and x4, \1, 9223372036854775807
+** and x2, \1, 1
 ** mov x3, 0
 ** b   f8
 */
@@ -76,9 +74,8 @@
 ** g16:
 ** mov (x[0-9]+), x0
 ** mov w0, w1
-** sbfx(x[0-9]+), \1, 0, 63
-** and x4, \2, 9223372036854775807
-** and x2, \2, 1
+** and x4, \1, 9223372036854775807
+** and x2, \1, 1
 ** mov x3, 0
 ** b   f16
 */
@@ -107,9 +104,8 @@
 /*
 ** g1p:
 ** mov (w[0-9]+), w1
-** sbfx(x[0-9]+), x0, 0, 63
-** and x3, \2, 9223372036854775807
-** and x1, \2, 1
+** and x3, x0, 9223372036854775807
+** and x1, x0, 1
 ** mov x2, 0
 ** mov w0, \1
 ** b   f1p
@@ -117,9 +113,8 @@
 /*
 ** g8p:
 ** mov (w[0-9]+), w1
-** sbfx(x[0-9]+), x0, 0, 63
-** and x3, \2, 9223372036854775807
-** and x1, \2, 1
+** and x3, x0, 9223372036854775807
+** and x1, x0, 1
 ** mov x2, 0
 ** mov w0, \1
 ** b   f8p
@@ -128,9 +123,8 @@
 ** g16p:
 ** mov (x[0-9]+), x0
 ** mov w0, w1
-** sbfx(x[0-9]+), \1, 0, 63
-** and x4, \2, 9223372036854775807
-** and x2, \2, 1
+** and x4, \1, 9223372036854775807
+** and x2, \1, 1
 ** mov x3, 0
 ** b   f16p
 */
diff --git a/gcc/testsuite/gcc.target/aarch64/bitfield-bitint-abi-align8.c 
b/gcc/testsuite/gcc.target/aarch64/bitfield-bitint-abi-align8.c
index da3c23550ba..e7f773640f0 100644
--- a/gcc/testsuite/gcc.target/aarch64/bitfield-bitint-abi-align8.c
+++ b/gcc/testsuite/gcc.target/aarch64/bitfield-bitint-abi-align8.c
@@ -54,9 +54,8 @@
 /*
 ** g1:
 ** mov (w[0-9]+), w1
-** sbfx(x[0-9]+), x0, 0, 63
-** and x3, \2, 9223372036854775807
-** and x1, \2, 1
+** and x3, x0, 9223372036854775807
+** and x1, x0, 1
 ** mov x2, 0
 ** mov w0, \1
 ** b   f1
@@ -65,9 +64,8 @@
 /*
 ** g8:
 ** mov (w[0-9]+), w1
-** sbfx(x[0-9]+), x0, 0, 63
-** and x3, \2, 9223372036854775807
-** and x1, \2, 1
+** and x3, x0, 9223372036854775807
+** and x1, x0, 1
 ** mov x2, 0
 ** mov w0, \1
 ** b   f8
@@ -76,9 +74,8 @@
 ** g16:
 ** mov (x[0-9]+), x0
 ** mov w0, w1
-** sbfx(x[0-9]+), \1, 0, 63
-** and x4, \2, 9223372036854775807
-** and x2, \2, 1
+** and x4, \1, 9223372036854775807
+** and x2, \1, 1
 ** mov x3, 0
 ** b   f16
 */
@@ -107,9 +104,8 @@
 /*
 ** g1p:
 ** mov (w[0-9]+), w1
-** sbfx(x[0-9]+), x0, 0, 63
-** and x3, \2, 9223372036854775807
-** and x1, \2, 1
+** and x3, x0, 9223372036854775807
+** and x1, x0, 1
 ** mov x2, 0
 ** mov w0, \1
 ** b   f1p
@@ -117,9 +113,8 @@
 /*
 ** g8p:
 ** mov (w[0-9]+), w1
-** sbfx(x[0-9]+), x0, 0, 63
-** and 

[gcc r15-2313] rtl-ssa: Define INCLUDE_ARRAY

2024-07-25 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:d6849aa926665cbee8bf87822401ca44f881753f

commit r15-2313-gd6849aa926665cbee8bf87822401ca44f881753f
Author: Richard Sandiford 
Date:   Thu Jul 25 13:25:32 2024 +0100

rtl-ssa: Define INCLUDE_ARRAY

g:72fbd3b2b2a497dbbe6599239bd61c5624203ed0 added a use of std::array
without explicitly forcing  to be included.  That didn't cause
problems in my local builds but understandably did for some people.

gcc/
* doc/rtl.texi: Document the need to define INCLUDE_ARRAY before
including rtl-ssa.h.
* rtl-ssa.h: Likewise (in comment).
* config/aarch64/aarch64-cc-fusion.cc: Add INCLUDE_ARRAY.
* config/aarch64/aarch64-early-ra.cc: Likewise.
* config/riscv/riscv-avlprop.cc: Likewise.
* config/riscv/riscv-vsetvl.cc: Likewise.
* fwprop.cc: Likewise.
* late-combine.cc: Likewise.
* pair-fusion.cc: Likewise.
* rtl-ssa/accesses.cc: Likewise.
* rtl-ssa/blocks.cc: Likewise.
* rtl-ssa/changes.cc: Likewise.
* rtl-ssa/functions.cc: Likewise.
* rtl-ssa/insns.cc: Likewise.
* rtl-ssa/movement.cc: Likewise.

Diff:
---
 gcc/config/aarch64/aarch64-cc-fusion.cc | 1 +
 gcc/config/aarch64/aarch64-early-ra.cc  | 1 +
 gcc/config/riscv/riscv-avlprop.cc   | 1 +
 gcc/config/riscv/riscv-vsetvl.cc| 1 +
 gcc/doc/rtl.texi| 1 +
 gcc/fwprop.cc   | 1 +
 gcc/late-combine.cc | 1 +
 gcc/pair-fusion.cc  | 1 +
 gcc/rtl-ssa.h   | 1 +
 gcc/rtl-ssa/accesses.cc | 1 +
 gcc/rtl-ssa/blocks.cc   | 1 +
 gcc/rtl-ssa/changes.cc  | 1 +
 gcc/rtl-ssa/functions.cc| 1 +
 gcc/rtl-ssa/insns.cc| 1 +
 gcc/rtl-ssa/movement.cc | 1 +
 15 files changed, 15 insertions(+)

diff --git a/gcc/config/aarch64/aarch64-cc-fusion.cc 
b/gcc/config/aarch64/aarch64-cc-fusion.cc
index e97c26682d07..3af8c00d8462 100644
--- a/gcc/config/aarch64/aarch64-cc-fusion.cc
+++ b/gcc/config/aarch64/aarch64-cc-fusion.cc
@@ -63,6 +63,7 @@
 
 #define INCLUDE_ALGORITHM
 #define INCLUDE_FUNCTIONAL
+#define INCLUDE_ARRAY
 #include "config.h"
 #include "system.h"
 #include "coretypes.h"
diff --git a/gcc/config/aarch64/aarch64-early-ra.cc 
b/gcc/config/aarch64/aarch64-early-ra.cc
index 99324423ee5a..5f269d029b45 100644
--- a/gcc/config/aarch64/aarch64-early-ra.cc
+++ b/gcc/config/aarch64/aarch64-early-ra.cc
@@ -40,6 +40,7 @@
 
 #define INCLUDE_ALGORITHM
 #define INCLUDE_FUNCTIONAL
+#define INCLUDE_ARRAY
 #include "config.h"
 #include "system.h"
 #include "coretypes.h"
diff --git a/gcc/config/riscv/riscv-avlprop.cc 
b/gcc/config/riscv/riscv-avlprop.cc
index 71d6f6a04957..caf5a93b234e 100644
--- a/gcc/config/riscv/riscv-avlprop.cc
+++ b/gcc/config/riscv/riscv-avlprop.cc
@@ -65,6 +65,7 @@ along with GCC; see the file COPYING3.  If not see
 #define IN_TARGET_CODE 1
 #define INCLUDE_ALGORITHM
 #define INCLUDE_FUNCTIONAL
+#define INCLUDE_ARRAY
 
 #include "config.h"
 #include "system.h"
diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc
index bbea2b5fd4f3..017efa8bc17e 100644
--- a/gcc/config/riscv/riscv-vsetvl.cc
+++ b/gcc/config/riscv/riscv-vsetvl.cc
@@ -63,6 +63,7 @@ along with GCC; see the file COPYING3.  If not see
 #define IN_TARGET_CODE 1
 #define INCLUDE_ALGORITHM
 #define INCLUDE_FUNCTIONAL
+#define INCLUDE_ARRAY
 
 #include "config.h"
 #include "system.h"
diff --git a/gcc/doc/rtl.texi b/gcc/doc/rtl.texi
index a1ede418c21e..0cb36aae09bd 100644
--- a/gcc/doc/rtl.texi
+++ b/gcc/doc/rtl.texi
@@ -4405,6 +4405,7 @@ A pass that wants to use the RTL SSA form should start 
with the following:
 @smallexample
 #define INCLUDE_ALGORITHM
 #define INCLUDE_FUNCTIONAL
+#define INCLUDE_ARRAY
 #include "config.h"
 #include "system.h"
 #include "coretypes.h"
diff --git a/gcc/fwprop.cc b/gcc/fwprop.cc
index bfdc7a1b7492..2ebb2f146cc6 100644
--- a/gcc/fwprop.cc
+++ b/gcc/fwprop.cc
@@ -20,6 +20,7 @@ along with GCC; see the file COPYING3.  If not see
 
 #define INCLUDE_ALGORITHM
 #define INCLUDE_FUNCTIONAL
+#define INCLUDE_ARRAY
 #include "config.h"
 #include "system.h"
 #include "coretypes.h"
diff --git a/gcc/late-combine.cc b/gcc/late-combine.cc
index 789d734692a8..2b62e2956ede 100644
--- a/gcc/late-combine.cc
+++ b/gcc/late-combine.cc
@@ -30,6 +30,7 @@
 
 #define INCLUDE_ALGORITHM
 #define INCLUDE_FUNCTIONAL
+#define INCLUDE_ARRAY
 #include "config.h"
 #include "system.h"
 #include "coretypes.h"
diff --git a/gcc/pair-fusion.cc b/gcc/pair-fusion.cc
index 31d2c21c88f9..cb0374f426b0 100644
--- a/gcc/pair-fusion.cc
+++ b/gcc/pair-fusion.cc
@@ -21,6 +21,7 @@
 #define INCLUDE_FUNCTIONAL
 #define INCLUDE_LIST
 #define INCLUDE_TYPE_TRAITS
+#define INCLUDE_ARRAY
 #include "config.h"
 #include "system.h"
 #include "coretypes.h"
diff --git a/gcc/rtl-ssa.h 

[gcc r15-2429] recog: Disallow subregs in mode-punned value [PR115881]

2024-07-31 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:d63b6d8b494483b0049370ff0dfeee0e1d10e54b

commit r15-2429-gd63b6d8b494483b0049370ff0dfeee0e1d10e54b
Author: Richard Sandiford 
Date:   Wed Jul 31 09:23:35 2024 +0100

recog: Disallow subregs in mode-punned value [PR115881]

In g:9d20529d94b23275885f380d155fe8671ab5353a, I'd extended
insn_propagation to handle simple cases of hard-reg mode punning.
The punned "to" value was created using simplify_subreg rather
than simplify_gen_subreg, on the basis that hard-coded subregs
aren't generally useful after RA (where hard-reg propagation is
expected to happen).

This PR is about a case where the subreg gets pushed into the
operands of a plus, but the subreg on one of the operands
cannot be simplified.  Specifically, we have to generate
(subreg:SI (reg:DI sp) 0) rather than (reg:SI sp), since all
references to the stack pointer must be via stack_pointer_rtx.

However, code in x86 (reasonably) expects no subregs of registers
to appear after RA, except for special cases like strict_low_part.
This leads to an awkward situation where we can't ban subregs of sp
(because of the strict_low_part use), can't allow direct references
to sp in other modes (because of the stack_pointer_rtx requirement),
and can't allow rvalue uses of the subreg (because of the "no subregs
after RA" assumption).  It all seems a bit of a mess...

I sat on this for a while in the hope that a clean solution might
become apparent, but in the end, I think we'll just have to check
manually for nested subregs and punt on them.

gcc/
PR rtl-optimization/115881
* recog.cc: Include rtl-iter.h.
(insn_propagation::apply_to_rvalue_1): Check that the result
of simplify_subreg does not include nested subregs.

gcc/testsuite/
PR rtl-optimization/115881
* gcc.c-torture/compile/pr115881.c: New test.

Diff:
---
 gcc/recog.cc   | 21 +
 gcc/testsuite/gcc.c-torture/compile/pr115881.c | 16 
 2 files changed, 37 insertions(+)

diff --git a/gcc/recog.cc b/gcc/recog.cc
index 54b317126c29..23e4820180f8 100644
--- a/gcc/recog.cc
+++ b/gcc/recog.cc
@@ -41,6 +41,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "reload.h"
 #include "tree-pass.h"
 #include "function-abi.h"
+#include "rtl-iter.h"
 
 #ifndef STACK_POP_CODE
 #if STACK_GROWS_DOWNWARD
@@ -1082,6 +1083,7 @@ insn_propagation::apply_to_rvalue_1 (rtx *loc)
  || !REG_CAN_CHANGE_MODE_P (REGNO (x), GET_MODE (from),
 GET_MODE (x)))
return false;
+
  /* If the reference is paradoxical and the replacement
 value contains registers, we would need to check that the
 simplification below does not increase REG_NREGS for those
@@ -1090,11 +1092,30 @@ insn_propagation::apply_to_rvalue_1 (rtx *loc)
  if (paradoxical_subreg_p (GET_MODE (x), GET_MODE (from))
  && !CONSTANT_P (to))
return false;
+
  newval = simplify_subreg (GET_MODE (x), to, GET_MODE (from),
subreg_lowpart_offset (GET_MODE (x),
   GET_MODE (from)));
  if (!newval)
return false;
+
+ /* Check that the simplification didn't just push an explicit
+subreg down into subexpressions.  In particular, for a register
+R that has a fixed mode, such as the stack pointer, a subreg of:
+
+  (plus:M (reg:M R) (const_int C))
+
+would be:
+
+  (plus:N (subreg:N (reg:M R) ...) (const_int C'))
+
+But targets can legitimately assume that subregs of hard registers
+will not be created after RA (except in special circumstances,
+such as strict_low_part).  */
+ subrtx_iterator::array_type array;
+ FOR_EACH_SUBRTX (iter, array, newval, NONCONST)
+   if (GET_CODE (*iter) == SUBREG)
+ return false;
}
 
   if (should_unshare)
diff --git a/gcc/testsuite/gcc.c-torture/compile/pr115881.c 
b/gcc/testsuite/gcc.c-torture/compile/pr115881.c
new file mode 100644
index ..8379704c4c8b
--- /dev/null
+++ b/gcc/testsuite/gcc.c-torture/compile/pr115881.c
@@ -0,0 +1,16 @@
+typedef unsigned u32;
+int list_is_head();
+void tu102_acr_wpr_build_acr_0_0_0(int, long, u32);
+void tu102_acr_wpr_build() {
+  u32 offset = 0;
+  for (; list_is_head();) {
+int hdr;
+u32 _addr = offset, _size = sizeof(hdr), *_data = &hdr;
+while (_size--) {
+  tu102_acr_wpr_build_acr_0_0_0(0, _addr, *_data++);
+  _addr += 4;
+}
+offset += sizeof(hdr);
+  }
+  tu102_acr_wpr_build_acr_0_0_0(0, offset, 0);
+}


[gcc r15-2436] aarch64: Add march flags for +fp8 arch extensions

2024-07-31 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:72ebbc3b2bb9bb3649f1222f731a9b4d0197499e

commit r15-2436-g72ebbc3b2bb9bb3649f1222f731a9b4d0197499e
Author: Claudio Bantaloukas 
Date:   Wed Jul 31 14:42:39 2024 +0100

aarch64: Add march flags for +fp8 arch extensions

This introduces the relevant flags to enable access to the fpmr register 
and fp8 intrinsics, which will be added subsequently.

gcc/ChangeLog:

* config/aarch64/aarch64-option-extensions.def (fp8): New.
* config/aarch64/aarch64.h (TARGET_FP8): Likewise.
* doc/invoke.texi (AArch64 Options): Document new -march flags
and extensions.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/acle/fp8.c: New test.

Diff:
---
 gcc/config/aarch64/aarch64-option-extensions.def |  2 ++
 gcc/config/aarch64/aarch64.h |  3 +++
 gcc/doc/invoke.texi  |  2 ++
 gcc/testsuite/gcc.target/aarch64/acle/fp8.c  | 20 
 4 files changed, 27 insertions(+)

diff --git a/gcc/config/aarch64/aarch64-option-extensions.def 
b/gcc/config/aarch64/aarch64-option-extensions.def
index 42ec0eec31e2..6998627f3774 100644
--- a/gcc/config/aarch64/aarch64-option-extensions.def
+++ b/gcc/config/aarch64/aarch64-option-extensions.def
@@ -232,6 +232,8 @@ AARCH64_OPT_EXTENSION("the", THE, (), (), (), "the")
 
 AARCH64_OPT_EXTENSION("gcs", GCS, (), (), (), "gcs")
 
+AARCH64_OPT_EXTENSION("fp8", FP8, (SIMD), (), (), "fp8")
+
 #undef AARCH64_OPT_FMV_EXTENSION
 #undef AARCH64_OPT_EXTENSION
 #undef AARCH64_FMV_FEATURE
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index b7e330438d9b..2e75c6b81e20 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -463,6 +463,9 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE ATTRIBUTE_UNUSED
 && (aarch64_tune_params.extra_tuning_flags \
 & AARCH64_EXTRA_TUNE_AVOID_PRED_RMW))
 
+/* fp8 instructions are enabled through +fp8.  */
+#define TARGET_FP8 AARCH64_HAVE_ISA (FP8)
+
 /* Standard register usage.  */
 
 /* 31 64-bit general purpose registers R0-R30:
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 86f9b5d1fe5e..ef2213b4e841 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -21849,6 +21849,8 @@ Enable support for Armv9.4-a Guarded Control Stack 
extension.
 Enable support for Armv8.9-a/9.4-a translation hardening extension.
 @item rcpc3
 Enable the RCpc3 (Release Consistency) extension.
+@item fp8
+Enable the fp8 (8-bit floating point) extension.
 
 @end table
 
diff --git a/gcc/testsuite/gcc.target/aarch64/acle/fp8.c 
b/gcc/testsuite/gcc.target/aarch64/acle/fp8.c
new file mode 100644
index ..459442be1557
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/acle/fp8.c
@@ -0,0 +1,20 @@
+/* Test the fp8 ACLE intrinsics family.  */
+/* { dg-do compile } */
+/* { dg-options "-O1 -march=armv8-a" } */
+
+#include 
+
+#ifdef __ARM_FEATURE_FP8
+#error "__ARM_FEATURE_FP8 feature macro defined."
+#endif
+
+#pragma GCC push_options
+#pragma GCC target("arch=armv9.4-a+fp8")
+
+/* We do not define __ARM_FEATURE_FP8 until all
+   relevant features have been added. */
+#ifdef __ARM_FEATURE_FP8
+#error "__ARM_FEATURE_FP8 feature macro defined."
+#endif
+
+#pragma GCC pop_options


[gcc r15-2437] aarch64: Add support for moving fpm system register

2024-07-31 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:6d43c3669a6bd9e84f6d3941e19cc025de59ece0

commit r15-2437-g6d43c3669a6bd9e84f6d3941e19cc025de59ece0
Author: Claudio Bantaloukas 
Date:   Wed Jul 31 14:42:40 2024 +0100

aarch64: Add support for moving fpm system register

Unlike most system registers, fpmr can be heavily written to in code that
exercises the fp8 functionality. That is because every fp8 instrinsic call
can potentially change the value of fpmr.
Rather than just use an unspec, we treat the fpmr system register like
all other registers and use a move operation to read and write to it.

We introduce a new class of moveable system registers that, currently,
only accepts fpmr and a new constraint, Umv, that allows us to
selectively use mrs and msr instructions when expanding rtl for them.
Given that there is code that depends on "real" registers coming before
"fake" ones, we introduce a new constant FPM_REGNUM that uses an
existing value and renumber registers below that.
This requires us to update the bitmaps that describe which registers
belong to each register class.

gcc/ChangeLog:

* config/aarch64/aarch64.cc (aarch64_hard_regno_nregs): Add
support for MOVEABLE_SYSREGS class.
(aarch64_hard_regno_mode_ok): Allow reads and writes to fpmr.
(aarch64_regno_regclass): Support MOVEABLE_SYSREGS class.
(aarch64_class_max_nregs): Likewise.
* config/aarch64/aarch64.h (FIXED_REGISTERS): add fpmr.
(CALL_REALLY_USED_REGISTERS): Likewise.
(REGISTER_NAMES): Likewise.
(enum reg_class): Add MOVEABLE_SYSREGS class.
(REG_CLASS_NAMES): Likewise.
(REG_CLASS_CONTENTS): Update class bitmaps to deal with fpmr,
the new MOVEABLE_REGS class and renumbering of registers.
* config/aarch64/aarch64.md: (FPM_REGNUM): added new register
number, reusing old value.
(FFR_REGNUM): Renumber.
(FFRT_REGNUM): Likewise.
(LOWERING_REGNUM): Likewise.
(TPIDR2_BLOCK_REGNUM): Likewise.
(SME_STATE_REGNUM): Likewise.
(TPIDR2_SETUP_REGNUM): Likewise.
(ZA_FREE_REGNUM): Likewise.
(ZA_SAVED_REGNUM): Likewise.
(ZA_REGNUM): Likewise.
(ZT0_REGNUM): Likewise.
(*mov_aarch64): Add support for moveable sysregs.
(*movsi_aarch64): Likewise.
(*movdi_aarch64): Likewise.
* config/aarch64/constraints.md (MOVEABLE_SYSREGS): New constraint.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/acle/fp8.c: New tests.

Diff:
---
 gcc/config/aarch64/aarch64.cc   |   8 +++
 gcc/config/aarch64/aarch64.h|  14 ++--
 gcc/config/aarch64/aarch64.md   |  30 ++---
 gcc/config/aarch64/constraints.md   |   3 +
 gcc/testsuite/gcc.target/aarch64/acle/fp8.c | 101 
 5 files changed, 142 insertions(+), 14 deletions(-)

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index e0cf382998c7..9810f2c03900 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -2018,6 +2018,7 @@ aarch64_hard_regno_nregs (unsigned regno, machine_mode 
mode)
 case PR_HI_REGS:
   return mode == VNx32BImode ? 2 : 1;
 
+case MOVEABLE_SYSREGS:
 case FFR_REGS:
 case PR_AND_FFR_REGS:
 case FAKE_REGS:
@@ -2045,6 +2046,9 @@ aarch64_hard_regno_mode_ok (unsigned regno, machine_mode 
mode)
 /* This must have the same size as _Unwind_Word.  */
 return mode == DImode;
 
+  if (regno == FPM_REGNUM)
+return mode == QImode || mode == HImode || mode == SImode || mode == 
DImode;
+
   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
   if (vec_flags == VEC_SVE_PRED)
 return pr_or_ffr_regnum_p (regno);
@@ -12680,6 +12684,9 @@ aarch64_regno_regclass (unsigned regno)
   if (PR_REGNUM_P (regno))
 return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
 
+  if (regno == FPM_REGNUM)
+return MOVEABLE_SYSREGS;
+
   if (regno == FFR_REGNUM || regno == FFRT_REGNUM)
 return FFR_REGS;
 
@@ -13068,6 +13075,7 @@ aarch64_class_max_nregs (reg_class_t regclass, 
machine_mode mode)
 case PR_HI_REGS:
   return mode == VNx32BImode ? 2 : 1;
 
+case MOVEABLE_SYSREGS:
 case STACK_REG:
 case FFR_REGS:
 case PR_AND_FFR_REGS:
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index 2e75c6b81e20..2dfb999bea53 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -523,6 +523,7 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE ATTRIBUTE_UNUSED
 1, 1, 1, 1,/* SFP, AP, CC, VG */   \
 0, 0, 0, 0,   0, 0, 0, 0,   /* P0 - P7 */   \
 0, 0, 0, 0,   0, 0, 0, 0,   /* P8 - P15 */  \
+1, /* FPMR */  \
 1, 1,   

[gcc r15-2438] aarch64: Add fpm register helper functions.

2024-07-31 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:cfe2b6756c691c92aa29337c6973e3b3361de5c9

commit r15-2438-gcfe2b6756c691c92aa29337c6973e3b3361de5c9
Author: Claudio Bantaloukas 
Date:   Wed Jul 31 14:42:41 2024 +0100

aarch64: Add fpm register helper functions.

The ACLE declares several helper types and functions to facilitate 
construction
of `fpm` arguments. These are available when one of the arm_neon.h, 
arm_sve.h,
or arm_sme.h headers is included. These helpers don't map to specific FP8
instructions and there's no expectation that they will produce a given code
sequence, they're just an abstraction and an aid to the programmer. Thus 
they are
implemented in a new header file arm_private_fp8.h
Users are not expected to include this file, as it is a mere implementation 
detail,
subject to change. A check is included to guard against direct inclusion.

gcc/ChangeLog:

* config.gcc (extra_headers): Install arm_private_fp8.h.
* config/aarch64/arm_neon.h: Include arm_private_fp8.h.
* config/aarch64/arm_sve.h: Likewise.
* config/aarch64/arm_private_fp8.h: New file
(fpm_t): New type representing fpmr values.
(enum __ARM_FPM_FORMAT): New enum representing valid fp8 formats.
(enum __ARM_FPM_OVERFLOW): New enum representing how some fp8
calculations work.
(__arm_fpm_init): New.
(__arm_set_fpm_src1_format): Likewise.
(__arm_set_fpm_src2_format): Likewise.
(__arm_set_fpm_dst_format): Likewise.
(__arm_set_fpm_overflow_cvt): Likewise.
(__arm_set_fpm_overflow_mul): Likewise.
(__arm_set_fpm_lscale): Likewise.
(__arm_set_fpm_lscale2): Likewise.
(__arm_set_fpm_nscale): Likewise.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/acle/fp8-helpers-neon.c: New test of fpmr 
helper
functions.
* gcc.target/aarch64/acle/fp8-helpers-sve.c: New test of fpmr helper
functions presence.
* gcc.target/aarch64/acle/fp8-helpers-sme.c: New test of fpmr helper
functions presence.

Diff:
---
 gcc/config.gcc |  2 +-
 gcc/config/aarch64/arm_neon.h  |  1 +
 gcc/config/aarch64/arm_private_fp8.h   | 80 ++
 gcc/config/aarch64/arm_sve.h   |  1 +
 .../gcc.target/aarch64/acle/fp8-helpers-neon.c | 53 ++
 .../gcc.target/aarch64/acle/fp8-helpers-sme.c  | 12 
 .../gcc.target/aarch64/acle/fp8-helpers-sve.c  | 12 
 7 files changed, 160 insertions(+), 1 deletion(-)

diff --git a/gcc/config.gcc b/gcc/config.gcc
index 7453ade07826..a36dd1bcbc66 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -347,7 +347,7 @@ m32c*-*-*)
 ;;
 aarch64*-*-*)
cpu_type=aarch64
-   extra_headers="arm_fp16.h arm_neon.h arm_bf16.h arm_acle.h arm_sve.h 
arm_sme.h arm_neon_sve_bridge.h"
+   extra_headers="arm_fp16.h arm_neon.h arm_bf16.h arm_acle.h arm_sve.h 
arm_sme.h arm_neon_sve_bridge.h arm_private_fp8.h"
c_target_objs="aarch64-c.o"
cxx_target_objs="aarch64-c.o"
d_target_objs="aarch64-d.o"
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index c4a09528ffd8..e376685489da 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -30,6 +30,7 @@
 #pragma GCC push_options
 #pragma GCC target ("+nothing+simd")
 
+#include 
 #pragma GCC aarch64 "arm_neon.h"
 
 #include 
diff --git a/gcc/config/aarch64/arm_private_fp8.h 
b/gcc/config/aarch64/arm_private_fp8.h
new file mode 100644
index ..5668cc24c99b
--- /dev/null
+++ b/gcc/config/aarch64/arm_private_fp8.h
@@ -0,0 +1,80 @@
+/* AArch64 FP8 helper functions.
+   Do not include this file directly. Use one of arm_neon.h
+   arm_sme.h arm_sve.h instead.
+
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   Contributed by ARM Ltd.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published
+   by the Free Software Foundation; either version 3, or (at your
+   option) any later version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+   License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   .  */
+
+#

[gcc r15-2696] Make may_trap_p_1 return false for constant pool references [PR116145]

2024-08-02 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:ba730fd10934e4ca004251aa3748bf9da4d35e62

commit r15-2696-gba730fd10934e4ca004251aa3748bf9da4d35e62
Author: Richard Sandiford 
Date:   Fri Aug 2 15:58:31 2024 +0100

Make may_trap_p_1 return false for constant pool references [PR116145]

The testcase contains the constant:

  arr2 = svreinterpret_u8(svdup_u32(0x0a0d5c3f));

which was initially hoisted by hand, but which gimple optimisers later
propagated to each use (as expected).  The constant was then expanded
as a load-and-duplicate from the constant pool.  Normally that load
should then be hoisted back out of the loop, but may_trap_or_fault_p
stopped that from happening in this case.

The code responsible was:

  if (/* MEM_NOTRAP_P only relates to the actual position of the memory
 reference; moving it out of context such as when moving code
 when optimizing, might cause its address to become invalid.  */
  code_changed
  || !MEM_NOTRAP_P (x))
{
  poly_int64 size = MEM_SIZE_KNOWN_P (x) ? MEM_SIZE (x) : -1;
  return rtx_addr_can_trap_p_1 (XEXP (x, 0), 0, size,
GET_MODE (x), code_changed);
}

where code_changed is true.  (Arguably it doesn't need to be true in
this case, if we inserted invariants on the preheader edge, but it
would still need to be true for conditionally executed loads.)

Normally this wouldn't be a problem, since rtx_addr_can_trap_p_1
would recognise that the address refers to the constant pool.
However, the SVE load-and-replicate instructions have a limited
offset range, so it isn't possible for them to have a LO_SUM address.
All we have is a plain pseudo base register.

MEM_READONLY_P is defined as:

  /* 1 if RTX is a mem that is statically allocated in read-only memory.  */
  #define MEM_READONLY_P(RTX) \
(RTL_FLAG_CHECK1 ("MEM_READONLY_P", (RTX), MEM)->unchanging)

and so I think it should be safe to move memory references if both
MEM_READONLY_P and MEM_NOTRAP_P are true.

The testcase isn't a minimal reproducer, but I think it's good
to have a realistic full routine in the testsuite.

gcc/
PR rtl-optimization/116145
* rtlanal.cc (may_trap_p_1): Trust MEM_NOTRAP_P even for code
movement if MEM_READONLY_P is also true.

gcc/testsuite/
PR rtl-optimization/116145
* gcc.target/aarch64/sve/acle/general/pr116145.c: New test.

Diff:
---
 gcc/rtlanal.cc | 14 +--
 .../gcc.target/aarch64/sve/acle/general/pr116145.c | 46 ++
 2 files changed, 56 insertions(+), 4 deletions(-)

diff --git a/gcc/rtlanal.cc b/gcc/rtlanal.cc
index 4158a531bdd7..893a6afbbc53 100644
--- a/gcc/rtlanal.cc
+++ b/gcc/rtlanal.cc
@@ -3152,10 +3152,16 @@ may_trap_p_1 (const_rtx x, unsigned flags)
  && MEM_VOLATILE_P (x)
  && XEXP (x, 0) == stack_pointer_rtx)
return true;
-  if (/* MEM_NOTRAP_P only relates to the actual position of the memory
-reference; moving it out of context such as when moving code
-when optimizing, might cause its address to become invalid.  */
- code_changed
+  if (/* MEM_READONLY_P means that the memory is both statically
+allocated and readonly, so MEM_NOTRAP_P should remain true
+even if the memory reference is moved.  This is certainly
+true for the important case of force_const_mem.
+
+Otherwise, MEM_NOTRAP_P only relates to the actual position
+of the memory reference; moving it out of context such as
+when moving code when optimizing, might cause its address
+to become invalid.  */
+ (code_changed && !MEM_READONLY_P (x))
  || !MEM_NOTRAP_P (x))
{
  poly_int64 size = MEM_SIZE_KNOWN_P (x) ? MEM_SIZE (x) : -1;
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr116145.c 
b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr116145.c
new file mode 100644
index ..a3d93d3e1c84
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr116145.c
@@ -0,0 +1,46 @@
+// { dg-options "-O2" }
+
+#include 
+#include 
+
+#pragma GCC target "+sve2"
+
+typedef unsigned char uchar;
+
+const uchar *
+search_line_fast (const uchar *s, const uchar *end)
+{
+  size_t VL = svcntb();
+  svuint8_t arr1, arr2;
+  svbool_t pc, pg = svptrue_b8();
+
+  // This should not be loaded inside the loop every time.
+  arr2 = svreinterpret_u8(svdup_u32(0x0a0d5c3f));
+
+  for (; s+VL <= end; s += VL) {
+arr1 = svld1_u8(pg, s);
+pc = svmatch_u8(pg, arr1, arr2);
+
+if (svptest_any(pg, pc)) {
+  pc = svbrkb_z(pg, pc);
+  return s+svcntp_b8(pg, pc);
+}
+  }
+
+  // Handle remainder.
+  if (s < 

[gcc r15-2697] AArch64: Fuse CMP+CSEL and CMP+CSET for -mcpu=neoverse-v2

2024-08-02 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:884846351c74dc79ab143a06c25f00fc7c9e3cfb

commit r15-2697-g884846351c74dc79ab143a06c25f00fc7c9e3cfb
Author: Jennifer Schmitz 
Date:   Fri Aug 2 15:58:32 2024 +0100

AArch64: Fuse CMP+CSEL and CMP+CSET for -mcpu=neoverse-v2

According to the Neoverse V2 Software Optimization Guide (section 4.14), the
instruction pairs CMP+CSEL and CMP+CSET can be fused, which had not been
implemented so far. This patch implements and tests the two fusion pairs.

The patch was bootstrapped and regtested on aarch64-linux-gnu, no 
regression.
There was also no non-noise impact on SPEC CPU2017 benchmark.
OK for mainline?

Signed-off-by: Jennifer Schmitz 

gcc/

* config/aarch64/aarch64.cc (aarch_macro_fusion_pair_p): Implement
fusion logic.
* config/aarch64/aarch64-fusion-pairs.def (cmp+csel): New entry.
(cmp+cset): Likewise.
* config/aarch64/tuning_models/neoversev2.h: Enable logic in
field fusible_ops.

gcc/testsuite/

* gcc.target/aarch64/fuse_cmp_csel.c: New test.
* gcc.target/aarch64/fuse_cmp_cset.c: Likewise.

Diff:
---
 gcc/config/aarch64/aarch64-fusion-pairs.def  |  2 ++
 gcc/config/aarch64/aarch64.cc| 20 ++
 gcc/config/aarch64/tuning_models/neoversev2.h|  5 +++-
 gcc/testsuite/gcc.target/aarch64/fuse_cmp_csel.c | 33 
 gcc/testsuite/gcc.target/aarch64/fuse_cmp_cset.c | 31 ++
 5 files changed, 90 insertions(+), 1 deletion(-)

diff --git a/gcc/config/aarch64/aarch64-fusion-pairs.def 
b/gcc/config/aarch64/aarch64-fusion-pairs.def
index 9a43b0c80657..bf5e85ba8fe1 100644
--- a/gcc/config/aarch64/aarch64-fusion-pairs.def
+++ b/gcc/config/aarch64/aarch64-fusion-pairs.def
@@ -37,5 +37,7 @@ AARCH64_FUSION_PAIR ("aes+aesmc", AES_AESMC)
 AARCH64_FUSION_PAIR ("alu+branch", ALU_BRANCH)
 AARCH64_FUSION_PAIR ("alu+cbz", ALU_CBZ)
 AARCH64_FUSION_PAIR ("addsub_2reg_const1", ADDSUB_2REG_CONST1)
+AARCH64_FUSION_PAIR ("cmp+csel", CMP_CSEL)
+AARCH64_FUSION_PAIR ("cmp+cset", CMP_CSET)
 
 #undef AARCH64_FUSION_PAIR
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 113ebb45cfda..9e12bd9711cd 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -27357,6 +27357,26 @@ aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn 
*curr)
   && reg_referenced_p (SET_DEST (prev_set), PATTERN (curr)))
 return true;
 
+  /* Fuse CMP and CSEL/CSET.  */
+  if (prev_set && curr_set
+  && GET_CODE (SET_SRC (prev_set)) == COMPARE
+  && SCALAR_INT_MODE_P (GET_MODE (XEXP (SET_SRC (prev_set), 0)))
+  && reg_referenced_p (SET_DEST (prev_set), PATTERN (curr)))
+{
+  enum attr_type prev_type = get_attr_type (prev);
+  if ((prev_type == TYPE_ALUS_SREG || prev_type == TYPE_ALUS_IMM)
+ && ((aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_CSEL)
+  && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
+  && aarch64_reg_or_zero (XEXP (SET_SRC (curr_set), 1), VOIDmode)
+  && aarch64_reg_or_zero (XEXP (SET_SRC (curr_set), 2), VOIDmode)
+  && SCALAR_INT_MODE_P (GET_MODE (XEXP (SET_SRC (curr_set), 1
+ || (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_CSET)
+ && GET_RTX_CLASS (GET_CODE (SET_SRC (curr_set)))
+== RTX_COMPARE
+ && REG_P (SET_DEST (curr_set)
+   return true;
+}
+
   /* Fuse flag-setting ALU instructions and conditional branch.  */
   if (aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
   && any_condjump_p (curr))
diff --git a/gcc/config/aarch64/tuning_models/neoversev2.h 
b/gcc/config/aarch64/tuning_models/neoversev2.h
index c9c3019dd01a..bd259a37e9c9 100644
--- a/gcc/config/aarch64/tuning_models/neoversev2.h
+++ b/gcc/config/aarch64/tuning_models/neoversev2.h
@@ -221,7 +221,10 @@ static const struct tune_params neoversev2_tunings =
 2 /* store_pred.  */
   }, /* memmov_cost.  */
   5, /* issue_rate  */
-  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
+  (AARCH64_FUSE_AES_AESMC
+   | AARCH64_FUSE_CMP_BRANCH
+   | AARCH64_FUSE_CMP_CSEL
+   | AARCH64_FUSE_CMP_CSET), /* fusible_ops  */
   "32:16", /* function_align.  */
   "4", /* jump_align.  */
   "32:16", /* loop_align.  */
diff --git a/gcc/testsuite/gcc.target/aarch64/fuse_cmp_csel.c 
b/gcc/testsuite/gcc.target/aarch64/fuse_cmp_csel.c
new file mode 100644
index ..85f302bab983
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/fuse_cmp_csel.c
@@ -0,0 +1,33 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mcpu=neoverse-v2" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+/*
+** f1:
+** ...
+** cmp w[0-9]+, w[0-9]+
+** cselw[0-9]+, w[0-9]+, w[0-9]+, le
+** ret
+*/
+int f1 (int a, int b, int c)
+{
+  int cmp = a > b;
+  int add1 = c + 3;
+  int

[gcc r15-2718] Revert "Make may_trap_p_1 return false for constant pool references [PR116145]" [PR116200]

2024-08-05 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:162a1ed70303a031c81b0aaac499aaf394560390

commit r15-2718-g162a1ed70303a031c81b0aaac499aaf394560390
Author: Richard Sandiford 
Date:   Mon Aug 5 10:02:45 2024 +0100

Revert "Make may_trap_p_1 return false for constant pool references 
[PR116145]" [PR116200]

This reverts commit ba730fd10934e4ca004251aa3748bf9da4d35e62.

Diff:
---
 gcc/rtlanal.cc | 14 ++-
 .../gcc.target/aarch64/sve/acle/general/pr116145.c | 46 --
 2 files changed, 4 insertions(+), 56 deletions(-)

diff --git a/gcc/rtlanal.cc b/gcc/rtlanal.cc
index 893a6afbbc53..4158a531bdd7 100644
--- a/gcc/rtlanal.cc
+++ b/gcc/rtlanal.cc
@@ -3152,16 +3152,10 @@ may_trap_p_1 (const_rtx x, unsigned flags)
  && MEM_VOLATILE_P (x)
  && XEXP (x, 0) == stack_pointer_rtx)
return true;
-  if (/* MEM_READONLY_P means that the memory is both statically
-allocated and readonly, so MEM_NOTRAP_P should remain true
-even if the memory reference is moved.  This is certainly
-true for the important case of force_const_mem.
-
-Otherwise, MEM_NOTRAP_P only relates to the actual position
-of the memory reference; moving it out of context such as
-when moving code when optimizing, might cause its address
-to become invalid.  */
- (code_changed && !MEM_READONLY_P (x))
+  if (/* MEM_NOTRAP_P only relates to the actual position of the memory
+reference; moving it out of context such as when moving code
+when optimizing, might cause its address to become invalid.  */
+ code_changed
  || !MEM_NOTRAP_P (x))
{
  poly_int64 size = MEM_SIZE_KNOWN_P (x) ? MEM_SIZE (x) : -1;
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr116145.c 
b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr116145.c
deleted file mode 100644
index a3d93d3e1c84..
--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr116145.c
+++ /dev/null
@@ -1,46 +0,0 @@
-// { dg-options "-O2" }
-
-#include 
-#include 
-
-#pragma GCC target "+sve2"
-
-typedef unsigned char uchar;
-
-const uchar *
-search_line_fast (const uchar *s, const uchar *end)
-{
-  size_t VL = svcntb();
-  svuint8_t arr1, arr2;
-  svbool_t pc, pg = svptrue_b8();
-
-  // This should not be loaded inside the loop every time.
-  arr2 = svreinterpret_u8(svdup_u32(0x0a0d5c3f));
-
-  for (; s+VL <= end; s += VL) {
-arr1 = svld1_u8(pg, s);
-pc = svmatch_u8(pg, arr1, arr2);
-
-if (svptest_any(pg, pc)) {
-  pc = svbrkb_z(pg, pc);
-  return s+svcntp_b8(pg, pc);
-}
-  }
-
-  // Handle remainder.
-  if (s < end) {
-pg = svwhilelt_b8((size_t)s, (size_t)end);
-
-arr1 = svld1_u8(pg, s);
-pc = svmatch_u8(pg, arr1, arr2);
-
-if (svptest_any(pg, pc)) {
-  pc = svbrkb_z(pg, pc);
-  return s+svcntp_b8(pg, pc);
-}
-  }
-
-  return end;
-}
-
-// { dg-final { scan-assembler {:\n\tld1b\t[^\n]*\n\tmatch\t[^\n]*\n\tb\.} } }


[gcc r15-2884] Use splay-tree-utils.h in tree-ssa-sccvn [PR30920]

2024-08-12 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:9ab8681db6c7736357a8713afec7c7b09080cba9

commit r15-2884-g9ab8681db6c7736357a8713afec7c7b09080cba9
Author: Richard Sandiford 
Date:   Mon Aug 12 10:52:29 2024 +0100

Use splay-tree-utils.h in tree-ssa-sccvn [PR30920]

This patch is an attempt to gauge opinion on one way of fixing PR30920.

The PR points out that the libiberty splay tree implementation does
not implement the algorithm described by Sleator and Tarjan and has
unclear complexity bounds.  (It's also somewhat dangerous in that
splay_tree_min and splay_tree_max walk the tree without splaying,
meaning that they are fully linear in the worst case, rather than
amortised logarithmic.)  These properties have been carried over
to typed-splay-tree.h.

We could fix those problems directly in the existing implementations,
and probably should for libiberty.  But when I added rtl-ssa, I also
added a third(!) splay tree implementation: splay-tree-utils.h.
In response to Jeff's understandable unease about having three
implementations, I was supposed to go back during the next stage 1
and reduce it to no more than two.  I never did that. :-(

splay-tree-utils.h is so called because rtl-ssa uses splay trees
in structures that are relatively small and very size-sensitive.
I therefore wanted to be able to embed the splay tree links directly
in the structures, rather than pay the penalty of using separate
nodes with one-way or two-way links between them.  There were also
operations for which it was convenient to treat the splay tree root
as an explicitly managed cursor, rather than treating the tree as
a pure ADT.  The interface is therefore a bit more low-level than
for the other implementations.

I wondered whether the same trade-offs might apply to users of
the libiberty splay trees.  The first one I looked at in detail
was SCC value numbering, which seemed like it would benefit from
using splay-tree-utils.h directly.

The patch does that.  It also adds a couple of new helper routines
to splay-tree-utils.h.

I don't expect this approach to be the right one for every use
of splay trees.  E.g. splay tree used for omp gimplification would
certainly need separate nodes.

gcc/
PR other/30920
* splay-tree-utils.h (rooted_splay_tree::insert_relative)
(rooted_splay_tree::lookup_le): New functions.
(rooted_splay_tree::remove_root_and_splay_next): Likewise.
* splay-tree-utils.tcc (rooted_splay_tree::insert_relative): New
function, extracted from...
(rooted_splay_tree::insert): ...here.
(rooted_splay_tree::lookup_le): New function.
(rooted_splay_tree::remove_root_and_splay_next): Likewise.
* tree-ssa-sccvn.cc (pd_range::m_children): New member variable.
(vn_walk_cb_data::vn_walk_cb_data): Initialize first_range.
(vn_walk_cb_data::known_ranges): Use a default_splay_tree.
(vn_walk_cb_data::~vn_walk_cb_data): Remove freeing of known_ranges.
(pd_range_compare, pd_range_alloc, pd_range_dealloc): Delete.
(vn_walk_cb_data::push_partial_def): Rewrite splay tree operations
to use splay-tree-utils.h.
* rtl-ssa/accesses.cc (function_info::add_use): Use insert_relative.

Diff:
---
 gcc/rtl-ssa/accesses.cc  |   8 ++--
 gcc/splay-tree-utils.h   |  29 +
 gcc/splay-tree-utils.tcc |  69 +++---
 gcc/tree-ssa-sccvn.cc| 106 ---
 4 files changed, 131 insertions(+), 81 deletions(-)

diff --git a/gcc/rtl-ssa/accesses.cc b/gcc/rtl-ssa/accesses.cc
index 5e9077545a81..ef99759871aa 100644
--- a/gcc/rtl-ssa/accesses.cc
+++ b/gcc/rtl-ssa/accesses.cc
@@ -1232,16 +1232,16 @@ function_info::add_use (use_info *use)
   need_use_splay_tree (def);
   int comparison = lookup_use (def->m_use_tree, insn);
   gcc_checking_assert (comparison != 0);
-  splay_tree_node *neighbor = def->m_use_tree.root ();
+  use_info *neighbor = def->m_use_tree.root ()->value ();
 
   // If USE comes before NEIGHBOR, insert USE to NEIGHBOR's left,
   // otherwise insert USE to NEIGHBOR's right.
   auto *use_node = allocate> (use);
-  def->m_use_tree.insert_child (neighbor, comparison > 0, use_node);
+  def->m_use_tree.insert_relative (comparison, use_node);
   if (comparison > 0)
-insert_use_after (use, neighbor->value ());
+insert_use_after (use, neighbor);
   else
-insert_use_before (use, neighbor->value ());
+insert_use_before (use, neighbor);
 }
 
 void
diff --git a/gcc/splay-tree-utils.h b/gcc/splay-tree-utils.h
index 8344808f6d19..9526e0ba3363 100644
--- a/gcc/splay-tree-utils.h
+++ b/gcc/splay-tree-utils.h
@@ -185,6 +185,21 @@ public:
   template
   bool insert (node_type new_node, Comparator compare);
 
+  // Insert NEW_NODE into the spl

[gcc r15-2929] aarch64: Rename svpext to svpext_lane [PR116371]

2024-08-15 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:cc2d29e5f4434a3fd4e0dd93ea4f9857a0309201

commit r15-2929-gcc2d29e5f4434a3fd4e0dd93ea4f9857a0309201
Author: Richard Sandiford 
Date:   Thu Aug 15 10:10:12 2024 +0100

aarch64: Rename svpext to svpext_lane [PR116371]

When implementing the SME2 ACLE, I somehow missed off the _lane
suffix on svpext.

gcc/
PR target/116371
* config/aarch64/aarch64-sve-builtins-sve2.h (svpext): Rename to...
(svpext_lane): ...this.
* config/aarch64/aarch64-sve-builtins-sve2.cc (svpext_impl): Rename
to...
(svpext_lane_impl): ...this and update instantiation accordingly.
* config/aarch64/aarch64-sve-builtins-sve2.def (svpext): Rename 
to...
(svpext_lane): ...this.

gcc/testsuite/
PR target/116371
* gcc.target/aarch64/sme2/acle-asm/pext_c16.c,
gcc.target/aarch64/sme2/acle-asm/pext_c16_x2.c,
gcc.target/aarch64/sme2/acle-asm/pext_c32.c,
gcc.target/aarch64/sme2/acle-asm/pext_c32_x2.c,
gcc.target/aarch64/sme2/acle-asm/pext_c64.c,
gcc.target/aarch64/sme2/acle-asm/pext_c64_x2.c,
gcc.target/aarch64/sme2/acle-asm/pext_c8.c,
gcc.target/aarch64/sme2/acle-asm/pext_c8_x2.c: Replace with...
* gcc.target/aarch64/sme2/acle-asm/pext_lane_c16.c,
gcc.target/aarch64/sme2/acle-asm/pext_lane_c16_x2.c,
gcc.target/aarch64/sme2/acle-asm/pext_lane_c32.c,
gcc.target/aarch64/sme2/acle-asm/pext_lane_c32_x2.c,
gcc.target/aarch64/sme2/acle-asm/pext_lane_c64.c,
gcc.target/aarch64/sme2/acle-asm/pext_lane_c64_x2.c,
gcc.target/aarch64/sme2/acle-asm/pext_lane_c8.c,
gcc.target/aarch64/sme2/acle-asm/pext_lane_c8_x2.c: ...these new 
tests,
testing for svpext_lane instead of svpext.

Diff:
---
 gcc/config/aarch64/aarch64-sve-builtins-sve2.cc|  4 +-
 gcc/config/aarch64/aarch64-sve-builtins-sve2.def   |  2 +-
 gcc/config/aarch64/aarch64-sve-builtins-sve2.h |  2 +-
 .../gcc.target/aarch64/sme2/acle-asm/pext_c16.c| 50 
 .../gcc.target/aarch64/sme2/acle-asm/pext_c16_x2.c | 54 --
 .../gcc.target/aarch64/sme2/acle-asm/pext_c32.c| 50 
 .../gcc.target/aarch64/sme2/acle-asm/pext_c32_x2.c | 54 --
 .../gcc.target/aarch64/sme2/acle-asm/pext_c64.c| 50 
 .../gcc.target/aarch64/sme2/acle-asm/pext_c64_x2.c | 54 --
 .../gcc.target/aarch64/sme2/acle-asm/pext_c8.c | 50 
 .../gcc.target/aarch64/sme2/acle-asm/pext_c8_x2.c  | 54 --
 .../aarch64/sme2/acle-asm/pext_lane_c16.c  | 50 
 .../aarch64/sme2/acle-asm/pext_lane_c16_x2.c   | 54 ++
 .../aarch64/sme2/acle-asm/pext_lane_c32.c  | 50 
 .../aarch64/sme2/acle-asm/pext_lane_c32_x2.c   | 54 ++
 .../aarch64/sme2/acle-asm/pext_lane_c64.c  | 50 
 .../aarch64/sme2/acle-asm/pext_lane_c64_x2.c   | 54 ++
 .../aarch64/sme2/acle-asm/pext_lane_c8.c   | 50 
 .../aarch64/sme2/acle-asm/pext_lane_c8_x2.c| 54 ++
 19 files changed, 420 insertions(+), 420 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-sve-builtins-sve2.cc 
b/gcc/config/aarch64/aarch64-sve-builtins-sve2.cc
index dc591551682..146a5459930 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-sve2.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-sve2.cc
@@ -221,7 +221,7 @@ public:
   }
 };
 
-class svpext_impl : public function_base
+class svpext_lane_impl : public function_base
 {
 public:
   rtx
@@ -619,7 +619,7 @@ FUNCTION (svmullt_lane, unspec_based_lane_function, 
(UNSPEC_SMULLT,
 UNSPEC_UMULLT, -1))
 FUNCTION (svnbsl, CODE_FOR_MODE0 (aarch64_sve2_nbsl),)
 FUNCTION (svnmatch, svmatch_svnmatch_impl, (UNSPEC_NMATCH))
-FUNCTION (svpext, svpext_impl,)
+FUNCTION (svpext_lane, svpext_lane_impl,)
 FUNCTION (svpmul, CODE_FOR_MODE0 (aarch64_sve2_pmul),)
 FUNCTION (svpmullb, unspec_based_function, (-1, UNSPEC_PMULLB, -1))
 FUNCTION (svpmullb_pair, unspec_based_function, (-1, UNSPEC_PMULLB_PAIR, -1))
diff --git a/gcc/config/aarch64/aarch64-sve-builtins-sve2.def 
b/gcc/config/aarch64/aarch64-sve-builtins-sve2.def
index 4366925a971..4543402f836 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-sve2.def
+++ b/gcc/config/aarch64/aarch64-sve-builtins-sve2.def
@@ -263,7 +263,7 @@ DEF_SVE_FUNCTION_GS (svmax, binary_opt_single_n, all_arith, 
x24, none)
 DEF_SVE_FUNCTION_GS (svmaxnm, binary_opt_single_n, all_float, x24, none)
 DEF_SVE_FUNCTION_GS (svmin, binary_opt_single_n, all_arith, x24, none)
 DEF_SVE_FUNCTION_GS (svminnm, binary_opt_single_n, all_float, x24, none)
-DEF_SVE_FUNCTION_GS (svpext, extract_

[gcc r15-2936] late-combine: Preserve INSN_CODE when modifying notes [PR116343]

2024-08-15 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:70ae0daeb76f28a3135f4a74d6e440fb1d9821fa

commit r15-2936-g70ae0daeb76f28a3135f4a74d6e440fb1d9821fa
Author: Richard Sandiford 
Date:   Thu Aug 15 16:54:02 2024 +0100

late-combine: Preserve INSN_CODE when modifying notes [PR116343]

When it removes a definition, late-combine tries to update all
uses in notes.  It does this using the same insn_propagation class
that it uses for patterns.

However, insn_propagation uses validate_change, which in turn
resets the INSN_CODE.  This is inefficient in the best case,
since it forces the pattern to be rerecognised even though
changing a note can't affect the INSN_CODE.  But in the PR
it's a correctness problem: resetting INSN_CODE means we lose
the NOOP_INSN_MOVE_CODE, which in turn means that rtl-ssa doesn't
queue it for deletion.

This patch adds a routine specifically for propagating into notes.
A belt-and-braces fix would be to rerecognise noop moves in
function_info::change_insns, but I can't think of a good reason
why that would be necessary, and it could paper over latent bugs.

gcc/
PR testsuite/116343
* recog.h (insn_propagation::apply_to_note): Declare.
* recog.cc (insn_propagation::apply_to_note): New function.
* late-combine.cc (insn_combination::substitute_note): Use
apply_to_note instead of apply_to_rvalue.
* rtl-ssa/changes.cc (rtl_ssa::changes_are_worthwhile): Improve
dumping of costs for noop moves.

gcc/testsuite/
PR testsuite/116343
* gcc.dg/torture/pr116343.c: New test.

Diff:
---
 gcc/late-combine.cc |  2 +-
 gcc/recog.cc| 13 +
 gcc/recog.h |  1 +
 gcc/rtl-ssa/changes.cc  |  5 -
 gcc/testsuite/gcc.dg/torture/pr116343.c | 18 ++
 5 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/gcc/late-combine.cc b/gcc/late-combine.cc
index 2b62e2956ed..1d81b386c3d 100644
--- a/gcc/late-combine.cc
+++ b/gcc/late-combine.cc
@@ -338,7 +338,7 @@ insn_combination::substitute_note (insn_info *use_insn, rtx 
note,
   || REG_NOTE_KIND (note) == REG_EQUIV)
 {
   insn_propagation prop (use_insn->rtl (), m_dest, m_src);
-  return (prop.apply_to_rvalue (&XEXP (note, 0))
+  return (prop.apply_to_note (&XEXP (note, 0))
  && (can_propagate || prop.num_replacements == 0));
 }
   return true;
diff --git a/gcc/recog.cc b/gcc/recog.cc
index 23e4820180f..615aaabc551 100644
--- a/gcc/recog.cc
+++ b/gcc/recog.cc
@@ -1469,6 +1469,19 @@ insn_propagation::apply_to_rvalue (rtx *loc)
   return res;
 }
 
+/* Like apply_to_rvalue, but specifically for the case where *LOC is in
+   a note.  This never changes the INSN_CODE.  */
+
+bool
+insn_propagation::apply_to_note (rtx *loc)
+{
+  auto old_code = INSN_CODE (insn);
+  bool res = apply_to_rvalue (loc);
+  if (INSN_CODE (insn) != old_code)
+INSN_CODE (insn) = old_code;
+  return res;
+}
+
 /* Check whether INSN matches a specific alternative of an .md pattern.  */
 
 bool
diff --git a/gcc/recog.h b/gcc/recog.h
index 87a5803dec0..1dccce78ba4 100644
--- a/gcc/recog.h
+++ b/gcc/recog.h
@@ -121,6 +121,7 @@ public:
   insn_propagation (rtx_insn *, rtx, rtx, bool = true);
   bool apply_to_pattern (rtx *);
   bool apply_to_rvalue (rtx *);
+  bool apply_to_note (rtx *);
 
   /* Return true if we should accept a substitution into the address of
  memory expression MEM.  Undoing changes OLD_NUM_CHANGES and up restores
diff --git a/gcc/rtl-ssa/changes.cc b/gcc/rtl-ssa/changes.cc
index a30f000191e..0476296607b 100644
--- a/gcc/rtl-ssa/changes.cc
+++ b/gcc/rtl-ssa/changes.cc
@@ -228,7 +228,10 @@ rtl_ssa::changes_are_worthwhile (array_slice changes,
   for (const insn_change *change : changes)
if (!change->is_deletion ())
  {
-   fprintf (dump_file, " %c %d", sep, change->new_cost);
+   if (INSN_CODE (change->rtl ()) == NOOP_MOVE_INSN_CODE)
+ fprintf (dump_file, " %c nop", sep);
+   else
+ fprintf (dump_file, " %c %d", sep, change->new_cost);
sep = '+';
  }
   if (weighted_new_cost != 0)
diff --git a/gcc/testsuite/gcc.dg/torture/pr116343.c 
b/gcc/testsuite/gcc.dg/torture/pr116343.c
new file mode 100644
index 000..ad13f0fc21c
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/pr116343.c
@@ -0,0 +1,18 @@
+// { dg-additional-options "-fschedule-insns -fno-thread-jumps -fno-dce" }
+
+int a, b, c;
+volatile int d;
+int e(int f, int g) { return g > 1 ? 1 : f >> g; }
+int main() {
+  int *i = &a;
+  long j[1];
+  if (a)
+while (1) {
+  a ^= 1;
+  if (*i)
+while (1)
+  ;
+  b = c && e((d, 1) >= 1, j[0]);
+}
+  return 0;
+}


[gcc r15-2937] Tweak base/index disambiguation in decompose_normal_address [PR116236]

2024-08-15 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:3673b7054ec268c445620b9c52d25e65bc9a7f96

commit r15-2937-g3673b7054ec268c445620b9c52d25e65bc9a7f96
Author: Richard Sandiford 
Date:   Thu Aug 15 16:54:03 2024 +0100

Tweak base/index disambiguation in decompose_normal_address [PR116236]

The PR points out that, for an address like:

  (plus (zero_extend X) Y)

decompose_normal_address doesn't establish a strong preference
between treating X as the base or Y as the base.  As the comment
in the patch says, zero_extend isn't enough on its own to assume
an index, at least not on POINTERS_EXTEND_UNSIGNED targets.
But in a construct like the one above, X and Y have different modes,
and it seems reasonable to assume that the one with the expected
address mode is the base.

This matters on targets like m68k that support index extension
and that require different classes for bases and indices.

gcc/
PR middle-end/116236
* rtlanal.cc (decompose_normal_address): Try to distinguish
bases and indices based on mode, before resorting to "baseness".

Diff:
---
 gcc/rtlanal.cc | 40 
 1 file changed, 28 insertions(+), 12 deletions(-)

diff --git a/gcc/rtlanal.cc b/gcc/rtlanal.cc
index 4158a531bdd..71207ee4f41 100644
--- a/gcc/rtlanal.cc
+++ b/gcc/rtlanal.cc
@@ -6724,20 +6724,36 @@ decompose_normal_address (struct address_info *info)
 }
   else if (out == 2)
 {
+  auto address_mode = targetm.addr_space.address_mode (info->as);
+  rtx inner_op0 = *inner_ops[0];
+  rtx inner_op1 = *inner_ops[1];
+  int base;
+  /* If one inner operand has the expected mode for a base and the other
+doesn't, assume that the other one is the index.  This is useful
+for addresses such as:
+
+  (plus (zero_extend X) Y)
+
+zero_extend is not in itself enough to assume an index, since bases
+can be zero-extended on POINTERS_EXTEND_UNSIGNED targets.  But if
+Y has address mode and X doesn't, there should be little doubt that
+Y is the base.  */
+  if (GET_MODE (inner_op0) == address_mode
+ && GET_MODE (inner_op1) != address_mode)
+   base = 0;
+  else if (GET_MODE (inner_op1) == address_mode
+  && GET_MODE (inner_op0) != address_mode)
+   base = 1;
   /* In the event of a tie, assume the base comes first.  */
-  if (baseness (*inner_ops[0], info->mode, info->as, PLUS,
-   GET_CODE (*ops[1]))
- >= baseness (*inner_ops[1], info->mode, info->as, PLUS,
-  GET_CODE (*ops[0])))
-   {
- set_address_base (info, ops[0], inner_ops[0]);
- set_address_index (info, ops[1], inner_ops[1]);
-   }
+  else if (baseness (inner_op0, info->mode, info->as, PLUS,
+GET_CODE (*ops[1]))
+  >= baseness (inner_op1, info->mode, info->as, PLUS,
+   GET_CODE (*ops[0])))
+   base = 0;
   else
-   {
- set_address_base (info, ops[1], inner_ops[1]);
- set_address_index (info, ops[0], inner_ops[0]);
-   }
+   base = 1;
+  set_address_base (info, ops[base], inner_ops[base]);
+  set_address_index (info, ops[1 - base], inner_ops[1 - base]);
 }
   else
 gcc_assert (out == 0);


[gcc r14-10589] aarch64: Fix invalid nested subregs [PR115464]

2024-08-15 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:32b21292adb6ad6b5e1d60d923a773e4d0daca7b

commit r14-10589-g32b21292adb6ad6b5e1d60d923a773e4d0daca7b
Author: Richard Sandiford 
Date:   Fri Aug 16 07:53:01 2024 +0100

aarch64: Fix invalid nested subregs [PR115464]

The testcase extracts one arm_neon.h vector from a pair (one subreg)
and then reinterprets the result as an SVE vector (another subreg).
Each subreg makes sense individually, but we can't fold them together
into a single subreg: it's 32 bytes -> 16 bytes -> 16*N bytes,
but the interpretation of 32 bytes -> 16*N bytes depends on
whether N==1 or N>1.

Since the second subreg makes sense individually, simplify_subreg
should bail out rather than ICE on it.  simplify_gen_subreg will
then do the same (because it already checks validate_subreg).
This leaves simplify_gen_subreg returning null, requiring the
caller to take appropriate action.

I think this is relatively likely to occur elsewhere, so the patch
adds a helper for forcing a subreg, allowing a temporary pseudo to
be created where necessary.

I'll follow up by using force_subreg in more places.  This patch
is intended to be a minimal backportable fix for the PR.

gcc/
PR target/115464
* simplify-rtx.cc (simplify_context::simplify_subreg): Don't try
to fold two subregs together if their relationship isn't known
at compile time.
* explow.h (force_subreg): Declare.
* explow.cc (force_subreg): New function.
* config/aarch64/aarch64-sve-builtins-base.cc
(svset_neonq_impl::expand): Use it instead of simplify_gen_subreg.

gcc/testsuite/
PR target/115464
* gcc.target/aarch64/sve/acle/general/pr115464.c: New test.

(cherry picked from commit 0970ff46ba6330fc80e8736fc05b2eaeeae0b6a0)

Diff:
---
 gcc/config/aarch64/aarch64-sve-builtins-base.cc   |  2 +-
 gcc/explow.cc | 15 +++
 gcc/explow.h  |  2 ++
 gcc/simplify-rtx.cc   |  5 +
 .../gcc.target/aarch64/sve/acle/general/pr115464.c| 13 +
 5 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc 
b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
index 0d2edf3f19e..c9182594bc1 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
@@ -1174,7 +1174,7 @@ public:
Advanced SIMD argument as an SVE vector.  */
 if (!BYTES_BIG_ENDIAN
&& is_undef (CALL_EXPR_ARG (e.call_expr, 0)))
-  return simplify_gen_subreg (mode, e.args[1], GET_MODE (e.args[1]), 0);
+  return force_subreg (mode, e.args[1], GET_MODE (e.args[1]), 0);
 
 rtx_vector_builder builder (VNx16BImode, 16, 2);
 for (unsigned int i = 0; i < 16; i++)
diff --git a/gcc/explow.cc b/gcc/explow.cc
index 8e5f6b8e680..f6843398c4b 100644
--- a/gcc/explow.cc
+++ b/gcc/explow.cc
@@ -745,6 +745,21 @@ force_reg (machine_mode mode, rtx x)
   return temp;
 }
 
+/* Like simplify_gen_subreg, but force OP into a new register if the
+   subreg cannot be formed directly.  */
+
+rtx
+force_subreg (machine_mode outermode, rtx op,
+ machine_mode innermode, poly_uint64 byte)
+{
+  rtx x = simplify_gen_subreg (outermode, op, innermode, byte);
+  if (x)
+return x;
+
+  op = copy_to_mode_reg (innermode, op);
+  return simplify_gen_subreg (outermode, op, innermode, byte);
+}
+
 /* If X is a memory ref, copy its contents to a new temp reg and return
that reg.  Otherwise, return X.  */
 
diff --git a/gcc/explow.h b/gcc/explow.h
index 16aa02cfb68..cbd1fcb7eb3 100644
--- a/gcc/explow.h
+++ b/gcc/explow.h
@@ -42,6 +42,8 @@ extern rtx copy_to_suggested_reg (rtx, rtx, machine_mode);
Args are mode (in case value is a constant) and the value.  */
 extern rtx force_reg (machine_mode, rtx);
 
+extern rtx force_subreg (machine_mode, rtx, machine_mode, poly_uint64);
+
 /* Return given rtx, copied into a new temp reg if it was in memory.  */
 extern rtx force_not_mem (rtx);
 
diff --git a/gcc/simplify-rtx.cc b/gcc/simplify-rtx.cc
index dceaa1c..729d408aa55 100644
--- a/gcc/simplify-rtx.cc
+++ b/gcc/simplify-rtx.cc
@@ -7612,6 +7612,11 @@ simplify_context::simplify_subreg (machine_mode 
outermode, rtx op,
   poly_uint64 innermostsize = GET_MODE_SIZE (innermostmode);
   rtx newx;
 
+  /* Make sure that the relationship between the two subregs is
+known at compile time.  */
+  if (!ordered_p (outersize, innermostsize))
+   return NULL_RTX;
+
   if (outermode == innermostmode
  && known_eq (byte, 0U)
  && known_eq (SUBREG_BYTE (op), 0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr115464.c 
b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr115464.c

[gcc r14-10590] aarch64: Add another use of force_subreg [PR115464]

2024-08-15 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:86dacfb06b90371458d58872f461d358a0834305

commit r14-10590-g86dacfb06b90371458d58872f461d358a0834305
Author: Richard Sandiford 
Date:   Fri Aug 16 07:53:02 2024 +0100

aarch64: Add another use of force_subreg [PR115464]

This patch includes the testcase from r15-1399 plus a miminal
fix for it, without the other proactive uses of force_subreg.
We can backport other force_subreg calls later if they're shown
to be needed.

gcc/
PR target/115464
* config/aarch64/aarch64-sve-builtins-base.cc
(svset_neonq_impl::expand): Use force_subreg instead of
lowpart_subreg.

gcc/testsuite/
PR target/115464
* gcc.target/aarch64/sve/acle/general/pr115464_2.c: New test.

Diff:
---
 gcc/config/aarch64/aarch64-sve-builtins-base.cc   |  4 +++-
 .../gcc.target/aarch64/sve/acle/general/pr115464_2.c  | 11 +++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc 
b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
index c9182594bc1..241a249503f 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
@@ -1185,7 +1185,9 @@ public:
 if (BYTES_BIG_ENDIAN)
   return e.use_exact_insn (code_for_aarch64_sve_set_neonq (mode));
 insn_code icode = code_for_vcond_mask (mode, mode);
-e.args[1] = lowpart_subreg (mode, e.args[1], GET_MODE (e.args[1]));
+e.args[1] = force_subreg (mode, e.args[1], GET_MODE (e.args[1]),
+ subreg_lowpart_offset (mode,
+GET_MODE (e.args[1])));
 e.add_output_operand (icode);
 e.add_input_operand (icode, e.args[1]);
 e.add_input_operand (icode, e.args[0]);
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr115464_2.c 
b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr115464_2.c
new file mode 100644
index 000..f561c34f732
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr115464_2.c
@@ -0,0 +1,11 @@
+/* { dg-options "-O2" } */
+
+#include 
+#include 
+#include 
+
+svuint16_t
+convolve4_4_x (uint16x8x2_t permute_tbl, svuint16_t a)
+{
+return svset_neonq_u16 (a, permute_tbl.val[1]);
+}


[gcc r13-8976] aarch64: Fix expansion of svsudot [PR114607]

2024-08-15 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:22c6a11686d3f20f8682c2fbe9e33867a7e8af0e

commit r13-8976-g22c6a11686d3f20f8682c2fbe9e33867a7e8af0e
Author: Richard Sandiford 
Date:   Fri Aug 16 07:58:24 2024 +0100

aarch64: Fix expansion of svsudot [PR114607]

Not sure how this happend, but: svsudot is supposed to be expanded
as USDOT with the operands swapped.  However, a thinko in the
expansion of svsudot meant that the arguments weren't in fact
swapped; the attempted swap was just a no-op.  And the testcases
blithely accepted that.

gcc/
PR target/114607
* config/aarch64/aarch64-sve-builtins-base.cc
(svusdot_impl::expand): Fix botched attempt to swap the operands
for svsudot.

gcc/testsuite/
PR target/114607
* gcc.target/aarch64/sve/acle/asm/sudot_s32.c: New test.

(cherry picked from commit 2c1c2485a4b1aca746ac693041e51ea6da5c64ca)

Diff:
---
 gcc/config/aarch64/aarch64-sve-builtins-base.cc   | 2 +-
 gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sudot_s32.c | 8 
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc 
b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
index cd9cace3c9b..34f2d8c6e4e 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
@@ -2403,7 +2403,7 @@ public:
version) is through the USDOT instruction but with the second and third
inputs swapped.  */
 if (m_su)
-  e.rotate_inputs_left (1, 2);
+  e.rotate_inputs_left (1, 3);
 /* The ACLE function has the same order requirements as for svdot.
While there's no requirement for the RTL pattern to have the same sort
of order as that for dot_prod, it's easier to read.
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sudot_s32.c 
b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sudot_s32.c
index 4b452619eee..e06b69affab 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sudot_s32.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sudot_s32.c
@@ -6,7 +6,7 @@
 
 /*
 ** sudot_s32_tied1:
-** usdot   z0\.s, z2\.b, z4\.b
+** usdot   z0\.s, z4\.b, z2\.b
 ** ret
 */
 TEST_TRIPLE_Z (sudot_s32_tied1, svint32_t, svint8_t, svuint8_t,
@@ -17,7 +17,7 @@ TEST_TRIPLE_Z (sudot_s32_tied1, svint32_t, svint8_t, 
svuint8_t,
 ** sudot_s32_tied2:
 ** mov (z[0-9]+)\.d, z0\.d
 ** movprfx z0, z4
-** usdot   z0\.s, z2\.b, \1\.b
+** usdot   z0\.s, \1\.b, z2\.b
 ** ret
 */
 TEST_TRIPLE_Z_REV (sudot_s32_tied2, svint32_t, svint8_t, svuint8_t,
@@ -27,7 +27,7 @@ TEST_TRIPLE_Z_REV (sudot_s32_tied2, svint32_t, svint8_t, 
svuint8_t,
 /*
 ** sudot_w0_s32_tied:
 ** mov (z[0-9]+\.b), w0
-** usdot   z0\.s, z2\.b, \1
+** usdot   z0\.s, \1, z2\.b
 ** ret
 */
 TEST_TRIPLE_ZX (sudot_w0_s32_tied, svint32_t, svint8_t, uint8_t,
@@ -37,7 +37,7 @@ TEST_TRIPLE_ZX (sudot_w0_s32_tied, svint32_t, svint8_t, 
uint8_t,
 /*
 ** sudot_9_s32_tied:
 ** mov (z[0-9]+\.b), #9
-** usdot   z0\.s, z2\.b, \1
+** usdot   z0\.s, \1, z2\.b
 ** ret
 */
 TEST_TRIPLE_Z (sudot_9_s32_tied, svint32_t, svint8_t, uint8_t,


[gcc r13-8977] aarch64: Fix bogus cnot optimisation [PR114603]

2024-08-15 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:959d6529df206c1983be14383da081f374416e47

commit r13-8977-g959d6529df206c1983be14383da081f374416e47
Author: Richard Sandiford 
Date:   Fri Aug 16 07:58:25 2024 +0100

aarch64: Fix bogus cnot optimisation [PR114603]

aarch64-sve.md had a pattern that combined:

cmpeq   pb.T, pa/z, zc.T, #0
mov zd.T, pb/z, #1

into:

cnotzd.T, pa/m, zc.T

But this is only valid if pa.T is a ptrue.  In other cases, the
original would set inactive elements of zd.T to 0, whereas the
combined form would copy elements from zc.T.

gcc/
PR target/114603
* config/aarch64/aarch64-sve.md (@aarch64_pred_cnot): Replace
with...
(@aarch64_ptrue_cnot): ...this, requiring operand 1 to be
a ptrue.
(*cnot): Require operand 1 to be a ptrue.
* config/aarch64/aarch64-sve-builtins-base.cc (svcnot_impl::expand):
Use aarch64_ptrue_cnot for _x operations that are predicated
with a ptrue.  Represent other _x operations as fully-defined _m
operations.

gcc/testsuite/
PR target/114603
* gcc.target/aarch64/sve/acle/general/cnot_1.c: New test.

(cherry picked from commit 67cbb1c638d6ab3a9cb77e674541e2b291fb67df)

Diff:
---
 gcc/config/aarch64/aarch64-sve-builtins-base.cc| 25 ++
 gcc/config/aarch64/aarch64-sve.md  | 20 -
 .../gcc.target/aarch64/sve/acle/general/cnot_1.c   | 23 
 3 files changed, 49 insertions(+), 19 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc 
b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
index 34f2d8c6e4e..852f569461a 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
@@ -496,15 +496,22 @@ public:
   expand (function_expander &e) const override
   {
 machine_mode mode = e.vector_mode (0);
-if (e.pred == PRED_x)
-  {
-   /* The pattern for CNOT includes an UNSPEC_PRED_Z, so needs
-  a ptrue hint.  */
-   e.add_ptrue_hint (0, e.gp_mode (0));
-   return e.use_pred_x_insn (code_for_aarch64_pred_cnot (mode));
-  }
-
-return e.use_cond_insn (code_for_cond_cnot (mode), 0);
+machine_mode pred_mode = e.gp_mode (0);
+/* The underlying _x pattern is effectively:
+
+dst = src == 0 ? 1 : 0
+
+   rather than an UNSPEC_PRED_X.  Using this form allows autovec
+   constructs to be matched by combine, but it means that the
+   predicate on the src == 0 comparison must be all-true.
+
+   For simplicity, represent other _x operations as fully-defined _m
+   operations rather than using a separate bespoke pattern.  */
+if (e.pred == PRED_x
+   && gen_lowpart (pred_mode, e.args[0]) == CONSTM1_RTX (pred_mode))
+  return e.use_pred_x_insn (code_for_aarch64_ptrue_cnot (mode));
+return e.use_cond_insn (code_for_cond_cnot (mode),
+   e.pred == PRED_x ? 1 : 0);
   }
 };
 
diff --git a/gcc/config/aarch64/aarch64-sve.md 
b/gcc/config/aarch64/aarch64-sve.md
index 7533b956686..0a05aecd1a3 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -3225,24 +3225,24 @@
 ;; - CNOT
 ;; -
 
-;; Predicated logical inverse.
-(define_expand "@aarch64_pred_cnot"
+;; Logical inverse, predicated with a ptrue.
+(define_expand "@aarch64_ptrue_cnot"
   [(set (match_operand:SVE_FULL_I 0 "register_operand")
(unspec:SVE_FULL_I
  [(unspec:
 [(match_operand: 1 "register_operand")
- (match_operand:SI 2 "aarch64_sve_ptrue_flag")
+ (const_int SVE_KNOWN_PTRUE)
  (eq:
-   (match_operand:SVE_FULL_I 3 "register_operand")
-   (match_dup 4))]
+   (match_operand:SVE_FULL_I 2 "register_operand")
+   (match_dup 3))]
 UNSPEC_PRED_Z)
-  (match_dup 5)
-  (match_dup 4)]
+  (match_dup 4)
+  (match_dup 3)]
  UNSPEC_SEL))]
   "TARGET_SVE"
   {
-operands[4] = CONST0_RTX (mode);
-operands[5] = CONST1_RTX (mode);
+operands[3] = CONST0_RTX (mode);
+operands[4] = CONST1_RTX (mode);
   }
 )
 
@@ -3251,7 +3251,7 @@
(unspec:SVE_I
  [(unspec:
 [(match_operand: 1 "register_operand" "Upl, Upl")
- (match_operand:SI 5 "aarch64_sve_ptrue_flag")
+ (const_int SVE_KNOWN_PTRUE)
  (eq:
(match_operand:SVE_I 2 "register_operand" "0, w")
(match_operand:SVE_I 3 "aarch64_simd_imm_zero"))]
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/cnot_1.c 
b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/cnot_1.c
new file mode 100644
index 000..b1a489f0cf0
--- /dev/null
+++ b/gcc/tests

[gcc r12-10673] aarch64: Fix expansion of svsudot [PR114607]

2024-08-16 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:2d1b1f404f3361a0e3d9d2a2bee5cf68c1290fe5

commit r12-10673-g2d1b1f404f3361a0e3d9d2a2bee5cf68c1290fe5
Author: Richard Sandiford 
Date:   Fri Aug 16 15:37:50 2024 +0100

aarch64: Fix expansion of svsudot [PR114607]

Not sure how this happend, but: svsudot is supposed to be expanded
as USDOT with the operands swapped.  However, a thinko in the
expansion of svsudot meant that the arguments weren't in fact
swapped; the attempted swap was just a no-op.  And the testcases
blithely accepted that.

gcc/
PR target/114607
* config/aarch64/aarch64-sve-builtins-base.cc
(svusdot_impl::expand): Fix botched attempt to swap the operands
for svsudot.

gcc/testsuite/
PR target/114607
* gcc.target/aarch64/sve/acle/asm/sudot_s32.c: New test.

(cherry picked from commit 2c1c2485a4b1aca746ac693041e51ea6da5c64ca)

Diff:
---
 gcc/config/aarch64/aarch64-sve-builtins-base.cc   | 2 +-
 gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sudot_s32.c | 8 
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc 
b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
index c24c0548724..e5e0d6ed5c9 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
@@ -2359,7 +2359,7 @@ public:
version) is through the USDOT instruction but with the second and third
inputs swapped.  */
 if (m_su)
-  e.rotate_inputs_left (1, 2);
+  e.rotate_inputs_left (1, 3);
 /* The ACLE function has the same order requirements as for svdot.
While there's no requirement for the RTL pattern to have the same sort
of order as that for dot_prod, it's easier to read.
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sudot_s32.c 
b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sudot_s32.c
index 4b452619eee..e06b69affab 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sudot_s32.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sudot_s32.c
@@ -6,7 +6,7 @@
 
 /*
 ** sudot_s32_tied1:
-** usdot   z0\.s, z2\.b, z4\.b
+** usdot   z0\.s, z4\.b, z2\.b
 ** ret
 */
 TEST_TRIPLE_Z (sudot_s32_tied1, svint32_t, svint8_t, svuint8_t,
@@ -17,7 +17,7 @@ TEST_TRIPLE_Z (sudot_s32_tied1, svint32_t, svint8_t, 
svuint8_t,
 ** sudot_s32_tied2:
 ** mov (z[0-9]+)\.d, z0\.d
 ** movprfx z0, z4
-** usdot   z0\.s, z2\.b, \1\.b
+** usdot   z0\.s, \1\.b, z2\.b
 ** ret
 */
 TEST_TRIPLE_Z_REV (sudot_s32_tied2, svint32_t, svint8_t, svuint8_t,
@@ -27,7 +27,7 @@ TEST_TRIPLE_Z_REV (sudot_s32_tied2, svint32_t, svint8_t, 
svuint8_t,
 /*
 ** sudot_w0_s32_tied:
 ** mov (z[0-9]+\.b), w0
-** usdot   z0\.s, z2\.b, \1
+** usdot   z0\.s, \1, z2\.b
 ** ret
 */
 TEST_TRIPLE_ZX (sudot_w0_s32_tied, svint32_t, svint8_t, uint8_t,
@@ -37,7 +37,7 @@ TEST_TRIPLE_ZX (sudot_w0_s32_tied, svint32_t, svint8_t, 
uint8_t,
 /*
 ** sudot_9_s32_tied:
 ** mov (z[0-9]+\.b), #9
-** usdot   z0\.s, z2\.b, \1
+** usdot   z0\.s, \1, z2\.b
 ** ret
 */
 TEST_TRIPLE_Z (sudot_9_s32_tied, svint32_t, svint8_t, uint8_t,


[gcc r12-10674] aarch64: Fix bogus cnot optimisation [PR114603]

2024-08-16 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:33b11c6d9a600fac25b7cc714e9905aac049685b

commit r12-10674-g33b11c6d9a600fac25b7cc714e9905aac049685b
Author: Richard Sandiford 
Date:   Fri Aug 16 15:37:50 2024 +0100

aarch64: Fix bogus cnot optimisation [PR114603]

aarch64-sve.md had a pattern that combined:

cmpeq   pb.T, pa/z, zc.T, #0
mov zd.T, pb/z, #1

into:

cnotzd.T, pa/m, zc.T

But this is only valid if pa.T is a ptrue.  In other cases, the
original would set inactive elements of zd.T to 0, whereas the
combined form would copy elements from zc.T.

gcc/
PR target/114603
* config/aarch64/aarch64-sve.md (@aarch64_pred_cnot): Replace
with...
(@aarch64_ptrue_cnot): ...this, requiring operand 1 to be
a ptrue.
(*cnot): Require operand 1 to be a ptrue.
* config/aarch64/aarch64-sve-builtins-base.cc (svcnot_impl::expand):
Use aarch64_ptrue_cnot for _x operations that are predicated
with a ptrue.  Represent other _x operations as fully-defined _m
operations.

gcc/testsuite/
PR target/114603
* gcc.target/aarch64/sve/acle/general/cnot_1.c: New test.

(cherry picked from commit 67cbb1c638d6ab3a9cb77e674541e2b291fb67df)

Diff:
---
 gcc/config/aarch64/aarch64-sve-builtins-base.cc| 25 ++
 gcc/config/aarch64/aarch64-sve.md  | 20 -
 .../gcc.target/aarch64/sve/acle/general/cnot_1.c   | 23 
 3 files changed, 49 insertions(+), 19 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc 
b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
index e5e0d6ed5c9..f96cb3ccc7b 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
@@ -494,15 +494,22 @@ public:
   expand (function_expander &e) const OVERRIDE
   {
 machine_mode mode = e.vector_mode (0);
-if (e.pred == PRED_x)
-  {
-   /* The pattern for CNOT includes an UNSPEC_PRED_Z, so needs
-  a ptrue hint.  */
-   e.add_ptrue_hint (0, e.gp_mode (0));
-   return e.use_pred_x_insn (code_for_aarch64_pred_cnot (mode));
-  }
-
-return e.use_cond_insn (code_for_cond_cnot (mode), 0);
+machine_mode pred_mode = e.gp_mode (0);
+/* The underlying _x pattern is effectively:
+
+dst = src == 0 ? 1 : 0
+
+   rather than an UNSPEC_PRED_X.  Using this form allows autovec
+   constructs to be matched by combine, but it means that the
+   predicate on the src == 0 comparison must be all-true.
+
+   For simplicity, represent other _x operations as fully-defined _m
+   operations rather than using a separate bespoke pattern.  */
+if (e.pred == PRED_x
+   && gen_lowpart (pred_mode, e.args[0]) == CONSTM1_RTX (pred_mode))
+  return e.use_pred_x_insn (code_for_aarch64_ptrue_cnot (mode));
+return e.use_cond_insn (code_for_cond_cnot (mode),
+   e.pred == PRED_x ? 1 : 0);
   }
 };
 
diff --git a/gcc/config/aarch64/aarch64-sve.md 
b/gcc/config/aarch64/aarch64-sve.md
index b8cc47ef5fc..c68a3598423 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -3205,24 +3205,24 @@
 ;; - CNOT
 ;; -
 
-;; Predicated logical inverse.
-(define_expand "@aarch64_pred_cnot"
+;; Logical inverse, predicated with a ptrue.
+(define_expand "@aarch64_ptrue_cnot"
   [(set (match_operand:SVE_FULL_I 0 "register_operand")
(unspec:SVE_FULL_I
  [(unspec:
 [(match_operand: 1 "register_operand")
- (match_operand:SI 2 "aarch64_sve_ptrue_flag")
+ (const_int SVE_KNOWN_PTRUE)
  (eq:
-   (match_operand:SVE_FULL_I 3 "register_operand")
-   (match_dup 4))]
+   (match_operand:SVE_FULL_I 2 "register_operand")
+   (match_dup 3))]
 UNSPEC_PRED_Z)
-  (match_dup 5)
-  (match_dup 4)]
+  (match_dup 4)
+  (match_dup 3)]
  UNSPEC_SEL))]
   "TARGET_SVE"
   {
-operands[4] = CONST0_RTX (mode);
-operands[5] = CONST1_RTX (mode);
+operands[3] = CONST0_RTX (mode);
+operands[4] = CONST1_RTX (mode);
   }
 )
 
@@ -3231,7 +3231,7 @@
(unspec:SVE_I
  [(unspec:
 [(match_operand: 1 "register_operand" "Upl, Upl")
- (match_operand:SI 5 "aarch64_sve_ptrue_flag")
+ (const_int SVE_KNOWN_PTRUE)
  (eq:
(match_operand:SVE_I 2 "register_operand" "0, w")
(match_operand:SVE_I 3 "aarch64_simd_imm_zero"))]
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/cnot_1.c 
b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/cnot_1.c
new file mode 100644
index 000..b1a489f0cf0
--- /dev/null
+++ b/gcc/test

[gcc r15-752] Cache the set of EH_RETURN_DATA_REGNOs

2024-05-21 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:7f35863ebbf7ba63e2f075edfbec105de272578a

commit r15-752-g7f35863ebbf7ba63e2f075edfbec105de272578a
Author: Richard Sandiford 
Date:   Tue May 21 10:21:16 2024 +0100

Cache the set of EH_RETURN_DATA_REGNOs

While reviewing Andrew's fix for PR114843, it seemed like it would
be convenient to have a HARD_REG_SET of EH_RETURN_DATA_REGNOs.
This patch adds one and uses it to simplify a couple of use sites.

gcc/
* hard-reg-set.h (target_hard_regs::x_eh_return_data_regs): New 
field.
(eh_return_data_regs): New macro.
* reginfo.cc (init_reg_sets_1): Initialize x_eh_return_data_regs.
* df-scan.cc (df_get_exit_block_use_set): Use it.
* ira-lives.cc (process_out_of_region_eh_regs): Likewise.

Diff:
---
 gcc/df-scan.cc |  8 +---
 gcc/hard-reg-set.h |  5 +
 gcc/ira-lives.cc   | 10 ++
 gcc/reginfo.cc | 10 ++
 4 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/gcc/df-scan.cc b/gcc/df-scan.cc
index 1bade2cd71e..c8ab3c09cee 100644
--- a/gcc/df-scan.cc
+++ b/gcc/df-scan.cc
@@ -3702,13 +3702,7 @@ df_get_exit_block_use_set (bitmap exit_block_uses)
 
   /* Mark the registers that will contain data for the handler.  */
   if (reload_completed && crtl->calls_eh_return)
-for (i = 0; ; ++i)
-  {
-   unsigned regno = EH_RETURN_DATA_REGNO (i);
-   if (regno == INVALID_REGNUM)
- break;
-   bitmap_set_bit (exit_block_uses, regno);
-  }
+IOR_REG_SET_HRS (exit_block_uses, eh_return_data_regs);
 
 #ifdef EH_RETURN_STACKADJ_RTX
   if ((!targetm.have_epilogue () || ! epilogue_completed)
diff --git a/gcc/hard-reg-set.h b/gcc/hard-reg-set.h
index 8c1d1512ca2..340eb425c10 100644
--- a/gcc/hard-reg-set.h
+++ b/gcc/hard-reg-set.h
@@ -421,6 +421,9 @@ struct target_hard_regs {
  with the local stack frame are safe, but scant others.  */
   HARD_REG_SET x_regs_invalidated_by_call;
 
+  /* The set of registers that are used by EH_RETURN_DATA_REGNO.  */
+  HARD_REG_SET x_eh_return_data_regs;
+
   /* Table of register numbers in the order in which to try to use them.  */
   int x_reg_alloc_order[FIRST_PSEUDO_REGISTER];
 
@@ -485,6 +488,8 @@ extern struct target_hard_regs *this_target_hard_regs;
 #define call_used_or_fixed_regs \
   (regs_invalidated_by_call | fixed_reg_set)
 #endif
+#define eh_return_data_regs \
+  (this_target_hard_regs->x_eh_return_data_regs)
 #define reg_alloc_order \
   (this_target_hard_regs->x_reg_alloc_order)
 #define inv_reg_alloc_order \
diff --git a/gcc/ira-lives.cc b/gcc/ira-lives.cc
index e07d3dc3e89..958eabb9708 100644
--- a/gcc/ira-lives.cc
+++ b/gcc/ira-lives.cc
@@ -1260,14 +1260,8 @@ process_out_of_region_eh_regs (basic_block bb)
   for (int n = ALLOCNO_NUM_OBJECTS (a) - 1; n >= 0; n--)
{
  ira_object_t obj = ALLOCNO_OBJECT (a, n);
- for (int k = 0; ; k++)
-   {
- unsigned int regno = EH_RETURN_DATA_REGNO (k);
- if (regno == INVALID_REGNUM)
-   break;
- SET_HARD_REG_BIT (OBJECT_CONFLICT_HARD_REGS (obj), regno);
- SET_HARD_REG_BIT (OBJECT_TOTAL_CONFLICT_HARD_REGS (obj), regno);
-   }
+ OBJECT_CONFLICT_HARD_REGS (obj) |= eh_return_data_regs;
+ OBJECT_TOTAL_CONFLICT_HARD_REGS (obj) |= eh_return_data_regs;
}
 }
 }
diff --git a/gcc/reginfo.cc b/gcc/reginfo.cc
index a0baeb90e12..73121365c47 100644
--- a/gcc/reginfo.cc
+++ b/gcc/reginfo.cc
@@ -420,6 +420,16 @@ init_reg_sets_1 (void)
}
 }
 
+  /* Recalculate eh_return_data_regs.  */
+  CLEAR_HARD_REG_SET (eh_return_data_regs);
+  for (i = 0; ; ++i)
+{
+  unsigned int regno = EH_RETURN_DATA_REGNO (i);
+  if (regno == INVALID_REGNUM)
+   break;
+  SET_HARD_REG_BIT (eh_return_data_regs, regno);
+}
+
   memset (have_regs_of_mode, 0, sizeof (have_regs_of_mode));
   memset (contains_reg_of_mode, 0, sizeof (contains_reg_of_mode));
   for (m = 0; m < (unsigned int) MAX_MACHINE_MODE; m++)


[gcc r15-820] vect: Fix access size alignment assumption [PR115192]

2024-05-24 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:a0fe4fb1c8d7804515845dd5d2a814b3c7a1ccba

commit r15-820-ga0fe4fb1c8d7804515845dd5d2a814b3c7a1ccba
Author: Richard Sandiford 
Date:   Fri May 24 13:47:21 2024 +0100

vect: Fix access size alignment assumption [PR115192]

create_intersect_range_checks checks whether two access ranges
a and b are alias-free using something equivalent to:

  end_a <= start_b || end_b <= start_a

It has two ways of doing this: a "vanilla" way that calculates
the exact exclusive end pointers, and another way that uses the
last inclusive aligned pointers (and changes the comparisons
accordingly).  The comment for the latter is:

  /* Calculate the minimum alignment shared by all four pointers,
 then arrange for this alignment to be subtracted from the
 exclusive maximum values to get inclusive maximum values.
 This "- min_align" is cumulative with a "+ access_size"
 in the calculation of the maximum values.  In the best
 (and common) case, the two cancel each other out, leaving
 us with an inclusive bound based only on seg_len.  In the
 worst case we're simply adding a smaller number than before.

The problem is that the associated code implicitly assumed that the
access size was a multiple of the pointer alignment, and so the
alignment could be carried over to the exclusive end pointer.

The testcase started failing after g:9fa5b473b5b8e289b6542
because that commit improved the alignment information for
the accesses.

gcc/
PR tree-optimization/115192
* tree-data-ref.cc (create_intersect_range_checks): Take the
alignment of the access sizes into account.

gcc/testsuite/
PR tree-optimization/115192
* gcc.dg/vect/pr115192.c: New test.

Diff:
---
 gcc/testsuite/gcc.dg/vect/pr115192.c | 28 
 gcc/tree-data-ref.cc |  5 -
 2 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.dg/vect/pr115192.c 
b/gcc/testsuite/gcc.dg/vect/pr115192.c
new file mode 100644
index 000..923d377c1bb
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr115192.c
@@ -0,0 +1,28 @@
+#include "tree-vect.h"
+
+int data[4 * 16 * 16] __attribute__((aligned(16)));
+
+__attribute__((noipa)) void
+foo (__SIZE_TYPE__ n)
+{
+  for (__SIZE_TYPE__ i = 1; i < n; ++i)
+{
+  data[i * n * 4] = data[(i - 1) * n * 4] + 1;
+  data[i * n * 4 + 1] = data[(i - 1) * n * 4 + 1] + 2;
+}
+}
+
+int
+main ()
+{
+  check_vect ();
+
+  data[0] = 10;
+  data[1] = 20;
+
+  foo (3);
+
+  if (data[24] != 12 || data[25] != 24)
+__builtin_abort ();
+  return 0;
+}
diff --git a/gcc/tree-data-ref.cc b/gcc/tree-data-ref.cc
index db15ddb43de..7c4049faf34 100644
--- a/gcc/tree-data-ref.cc
+++ b/gcc/tree-data-ref.cc
@@ -73,6 +73,7 @@ along with GCC; see the file COPYING3.  If not see
 
 */
 
+#define INCLUDE_ALGORITHM
 #include "config.h"
 #include "system.h"
 #include "coretypes.h"
@@ -2640,7 +2641,9 @@ create_intersect_range_checks (class loop *loop, tree 
*cond_expr,
 Because the maximum values are inclusive, there is an alias
 if the maximum value of one segment is equal to the minimum
 value of the other.  */
-  min_align = MIN (dr_a.align, dr_b.align);
+  min_align = std::min (dr_a.align, dr_b.align);
+  min_align = std::min (min_align, known_alignment (dr_a.access_size));
+  min_align = std::min (min_align, known_alignment (dr_b.access_size));
   cmp_code = LT_EXPR;
 }


[gcc r15-906] aarch64: Split aarch64_combinev16qi before RA [PR115258]

2024-05-29 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:39263ed2d39ac1cebde59bc5e72ddcad5dc7a1ec

commit r15-906-g39263ed2d39ac1cebde59bc5e72ddcad5dc7a1ec
Author: Richard Sandiford 
Date:   Wed May 29 16:43:33 2024 +0100

aarch64: Split aarch64_combinev16qi before RA [PR115258]

Two-vector TBL instructions are fed by an aarch64_combinev16qi, whose
purpose is to put the two input data vectors into consecutive registers.
This aarch64_combinev16qi was then split after reload into individual
moves (from the first input to the first half of the output, and from
the second input to the second half of the output).

In the worst case, the RA might allocate things so that the destination
of the aarch64_combinev16qi is the second input followed by the first
input.  In that case, the split form of aarch64_combinev16qi uses three
eors to swap the registers around.

This PR is about a test where this worst case occurred.  And given the
insn description, that allocation doesn't semm unreasonable.

early-ra should (hopefully) mean that we're now better at allocating
subregs of vector registers.  The upcoming RA subreg patches should
improve things further.  The best fix for the PR therefore seems
to be to split the combination before RA, so that the RA can see
the underlying moves.

Perhaps it even makes sense to do this at expand time, avoiding the need
for aarch64_combinev16qi entirely.  That deserves more experimentation
though.

gcc/
PR target/115258
* config/aarch64/aarch64-simd.md (aarch64_combinev16qi): Allow
the split before reload.
* config/aarch64/aarch64.cc (aarch64_split_combinev16qi): Generalize
into a form that handles pseudo registers.

gcc/testsuite/
PR target/115258
* gcc.target/aarch64/pr115258.c: New test.

Diff:
---
 gcc/config/aarch64/aarch64-simd.md  |  2 +-
 gcc/config/aarch64/aarch64.cc   | 29 ++---
 gcc/testsuite/gcc.target/aarch64/pr115258.c | 19 +++
 3 files changed, 34 insertions(+), 16 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index c311888e4bd..868f4486218 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -8474,7 +8474,7 @@
UNSPEC_CONCAT))]
   "TARGET_SIMD"
   "#"
-  "&& reload_completed"
+  "&& 1"
   [(const_int 0)]
 {
   aarch64_split_combinev16qi (operands);
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index ee12d8897a8..13191ec8e34 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -25333,27 +25333,26 @@ aarch64_output_sve_ptrues (rtx const_unspec)
 void
 aarch64_split_combinev16qi (rtx operands[3])
 {
-  unsigned int dest = REGNO (operands[0]);
-  unsigned int src1 = REGNO (operands[1]);
-  unsigned int src2 = REGNO (operands[2]);
   machine_mode halfmode = GET_MODE (operands[1]);
-  unsigned int halfregs = REG_NREGS (operands[1]);
-  rtx destlo, desthi;
 
   gcc_assert (halfmode == V16QImode);
 
-  if (src1 == dest && src2 == dest + halfregs)
+  rtx destlo = simplify_gen_subreg (halfmode, operands[0],
+   GET_MODE (operands[0]), 0);
+  rtx desthi = simplify_gen_subreg (halfmode, operands[0],
+   GET_MODE (operands[0]),
+   GET_MODE_SIZE (halfmode));
+
+  bool skiplo = rtx_equal_p (destlo, operands[1]);
+  bool skiphi = rtx_equal_p (desthi, operands[2]);
+
+  if (skiplo && skiphi)
 {
   /* No-op move.  Can't split to nothing; emit something.  */
   emit_note (NOTE_INSN_DELETED);
   return;
 }
 
-  /* Preserve register attributes for variable tracking.  */
-  destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
-  desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
-  GET_MODE_SIZE (halfmode));
-
   /* Special case of reversed high/low parts.  */
   if (reg_overlap_mentioned_p (operands[2], destlo)
   && reg_overlap_mentioned_p (operands[1], desthi))
@@ -25366,16 +25365,16 @@ aarch64_split_combinev16qi (rtx operands[3])
 {
   /* Try to avoid unnecessary moves if part of the result
 is in the right place already.  */
-  if (src1 != dest)
+  if (!skiplo)
emit_move_insn (destlo, operands[1]);
-  if (src2 != dest + halfregs)
+  if (!skiphi)
emit_move_insn (desthi, operands[2]);
 }
   else
 {
-  if (src2 != dest + halfregs)
+  if (!skiphi)
emit_move_insn (desthi, operands[2]);
-  if (src1 != dest)
+  if (!skiplo)
emit_move_insn (destlo, operands[1]);
 }
 }
diff --git a/gcc/testsuite/gcc.target/aarch64/pr115258.c 
b/gcc/testsuite/gcc.target/aarch64/pr115258.c
new file mode 100644
index 000..9a489d4604c
--- /dev/null
+++ b

[gcc r15-929] ira: Fix go_through_subreg offset calculation [PR115281]

2024-05-30 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:46d931b3dd31cbba7c3355ada63f155aa24a4e2b

commit r15-929-g46d931b3dd31cbba7c3355ada63f155aa24a4e2b
Author: Richard Sandiford 
Date:   Thu May 30 16:17:58 2024 +0100

ira: Fix go_through_subreg offset calculation [PR115281]

go_through_subreg used:

  else if (!can_div_trunc_p (SUBREG_BYTE (x),
 REGMODE_NATURAL_SIZE (GET_MODE (x)), offset))

to calculate the register offset for a pseudo subreg x.  In the blessed
days before poly-int, this was:

*offset = (SUBREG_BYTE (x) / REGMODE_NATURAL_SIZE (GET_MODE (x)));

But I think this is testing the wrong natural size.  If we exclude
paradoxical subregs (which will get an offset of zero regardless),
it's the inner register that is being split, so it should be the
inner register's natural size that we use.

This matters in the testcase because we have an SFmode lowpart
subreg into the last of three variable-sized vectors.  The
SUBREG_BYTE is therefore equal to the size of two variable-sized
vectors.  Dividing by the vector size gives a register offset of 2,
as expected, but dividing by the size of a scalar FPR would give
a variable offset.

I think something similar could happen for fixed-size targets if
REGMODE_NATURAL_SIZE is different for vectors and integers (say),
although that case would trade an ICE for an incorrect offset.

gcc/
PR rtl-optimization/115281
* ira-conflicts.cc (go_through_subreg): Use the natural size of
the inner mode rather than the outer mode.

gcc/testsuite/
PR rtl-optimization/115281
* gfortran.dg/pr115281.f90: New test.

Diff:
---
 gcc/ira-conflicts.cc   |  3 ++-
 gcc/testsuite/gfortran.dg/pr115281.f90 | 39 ++
 2 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/gcc/ira-conflicts.cc b/gcc/ira-conflicts.cc
index 83274c53330..15ac42d8848 100644
--- a/gcc/ira-conflicts.cc
+++ b/gcc/ira-conflicts.cc
@@ -227,8 +227,9 @@ go_through_subreg (rtx x, int *offset)
   if (REGNO (reg) < FIRST_PSEUDO_REGISTER)
 *offset = subreg_regno_offset (REGNO (reg), GET_MODE (reg),
   SUBREG_BYTE (x), GET_MODE (x));
+  /* The offset is always 0 for paradoxical subregs.  */
   else if (!can_div_trunc_p (SUBREG_BYTE (x),
-REGMODE_NATURAL_SIZE (GET_MODE (x)), offset))
+REGMODE_NATURAL_SIZE (GET_MODE (reg)), offset))
 /* Checked by validate_subreg.  We must know at compile time which
inner hard registers are being accessed.  */
 gcc_unreachable ();
diff --git a/gcc/testsuite/gfortran.dg/pr115281.f90 
b/gcc/testsuite/gfortran.dg/pr115281.f90
new file mode 100644
index 000..80aa822e745
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/pr115281.f90
@@ -0,0 +1,39 @@
+! { dg-options "-O3" }
+! { dg-additional-options "-mcpu=neoverse-v1" { target aarch64*-*-* } }
+
+SUBROUTINE fn0(ma, mb, nt)
+  CHARACTER ca
+  REAL r0(ma)
+  INTEGER i0(mb)
+  REAL r1(3,mb)
+  REAL r2(3,mb)
+  REAL r3(3,3)
+  zero=0.0
+  do na = 1, nt
+ nt = i0(na)
+ do l = 1, 3
+r1 (l, na) =   r0 (nt)
+r2(l, na) = zero
+ enddo
+  enddo
+  if (ca  .ne.'z') then
+ do j = 1, 3
+do i = 1, 3
+   r4  = zero
+enddo
+ enddo
+ do na = 1, nt
+do k =  1, 3
+   do l = 1, 3
+  do m = 1, 3
+ r3 = r4 * v
+  enddo
+   enddo
+enddo
+ do i = 1, 3
+   do k = 1, ifn (r3)
+   enddo
+enddo
+ enddo
+ endif
+END


[gcc r14-10263] vect: Fix access size alignment assumption [PR115192]

2024-05-31 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:36575f5fe491d86b6851ff3f47cbfb7dad0fc8ae

commit r14-10263-g36575f5fe491d86b6851ff3f47cbfb7dad0fc8ae
Author: Richard Sandiford 
Date:   Fri May 31 08:22:55 2024 +0100

vect: Fix access size alignment assumption [PR115192]

create_intersect_range_checks checks whether two access ranges
a and b are alias-free using something equivalent to:

  end_a <= start_b || end_b <= start_a

It has two ways of doing this: a "vanilla" way that calculates
the exact exclusive end pointers, and another way that uses the
last inclusive aligned pointers (and changes the comparisons
accordingly).  The comment for the latter is:

  /* Calculate the minimum alignment shared by all four pointers,
 then arrange for this alignment to be subtracted from the
 exclusive maximum values to get inclusive maximum values.
 This "- min_align" is cumulative with a "+ access_size"
 in the calculation of the maximum values.  In the best
 (and common) case, the two cancel each other out, leaving
 us with an inclusive bound based only on seg_len.  In the
 worst case we're simply adding a smaller number than before.

The problem is that the associated code implicitly assumed that the
access size was a multiple of the pointer alignment, and so the
alignment could be carried over to the exclusive end pointer.

The testcase started failing after g:9fa5b473b5b8e289b6542
because that commit improved the alignment information for
the accesses.

gcc/
PR tree-optimization/115192
* tree-data-ref.cc (create_intersect_range_checks): Take the
alignment of the access sizes into account.

gcc/testsuite/
PR tree-optimization/115192
* gcc.dg/vect/pr115192.c: New test.

(cherry picked from commit a0fe4fb1c8d7804515845dd5d2a814b3c7a1ccba)

Diff:
---
 gcc/testsuite/gcc.dg/vect/pr115192.c | 28 
 gcc/tree-data-ref.cc |  5 -
 2 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.dg/vect/pr115192.c 
b/gcc/testsuite/gcc.dg/vect/pr115192.c
new file mode 100644
index 000..923d377c1bb
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr115192.c
@@ -0,0 +1,28 @@
+#include "tree-vect.h"
+
+int data[4 * 16 * 16] __attribute__((aligned(16)));
+
+__attribute__((noipa)) void
+foo (__SIZE_TYPE__ n)
+{
+  for (__SIZE_TYPE__ i = 1; i < n; ++i)
+{
+  data[i * n * 4] = data[(i - 1) * n * 4] + 1;
+  data[i * n * 4 + 1] = data[(i - 1) * n * 4 + 1] + 2;
+}
+}
+
+int
+main ()
+{
+  check_vect ();
+
+  data[0] = 10;
+  data[1] = 20;
+
+  foo (3);
+
+  if (data[24] != 12 || data[25] != 24)
+__builtin_abort ();
+  return 0;
+}
diff --git a/gcc/tree-data-ref.cc b/gcc/tree-data-ref.cc
index f37734b5340..654a8220214 100644
--- a/gcc/tree-data-ref.cc
+++ b/gcc/tree-data-ref.cc
@@ -73,6 +73,7 @@ along with GCC; see the file COPYING3.  If not see
 
 */
 
+#define INCLUDE_ALGORITHM
 #include "config.h"
 #include "system.h"
 #include "coretypes.h"
@@ -2640,7 +2641,9 @@ create_intersect_range_checks (class loop *loop, tree 
*cond_expr,
 Because the maximum values are inclusive, there is an alias
 if the maximum value of one segment is equal to the minimum
 value of the other.  */
-  min_align = MIN (dr_a.align, dr_b.align);
+  min_align = std::min (dr_a.align, dr_b.align);
+  min_align = std::min (min_align, known_alignment (dr_a.access_size));
+  min_align = std::min (min_align, known_alignment (dr_b.access_size));
   cmp_code = LT_EXPR;
 }


[gcc r13-8812] vect: Fix access size alignment assumption [PR115192]

2024-05-31 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:0836216693749f3b0b383d015bd36c004754f1da

commit r13-8812-g0836216693749f3b0b383d015bd36c004754f1da
Author: Richard Sandiford 
Date:   Fri May 31 15:56:04 2024 +0100

vect: Fix access size alignment assumption [PR115192]

create_intersect_range_checks checks whether two access ranges
a and b are alias-free using something equivalent to:

  end_a <= start_b || end_b <= start_a

It has two ways of doing this: a "vanilla" way that calculates
the exact exclusive end pointers, and another way that uses the
last inclusive aligned pointers (and changes the comparisons
accordingly).  The comment for the latter is:

  /* Calculate the minimum alignment shared by all four pointers,
 then arrange for this alignment to be subtracted from the
 exclusive maximum values to get inclusive maximum values.
 This "- min_align" is cumulative with a "+ access_size"
 in the calculation of the maximum values.  In the best
 (and common) case, the two cancel each other out, leaving
 us with an inclusive bound based only on seg_len.  In the
 worst case we're simply adding a smaller number than before.

The problem is that the associated code implicitly assumed that the
access size was a multiple of the pointer alignment, and so the
alignment could be carried over to the exclusive end pointer.

The testcase started failing after g:9fa5b473b5b8e289b6542
because that commit improved the alignment information for
the accesses.

gcc/
PR tree-optimization/115192
* tree-data-ref.cc (create_intersect_range_checks): Take the
alignment of the access sizes into account.

gcc/testsuite/
PR tree-optimization/115192
* gcc.dg/vect/pr115192.c: New test.

(cherry picked from commit a0fe4fb1c8d7804515845dd5d2a814b3c7a1ccba)

Diff:
---
 gcc/testsuite/gcc.dg/vect/pr115192.c | 28 
 gcc/tree-data-ref.cc |  5 -
 2 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.dg/vect/pr115192.c 
b/gcc/testsuite/gcc.dg/vect/pr115192.c
new file mode 100644
index 000..923d377c1bb
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr115192.c
@@ -0,0 +1,28 @@
+#include "tree-vect.h"
+
+int data[4 * 16 * 16] __attribute__((aligned(16)));
+
+__attribute__((noipa)) void
+foo (__SIZE_TYPE__ n)
+{
+  for (__SIZE_TYPE__ i = 1; i < n; ++i)
+{
+  data[i * n * 4] = data[(i - 1) * n * 4] + 1;
+  data[i * n * 4 + 1] = data[(i - 1) * n * 4 + 1] + 2;
+}
+}
+
+int
+main ()
+{
+  check_vect ();
+
+  data[0] = 10;
+  data[1] = 20;
+
+  foo (3);
+
+  if (data[24] != 12 || data[25] != 24)
+__builtin_abort ();
+  return 0;
+}
diff --git a/gcc/tree-data-ref.cc b/gcc/tree-data-ref.cc
index 6cd5f7aa3cf..96934addff1 100644
--- a/gcc/tree-data-ref.cc
+++ b/gcc/tree-data-ref.cc
@@ -73,6 +73,7 @@ along with GCC; see the file COPYING3.  If not see
 
 */
 
+#define INCLUDE_ALGORITHM
 #include "config.h"
 #include "system.h"
 #include "coretypes.h"
@@ -2629,7 +2630,9 @@ create_intersect_range_checks (class loop *loop, tree 
*cond_expr,
 Because the maximum values are inclusive, there is an alias
 if the maximum value of one segment is equal to the minimum
 value of the other.  */
-  min_align = MIN (dr_a.align, dr_b.align);
+  min_align = std::min (dr_a.align, dr_b.align);
+  min_align = std::min (min_align, known_alignment (dr_a.access_size));
+  min_align = std::min (min_align, known_alignment (dr_b.access_size));
   cmp_code = LT_EXPR;
 }


[gcc r13-8813] vect: Tighten vect_determine_precisions_from_range [PR113281]

2024-05-31 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:2602b71103d5ef2ef86000cac832b31dad3dfe2b

commit r13-8813-g2602b71103d5ef2ef86000cac832b31dad3dfe2b
Author: Richard Sandiford 
Date:   Fri May 31 15:56:05 2024 +0100

vect: Tighten vect_determine_precisions_from_range [PR113281]

This was another PR caused by the way that
vect_determine_precisions_from_range handles shifts.  We tried to
narrow 32768 >> x to a 16-bit shift based on range information for
the inputs and outputs, with vect_recog_over_widening_pattern
(after PR110828) adjusting the shift amount.  But this doesn't
work for the case where x is in [16, 31], since then 32-bit
32768 >> x is a well-defined zero, whereas no well-defined
16-bit 32768 >> y will produce 0.

We could perhaps generate x < 16 ? 32768 >> x : 0 instead,
but since vect_determine_precisions_from_range was never really
supposed to rely on fix-ups, it seems better to fix that instead.

The patch also makes the code more selective about which codes
can be narrowed based on input and output ranges.  This showed
that vect_truncatable_operation_p was missing cases for
BIT_NOT_EXPR (equivalent to BIT_XOR_EXPR of -1) and NEGATE_EXPR
(equivalent to BIT_NOT_EXPR followed by a PLUS_EXPR of 1).

pr113281-1.c is the original testcase.  pr113281-[23].c failed
before the patch due to overly optimistic narrowing.  pr113281-[45].c
previously passed and are meant to protect against accidental
optimisation regressions.

gcc/
PR target/113281
* tree-vect-patterns.cc (vect_recog_over_widening_pattern): Remove
workaround for right shifts.
(vect_truncatable_operation_p): Handle NEGATE_EXPR and BIT_NOT_EXPR.
(vect_determine_precisions_from_range): Be more selective about
which codes can be narrowed based on their input and output ranges.
For shifts, require at least one more bit of precision than the
maximum shift amount.

gcc/testsuite/
PR target/113281
* gcc.dg/vect/pr113281-1.c: New test.
* gcc.dg/vect/pr113281-2.c: Likewise.
* gcc.dg/vect/pr113281-3.c: Likewise.
* gcc.dg/vect/pr113281-4.c: Likewise.
* gcc.dg/vect/pr113281-5.c: Likewise.

(cherry picked from commit 1a8261e047f7a2c2b0afb95716f7615cba718cd1)

Diff:
---
 gcc/testsuite/gcc.dg/vect/pr113281-1.c |  17 ++
 gcc/testsuite/gcc.dg/vect/pr113281-2.c |  50 +++
 gcc/testsuite/gcc.dg/vect/pr113281-3.c |  39 
 gcc/testsuite/gcc.dg/vect/pr113281-4.c |  55 +
 gcc/testsuite/gcc.dg/vect/pr113281-5.c |  66 
 gcc/tree-vect-patterns.cc  | 107 -
 6 files changed, 305 insertions(+), 29 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/vect/pr113281-1.c 
b/gcc/testsuite/gcc.dg/vect/pr113281-1.c
new file mode 100644
index 000..6df4231cb5f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr113281-1.c
@@ -0,0 +1,17 @@
+#include "tree-vect.h"
+
+unsigned char a;
+
+int main() {
+  check_vect ();
+
+  short b = a = 0;
+  for (; a != 19; a++)
+if (a)
+  b = 32872 >> a;
+
+  if (b == 0)
+return 0;
+  else
+return 1;
+}
diff --git a/gcc/testsuite/gcc.dg/vect/pr113281-2.c 
b/gcc/testsuite/gcc.dg/vect/pr113281-2.c
new file mode 100644
index 000..3a1170c28b6
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr113281-2.c
@@ -0,0 +1,50 @@
+/* { dg-do compile } */
+
+#define N 128
+
+short x[N];
+short y[N];
+
+void
+f1 (void)
+{
+  for (int i = 0; i < N; ++i)
+x[i] >>= y[i];
+}
+
+void
+f2 (void)
+{
+  for (int i = 0; i < N; ++i)
+x[i] >>= (y[i] < 32 ? y[i] : 32);
+}
+
+void
+f3 (void)
+{
+  for (int i = 0; i < N; ++i)
+x[i] >>= (y[i] < 31 ? y[i] : 31);
+}
+
+void
+f4 (void)
+{
+  for (int i = 0; i < N; ++i)
+x[i] >>= (y[i] & 31);
+}
+
+void
+f5 (void)
+{
+  for (int i = 0; i < N; ++i)
+x[i] >>= 0x8000 >> y[i];
+}
+
+void
+f6 (void)
+{
+  for (int i = 0; i < N; ++i)
+x[i] >>= 0x8000 >> (y[i] & 31);
+}
+
+/* { dg-final { scan-tree-dump-not {can narrow[^\n]+>>} "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/pr113281-3.c 
b/gcc/testsuite/gcc.dg/vect/pr113281-3.c
new file mode 100644
index 000..5982dd2d16f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr113281-3.c
@@ -0,0 +1,39 @@
+/* { dg-do compile } */
+
+#define N 128
+
+short x[N];
+short y[N];
+
+void
+f1 (void)
+{
+  for (int i = 0; i < N; ++i)
+x[i] >>= (y[i] < 30 ? y[i] : 30);
+}
+
+void
+f2 (void)
+{
+  for (int i = 0; i < N; ++i)
+x[i] >>= ((y[i] & 15) + 2);
+}
+
+void
+f3 (void)
+{
+  for (int i = 0; i < N; ++i)
+x[i] >>= (y[i] < 16 ? y[i] : 16);
+}
+
+void
+f4 (void)
+{
+  for (int i = 0; i < N; ++i)
+x[i] = 32768 >> ((y[i] & 15) + 3);
+}
+
+/* { dg-final { scan-tree-dump {can narrow to signed:31 without loss [^\n]+>>} 
"vect" } } */
+/* { dg-final { scan-tree-dump {can n

[gcc r12-10488] vect: Fix access size alignment assumption [PR115192]

2024-06-04 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:f510e59db482456160b8a63dc083c78b0c1f6c09

commit r12-10488-gf510e59db482456160b8a63dc083c78b0c1f6c09
Author: Richard Sandiford 
Date:   Tue Jun 4 08:47:47 2024 +0100

vect: Fix access size alignment assumption [PR115192]

create_intersect_range_checks checks whether two access ranges
a and b are alias-free using something equivalent to:

  end_a <= start_b || end_b <= start_a

It has two ways of doing this: a "vanilla" way that calculates
the exact exclusive end pointers, and another way that uses the
last inclusive aligned pointers (and changes the comparisons
accordingly).  The comment for the latter is:

  /* Calculate the minimum alignment shared by all four pointers,
 then arrange for this alignment to be subtracted from the
 exclusive maximum values to get inclusive maximum values.
 This "- min_align" is cumulative with a "+ access_size"
 in the calculation of the maximum values.  In the best
 (and common) case, the two cancel each other out, leaving
 us with an inclusive bound based only on seg_len.  In the
 worst case we're simply adding a smaller number than before.

The problem is that the associated code implicitly assumed that the
access size was a multiple of the pointer alignment, and so the
alignment could be carried over to the exclusive end pointer.

The testcase started failing after g:9fa5b473b5b8e289b6542
because that commit improved the alignment information for
the accesses.

gcc/
PR tree-optimization/115192
* tree-data-ref.cc (create_intersect_range_checks): Take the
alignment of the access sizes into account.

gcc/testsuite/
PR tree-optimization/115192
* gcc.dg/vect/pr115192.c: New test.

(cherry picked from commit a0fe4fb1c8d7804515845dd5d2a814b3c7a1ccba)

Diff:
---
 gcc/testsuite/gcc.dg/vect/pr115192.c | 28 
 gcc/tree-data-ref.cc |  5 -
 2 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.dg/vect/pr115192.c 
b/gcc/testsuite/gcc.dg/vect/pr115192.c
new file mode 100644
index 000..923d377c1bb
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr115192.c
@@ -0,0 +1,28 @@
+#include "tree-vect.h"
+
+int data[4 * 16 * 16] __attribute__((aligned(16)));
+
+__attribute__((noipa)) void
+foo (__SIZE_TYPE__ n)
+{
+  for (__SIZE_TYPE__ i = 1; i < n; ++i)
+{
+  data[i * n * 4] = data[(i - 1) * n * 4] + 1;
+  data[i * n * 4 + 1] = data[(i - 1) * n * 4 + 1] + 2;
+}
+}
+
+int
+main ()
+{
+  check_vect ();
+
+  data[0] = 10;
+  data[1] = 20;
+
+  foo (3);
+
+  if (data[24] != 12 || data[25] != 24)
+__builtin_abort ();
+  return 0;
+}
diff --git a/gcc/tree-data-ref.cc b/gcc/tree-data-ref.cc
index 0df4a3525f4..706a49f226e 100644
--- a/gcc/tree-data-ref.cc
+++ b/gcc/tree-data-ref.cc
@@ -73,6 +73,7 @@ along with GCC; see the file COPYING3.  If not see
 
 */
 
+#define INCLUDE_ALGORITHM
 #include "config.h"
 #include "system.h"
 #include "coretypes.h"
@@ -2627,7 +2628,9 @@ create_intersect_range_checks (class loop *loop, tree 
*cond_expr,
 Because the maximum values are inclusive, there is an alias
 if the maximum value of one segment is equal to the minimum
 value of the other.  */
-  min_align = MIN (dr_a.align, dr_b.align);
+  min_align = std::min (dr_a.align, dr_b.align);
+  min_align = std::min (min_align, known_alignment (dr_a.access_size));
+  min_align = std::min (min_align, known_alignment (dr_b.access_size));
   cmp_code = LT_EXPR;
 }


[gcc r12-10489] vect: Tighten vect_determine_precisions_from_range [PR113281]

2024-06-04 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:dfaa13455d67646805bc611aa4373728a460a37d

commit r12-10489-gdfaa13455d67646805bc611aa4373728a460a37d
Author: Richard Sandiford 
Date:   Tue Jun 4 08:47:48 2024 +0100

vect: Tighten vect_determine_precisions_from_range [PR113281]

This was another PR caused by the way that
vect_determine_precisions_from_range handles shifts.  We tried to
narrow 32768 >> x to a 16-bit shift based on range information for
the inputs and outputs, with vect_recog_over_widening_pattern
(after PR110828) adjusting the shift amount.  But this doesn't
work for the case where x is in [16, 31], since then 32-bit
32768 >> x is a well-defined zero, whereas no well-defined
16-bit 32768 >> y will produce 0.

We could perhaps generate x < 16 ? 32768 >> x : 0 instead,
but since vect_determine_precisions_from_range was never really
supposed to rely on fix-ups, it seems better to fix that instead.

The patch also makes the code more selective about which codes
can be narrowed based on input and output ranges.  This showed
that vect_truncatable_operation_p was missing cases for
BIT_NOT_EXPR (equivalent to BIT_XOR_EXPR of -1) and NEGATE_EXPR
(equivalent to BIT_NOT_EXPR followed by a PLUS_EXPR of 1).

pr113281-1.c is the original testcase.  pr113281-[23].c failed
before the patch due to overly optimistic narrowing.  pr113281-[45].c
previously passed and are meant to protect against accidental
optimisation regressions.

gcc/
PR target/113281
* tree-vect-patterns.cc (vect_recog_over_widening_pattern): Remove
workaround for right shifts.
(vect_truncatable_operation_p): Handle NEGATE_EXPR and BIT_NOT_EXPR.
(vect_determine_precisions_from_range): Be more selective about
which codes can be narrowed based on their input and output ranges.
For shifts, require at least one more bit of precision than the
maximum shift amount.

gcc/testsuite/
PR target/113281
* gcc.dg/vect/pr113281-1.c: New test.
* gcc.dg/vect/pr113281-2.c: Likewise.
* gcc.dg/vect/pr113281-3.c: Likewise.
* gcc.dg/vect/pr113281-4.c: Likewise.
* gcc.dg/vect/pr113281-5.c: Likewise.

(cherry picked from commit 1a8261e047f7a2c2b0afb95716f7615cba718cd1)

Diff:
---
 gcc/testsuite/gcc.dg/vect/pr113281-1.c |  17 ++
 gcc/testsuite/gcc.dg/vect/pr113281-2.c |  50 +++
 gcc/testsuite/gcc.dg/vect/pr113281-3.c |  39 
 gcc/testsuite/gcc.dg/vect/pr113281-4.c |  55 +
 gcc/testsuite/gcc.dg/vect/pr113281-5.c |  66 
 gcc/tree-vect-patterns.cc  | 107 -
 6 files changed, 305 insertions(+), 29 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/vect/pr113281-1.c 
b/gcc/testsuite/gcc.dg/vect/pr113281-1.c
new file mode 100644
index 000..6df4231cb5f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr113281-1.c
@@ -0,0 +1,17 @@
+#include "tree-vect.h"
+
+unsigned char a;
+
+int main() {
+  check_vect ();
+
+  short b = a = 0;
+  for (; a != 19; a++)
+if (a)
+  b = 32872 >> a;
+
+  if (b == 0)
+return 0;
+  else
+return 1;
+}
diff --git a/gcc/testsuite/gcc.dg/vect/pr113281-2.c 
b/gcc/testsuite/gcc.dg/vect/pr113281-2.c
new file mode 100644
index 000..3a1170c28b6
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr113281-2.c
@@ -0,0 +1,50 @@
+/* { dg-do compile } */
+
+#define N 128
+
+short x[N];
+short y[N];
+
+void
+f1 (void)
+{
+  for (int i = 0; i < N; ++i)
+x[i] >>= y[i];
+}
+
+void
+f2 (void)
+{
+  for (int i = 0; i < N; ++i)
+x[i] >>= (y[i] < 32 ? y[i] : 32);
+}
+
+void
+f3 (void)
+{
+  for (int i = 0; i < N; ++i)
+x[i] >>= (y[i] < 31 ? y[i] : 31);
+}
+
+void
+f4 (void)
+{
+  for (int i = 0; i < N; ++i)
+x[i] >>= (y[i] & 31);
+}
+
+void
+f5 (void)
+{
+  for (int i = 0; i < N; ++i)
+x[i] >>= 0x8000 >> y[i];
+}
+
+void
+f6 (void)
+{
+  for (int i = 0; i < N; ++i)
+x[i] >>= 0x8000 >> (y[i] & 31);
+}
+
+/* { dg-final { scan-tree-dump-not {can narrow[^\n]+>>} "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/pr113281-3.c 
b/gcc/testsuite/gcc.dg/vect/pr113281-3.c
new file mode 100644
index 000..5982dd2d16f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr113281-3.c
@@ -0,0 +1,39 @@
+/* { dg-do compile } */
+
+#define N 128
+
+short x[N];
+short y[N];
+
+void
+f1 (void)
+{
+  for (int i = 0; i < N; ++i)
+x[i] >>= (y[i] < 30 ? y[i] : 30);
+}
+
+void
+f2 (void)
+{
+  for (int i = 0; i < N; ++i)
+x[i] >>= ((y[i] & 15) + 2);
+}
+
+void
+f3 (void)
+{
+  for (int i = 0; i < N; ++i)
+x[i] >>= (y[i] < 16 ? y[i] : 16);
+}
+
+void
+f4 (void)
+{
+  for (int i = 0; i < N; ++i)
+x[i] = 32768 >> ((y[i] & 15) + 3);
+}
+
+/* { dg-final { scan-tree-dump {can narrow to signed:31 without loss [^\n]+>>} 
"vect" } } */
+/* { dg-final { scan-tree-dump {can n

[gcc r11-11465] vect: Fix access size alignment assumption [PR115192]

2024-06-04 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:741ea10418987ac02eb8e680f2946a6e5928eb23

commit r11-11465-g741ea10418987ac02eb8e680f2946a6e5928eb23
Author: Richard Sandiford 
Date:   Tue Jun 4 13:47:34 2024 +0100

vect: Fix access size alignment assumption [PR115192]

create_intersect_range_checks checks whether two access ranges
a and b are alias-free using something equivalent to:

  end_a <= start_b || end_b <= start_a

It has two ways of doing this: a "vanilla" way that calculates
the exact exclusive end pointers, and another way that uses the
last inclusive aligned pointers (and changes the comparisons
accordingly).  The comment for the latter is:

  /* Calculate the minimum alignment shared by all four pointers,
 then arrange for this alignment to be subtracted from the
 exclusive maximum values to get inclusive maximum values.
 This "- min_align" is cumulative with a "+ access_size"
 in the calculation of the maximum values.  In the best
 (and common) case, the two cancel each other out, leaving
 us with an inclusive bound based only on seg_len.  In the
 worst case we're simply adding a smaller number than before.

The problem is that the associated code implicitly assumed that the
access size was a multiple of the pointer alignment, and so the
alignment could be carried over to the exclusive end pointer.

The testcase started failing after g:9fa5b473b5b8e289b6542
because that commit improved the alignment information for
the accesses.

gcc/
PR tree-optimization/115192
* tree-data-ref.c (create_intersect_range_checks): Take the
alignment of the access sizes into account.

gcc/testsuite/
PR tree-optimization/115192
* gcc.dg/vect/pr115192.c: New test.

(cherry picked from commit a0fe4fb1c8d7804515845dd5d2a814b3c7a1ccba)

Diff:
---
 gcc/testsuite/gcc.dg/vect/pr115192.c | 28 
 gcc/tree-data-ref.c  |  5 -
 2 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.dg/vect/pr115192.c 
b/gcc/testsuite/gcc.dg/vect/pr115192.c
new file mode 100644
index 000..923d377c1bb
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr115192.c
@@ -0,0 +1,28 @@
+#include "tree-vect.h"
+
+int data[4 * 16 * 16] __attribute__((aligned(16)));
+
+__attribute__((noipa)) void
+foo (__SIZE_TYPE__ n)
+{
+  for (__SIZE_TYPE__ i = 1; i < n; ++i)
+{
+  data[i * n * 4] = data[(i - 1) * n * 4] + 1;
+  data[i * n * 4 + 1] = data[(i - 1) * n * 4 + 1] + 2;
+}
+}
+
+int
+main ()
+{
+  check_vect ();
+
+  data[0] = 10;
+  data[1] = 20;
+
+  foo (3);
+
+  if (data[24] != 12 || data[25] != 24)
+__builtin_abort ();
+  return 0;
+}
diff --git a/gcc/tree-data-ref.c b/gcc/tree-data-ref.c
index b3dd2f0ca41..d127aba8792 100644
--- a/gcc/tree-data-ref.c
+++ b/gcc/tree-data-ref.c
@@ -73,6 +73,7 @@ along with GCC; see the file COPYING3.  If not see
 
 */
 
+#define INCLUDE_ALGORITHM
 #include "config.h"
 #include "system.h"
 #include "coretypes.h"
@@ -2629,7 +2630,9 @@ create_intersect_range_checks (class loop *loop, tree 
*cond_expr,
 Because the maximum values are inclusive, there is an alias
 if the maximum value of one segment is equal to the minimum
 value of the other.  */
-  min_align = MIN (dr_a.align, dr_b.align);
+  min_align = std::min (dr_a.align, dr_b.align);
+  min_align = std::min (min_align, known_alignment (dr_a.access_size));
+  min_align = std::min (min_align, known_alignment (dr_b.access_size));
   cmp_code = LT_EXPR;
 }


[gcc r11-11466] vect: Tighten vect_determine_precisions_from_range [PR113281]

2024-06-04 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:95e4252f53bc0e5b66a200c611fd2c9f6f7f2a62

commit r11-11466-g95e4252f53bc0e5b66a200c611fd2c9f6f7f2a62
Author: Richard Sandiford 
Date:   Tue Jun 4 13:47:35 2024 +0100

vect: Tighten vect_determine_precisions_from_range [PR113281]

This was another PR caused by the way that
vect_determine_precisions_from_range handles shifts.  We tried to
narrow 32768 >> x to a 16-bit shift based on range information for
the inputs and outputs, with vect_recog_over_widening_pattern
(after PR110828) adjusting the shift amount.  But this doesn't
work for the case where x is in [16, 31], since then 32-bit
32768 >> x is a well-defined zero, whereas no well-defined
16-bit 32768 >> y will produce 0.

We could perhaps generate x < 16 ? 32768 >> x : 0 instead,
but since vect_determine_precisions_from_range was never really
supposed to rely on fix-ups, it seems better to fix that instead.

The patch also makes the code more selective about which codes
can be narrowed based on input and output ranges.  This showed
that vect_truncatable_operation_p was missing cases for
BIT_NOT_EXPR (equivalent to BIT_XOR_EXPR of -1) and NEGATE_EXPR
(equivalent to BIT_NOT_EXPR followed by a PLUS_EXPR of 1).

pr113281-1.c is the original testcase.  pr113281-[23].c failed
before the patch due to overly optimistic narrowing.  pr113281-[45].c
previously passed and are meant to protect against accidental
optimisation regressions.

gcc/
PR target/113281
* tree-vect-patterns.c (vect_recog_over_widening_pattern): Remove
workaround for right shifts.
(vect_truncatable_operation_p): Handle NEGATE_EXPR and BIT_NOT_EXPR.
(vect_determine_precisions_from_range): Be more selective about
which codes can be narrowed based on their input and output ranges.
For shifts, require at least one more bit of precision than the
maximum shift amount.

gcc/testsuite/
PR target/113281
* gcc.dg/vect/pr113281-1.c: New test.
* gcc.dg/vect/pr113281-2.c: Likewise.
* gcc.dg/vect/pr113281-3.c: Likewise.
* gcc.dg/vect/pr113281-4.c: Likewise.
* gcc.dg/vect/pr113281-5.c: Likewise.

(cherry picked from commit 1a8261e047f7a2c2b0afb95716f7615cba718cd1)

Diff:
---
 gcc/testsuite/gcc.dg/vect/pr113281-1.c |  17 ++
 gcc/testsuite/gcc.dg/vect/pr113281-2.c |  50 +++
 gcc/testsuite/gcc.dg/vect/pr113281-3.c |  39 
 gcc/testsuite/gcc.dg/vect/pr113281-4.c |  55 +
 gcc/testsuite/gcc.dg/vect/pr113281-5.c |  66 
 gcc/tree-vect-patterns.c   | 107 -
 6 files changed, 305 insertions(+), 29 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/vect/pr113281-1.c 
b/gcc/testsuite/gcc.dg/vect/pr113281-1.c
new file mode 100644
index 000..6df4231cb5f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr113281-1.c
@@ -0,0 +1,17 @@
+#include "tree-vect.h"
+
+unsigned char a;
+
+int main() {
+  check_vect ();
+
+  short b = a = 0;
+  for (; a != 19; a++)
+if (a)
+  b = 32872 >> a;
+
+  if (b == 0)
+return 0;
+  else
+return 1;
+}
diff --git a/gcc/testsuite/gcc.dg/vect/pr113281-2.c 
b/gcc/testsuite/gcc.dg/vect/pr113281-2.c
new file mode 100644
index 000..3a1170c28b6
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr113281-2.c
@@ -0,0 +1,50 @@
+/* { dg-do compile } */
+
+#define N 128
+
+short x[N];
+short y[N];
+
+void
+f1 (void)
+{
+  for (int i = 0; i < N; ++i)
+x[i] >>= y[i];
+}
+
+void
+f2 (void)
+{
+  for (int i = 0; i < N; ++i)
+x[i] >>= (y[i] < 32 ? y[i] : 32);
+}
+
+void
+f3 (void)
+{
+  for (int i = 0; i < N; ++i)
+x[i] >>= (y[i] < 31 ? y[i] : 31);
+}
+
+void
+f4 (void)
+{
+  for (int i = 0; i < N; ++i)
+x[i] >>= (y[i] & 31);
+}
+
+void
+f5 (void)
+{
+  for (int i = 0; i < N; ++i)
+x[i] >>= 0x8000 >> y[i];
+}
+
+void
+f6 (void)
+{
+  for (int i = 0; i < N; ++i)
+x[i] >>= 0x8000 >> (y[i] & 31);
+}
+
+/* { dg-final { scan-tree-dump-not {can narrow[^\n]+>>} "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/pr113281-3.c 
b/gcc/testsuite/gcc.dg/vect/pr113281-3.c
new file mode 100644
index 000..5982dd2d16f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr113281-3.c
@@ -0,0 +1,39 @@
+/* { dg-do compile } */
+
+#define N 128
+
+short x[N];
+short y[N];
+
+void
+f1 (void)
+{
+  for (int i = 0; i < N; ++i)
+x[i] >>= (y[i] < 30 ? y[i] : 30);
+}
+
+void
+f2 (void)
+{
+  for (int i = 0; i < N; ++i)
+x[i] >>= ((y[i] & 15) + 2);
+}
+
+void
+f3 (void)
+{
+  for (int i = 0; i < N; ++i)
+x[i] >>= (y[i] < 16 ? y[i] : 16);
+}
+
+void
+f4 (void)
+{
+  for (int i = 0; i < N; ++i)
+x[i] = 32768 >> ((y[i] & 15) + 3);
+}
+
+/* { dg-final { scan-tree-dump {can narrow to signed:31 without loss [^\n]+>>} 
"vect" } } */
+/* { dg-final { scan-tree-dump {can na

[gcc r11-11467] rtl-ssa: Extend m_num_defs to a full unsigned int [PR108086]

2024-06-04 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:66d01cc3f4a248ccc471a978f0bfe3615c3f3a30

commit r11-11467-g66d01cc3f4a248ccc471a978f0bfe3615c3f3a30
Author: Richard Sandiford 
Date:   Tue Jun 4 13:47:35 2024 +0100

rtl-ssa: Extend m_num_defs to a full unsigned int [PR108086]

insn_info tried to save space by storing the number of
definitions in a 16-bit bitfield.  The justification was:

  // ...  FIRST_PSEUDO_REGISTER + 1
  // is the maximum number of accesses to hard registers and memory, and
  // MAX_RECOG_OPERANDS is the maximum number of pseudos that can be
  // defined by an instruction, so the number of definitions should fit
  // easily in 16 bits.

But while that reasoning holds (I think) for real instructions,
it doesn't hold for artificial instructions.  I don't think there's
any sensible higher limit we can use, so this patch goes for a full
unsigned int.

gcc/
PR rtl-optimization/108086
* rtl-ssa/insns.h (insn_info): Make m_num_defs a full unsigned int.
Adjust size-related commentary accordingly.

(cherry picked from commit cd41085a37b8288dbdfe0f81027ce04b978578f1)

Diff:
---
 gcc/rtl-ssa/insns.h | 14 +-
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/gcc/rtl-ssa/insns.h b/gcc/rtl-ssa/insns.h
index e4aa6d1d5ce..ab715adc151 100644
--- a/gcc/rtl-ssa/insns.h
+++ b/gcc/rtl-ssa/insns.h
@@ -141,7 +141,7 @@ using insn_call_clobbers_tree = 
default_splay_tree;
 // of "notes", a bit like REG_NOTES for the underlying RTL insns.
 class insn_info
 {
-  // Size: 8 LP64 words.
+  // Size: 9 LP64 words.
   friend class ebb_info;
   friend class function_info;
 
@@ -401,10 +401,11 @@ private:
   // The number of definitions and the number uses.  FIRST_PSEUDO_REGISTER + 1
   // is the maximum number of accesses to hard registers and memory, and
   // MAX_RECOG_OPERANDS is the maximum number of pseudos that can be
-  // defined by an instruction, so the number of definitions should fit
-  // easily in 16 bits.
+  // defined by an instruction, so the number of definitions in a real
+  // instruction should fit easily in 16 bits.  However, there are no
+  // limits on the number of definitions in artifical instructions.
   unsigned int m_num_uses;
-  unsigned int m_num_defs : 16;
+  unsigned int m_num_defs;
 
   // Flags returned by the accessors above.
   unsigned int m_is_debug_insn : 1;
@@ -414,7 +415,7 @@ private:
   unsigned int m_has_volatile_refs : 1;
 
   // For future expansion.
-  unsigned int m_spare : 11;
+  unsigned int m_spare : 27;
 
   // The program point at which the instruction occurs.
   //
@@ -431,6 +432,9 @@ private:
   // instruction.
   mutable int m_cost_or_uid;
 
+  // On LP64 systems, there's a gap here that could be used for future
+  // expansion.
+
   // The list of notes that have been attached to the instruction.
   insn_note *m_first_note;
 };


[gcc r11-11468] rtl-ssa: Fix -fcompare-debug failure [PR100303]

2024-06-04 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:a1fb76e041740e7dd8cdf71dff3ae7aa31b3ea9b

commit r11-11468-ga1fb76e041740e7dd8cdf71dff3ae7aa31b3ea9b
Author: Richard Sandiford 
Date:   Tue Jun 4 13:47:36 2024 +0100

rtl-ssa: Fix -fcompare-debug failure [PR100303]

This patch fixes an oversight in the handling of debug instructions
in rtl-ssa.  At the moment (and whether this is a good idea or not
remains to be seen), we maintain a linear RPO sequence of definitions
and non-debug uses.  If a register is defined more than once, we use
a degenerate phi to reestablish a previous definition where necessary.

However, debug instructions shouldn't of course affect codegen,
so we can't create a new definition just for them.  In those situations
we instead hang the debug use off the real definition (meaning that
debug uses do not follow a linear order wrt definitions).  Again,
it remains to be seen whether that's a good idea.

The problem in the PR was that we weren't taking this into account
when increasing (or potentially increasing) the live range of an
existing definition.  We'd create the phi even if it would only
be used by debug instructions.

The patch goes for the simple but inelegant approach of passing
a bool to say whether the use is a debug use or not.  I imagine
this area will need some tweaking based on experience in future.

gcc/
PR rtl-optimization/100303
* rtl-ssa/accesses.cc (function_info::make_use_available): Take a
boolean that indicates whether the use will only be used in
debug instructions.  Treat it in the same way that existing
cross-EBB debug references would be handled if so.
(function_info::make_uses_available): Likewise.
* rtl-ssa/functions.h (function_info::make_uses_available): Update
prototype accordingly.
(function_info::make_uses_available): Likewise.
* fwprop.c (try_fwprop_subst): Update call accordingly.

(cherry picked from commit c97351c0cf4872cc0e99e73ed17fb16659fd38b3)

Diff:
---
 gcc/fwprop.c|   3 +-
 gcc/rtl-ssa/accesses.cc |  15 +++--
 gcc/rtl-ssa/functions.h |   7 +-
 gcc/testsuite/g++.dg/torture/pr100303.C | 112 
 4 files changed, 129 insertions(+), 8 deletions(-)

diff --git a/gcc/fwprop.c b/gcc/fwprop.c
index d7203672886..73284a7ae3e 100644
--- a/gcc/fwprop.c
+++ b/gcc/fwprop.c
@@ -606,7 +606,8 @@ try_fwprop_subst (use_info *use, set_info *def,
   if (def_insn->bb () != use_insn->bb ())
 {
   src_uses = crtl->ssa->make_uses_available (attempt, src_uses,
-use_insn->bb ());
+use_insn->bb (),
+use_insn->is_debug_insn ());
   if (!src_uses.is_valid ())
return false;
 }
diff --git a/gcc/rtl-ssa/accesses.cc b/gcc/rtl-ssa/accesses.cc
index af7b568fa98..0621ea22880 100644
--- a/gcc/rtl-ssa/accesses.cc
+++ b/gcc/rtl-ssa/accesses.cc
@@ -1290,7 +1290,10 @@ function_info::insert_temp_clobber (obstack_watermark 
&watermark,
 }
 
 // A subroutine of make_uses_available.  Try to make USE's definition
-// available at the head of BB.  On success:
+// available at the head of BB.  WILL_BE_DEBUG_USE is true if the
+// definition will be used only in debug instructions.
+//
+// On success:
 //
 // - If the use would have the same def () as USE, return USE.
 //
@@ -1302,7 +1305,8 @@ function_info::insert_temp_clobber (obstack_watermark 
&watermark,
 //
 // Return null on failure.
 use_info *
-function_info::make_use_available (use_info *use, bb_info *bb)
+function_info::make_use_available (use_info *use, bb_info *bb,
+  bool will_be_debug_use)
 {
   set_info *def = use->def ();
   if (!def)
@@ -1318,7 +1322,7 @@ function_info::make_use_available (use_info *use, bb_info 
*bb)
   && single_pred (cfg_bb) == use_bb->cfg_bb ()
   && remains_available_on_exit (def, use_bb))
 {
-  if (def->ebb () == bb->ebb ())
+  if (def->ebb () == bb->ebb () || will_be_debug_use)
return use;
 
   resource_info resource = use->resource ();
@@ -1362,7 +1366,8 @@ function_info::make_use_available (use_info *use, bb_info 
*bb)
 // See the comment above the declaration.
 use_array
 function_info::make_uses_available (obstack_watermark &watermark,
-   use_array uses, bb_info *bb)
+   use_array uses, bb_info *bb,
+   bool will_be_debug_uses)
 {
   unsigned int num_uses = uses.size ();
   if (num_uses == 0)
@@ -1371,7 +1376,7 @@ function_info::make_uses_available (obstack_watermark 
&watermark,
   auto **new_uses = XOBNEWVEC (watermark, access_info *, num_uses);
   for (unsigned int i = 0; i < num_uses; ++i)
  

[gcc r14-10303] ira: Fix go_through_subreg offset calculation [PR115281]

2024-06-11 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:7d64bc0990381221c480ba15cb9cc950e51e2cef

commit r14-10303-g7d64bc0990381221c480ba15cb9cc950e51e2cef
Author: Richard Sandiford 
Date:   Tue Jun 11 09:58:48 2024 +0100

ira: Fix go_through_subreg offset calculation [PR115281]

go_through_subreg used:

  else if (!can_div_trunc_p (SUBREG_BYTE (x),
 REGMODE_NATURAL_SIZE (GET_MODE (x)), offset))

to calculate the register offset for a pseudo subreg x.  In the blessed
days before poly-int, this was:

*offset = (SUBREG_BYTE (x) / REGMODE_NATURAL_SIZE (GET_MODE (x)));

But I think this is testing the wrong natural size.  If we exclude
paradoxical subregs (which will get an offset of zero regardless),
it's the inner register that is being split, so it should be the
inner register's natural size that we use.

This matters in the testcase because we have an SFmode lowpart
subreg into the last of three variable-sized vectors.  The
SUBREG_BYTE is therefore equal to the size of two variable-sized
vectors.  Dividing by the vector size gives a register offset of 2,
as expected, but dividing by the size of a scalar FPR would give
a variable offset.

I think something similar could happen for fixed-size targets if
REGMODE_NATURAL_SIZE is different for vectors and integers (say),
although that case would trade an ICE for an incorrect offset.

gcc/
PR rtl-optimization/115281
* ira-conflicts.cc (go_through_subreg): Use the natural size of
the inner mode rather than the outer mode.

gcc/testsuite/
PR rtl-optimization/115281
* gfortran.dg/pr115281.f90: New test.

(cherry picked from commit 46d931b3dd31cbba7c3355ada63f155aa24a4e2b)

Diff:
---
 gcc/ira-conflicts.cc   |  3 ++-
 gcc/testsuite/gfortran.dg/pr115281.f90 | 39 ++
 2 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/gcc/ira-conflicts.cc b/gcc/ira-conflicts.cc
index 83274c53330..15ac42d8848 100644
--- a/gcc/ira-conflicts.cc
+++ b/gcc/ira-conflicts.cc
@@ -227,8 +227,9 @@ go_through_subreg (rtx x, int *offset)
   if (REGNO (reg) < FIRST_PSEUDO_REGISTER)
 *offset = subreg_regno_offset (REGNO (reg), GET_MODE (reg),
   SUBREG_BYTE (x), GET_MODE (x));
+  /* The offset is always 0 for paradoxical subregs.  */
   else if (!can_div_trunc_p (SUBREG_BYTE (x),
-REGMODE_NATURAL_SIZE (GET_MODE (x)), offset))
+REGMODE_NATURAL_SIZE (GET_MODE (reg)), offset))
 /* Checked by validate_subreg.  We must know at compile time which
inner hard registers are being accessed.  */
 gcc_unreachable ();
diff --git a/gcc/testsuite/gfortran.dg/pr115281.f90 
b/gcc/testsuite/gfortran.dg/pr115281.f90
new file mode 100644
index 000..80aa822e745
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/pr115281.f90
@@ -0,0 +1,39 @@
+! { dg-options "-O3" }
+! { dg-additional-options "-mcpu=neoverse-v1" { target aarch64*-*-* } }
+
+SUBROUTINE fn0(ma, mb, nt)
+  CHARACTER ca
+  REAL r0(ma)
+  INTEGER i0(mb)
+  REAL r1(3,mb)
+  REAL r2(3,mb)
+  REAL r3(3,3)
+  zero=0.0
+  do na = 1, nt
+ nt = i0(na)
+ do l = 1, 3
+r1 (l, na) =   r0 (nt)
+r2(l, na) = zero
+ enddo
+  enddo
+  if (ca  .ne.'z') then
+ do j = 1, 3
+do i = 1, 3
+   r4  = zero
+enddo
+ enddo
+ do na = 1, nt
+do k =  1, 3
+   do l = 1, 3
+  do m = 1, 3
+ r3 = r4 * v
+  enddo
+   enddo
+enddo
+ do i = 1, 3
+   do k = 1, ifn (r3)
+   enddo
+enddo
+ enddo
+ endif
+END


[gcc r15-1244] aarch64: Fix invalid nested subregs [PR115464]

2024-06-13 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:0970ff46ba6330fc80e8736fc05b2eaeeae0b6a0

commit r15-1244-g0970ff46ba6330fc80e8736fc05b2eaeeae0b6a0
Author: Richard Sandiford 
Date:   Thu Jun 13 12:48:21 2024 +0100

aarch64: Fix invalid nested subregs [PR115464]

The testcase extracts one arm_neon.h vector from a pair (one subreg)
and then reinterprets the result as an SVE vector (another subreg).
Each subreg makes sense individually, but we can't fold them together
into a single subreg: it's 32 bytes -> 16 bytes -> 16*N bytes,
but the interpretation of 32 bytes -> 16*N bytes depends on
whether N==1 or N>1.

Since the second subreg makes sense individually, simplify_subreg
should bail out rather than ICE on it.  simplify_gen_subreg will
then do the same (because it already checks validate_subreg).
This leaves simplify_gen_subreg returning null, requiring the
caller to take appropriate action.

I think this is relatively likely to occur elsewhere, so the patch
adds a helper for forcing a subreg, allowing a temporary pseudo to
be created where necessary.

I'll follow up by using force_subreg in more places.  This patch
is intended to be a minimal backportable fix for the PR.

gcc/
PR target/115464
* simplify-rtx.cc (simplify_context::simplify_subreg): Don't try
to fold two subregs together if their relationship isn't known
at compile time.
* explow.h (force_subreg): Declare.
* explow.cc (force_subreg): New function.
* config/aarch64/aarch64-sve-builtins-base.cc
(svset_neonq_impl::expand): Use it instead of simplify_gen_subreg.

gcc/testsuite/
PR target/115464
* gcc.target/aarch64/sve/acle/general/pr115464.c: New test.

Diff:
---
 gcc/config/aarch64/aarch64-sve-builtins-base.cc   |  2 +-
 gcc/explow.cc | 15 +++
 gcc/explow.h  |  2 ++
 gcc/simplify-rtx.cc   |  5 +
 .../gcc.target/aarch64/sve/acle/general/pr115464.c| 13 +
 5 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc 
b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
index dea2f6e6bfc4..823d60040f9a 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
@@ -1174,7 +1174,7 @@ public:
Advanced SIMD argument as an SVE vector.  */
 if (!BYTES_BIG_ENDIAN
&& is_undef (CALL_EXPR_ARG (e.call_expr, 0)))
-  return simplify_gen_subreg (mode, e.args[1], GET_MODE (e.args[1]), 0);
+  return force_subreg (mode, e.args[1], GET_MODE (e.args[1]), 0);
 
 rtx_vector_builder builder (VNx16BImode, 16, 2);
 for (unsigned int i = 0; i < 16; i++)
diff --git a/gcc/explow.cc b/gcc/explow.cc
index 8e5f6b8e6804..f6843398c4b0 100644
--- a/gcc/explow.cc
+++ b/gcc/explow.cc
@@ -745,6 +745,21 @@ force_reg (machine_mode mode, rtx x)
   return temp;
 }
 
+/* Like simplify_gen_subreg, but force OP into a new register if the
+   subreg cannot be formed directly.  */
+
+rtx
+force_subreg (machine_mode outermode, rtx op,
+ machine_mode innermode, poly_uint64 byte)
+{
+  rtx x = simplify_gen_subreg (outermode, op, innermode, byte);
+  if (x)
+return x;
+
+  op = copy_to_mode_reg (innermode, op);
+  return simplify_gen_subreg (outermode, op, innermode, byte);
+}
+
 /* If X is a memory ref, copy its contents to a new temp reg and return
that reg.  Otherwise, return X.  */
 
diff --git a/gcc/explow.h b/gcc/explow.h
index 16aa02cfb689..cbd1fcb7eb34 100644
--- a/gcc/explow.h
+++ b/gcc/explow.h
@@ -42,6 +42,8 @@ extern rtx copy_to_suggested_reg (rtx, rtx, machine_mode);
Args are mode (in case value is a constant) and the value.  */
 extern rtx force_reg (machine_mode, rtx);
 
+extern rtx force_subreg (machine_mode, rtx, machine_mode, poly_uint64);
+
 /* Return given rtx, copied into a new temp reg if it was in memory.  */
 extern rtx force_not_mem (rtx);
 
diff --git a/gcc/simplify-rtx.cc b/gcc/simplify-rtx.cc
index 3ee95f74d3db..35ba54c62921 100644
--- a/gcc/simplify-rtx.cc
+++ b/gcc/simplify-rtx.cc
@@ -7737,6 +7737,11 @@ simplify_context::simplify_subreg (machine_mode 
outermode, rtx op,
   poly_uint64 innermostsize = GET_MODE_SIZE (innermostmode);
   rtx newx;
 
+  /* Make sure that the relationship between the two subregs is
+known at compile time.  */
+  if (!ordered_p (outersize, innermostsize))
+   return NULL_RTX;
+
   if (outermode == innermostmode
  && known_eq (byte, 0U)
  && known_eq (SUBREG_BYTE (op), 0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr115464.c 
b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr115464.c
new file mode 100644
index ..d728d1325edb
--- /dev/null
++

[gcc r15-1395] Make force_subreg emit nothing on failure

2024-06-18 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:01044471ea39f9be4803c583ef2a946abc657f99

commit r15-1395-g01044471ea39f9be4803c583ef2a946abc657f99
Author: Richard Sandiford 
Date:   Tue Jun 18 12:22:30 2024 +0100

Make force_subreg emit nothing on failure

While adding more uses of force_subreg, I realised that it should
be more careful to emit no instructions on failure.  This kind of
failure should be very rare, so I don't think it's a case worth
optimising for.

gcc/
* explow.cc (force_subreg): Emit no instructions on failure.

Diff:
---
 gcc/explow.cc | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/gcc/explow.cc b/gcc/explow.cc
index f6843398c4b0..bd93c8780649 100644
--- a/gcc/explow.cc
+++ b/gcc/explow.cc
@@ -756,8 +756,12 @@ force_subreg (machine_mode outermode, rtx op,
   if (x)
 return x;
 
+  auto *start = get_last_insn ();
   op = copy_to_mode_reg (innermode, op);
-  return simplify_gen_subreg (outermode, op, innermode, byte);
+  rtx res = simplify_gen_subreg (outermode, op, innermode, byte);
+  if (!res)
+delete_insns_since (start);
+  return res;
 }
 
 /* If X is a memory ref, copy its contents to a new temp reg and return


[gcc r15-1396] aarch64: Use force_subreg in more places

2024-06-18 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:1474a8eead4ab390e59ee014befa8c40346679f4

commit r15-1396-g1474a8eead4ab390e59ee014befa8c40346679f4
Author: Richard Sandiford 
Date:   Tue Jun 18 12:22:30 2024 +0100

aarch64: Use force_subreg in more places

This patch makes the aarch64 code use force_subreg instead of
simplify_gen_subreg in more places.  The criteria were:

(1) The code is obviously specific to expand (where new pseudos
can be created).

(2) The value is obviously an rvalue rather than an lvalue.

(3) The offset wasn't a simple lowpart or highpart calculation;
a later patch will deal with those.

gcc/
* config/aarch64/aarch64-builtins.cc (aarch64_expand_fcmla_builtin):
Use force_subreg instead of simplify_gen_subreg.
* config/aarch64/aarch64-simd.md (ctz2): Likewise.
* config/aarch64/aarch64-sve-builtins-base.cc
(svget_impl::expand): Likewise.
(svget_neonq_impl::expand): Likewise.
* config/aarch64/aarch64-sve-builtins-functions.h
(multireg_permute::expand): Likewise.

Diff:
---
 gcc/config/aarch64/aarch64-builtins.cc  | 4 ++--
 gcc/config/aarch64/aarch64-simd.md  | 4 ++--
 gcc/config/aarch64/aarch64-sve-builtins-base.cc | 8 +++-
 gcc/config/aarch64/aarch64-sve-builtins-functions.h | 6 +++---
 4 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-builtins.cc 
b/gcc/config/aarch64/aarch64-builtins.cc
index d589e59defc2..7d827cbc2ac0 100644
--- a/gcc/config/aarch64/aarch64-builtins.cc
+++ b/gcc/config/aarch64/aarch64-builtins.cc
@@ -2592,12 +2592,12 @@ aarch64_expand_fcmla_builtin (tree exp, rtx target, int 
fcode)
   rtx temp2 = gen_reg_rtx (DImode);
   temp1 = simplify_gen_subreg (d->mode, op2, quadmode,
   subreg_lowpart_offset (d->mode, quadmode));
-  temp1 = simplify_gen_subreg (V2DImode, temp1, d->mode, 0);
+  temp1 = force_subreg (V2DImode, temp1, d->mode, 0);
   if (BYTES_BIG_ENDIAN)
emit_insn (gen_aarch64_get_lanev2di (temp2, temp1, const0_rtx));
   else
emit_insn (gen_aarch64_get_lanev2di (temp2, temp1, const1_rtx));
-  op2 = simplify_gen_subreg (d->mode, temp2, GET_MODE (temp2), 0);
+  op2 = force_subreg (d->mode, temp2, GET_MODE (temp2), 0);
 
   /* And recalculate the index.  */
   lane -= nunits / 4;
diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 0bb39091a385..01b084d8ccb5 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -389,8 +389,8 @@
   "TARGET_SIMD"
   {
  emit_insn (gen_bswap2 (operands[0], operands[1]));
- rtx op0_castsi2qi = simplify_gen_subreg(mode, operands[0],
-mode, 0);
+ rtx op0_castsi2qi = force_subreg (mode, operands[0],
+  mode, 0);
  emit_insn (gen_aarch64_rbit (op0_castsi2qi, op0_castsi2qi));
  emit_insn (gen_clz2 (operands[0], operands[0]));
  DONE;
diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc 
b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
index 823d60040f9a..999320371247 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
@@ -1121,9 +1121,8 @@ public:
   expand (function_expander &e) const override
   {
 /* Fold the access into a subreg rvalue.  */
-return simplify_gen_subreg (e.vector_mode (0), e.args[0],
-   GET_MODE (e.args[0]),
-   INTVAL (e.args[1]) * BYTES_PER_SVE_VECTOR);
+return force_subreg (e.vector_mode (0), e.args[0], GET_MODE (e.args[0]),
+INTVAL (e.args[1]) * BYTES_PER_SVE_VECTOR);
   }
 };
 
@@ -1157,8 +1156,7 @@ public:
e.add_fixed_operand (indices);
return e.generate_insn (icode);
   }
-return simplify_gen_subreg (e.result_mode (), e.args[0],
-   GET_MODE (e.args[0]), 0);
+return force_subreg (e.result_mode (), e.args[0], GET_MODE (e.args[0]), 0);
   }
 };
 
diff --git a/gcc/config/aarch64/aarch64-sve-builtins-functions.h 
b/gcc/config/aarch64/aarch64-sve-builtins-functions.h
index 3b8e575e98e7..7d06a57ff834 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-functions.h
+++ b/gcc/config/aarch64/aarch64-sve-builtins-functions.h
@@ -639,9 +639,9 @@ public:
   {
machine_mode elt_mode = e.vector_mode (0);
rtx arg = e.args[0];
-   e.args[0] = simplify_gen_subreg (elt_mode, arg, GET_MODE (arg), 0);
-   e.args.safe_push (simplify_gen_subreg (elt_mode, arg, GET_MODE (arg),
-  GET_MODE_SIZE (elt_mode)));
+   e.args[0] = force_subreg (elt_mode, arg, GET_MODE (arg), 0);
+   e.args.safe_push (force_subreg (elt_mode, arg, GET_MODE (arg),
+

[gcc r15-1397] Make more use of force_subreg

2024-06-18 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:d4047da6a070175aae7121c739d1cad6b08ff4b2

commit r15-1397-gd4047da6a070175aae7121c739d1cad6b08ff4b2
Author: Richard Sandiford 
Date:   Tue Jun 18 12:22:30 2024 +0100

Make more use of force_subreg

This patch makes target-independent code use force_subreg instead
of simplify_gen_subreg in some places.  The criteria were:

(1) The code is obviously specific to expand (where new pseudos
can be created), or at least would be invalid to call when
!can_create_pseudo_p () and temporaries are needed.

(2) The value is obviously an rvalue rather than an lvalue.

(3) The offset wasn't a simple lowpart or highpart calculation;
a later patch will deal with those.

Doing this should reduce the likelihood of bugs like PR115464
occuring in other situations.

gcc/
* expmed.cc (store_bit_field_using_insv): Use force_subreg
instead of simplify_gen_subreg.
(store_bit_field_1): Likewise.
(extract_bit_field_as_subreg): Likewise.
(extract_integral_bit_field): Likewise.
(emit_store_flag_1): Likewise.
* expr.cc (convert_move): Likewise.
(convert_modes): Likewise.
(emit_group_load_1): Likewise.
(emit_group_store): Likewise.
(expand_assignment): Likewise.

Diff:
---
 gcc/expmed.cc | 22 --
 gcc/expr.cc   | 27 ---
 2 files changed, 20 insertions(+), 29 deletions(-)

diff --git a/gcc/expmed.cc b/gcc/expmed.cc
index 9ba01695f538..1f68e7be721d 100644
--- a/gcc/expmed.cc
+++ b/gcc/expmed.cc
@@ -695,13 +695,7 @@ store_bit_field_using_insv (const extraction_insn *insv, 
rtx op0,
 if we must narrow it, be sure we do it correctly.  */
 
  if (GET_MODE_SIZE (value_mode) < GET_MODE_SIZE (op_mode))
-   {
- tmp = simplify_subreg (op_mode, value1, value_mode, 0);
- if (! tmp)
-   tmp = simplify_gen_subreg (op_mode,
-  force_reg (value_mode, value1),
-  value_mode, 0);
-   }
+   tmp = force_subreg (op_mode, value1, value_mode, 0);
  else
{
  if (targetm.mode_rep_extended (op_mode, value_mode) != UNKNOWN)
@@ -806,7 +800,7 @@ store_bit_field_1 (rtx str_rtx, poly_uint64 bitsize, 
poly_uint64 bitnum,
   if (known_eq (bitnum, 0U)
  && known_eq (bitsize, GET_MODE_BITSIZE (GET_MODE (op0
{
- sub = simplify_gen_subreg (GET_MODE (op0), value, fieldmode, 0);
+ sub = force_subreg (GET_MODE (op0), value, fieldmode, 0);
  if (sub)
{
  if (reverse)
@@ -1633,7 +1627,7 @@ extract_bit_field_as_subreg (machine_mode mode, rtx op0,
   && known_eq (bitsize, GET_MODE_BITSIZE (mode))
   && lowpart_bit_field_p (bitnum, bitsize, op0_mode)
   && TRULY_NOOP_TRUNCATION_MODES_P (mode, op0_mode))
-return simplify_gen_subreg (mode, op0, op0_mode, bytenum);
+return force_subreg (mode, op0, op0_mode, bytenum);
   return NULL_RTX;
 }
 
@@ -2000,11 +1994,11 @@ extract_integral_bit_field (rtx op0, 
opt_scalar_int_mode op0_mode,
  return convert_extracted_bit_field (target, mode, tmode, unsignedp);
}
   /* If OP0 is a hard register, copy it to a pseudo before calling
-simplify_gen_subreg.  */
+force_subreg.  */
   if (REG_P (op0) && HARD_REGISTER_P (op0))
op0 = copy_to_reg (op0);
-  op0 = simplify_gen_subreg (word_mode, op0, op0_mode.require (),
-bitnum / BITS_PER_WORD * UNITS_PER_WORD);
+  op0 = force_subreg (word_mode, op0, op0_mode.require (),
+ bitnum / BITS_PER_WORD * UNITS_PER_WORD);
   op0_mode = word_mode;
   bitnum %= BITS_PER_WORD;
 }
@@ -5774,8 +5768,8 @@ emit_store_flag_1 (rtx target, enum rtx_code code, rtx 
op0, rtx op1,
 
  /* Do a logical OR or AND of the two words and compare the
 result.  */
- op00 = simplify_gen_subreg (word_mode, op0, int_mode, 0);
- op01 = simplify_gen_subreg (word_mode, op0, int_mode, UNITS_PER_WORD);
+ op00 = force_subreg (word_mode, op0, int_mode, 0);
+ op01 = force_subreg (word_mode, op0, int_mode, UNITS_PER_WORD);
  tem = expand_binop (word_mode,
  op1 == const0_rtx ? ior_optab : and_optab,
  op00, op01, NULL_RTX, unsignedp,
diff --git a/gcc/expr.cc b/gcc/expr.cc
index 9cecc1758f5c..31a7346e33f0 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -301,7 +301,7 @@ convert_move (rtx to, rtx from, int unsignedp)
GET_MODE_BITSIZE (to_mode)));
 
   if (VECTOR_MODE_P (to_mode))
-   from = simplify_gen_subreg (to_mode, from, GET_MODE (from), 0);
+   from = force_subreg (to_mode, from, GET_MODE (from), 0);
   else
   

[gcc r15-1398] Add force_lowpart_subreg

2024-06-18 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:5f40d1c0cc6ce91ef28d326b8707b3f05e6f239c

commit r15-1398-g5f40d1c0cc6ce91ef28d326b8707b3f05e6f239c
Author: Richard Sandiford 
Date:   Tue Jun 18 12:22:31 2024 +0100

Add force_lowpart_subreg

optabs had a local function called lowpart_subreg_maybe_copy
that is very similar to the lowpart version of force_subreg.
This patch adds a force_lowpart_subreg wrapper around
force_subreg and uses it in optabs.cc.

The only difference between the old and new functions is that
the old one asserted success while the new one doesn't.
It's common not to assert elsewhere when taking subregs;
normally a null result is enough.

Later patches will make more use of the new function.

gcc/
* explow.h (force_lowpart_subreg): Declare.
* explow.cc (force_lowpart_subreg): New function.
* optabs.cc (lowpart_subreg_maybe_copy): Delete.
(expand_absneg_bit): Use force_lowpart_subreg instead of
lowpart_subreg_maybe_copy.
(expand_copysign_bit): Likewise.

Diff:
---
 gcc/explow.cc | 14 ++
 gcc/explow.h  |  1 +
 gcc/optabs.cc | 24 ++--
 3 files changed, 17 insertions(+), 22 deletions(-)

diff --git a/gcc/explow.cc b/gcc/explow.cc
index bd93c8780649..2a91cf76ea62 100644
--- a/gcc/explow.cc
+++ b/gcc/explow.cc
@@ -764,6 +764,20 @@ force_subreg (machine_mode outermode, rtx op,
   return res;
 }
 
+/* Try to return an rvalue expression for the OUTERMODE lowpart of OP,
+   which has mode INNERMODE.  Allow OP to be forced into a new register
+   if necessary.
+
+   Return null on failure.  */
+
+rtx
+force_lowpart_subreg (machine_mode outermode, rtx op,
+ machine_mode innermode)
+{
+  auto byte = subreg_lowpart_offset (outermode, innermode);
+  return force_subreg (outermode, op, innermode, byte);
+}
+
 /* If X is a memory ref, copy its contents to a new temp reg and return
that reg.  Otherwise, return X.  */
 
diff --git a/gcc/explow.h b/gcc/explow.h
index cbd1fcb7eb34..dd654649b068 100644
--- a/gcc/explow.h
+++ b/gcc/explow.h
@@ -43,6 +43,7 @@ extern rtx copy_to_suggested_reg (rtx, rtx, machine_mode);
 extern rtx force_reg (machine_mode, rtx);
 
 extern rtx force_subreg (machine_mode, rtx, machine_mode, poly_uint64);
+extern rtx force_lowpart_subreg (machine_mode, rtx, machine_mode);
 
 /* Return given rtx, copied into a new temp reg if it was in memory.  */
 extern rtx force_not_mem (rtx);
diff --git a/gcc/optabs.cc b/gcc/optabs.cc
index c54d275b8b7a..d569742beea9 100644
--- a/gcc/optabs.cc
+++ b/gcc/optabs.cc
@@ -3096,26 +3096,6 @@ expand_ffs (scalar_int_mode mode, rtx op0, rtx target)
   return 0;
 }
 
-/* Extract the OMODE lowpart from VAL, which has IMODE.  Under certain
-   conditions, VAL may already be a SUBREG against which we cannot generate
-   a further SUBREG.  In this case, we expect forcing the value into a
-   register will work around the situation.  */
-
-static rtx
-lowpart_subreg_maybe_copy (machine_mode omode, rtx val,
-  machine_mode imode)
-{
-  rtx ret;
-  ret = lowpart_subreg (omode, val, imode);
-  if (ret == NULL)
-{
-  val = force_reg (imode, val);
-  ret = lowpart_subreg (omode, val, imode);
-  gcc_assert (ret != NULL);
-}
-  return ret;
-}
-
 /* Expand a floating point absolute value or negation operation via a
logical operation on the sign bit.  */
 
@@ -3204,7 +3184,7 @@ expand_absneg_bit (enum rtx_code code, scalar_float_mode 
mode,
   gen_lowpart (imode, op0),
   immed_wide_int_const (mask, imode),
   gen_lowpart (imode, target), 1, OPTAB_LIB_WIDEN);
-  target = lowpart_subreg_maybe_copy (mode, temp, imode);
+  target = force_lowpart_subreg (mode, temp, imode);
 
   set_dst_reg_note (get_last_insn (), REG_EQUAL,
gen_rtx_fmt_e (code, mode, copy_rtx (op0)),
@@ -4043,7 +4023,7 @@ expand_copysign_bit (scalar_float_mode mode, rtx op0, rtx 
op1, rtx target,
 
   temp = expand_binop (imode, ior_optab, op0, op1,
   gen_lowpart (imode, target), 1, OPTAB_LIB_WIDEN);
-  target = lowpart_subreg_maybe_copy (mode, temp, imode);
+  target = force_lowpart_subreg (mode, temp, imode);
 }
 
   return target;


[gcc r15-1399] aarch64: Add some uses of force_lowpart_subreg

2024-06-18 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:6bd4fbae45d11795a9a6f54b866308d4d7134def

commit r15-1399-g6bd4fbae45d11795a9a6f54b866308d4d7134def
Author: Richard Sandiford 
Date:   Tue Jun 18 12:22:31 2024 +0100

aarch64: Add some uses of force_lowpart_subreg

This patch makes more use of force_lowpart_subreg, similarly
to the recent patch for force_subreg.  The criteria were:

(1) The code is obviously specific to expand (where new pseudos
can be created).

(2) The value is obviously an rvalue rather than an lvalue.

gcc/
PR target/115464
* config/aarch64/aarch64-builtins.cc (aarch64_expand_fcmla_builtin)
(aarch64_expand_rwsr_builtin): Use force_lowpart_subreg instead of
simplify_gen_subreg and lowpart_subreg.
* config/aarch64/aarch64-sve-builtins-base.cc
(svset_neonq_impl::expand): Likewise.
* config/aarch64/aarch64-sve-builtins-sme.cc
(add_load_store_slice_operand): Likewise.
* config/aarch64/aarch64.cc (aarch64_sve_reinterpret): Likewise.
(aarch64_addti_scratch_regs, aarch64_subvti_scratch_regs): Likewise.

gcc/testsuite/
PR target/115464
* gcc.target/aarch64/sve/acle/general/pr115464_2.c: New test.

Diff:
---
 gcc/config/aarch64/aarch64-builtins.cc | 11 +--
 gcc/config/aarch64/aarch64-sve-builtins-base.cc|  2 +-
 gcc/config/aarch64/aarch64-sve-builtins-sme.cc |  2 +-
 gcc/config/aarch64/aarch64.cc  | 14 +-
 .../gcc.target/aarch64/sve/acle/general/pr115464_2.c   | 11 +++
 5 files changed, 23 insertions(+), 17 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-builtins.cc 
b/gcc/config/aarch64/aarch64-builtins.cc
index 7d827cbc2ac0..30669f8aa182 100644
--- a/gcc/config/aarch64/aarch64-builtins.cc
+++ b/gcc/config/aarch64/aarch64-builtins.cc
@@ -2579,8 +2579,7 @@ aarch64_expand_fcmla_builtin (tree exp, rtx target, int 
fcode)
   int lane = INTVAL (lane_idx);
 
   if (lane < nunits / 4)
-op2 = simplify_gen_subreg (d->mode, op2, quadmode,
-  subreg_lowpart_offset (d->mode, quadmode));
+op2 = force_lowpart_subreg (d->mode, op2, quadmode);
   else
 {
   /* Select the upper 64 bits, either a V2SF or V4HF, this however
@@ -2590,8 +2589,7 @@ aarch64_expand_fcmla_builtin (tree exp, rtx target, int 
fcode)
 gen_highpart_mode generates code that isn't optimal.  */
   rtx temp1 = gen_reg_rtx (d->mode);
   rtx temp2 = gen_reg_rtx (DImode);
-  temp1 = simplify_gen_subreg (d->mode, op2, quadmode,
-  subreg_lowpart_offset (d->mode, quadmode));
+  temp1 = force_lowpart_subreg (d->mode, op2, quadmode);
   temp1 = force_subreg (V2DImode, temp1, d->mode, 0);
   if (BYTES_BIG_ENDIAN)
emit_insn (gen_aarch64_get_lanev2di (temp2, temp1, const0_rtx));
@@ -2836,7 +2834,7 @@ aarch64_expand_rwsr_builtin (tree exp, rtx target, int 
fcode)
case AARCH64_WSR64:
case AARCH64_WSRF64:
case AARCH64_WSR128:
- subreg = lowpart_subreg (sysreg_mode, input_val, mode);
+ subreg = force_lowpart_subreg (sysreg_mode, input_val, mode);
  break;
case AARCH64_WSRF:
  subreg = gen_lowpart_SUBREG (SImode, input_val);
@@ -2871,7 +2869,8 @@ aarch64_expand_rwsr_builtin (tree exp, rtx target, int 
fcode)
 case AARCH64_RSR64:
 case AARCH64_RSRF64:
 case AARCH64_RSR128:
-  return lowpart_subreg (TYPE_MODE (TREE_TYPE (exp)), target, sysreg_mode);
+  return force_lowpart_subreg (TYPE_MODE (TREE_TYPE (exp)),
+  target, sysreg_mode);
 case AARCH64_RSRF:
   subreg = gen_lowpart_SUBREG (SImode, target);
   return gen_lowpart_SUBREG (SFmode, subreg);
diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc 
b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
index 999320371247..aa26370d397f 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
@@ -1183,7 +1183,7 @@ public:
 if (BYTES_BIG_ENDIAN)
   return e.use_exact_insn (code_for_aarch64_sve_set_neonq (mode));
 insn_code icode = code_for_vcond_mask (mode, mode);
-e.args[1] = lowpart_subreg (mode, e.args[1], GET_MODE (e.args[1]));
+e.args[1] = force_lowpart_subreg (mode, e.args[1], GET_MODE (e.args[1]));
 e.add_output_operand (icode);
 e.add_input_operand (icode, e.args[1]);
 e.add_input_operand (icode, e.args[0]);
diff --git a/gcc/config/aarch64/aarch64-sve-builtins-sme.cc 
b/gcc/config/aarch64/aarch64-sve-builtins-sme.cc
index f4c91bcbb95d..b66b35ae60b7 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-sme.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-sme.cc
@@ -112,7 +112,7 @@ add_load_store_slice_operand (function_expander &e, 
insn_code icode,
   rtx base = e.args[argno];
   if (e.mode_suffix_

[gcc r15-1401] Add force_highpart_subreg

2024-06-18 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:e0700fbe35286d31fe64782b255c8d2caec673dc

commit r15-1401-ge0700fbe35286d31fe64782b255c8d2caec673dc
Author: Richard Sandiford 
Date:   Tue Jun 18 12:22:32 2024 +0100

Add force_highpart_subreg

This patch adds a force_highpart_subreg to go along with the
recently added force_lowpart_subreg.

gcc/
* explow.h (force_highpart_subreg): Declare.
* explow.cc (force_highpart_subreg): New function.
* builtins.cc (expand_builtin_issignaling): Use it.
* expmed.cc (emit_store_flag_1): Likewise.

Diff:
---
 gcc/builtins.cc | 15 ---
 gcc/explow.cc   | 14 ++
 gcc/explow.h|  1 +
 gcc/expmed.cc   |  4 +---
 4 files changed, 20 insertions(+), 14 deletions(-)

diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index bde517b639e8..d467d1697b45 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -2835,9 +2835,7 @@ expand_builtin_issignaling (tree exp, rtx target)
 it is, working on the DImode high part is usually better.  */
  if (!MEM_P (temp))
{
- if (rtx t = simplify_gen_subreg (imode, temp, fmode,
-  subreg_highpart_offset (imode,
-  fmode)))
+ if (rtx t = force_highpart_subreg (imode, temp, fmode))
hi = t;
  else
{
@@ -2845,9 +2843,7 @@ expand_builtin_issignaling (tree exp, rtx target)
  if (int_mode_for_mode (fmode).exists (&imode2))
{
  rtx temp2 = gen_lowpart (imode2, temp);
- poly_uint64 off = subreg_highpart_offset (imode, imode2);
- if (rtx t = simplify_gen_subreg (imode, temp2,
-  imode2, off))
+ if (rtx t = force_highpart_subreg (imode, temp2, imode2))
hi = t;
}
}
@@ -2938,8 +2934,7 @@ expand_builtin_issignaling (tree exp, rtx target)
   it is, working on DImode parts is usually better.  */
if (!MEM_P (temp))
  {
-   hi = simplify_gen_subreg (imode, temp, fmode,
- subreg_highpart_offset (imode, fmode));
+   hi = force_highpart_subreg (imode, temp, fmode);
lo = force_lowpart_subreg (imode, temp, fmode);
if (!hi || !lo)
  {
@@ -2947,9 +2942,7 @@ expand_builtin_issignaling (tree exp, rtx target)
if (int_mode_for_mode (fmode).exists (&imode2))
  {
rtx temp2 = gen_lowpart (imode2, temp);
-   hi = simplify_gen_subreg (imode, temp2, imode2,
- subreg_highpart_offset (imode,
- imode2));
+   hi = force_highpart_subreg (imode, temp2, imode2);
lo = force_lowpart_subreg (imode, temp2, imode2);
  }
  }
diff --git a/gcc/explow.cc b/gcc/explow.cc
index 2a91cf76ea62..b4a0df89bc36 100644
--- a/gcc/explow.cc
+++ b/gcc/explow.cc
@@ -778,6 +778,20 @@ force_lowpart_subreg (machine_mode outermode, rtx op,
   return force_subreg (outermode, op, innermode, byte);
 }
 
+/* Try to return an rvalue expression for the OUTERMODE highpart of OP,
+   which has mode INNERMODE.  Allow OP to be forced into a new register
+   if necessary.
+
+   Return null on failure.  */
+
+rtx
+force_highpart_subreg (machine_mode outermode, rtx op,
+  machine_mode innermode)
+{
+  auto byte = subreg_highpart_offset (outermode, innermode);
+  return force_subreg (outermode, op, innermode, byte);
+}
+
 /* If X is a memory ref, copy its contents to a new temp reg and return
that reg.  Otherwise, return X.  */
 
diff --git a/gcc/explow.h b/gcc/explow.h
index dd654649b068..de89e9e2933e 100644
--- a/gcc/explow.h
+++ b/gcc/explow.h
@@ -44,6 +44,7 @@ extern rtx force_reg (machine_mode, rtx);
 
 extern rtx force_subreg (machine_mode, rtx, machine_mode, poly_uint64);
 extern rtx force_lowpart_subreg (machine_mode, rtx, machine_mode);
+extern rtx force_highpart_subreg (machine_mode, rtx, machine_mode);
 
 /* Return given rtx, copied into a new temp reg if it was in memory.  */
 extern rtx force_not_mem (rtx);
diff --git a/gcc/expmed.cc b/gcc/expmed.cc
index 1f68e7be721d..3b9475f5aa0b 100644
--- a/gcc/expmed.cc
+++ b/gcc/expmed.cc
@@ -5784,9 +5784,7 @@ emit_store_flag_1 (rtx target, enum rtx_code code, rtx 
op0, rtx op1,
  rtx op0h;
 
  /* If testing the sign bit, can just test on high word.  */
- op0h = simplify_gen_subreg (word_mode, op0, int_mode,
- subreg_highpart_offset (word_mode,
- int_mode));
+ op0h = force_highpart_su

[gcc r15-1402] aarch64: Add some uses of force_highpart_subreg

2024-06-18 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:c67a9a9c8e934234b640a613b0ae3c15e7fa9733

commit r15-1402-gc67a9a9c8e934234b640a613b0ae3c15e7fa9733
Author: Richard Sandiford 
Date:   Tue Jun 18 12:22:33 2024 +0100

aarch64: Add some uses of force_highpart_subreg

This patch adds uses of force_highpart_subreg to places that
already use force_lowpart_subreg.

gcc/
* config/aarch64/aarch64.cc (aarch64_addti_scratch_regs): Use
force_highpart_subreg instead of gen_highpart and 
simplify_gen_subreg.
(aarch64_subvti_scratch_regs): Likewise.

Diff:
---
 gcc/config/aarch64/aarch64.cc | 17 -
 1 file changed, 4 insertions(+), 13 deletions(-)

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index c952a7cdefec..026f8627a893 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -26873,19 +26873,12 @@ aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx 
*low_dest,
   *low_in1 = force_lowpart_subreg (DImode, op1, TImode);
   *low_in2 = force_lowpart_subreg (DImode, op2, TImode);
   *high_dest = gen_reg_rtx (DImode);
-  *high_in1 = gen_highpart (DImode, op1);
-  *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
-  subreg_highpart_offset (DImode, TImode));
+  *high_in1 = force_highpart_subreg (DImode, op1, TImode);
+  *high_in2 = force_highpart_subreg (DImode, op2, TImode);
 }
 
 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
 
-   This function differs from 'arch64_addti_scratch_regs' in that
-   OP1 can be an immediate constant (zero). We must call
-   subreg_highpart_offset with DImode and TImode arguments, otherwise
-   VOIDmode will be used for the const_int which generates an internal
-   error from subreg_size_highpart_offset which does not expect a size of zero.
-
OP1 represents the TImode destination operand 1
OP2 represents the TImode destination operand 2
LOW_DEST represents the low half (DImode) of TImode operand 0
@@ -26907,10 +26900,8 @@ aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx 
*low_dest,
   *low_in2 = force_lowpart_subreg (DImode, op2, TImode);
   *high_dest = gen_reg_rtx (DImode);
 
-  *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
-  subreg_highpart_offset (DImode, TImode));
-  *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
-  subreg_highpart_offset (DImode, TImode));
+  *high_in1 = force_highpart_subreg (DImode, op1, TImode);
+  *high_in2 = force_highpart_subreg (DImode, op2, TImode);
 }
 
 /* Generate RTL for 128-bit (TImode) subtraction with overflow.


[gcc r15-1400] Make more use of force_lowpart_subreg

2024-06-18 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:a573ed4367ee685fb1bc50b79239b8b4b69872ee

commit r15-1400-ga573ed4367ee685fb1bc50b79239b8b4b69872ee
Author: Richard Sandiford 
Date:   Tue Jun 18 12:22:32 2024 +0100

Make more use of force_lowpart_subreg

This patch makes target-independent code use force_lowpart_subreg
instead of simplify_gen_subreg and lowpart_subreg in some places.
The criteria were:

(1) The code is obviously specific to expand (where new pseudos
can be created), or at least would be invalid to call when
!can_create_pseudo_p () and temporaries are needed.

(2) The value is obviously an rvalue rather than an lvalue.

Doing this should reduce the likelihood of bugs like PR115464
occuring in other situations.

gcc/
* builtins.cc (expand_builtin_issignaling): Use force_lowpart_subreg
instead of simplify_gen_subreg and lowpart_subreg.
* expr.cc (convert_mode_scalar, expand_expr_real_2): Likewise.
* optabs.cc (expand_doubleword_mod): Likewise.

Diff:
---
 gcc/builtins.cc |  7 ++-
 gcc/expr.cc | 17 +
 gcc/optabs.cc   |  2 +-
 3 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index 5b5307c67b8c..bde517b639e8 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -2940,8 +2940,7 @@ expand_builtin_issignaling (tree exp, rtx target)
  {
hi = simplify_gen_subreg (imode, temp, fmode,
  subreg_highpart_offset (imode, fmode));
-   lo = simplify_gen_subreg (imode, temp, fmode,
- subreg_lowpart_offset (imode, fmode));
+   lo = force_lowpart_subreg (imode, temp, fmode);
if (!hi || !lo)
  {
scalar_int_mode imode2;
@@ -2951,9 +2950,7 @@ expand_builtin_issignaling (tree exp, rtx target)
hi = simplify_gen_subreg (imode, temp2, imode2,
  subreg_highpart_offset (imode,
  imode2));
-   lo = simplify_gen_subreg (imode, temp2, imode2,
- subreg_lowpart_offset (imode,
-imode2));
+   lo = force_lowpart_subreg (imode, temp2, imode2);
  }
  }
if (!hi || !lo)
diff --git a/gcc/expr.cc b/gcc/expr.cc
index 31a7346e33f0..ffbac5136923 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -423,7 +423,8 @@ convert_mode_scalar (rtx to, rtx from, int unsignedp)
0).exists (&toi_mode))
{
  start_sequence ();
- rtx fromi = lowpart_subreg (fromi_mode, from, from_mode);
+ rtx fromi = force_lowpart_subreg (fromi_mode, from,
+   from_mode);
  rtx tof = NULL_RTX;
  if (fromi)
{
@@ -443,7 +444,7 @@ convert_mode_scalar (rtx to, rtx from, int unsignedp)
  NULL_RTX, 1);
  if (toi)
{
- tof = lowpart_subreg (to_mode, toi, toi_mode);
+ tof = force_lowpart_subreg (to_mode, toi, toi_mode);
  if (tof)
emit_move_insn (to, tof);
}
@@ -475,7 +476,7 @@ convert_mode_scalar (rtx to, rtx from, int unsignedp)
0).exists (&toi_mode))
{
  start_sequence ();
- rtx fromi = lowpart_subreg (fromi_mode, from, from_mode);
+ rtx fromi = force_lowpart_subreg (fromi_mode, from, from_mode);
  rtx tof = NULL_RTX;
  do
{
@@ -510,11 +511,11 @@ convert_mode_scalar (rtx to, rtx from, int unsignedp)
  temp4, shift, NULL_RTX, 1);
  if (!temp5)
break;
- rtx temp6 = lowpart_subreg (toi_mode, temp5, fromi_mode);
+ rtx temp6 = force_lowpart_subreg (toi_mode, temp5,
+   fromi_mode);
  if (!temp6)
break;
- tof = lowpart_subreg (to_mode, force_reg (toi_mode, temp6),
-   toi_mode);
+ tof = force_lowpart_subreg (to_mode, temp6, toi_mode);
  if (tof)
emit_move_insn (to, tof);
}
@@ -9784,9 +9785,9 @@ expand_expr_real_2 (const_sepops ops, rtx target, 
machine_mode tmode,
inner_mode = TYPE_MODE (inner_type);
 
  if (modifier == EXPAND_INITIALIZER)
-   op0 = lowpart_subreg (mode, op0, inner_mode);
+

[gcc r15-1531] sh: Make *minus_plus_one work after RA

2024-06-21 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:f49267e1636872128249431e9e5d20c0908b7e8e

commit r15-1531-gf49267e1636872128249431e9e5d20c0908b7e8e
Author: Richard Sandiford 
Date:   Fri Jun 21 09:52:42 2024 +0100

sh: Make *minus_plus_one work after RA

*minus_plus_one had no constraints, which meant that it could be
matched after RA with operands 0, 1 and 2 all being different.
The associated split instead requires operand 0 to be tied to
operand 1.

gcc/
* config/sh/sh.md (*minus_plus_one): Add constraints.

Diff:
---
 gcc/config/sh/sh.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/gcc/config/sh/sh.md b/gcc/config/sh/sh.md
index 92a1efeb811..9491b49e55b 100644
--- a/gcc/config/sh/sh.md
+++ b/gcc/config/sh/sh.md
@@ -1642,9 +1642,9 @@
 ;; matched.  Split this up into a simple sub add sequence, as this will save
 ;; us one sett insn.
 (define_insn_and_split "*minus_plus_one"
-  [(set (match_operand:SI 0 "arith_reg_dest" "")
-   (plus:SI (minus:SI (match_operand:SI 1 "arith_reg_operand" "")
-  (match_operand:SI 2 "arith_reg_operand" ""))
+  [(set (match_operand:SI 0 "arith_reg_dest" "=r")
+   (plus:SI (minus:SI (match_operand:SI 1 "arith_reg_operand" "0")
+  (match_operand:SI 2 "arith_reg_operand" "r"))
 (const_int 1)))]
   "TARGET_SH1"
   "#"


[gcc r15-1545] rtl-ssa: Don't cost no-op moves

2024-06-21 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:4a43a06c7b2bcc3402ac69d6e5ce7b8008acc69a

commit r15-1545-g4a43a06c7b2bcc3402ac69d6e5ce7b8008acc69a
Author: Richard Sandiford 
Date:   Fri Jun 21 15:40:10 2024 +0100

rtl-ssa: Don't cost no-op moves

No-op moves are given the code NOOP_MOVE_INSN_CODE if we plan
to delete them later.  Such insns shouldn't be costed, partly
because they're going to disappear, and partly because targets
won't recognise the insn code.

gcc/
* rtl-ssa/changes.cc (rtl_ssa::changes_are_worthwhile): Don't
cost no-op moves.
* rtl-ssa/insns.cc (insn_info::calculate_cost): Likewise.

Diff:
---
 gcc/rtl-ssa/changes.cc | 6 +-
 gcc/rtl-ssa/insns.cc   | 7 ++-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/gcc/rtl-ssa/changes.cc b/gcc/rtl-ssa/changes.cc
index 11639e81bb7..3101f2dc4fc 100644
--- a/gcc/rtl-ssa/changes.cc
+++ b/gcc/rtl-ssa/changes.cc
@@ -177,13 +177,17 @@ rtl_ssa::changes_are_worthwhile (array_slice changes,
   auto entry_count = ENTRY_BLOCK_PTR_FOR_FN (cfun)->count;
   for (insn_change *change : changes)
 {
+  // Count zero for the old cost if the old instruction was a no-op
+  // move or had an unknown cost.  This should reduce the chances of
+  // making an unprofitable change.
   old_cost += change->old_cost ();
   basic_block cfg_bb = change->bb ()->cfg_bb ();
   bool for_speed = optimize_bb_for_speed_p (cfg_bb);
   if (for_speed)
weighted_old_cost += (cfg_bb->count.to_sreal_scale (entry_count)
  * change->old_cost ());
-  if (!change->is_deletion ())
+  if (!change->is_deletion ()
+ && INSN_CODE (change->rtl ()) != NOOP_MOVE_INSN_CODE)
{
  change->new_cost = insn_cost (change->rtl (), for_speed);
  new_cost += change->new_cost;
diff --git a/gcc/rtl-ssa/insns.cc b/gcc/rtl-ssa/insns.cc
index 0171d93c357..68365e323ec 100644
--- a/gcc/rtl-ssa/insns.cc
+++ b/gcc/rtl-ssa/insns.cc
@@ -48,7 +48,12 @@ insn_info::calculate_cost () const
 {
   basic_block cfg_bb = BLOCK_FOR_INSN (m_rtl);
   temporarily_undo_changes (0);
-  m_cost_or_uid = insn_cost (m_rtl, optimize_bb_for_speed_p (cfg_bb));
+  if (INSN_CODE (m_rtl) == NOOP_MOVE_INSN_CODE)
+// insn_cost also uses 0 to mean "don't know".  Callers that
+// want to distinguish the cases will need to check INSN_CODE.
+m_cost_or_uid = 0;
+  else
+m_cost_or_uid = insn_cost (m_rtl, optimize_bb_for_speed_p (cfg_bb));
   redo_changes (0);
 }


[gcc r15-1546] iq2000: Fix test and branch instructions

2024-06-21 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:8f254cd4e40b692e5f01a3b40f2b5b60c8528a1e

commit r15-1546-g8f254cd4e40b692e5f01a3b40f2b5b60c8528a1e
Author: Richard Sandiford 
Date:   Fri Jun 21 15:40:10 2024 +0100

iq2000: Fix test and branch instructions

The iq2000 test and branch instructions had patterns like:

  [(set (pc)
(if_then_else
 (eq (and:SI (match_operand:SI 0 "register_operand" "r")
 (match_operand:SI 1 "power_of_2_operand" "I"))
  (const_int 0))
 (match_operand 2 "pc_or_label_operand" "")
 (match_operand 3 "pc_or_label_operand" "")))]

power_of_2_operand allows any 32-bit power of 2, whereas "I" only
accepts 16-bit signed constants.  This meant that any power of 2
greater than 32768 would cause an "insn does not satisfy its
constraints" ICE.

Also, the %p operand modifier barfed on 1<<31, which is sign-
rather than zero-extended to 64 bits.  The code is inherently
limited to 32-bit operands -- power_of_2_operand contains a test
involving "unsigned" -- so this patch just ands with 0x.

gcc/
* config/iq2000/iq2000.cc (iq2000_print_operand): Make %p handle 
1<<31.
* config/iq2000/iq2000.md: Remove "I" constraints on
power_of_2_operands.

Diff:
---
 gcc/config/iq2000/iq2000.cc | 2 +-
 gcc/config/iq2000/iq2000.md | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/gcc/config/iq2000/iq2000.cc b/gcc/config/iq2000/iq2000.cc
index f9f8c417841..136675d0fbb 100644
--- a/gcc/config/iq2000/iq2000.cc
+++ b/gcc/config/iq2000/iq2000.cc
@@ -3127,7 +3127,7 @@ iq2000_print_operand (FILE *file, rtx op, int letter)
 {
   int value;
   if (code != CONST_INT
- || (value = exact_log2 (INTVAL (op))) < 0)
+ || (value = exact_log2 (UINTVAL (op) & 0x)) < 0)
output_operand_lossage ("invalid %%p value");
   else
fprintf (file, "%d", value);
diff --git a/gcc/config/iq2000/iq2000.md b/gcc/config/iq2000/iq2000.md
index 8617efac3c6..e62c250ce8c 100644
--- a/gcc/config/iq2000/iq2000.md
+++ b/gcc/config/iq2000/iq2000.md
@@ -1175,7 +1175,7 @@
   [(set (pc)
(if_then_else
 (eq (and:SI (match_operand:SI 0 "register_operand" "r")
-(match_operand:SI 1 "power_of_2_operand" "I"))
+(match_operand:SI 1 "power_of_2_operand"))
  (const_int 0))
 (match_operand 2 "pc_or_label_operand" "")
 (match_operand 3 "pc_or_label_operand" "")))]
@@ -1189,7 +1189,7 @@
   [(set (pc)
(if_then_else
 (ne (and:SI (match_operand:SI 0 "register_operand" "r")
-(match_operand:SI 1 "power_of_2_operand" "I"))
+(match_operand:SI 1 "power_of_2_operand"))
 (const_int 0))
 (match_operand 2 "pc_or_label_operand" "")
 (match_operand 3 "pc_or_label_operand" "")))]


[gcc r15-1547] xstormy16: Fix xs_hi_nonmemory_operand

2024-06-21 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:5320bcbd342a985a6e1db60bff2918f73dcad1a0

commit r15-1547-g5320bcbd342a985a6e1db60bff2918f73dcad1a0
Author: Richard Sandiford 
Date:   Fri Jun 21 15:40:11 2024 +0100

xstormy16: Fix xs_hi_nonmemory_operand

All uses of xs_hi_nonmemory_operand allow constraint "i",
which means that they allow consts, symbol_refs and label_refs.
The definition of xs_hi_nonmemory_operand accounted for consts,
but not for symbol_refs and label_refs.

gcc/
* config/stormy16/predicates.md (xs_hi_nonmemory_operand): Handle
symbol_ref and label_ref.

Diff:
---
 gcc/config/stormy16/predicates.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/config/stormy16/predicates.md 
b/gcc/config/stormy16/predicates.md
index 67c2ddc107c..085c9c5ed2d 100644
--- a/gcc/config/stormy16/predicates.md
+++ b/gcc/config/stormy16/predicates.md
@@ -152,7 +152,7 @@
 })
 
 (define_predicate "xs_hi_nonmemory_operand"
-  (match_code "const_int,reg,subreg,const")
+  (match_code "const_int,reg,subreg,const,symbol_ref,label_ref")
 {
   return nonmemory_operand (op, mode);
 })


[gcc r15-1578] rtl-ssa: Rework _ignoring interfaces

2024-06-24 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:5185274c76cc3b68a38713273779ec29ae4fe5d2

commit r15-1578-g5185274c76cc3b68a38713273779ec29ae4fe5d2
Author: Richard Sandiford 
Date:   Mon Jun 24 08:43:18 2024 +0100

rtl-ssa: Rework _ignoring interfaces

rtl-ssa has routines for scanning forwards or backwards for something
under the control of an exclusion set.  These searches are currently
used for two main things:

- to work out where an instruction can be moved within its EBB
- to work out whether recog can add a new hard register clobber

The exclusion set was originally a callback function that returned
true for insns that should be ignored.  However, for the late-combine
work, I'd also like to be able to skip an entire definition, along
with all its uses.

This patch prepares for that by turning the exclusion set into an
object that provides predicate member functions.  Currently the
only two member functions are:

- should_ignore_insn: what the old callback did
- should_ignore_def: the new functionality

but more could be added later.

Doing this also makes it easy to remove some asymmetry that I think
in hindsight was a mistake: in forward scans, ignoring an insn meant
ignoring all definitions in that insn (ok) and all uses of those
definitions (non-obvious).  The new interface makes it possible
to select the required behaviour, with that behaviour being applied
consistently in both directions.

Now that the exclusion set is a dedicated object, rather than
just a "random" function, I think it makes sense to remove the
_ignoring suffix from the function names.  The suffix was originally
there to describe the callback, and in particular to emphasise that
a true return meant "ignore" rather than "heed".

gcc/
* rtl-ssa.h: Include predicates.h.
* rtl-ssa/predicates.h: New file.
* rtl-ssa/access-utils.h (prev_call_clobbers_ignoring): Rename to...
(prev_call_clobbers): ...this and treat the ignore parameter as an
object with the same interface as ignore_nothing.
(next_call_clobbers_ignoring): Rename to...
(next_call_clobbers): ...this and treat the ignore parameter as an
object with the same interface as ignore_nothing.
(first_nondebug_insn_use_ignoring): Rename to...
(first_nondebug_insn_use): ...this and treat the ignore parameter as
an object with the same interface as ignore_nothing.
(last_nondebug_insn_use_ignoring): Rename to...
(last_nondebug_insn_use): ...this and treat the ignore parameter as
an object with the same interface as ignore_nothing.
(last_access_ignoring): Rename to...
(last_access): ...this and treat the ignore parameter as an object
with the same interface as ignore_nothing.  Conditionally skip
definitions.
(prev_access_ignoring): Rename to...
(prev_access): ...this and treat the ignore parameter as an object
with the same interface as ignore_nothing.
(first_def_ignoring): Replace with...
(first_access): ...this new function.
(next_access_ignoring): Rename to...
(next_access): ...this and treat the ignore parameter as an object
with the same interface as ignore_nothing.  Conditionally skip
definitions.
* rtl-ssa/change-utils.h (insn_is_changing): Delete.
(restrict_movement_ignoring): Rename to...
(restrict_movement): ...this and treat the ignore parameter as an
object with the same interface as ignore_nothing.
(recog_ignoring): Rename to...
(recog): ...this and treat the ignore parameter as an object with
the same interface as ignore_nothing.
* rtl-ssa/changes.h (insn_is_changing_closure): Delete.
* rtl-ssa/functions.h (function_info::add_regno_clobber): Treat
the ignore parameter as an object with the same interface as
ignore_nothing.
* rtl-ssa/insn-utils.h (insn_is): Delete.
* rtl-ssa/insns.h (insn_is_closure): Delete.
* rtl-ssa/member-fns.inl
(insn_is_changing_closure::insn_is_changing_closure): Delete.
(insn_is_changing_closure::operator()): Likewise.
(function_info::add_regno_clobber): Treat the ignore parameter
as an object with the same interface as ignore_nothing.
(ignore_changing_insns::ignore_changing_insns): New function.
(ignore_changing_insns::should_ignore_insn): Likewise.
* rtl-ssa/movement.h (restrict_movement_for_dead_range): Treat
the ignore parameter as an object with the same interface as
ignore_nothing.
(restrict_movement_for_defs_ignoring): R

[gcc r15-1579] Add a late-combine pass [PR106594]

2024-06-24 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:792f97b44ffc5e6a967292b3747fd835e99396e7

commit r15-1579-g792f97b44ffc5e6a967292b3747fd835e99396e7
Author: Richard Sandiford 
Date:   Mon Jun 24 08:43:19 2024 +0100

Add a late-combine pass [PR106594]

This patch adds a combine pass that runs late in the pipeline.
There are two instances: one between combine and split1, and one
after postreload.

The pass currently has a single objective: remove definitions by
substituting into all uses.  The pre-RA version tries to restrict
itself to cases that are likely to have a neutral or beneficial
effect on register pressure.

The patch fixes PR106594.  It also fixes a few FAILs and XFAILs
in the aarch64 test results, mostly due to making proper use of
MOVPRFX in cases where we didn't previously.

This is just a first step.  I'm hoping that the pass could be
used for other combine-related optimisations in future.  In particular,
the post-RA version doesn't need to restrict itself to cases where all
uses are substitutable, since it doesn't have to worry about register
pressure.  If we did that, and if we extended it to handle multi-register
REGs, the pass might be a viable replacement for regcprop, which in
turn might reduce the cost of having a post-RA instance of the new pass.

On most targets, the pass is enabled by default at -O2 and above.
However, it has a tendency to undo x86's STV and RPAD passes,
by folding the more complex post-STV/RPAD form back into the
simpler pre-pass form.

Also, running a pass after register allocation means that we can
now match define_insn_and_splits that were previously only matched
before register allocation.  This trips things like:

  (define_insn_and_split "..."
[...pattern...]
"...cond..."
"#"
"&& 1"
[...pattern...]
{
  ...unconditional use of gen_reg_rtx ()...;
}

because matching and splitting after RA will call gen_reg_rtx when
pseudos are no longer allowed.  rs6000 has several instances of this.

xtensa has a variation in which the split condition is:

"&& can_create_pseudo_p ()"

The failure then is that, if we match after RA, we'll never be
able to split the instruction.

The patch therefore disables the pass by default on i386, rs6000
and xtensa.  Hopefully we can fix those ports later (if their
maintainers want).  It seems better to add the pass first, though,
to make it easier to test any such fixes.

gcc.target/aarch64/bitfield-bitint-abi-align{16,8}.c would need
quite a few updates for the late-combine output.  That might be
worth doing, but it seems too complex to do as part of this patch.

I tried compiling at least one target per CPU directory and comparing
the assembly output for parts of the GCC testsuite.  This is just a way
of getting a flavour of how the pass performs; it obviously isn't a
meaningful benchmark.  All targets seemed to improve on average:

Target Tests   GoodBad   %Good   Delta  Median
== =   ===   =   =  ==
aarch64-linux-gnu   2215   1975240  89.16%   -4159  -1
aarch64_be-linux-gnu1569   1483 86  94.52%  -10117  -1
alpha-linux-gnu 1454   1370 84  94.22%   -9502  -1
amdgcn-amdhsa   5122   4671451  91.19%  -35737  -1
arc-elf 2166   1932234  89.20%  -37742  -1
arm-linux-gnueabi   1953   1661292  85.05%  -12415  -1
arm-linux-gnueabihf 1834   1549285  84.46%  -11137  -1
avr-elf 4789   4330459  90.42% -441276  -4
bfin-elf2795   2394401  85.65%  -19252  -1
bpf-elf 3122   2928194  93.79%   -8785  -1
c6x-elf 2227   1929298  86.62%  -17339  -1
cris-elf3464   3270194  94.40%  -23263  -2
csky-elf2915   2591324  88.89%  -22146  -1
epiphany-elf2399   2304 95  96.04%  -28698  -2
fr30-elf7712   7299413  94.64%  -99830  -2
frv-linux-gnu   3332   2877455  86.34%  -25108  -1
ft32-elf2775   2667108  96.11%  -25029  -1
h8300-elf   3176   2862314  90.11%  -29305  -2
hppa64-hp-hpux11.23 4287   4247 40  99.07%  -45963  -2
ia64-linux-gnu  2343   1946397  83.06%   -9907  -2
iq2000-elf  9684   9637 47  99.51% -126557  -2
lm32-elf2681   2608 73  97.28%  -59884  -3
loongarch64-linux-gnu   1303   1218 85  93.48%  -13375  -2
m32r-elf1626   1517109  93.30%   -9323  -2
m68k-linux-gnu 

[gcc r15-1580] Regenerate common.opt.urls

2024-06-24 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:a6f7e3ca2961e9315a23ffd99b40f004848f900e

commit r15-1580-ga6f7e3ca2961e9315a23ffd99b40f004848f900e
Author: Richard Sandiford 
Date:   Mon Jun 24 09:42:16 2024 +0100

Regenerate common.opt.urls

gcc/
* common.opt.urls: Regenerate.

Diff:
---
 gcc/common.opt.urls | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/gcc/common.opt.urls b/gcc/common.opt.urls
index 1f2eb67c8e0..1ec32670633 100644
--- a/gcc/common.opt.urls
+++ b/gcc/common.opt.urls
@@ -712,6 +712,9 @@ 
UrlSuffix(gcc/Optimize-Options.html#index-fhoist-adjacent-loads)
 flarge-source-files
 UrlSuffix(gcc/Preprocessor-Options.html#index-flarge-source-files)
 
+flate-combine-instructions
+UrlSuffix(gcc/Optimize-Options.html#index-flate-combine-instructions)
+
 floop-parallelize-all
 UrlSuffix(gcc/Optimize-Options.html#index-floop-parallelize-all)


[gcc r15-4111] aarch64: Fix general permutes of svbfloat16_ts

2024-10-07 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:4fd473f66faf5bd95c84fe5c0fa41be735a7c09f

commit r15-4111-g4fd473f66faf5bd95c84fe5c0fa41be735a7c09f
Author: Richard Sandiford 
Date:   Mon Oct 7 13:03:03 2024 +0100

aarch64: Fix general permutes of svbfloat16_ts

Testing gcc.target/aarch64/sve/permute_2.c without the associated GCC
patch triggered an unrecognisable insn ICE for the svbfloat16_t tests.
This was because the implementation of general two-vector permutes
requires two TBLs and an ORR, with the ORR being represented as an
unspec for floating-point modes.  The associated pattern did not
cover VNx8BF.

gcc/
* config/aarch64/iterators.md (SVE_I): Move further up file.
(SVE_F): New mode iterator.
(SVE_ALL): Redefine in terms of SVE_I and SVE_F.
* config/aarch64/aarch64-sve.md (*3): Extend
to all SVE_F.

gcc/testsuite/
* gcc.target/aarch64/sve/permute_5.c: New test.

Diff:
---
 gcc/config/aarch64/aarch64-sve.md|  8 +++
 gcc/config/aarch64/iterators.md  | 27 
 gcc/testsuite/gcc.target/aarch64/sve/permute_5.c | 10 +
 3 files changed, 27 insertions(+), 18 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-sve.md 
b/gcc/config/aarch64/aarch64-sve.md
index ec1d059a2b1b..90db51e51b9d 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -6455,10 +6455,10 @@
 ;; by providing this, but we need to use UNSPECs since rtx logical ops
 ;; aren't defined for floating-point modes.
 (define_insn "*3"
-  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w")
-   (unspec:SVE_FULL_F
- [(match_operand:SVE_FULL_F 1 "register_operand" "w")
-  (match_operand:SVE_FULL_F 2 "register_operand" "w")]
+  [(set (match_operand:SVE_F 0 "register_operand" "=w")
+   (unspec:SVE_F
+ [(match_operand:SVE_F 1 "register_operand" "w")
+  (match_operand:SVE_F 2 "register_operand" "w")]
  LOGICALF))]
   "TARGET_SVE"
   "\t%0.d, %1.d, %2.d"
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index fcad236eee9f..1322193b027c 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -519,15 +519,20 @@
 VNx4HI VNx2HI
 VNx2SI])
 
+;; All SVE integer vector modes.
+(define_mode_iterator SVE_I [VNx16QI VNx8QI VNx4QI VNx2QI
+VNx8HI VNx4HI VNx2HI
+VNx4SI VNx2SI
+VNx2DI])
+
+;; All SVE floating-point vector modes.
+(define_mode_iterator SVE_F [VNx8HF VNx4HF VNx2HF
+VNx8BF VNx4BF VNx2BF
+VNx4SF VNx2SF
+VNx2DF])
+
 ;; All SVE vector modes.
-(define_mode_iterator SVE_ALL [VNx16QI VNx8QI VNx4QI VNx2QI
-  VNx8HI VNx4HI VNx2HI
-  VNx8HF VNx4HF VNx2HF
-  VNx8BF VNx4BF VNx2BF
-  VNx4SI VNx2SI
-  VNx4SF VNx2SF
-  VNx2DI
-  VNx2DF])
+(define_mode_iterator SVE_ALL [SVE_I SVE_F])
 
 ;; All SVE 2-vector modes.
 (define_mode_iterator SVE_FULLx2 [VNx32QI VNx16HI VNx8SI VNx4DI
@@ -549,12 +554,6 @@
 ;; All SVE vector and structure modes.
 (define_mode_iterator SVE_ALL_STRUCT [SVE_ALL SVE_STRUCT])
 
-;; All SVE integer vector modes.
-(define_mode_iterator SVE_I [VNx16QI VNx8QI VNx4QI VNx2QI
-VNx8HI VNx4HI VNx2HI
-VNx4SI VNx2SI
-VNx2DI])
-
 ;; All SVE integer vector modes and Advanced SIMD 64-bit vector
 ;; element modes
 (define_mode_iterator SVE_I_SIMD_DI [SVE_I V2DI])
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/permute_5.c 
b/gcc/testsuite/gcc.target/aarch64/sve/permute_5.c
new file mode 100644
index ..786b05ee3e72
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/permute_5.c
@@ -0,0 +1,10 @@
+/* { dg-options "-O -msve-vector-bits=256" } */
+
+typedef __SVBfloat16_t vbfloat16 __attribute__((arm_sve_vector_bits(256)));
+
+vbfloat16
+foo (vbfloat16 x, vbfloat16 y)
+{
+  return __builtin_shufflevector (x, y, 0, 2, 1, 3, 16, 19, 17, 18,
+ 8, 9, 10, 11, 23, 22, 21, 20);
+}


[gcc r15-4110] aarch64: Handle SVE modes in aarch64_evpc_reencode [PR116583]

2024-10-07 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:1048ebbbdc98a5928a974356d7f4244603b6bd32

commit r15-4110-g1048ebbbdc98a5928a974356d7f4244603b6bd32
Author: Richard Sandiford 
Date:   Mon Oct 7 13:03:02 2024 +0100

aarch64: Handle SVE modes in aarch64_evpc_reencode [PR116583]

For Advanced SIMD modes, aarch64_evpc_reencode tests whether
a permute in a narrow element mode can be done more cheaply
in a wider mode.  For example, { 0, 1, 8, 9, 4, 5, 12, 13 }
on V8HI is a natural TRN1 on V4SI ({ 0, 4, 2, 6 }).

This patch extends the code to handle SVE data and predicate
modes as well.  This is a prerequisite to getting good results
for PR116583.

gcc/
PR target/116583
* config/aarch64/aarch64.cc (aarch64_coalesce_units): New function,
extending the Advanced SIMD handling from...
(aarch64_evpc_reencode): ...here to SVE data and predicate modes.

gcc/testsuite/
PR target/116583
* gcc.target/aarch64/sve/permute_1.c: New test.
* gcc.target/aarch64/sve/permute_2.c: Likewise.
* gcc.target/aarch64/sve/permute_3.c: Likewise.
* gcc.target/aarch64/sve/permute_4.c: Likewise.

Diff:
---
 gcc/config/aarch64/aarch64.cc|  55 -
 gcc/testsuite/gcc.target/aarch64/sve/permute_1.c | 106 +
 gcc/testsuite/gcc.target/aarch64/sve/permute_2.c | 277 +++
 gcc/testsuite/gcc.target/aarch64/sve/permute_3.c |  91 
 gcc/testsuite/gcc.target/aarch64/sve/permute_4.c | 113 +
 5 files changed, 633 insertions(+), 9 deletions(-)

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index e7bb3278a27e..102680a0efca 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -1933,6 +1933,46 @@ aarch64_sve_int_mode (machine_mode mode)
   return aarch64_sve_data_mode (int_mode, GET_MODE_NUNITS (mode)).require ();
 }
 
+/* Look for a vector mode with the same classification as VEC_MODE,
+   but with each group of FACTOR elements coalesced into a single element.
+   In other words, look for a mode in which the elements are FACTOR times
+   larger and in which the number of elements is FACTOR times smaller.
+
+   Return the mode found, if one exists.  */
+
+static opt_machine_mode
+aarch64_coalesce_units (machine_mode vec_mode, unsigned int factor)
+{
+  auto elt_bits = vector_element_size (GET_MODE_BITSIZE (vec_mode),
+  GET_MODE_NUNITS (vec_mode));
+  auto vec_flags = aarch64_classify_vector_mode (vec_mode);
+  if (vec_flags & VEC_SVE_PRED)
+{
+  if (known_eq (GET_MODE_SIZE (vec_mode), BYTES_PER_SVE_PRED))
+   return aarch64_sve_pred_mode (elt_bits * factor);
+  return {};
+}
+
+  scalar_mode new_elt_mode;
+  if (!int_mode_for_size (elt_bits * factor, false).exists (&new_elt_mode))
+return {};
+
+  if (vec_flags == VEC_ADVSIMD)
+{
+  auto mode = aarch64_simd_container_mode (new_elt_mode,
+  GET_MODE_BITSIZE (vec_mode));
+  if (mode != word_mode)
+   return mode;
+}
+  else if (vec_flags & VEC_SVE_DATA)
+{
+  poly_uint64 new_nunits;
+  if (multiple_p (GET_MODE_NUNITS (vec_mode), factor, &new_nunits))
+   return aarch64_sve_data_mode (new_elt_mode, new_nunits);
+}
+  return {};
+}
+
 /* Implement TARGET_VECTORIZE_RELATED_MODE.  */
 
 static opt_machine_mode
@@ -25731,26 +25771,23 @@ aarch64_evpc_reencode (struct expand_vec_perm_d *d)
 {
   expand_vec_perm_d newd;
 
-  if (d->vec_flags != VEC_ADVSIMD)
+  /* The subregs that we'd create are not supported for big-endian SVE;
+ see aarch64_modes_compatible_p for details.  */
+  if (BYTES_BIG_ENDIAN && (d->vec_flags & VEC_ANY_SVE))
 return false;
 
   /* Get the new mode.  Always twice the size of the inner
  and half the elements.  */
-  poly_uint64 vec_bits = GET_MODE_BITSIZE (d->vmode);
-  unsigned int new_elt_bits = GET_MODE_UNIT_BITSIZE (d->vmode) * 2;
-  auto new_elt_mode = int_mode_for_size (new_elt_bits, false).require ();
-  machine_mode new_mode = aarch64_simd_container_mode (new_elt_mode, vec_bits);
-
-  if (new_mode == word_mode)
+  machine_mode new_mode;
+  if (!aarch64_coalesce_units (d->vmode, 2).exists (&new_mode))
 return false;
 
   vec_perm_indices newpermindices;
-
   if (!newpermindices.new_shrunk_vector (d->perm, 2))
 return false;
 
   newd.vmode = new_mode;
-  newd.vec_flags = VEC_ADVSIMD;
+  newd.vec_flags = d->vec_flags;
   newd.op_mode = newd.vmode;
   newd.op_vec_flags = newd.vec_flags;
   newd.target = d->target ? gen_lowpart (new_mode, d->target) : NULL;
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/permute_1.c 
b/gcc/testsuite/gcc.target/aarch64/sve/permute_1.c
new file mode 100644
index ..90aeef321882
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/permute_1.c
@@ -0,0 +1,106 @@
+/* { dg-options "-O -msve-vector-bits=256" } */
+/* { dg-final { chec

[gcc r15-4109] testsuite: Unset torture_current_flags after use

2024-10-07 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:fce02baff53bbcb673f41ea18668103edb2f7c00

commit r15-4109-gfce02baff53bbcb673f41ea18668103edb2f7c00
Author: Richard Sandiford 
Date:   Mon Oct 7 13:03:02 2024 +0100

testsuite: Unset torture_current_flags after use

Before running a test with specific torture options, gcc-dg-runtest
sets the global variable torture_current_flags to the set of torture
options that will be used.  However, it never unset the variable
afterwards, which meant that the last options would hang around
and potentially confuse later non-torture tests.

I saw this with a follow-on patch to check-function-bodies, but it's
probably possible to construct aritificial test combinations that
expose it with check-function-bodies's existing flag filtering.

gcc/testsuite/
* lib/gcc-dg.exp (gcc-dg-runtest): Unset torture_current_flags
after each test.

Diff:
---
 gcc/testsuite/lib/gcc-dg.exp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/gcc/testsuite/lib/gcc-dg.exp b/gcc/testsuite/lib/gcc-dg.exp
index cb401a704359..7adca02f9377 100644
--- a/gcc/testsuite/lib/gcc-dg.exp
+++ b/gcc/testsuite/lib/gcc-dg.exp
@@ -628,6 +628,7 @@ proc gcc-dg-runtest { testcases flags default-extra-flags } 
{
set torture_current_flags "$flags_t"
verbose "Testing $nshort, $flags $flags_t" 1
dg-test $test "$flags $flags_t" ${default-extra-flags}
+   unset torture_current_flags
}
 }


[gcc r15-4112] vect: Variable lane indices in vectorizable_slp_permutation_1 [PR116583]

2024-10-07 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:1732298d51028ae50a802e538df5d7249556255d

commit r15-4112-g1732298d51028ae50a802e538df5d7249556255d
Author: Richard Sandiford 
Date:   Mon Oct 7 13:03:03 2024 +0100

vect: Variable lane indices in vectorizable_slp_permutation_1 [PR116583]

The main patch for PR116583 needs to create variable indices into
an input vector.  This pre-patch changes the types to allow that.

There is no pretty-print format for poly_uint64 because of issues
with passing C++ objects through "...".

gcc/
PR tree-optimization/116583
* tree-vect-slp.cc (vectorizable_slp_permutation_1): Using
poly_uint64 for scalar lane indices.

Diff:
---
 gcc/tree-vect-slp.cc | 16 +---
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 125e69cf0eb0..97a471ad9108 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -10310,8 +10310,8 @@ vectorizable_slp_permutation_1 (vec_info *vinfo, 
gimple_stmt_iterator *gsi,
  from the { SLP operand, scalar lane } permutation as recorded in the
  SLP node as intermediate step.  This part should already work
  with SLP children with arbitrary number of lanes.  */
-  auto_vec, unsigned> > vperm;
-  auto_vec active_lane;
+  auto_vec, poly_uint64>> vperm;
+  auto_vec active_lane;
   vperm.create (olanes);
   active_lane.safe_grow_cleared (children.length (), true);
   for (unsigned i = 0; i < ncopies; ++i)
@@ -10326,8 +10326,9 @@ vectorizable_slp_permutation_1 (vec_info *vinfo, 
gimple_stmt_iterator *gsi,
{
  /* We checked above that the vectors are constant-length.  */
  unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
- unsigned vi = (active_lane[p.first] + p.second) / vnunits;
- unsigned vl = (active_lane[p.first] + p.second) % vnunits;
+ unsigned lane = active_lane[p.first].to_constant ();
+ unsigned vi = (lane + p.second) / vnunits;
+ unsigned vl = (lane + p.second) % vnunits;
  vperm.quick_push ({{p.first, vi}, vl});
}
}
@@ -10353,9 +10354,10 @@ vectorizable_slp_permutation_1 (vec_info *vinfo, 
gimple_stmt_iterator *gsi,
  ? multiple_p (i, npatterns)
  : multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype
dump_printf (MSG_NOTE, ",");
- dump_printf (MSG_NOTE, " vops%u[%u][%u]",
-  vperm[i].first.first, vperm[i].first.second,
-  vperm[i].second);
+ dump_printf (MSG_NOTE, " vops%u[%u][",
+  vperm[i].first.first, vperm[i].first.second);
+ dump_dec (MSG_NOTE, vperm[i].second);
+ dump_printf (MSG_NOTE, "]");
}
   dump_printf (MSG_NOTE, "\n");
 }


[gcc r15-4114] vect: Support more VLA SLP permutations [PR116583]

2024-10-07 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:8157f3f2d211bfbf53fbf8dd209b47ce583f4142

commit r15-4114-g8157f3f2d211bfbf53fbf8dd209b47ce583f4142
Author: Richard Sandiford 
Date:   Mon Oct 7 13:03:04 2024 +0100

vect: Support more VLA SLP permutations [PR116583]

This is the main patch for PR116583.  Previously, we only
supported VLA SLP permutations for which the output and inputs
have the same number of lanes, and for which that number of
lanes divides the number of vector elements.

The patch extends this to handle:

(1) "packs" of a single 2N-vector input into an N-vector output
(2) "unpacks" of N-vector inputs into an XN-vector output

Hopefully the comments in the code explain the approach.

The contents of the:

  for (unsigned i = 0; i < ncopies; ++i)

loop do not change; the patch simply adds an outer loop around it.

The patch removes the XFAIL in slp-13.c and also improves
the SVE vect.exp results with vect-force-slp=1.  I haven't
added new tests specifically for this, since presumably the
existing ones will cover it once the SLP switch is flipped.

gcc/
PR tree-optimization/116583
* tree-vect-slp.cc (vectorizable_slp_permutation_1): Handle
variable-length pack and unpack permutations.

gcc/testsuite/
PR tree-optimization/116583
* gcc.dg/vect/slp-13.c: Remove xfail for vect_variable_length.
* gcc.dg/vect/slp-13-big-array.c: Likewise.

Diff:
---
 gcc/testsuite/gcc.dg/vect/slp-13-big-array.c |   2 +-
 gcc/testsuite/gcc.dg/vect/slp-13.c   |   2 +-
 gcc/tree-vect-slp.cc | 107 ---
 3 files changed, 82 insertions(+), 29 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/vect/slp-13-big-array.c 
b/gcc/testsuite/gcc.dg/vect/slp-13-big-array.c
index ca70856c1dd5..e45f8aab1339 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-13-big-array.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-13-big-array.c
@@ -137,4 +137,4 @@ int main (void)
 /* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" { target { 
{ vect_interleave && vect_extract_even_odd } && { ! vect_pack_trunc } } } } } */
 /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { 
target { ! vect_pack_trunc } } } } */
 /* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" { target { 
{ vect_interleave && vect_extract_even_odd } && vect_pack_trunc } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" { 
target vect_pack_trunc xfail vect_variable_length } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" { 
target vect_pack_trunc } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-13.c 
b/gcc/testsuite/gcc.dg/vect/slp-13.c
index b7f947e6dbe1..d6346aef9788 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-13.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-13.c
@@ -131,4 +131,4 @@ int main (void)
 /* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" { target { 
{ vect_interleave && vect_extract_even_odd } && { ! vect_pack_trunc } } } } } */
 /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { 
target { ! vect_pack_trunc } } } } */
 /* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" { target { 
{ vect_interleave && vect_extract_even_odd } && vect_pack_trunc } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" { 
target vect_pack_trunc xfail vect_variable_length } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" { 
target vect_pack_trunc } } } */
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 1c986a652521..a5cd596fd285 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -10208,6 +10208,13 @@ vectorizable_slp_permutation_1 (vec_info *vinfo, 
gimple_stmt_iterator *gsi,
   unsigned i;
   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
   bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
+  /* True if we're permuting a single input of 2N vectors down
+ to N vectors.  This case doesn't generalize beyond 2 since
+ VEC_PERM_EXPR only takes 2 inputs.  */
+  bool pack_p = false;
+  /* If we're permuting inputs of N vectors each into X*N outputs,
+ this is the value of X, otherwise it is 1.  */
+  unsigned int unpack_factor = 1;
   tree op_vectype = NULL_TREE;
   FOR_EACH_VEC_ELT (children, i, child)
 if (SLP_TREE_VECTYPE (child))
@@ -10229,7 +10236,20 @@ vectorizable_slp_permutation_1 (vec_info *vinfo, 
gimple_stmt_iterator *gsi,
 "Unsupported vector types in lane permutation\n");
  return -1;
}
-  if (SLP_TREE_LANES (child) != SLP_TREE_LANES (node))
+  auto op_nunits = TYPE_VECTOR_SUBPARTS (op_vectype);
+  unsigned int this_unpack_factor;
+  /* Check whether the input has twice as many lanes per vector.  */
+  if (children.le

  1   2   3   4   >