https://gcc.gnu.org/g:161e246cf32f1298400aa3c1d86110490a3cd0ce
commit r15-4963-g161e246cf32f1298400aa3c1d86110490a3cd0ce Author: Kyrylo Tkachov <ktkac...@nvidia.com> Date: Tue Nov 5 05:10:22 2024 -0800 PR target/117449: Restrict vector rotate match and split to pre-reload The vector rotate splitter has some logic to deal with post-reload splitting but not all cases in aarch64_emit_opt_vec_rotate are post-reload-safe. In particular the ROTATE+XOR expansion for TARGET_SHA3 can create RTL that can later be simplified to a simple ROTATE post-reload, which would then match the insn again and try to split it. So do a clean split pre-reload and avoid going down this path post-reload by restricting the insn_and_split to can_create_pseudo_p (). Bootstrapped and tested on aarch64-none-linux. Signed-off-by: Kyrylo Tkachov <ktkac...@nvidia.com> gcc/ PR target/117449 * config/aarch64/aarch64-simd.md (*aarch64_simd_rotate_imm<mode>): Match only when can_create_pseudo_p (). * config/aarch64/aarch64.cc (aarch64_emit_opt_vec_rotate): Assume can_create_pseudo_p (). gcc/testsuite/ PR target/117449 * gcc.c-torture/compile/pr117449.c: New test. Diff: --- gcc/config/aarch64/aarch64-simd.md | 6 ++++-- gcc/config/aarch64/aarch64.cc | 11 ++++++----- gcc/testsuite/gcc.c-torture/compile/pr117449.c | 8 ++++++++ 3 files changed, 18 insertions(+), 7 deletions(-) diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index a91222b6e3b2..cfe95bd4c316 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -1296,11 +1296,13 @@ ;; After all the combinations and propagations of ROTATE have been ;; attempted split any remaining vector rotates into SHL + USRA sequences. +;; Don't match this after reload as the various possible sequence for this +;; require temporary registers. (define_insn_and_split "*aarch64_simd_rotate_imm<mode>" [(set (match_operand:VDQ_I 0 "register_operand" "=&w") (rotate:VDQ_I (match_operand:VDQ_I 1 "register_operand" "w") (match_operand:VDQ_I 2 "aarch64_simd_lshift_imm")))] - "TARGET_SIMD" + "TARGET_SIMD && can_create_pseudo_p ()" "#" "&& 1" [(set (match_dup 3) @@ -1316,7 +1318,7 @@ if (aarch64_emit_opt_vec_rotate (operands[0], operands[1], operands[2])) DONE; - operands[3] = reload_completed ? operands[0] : gen_reg_rtx (<MODE>mode); + operands[3] = gen_reg_rtx (<MODE>mode); rtx shft_amnt = unwrap_const_vec_duplicate (operands[2]); int bitwidth = GET_MODE_UNIT_SIZE (<MODE>mode) * BITS_PER_UNIT; operands[4] diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index 9347e06f0e9e..f2b53475adbe 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -16030,6 +16030,8 @@ aarch64_emit_opt_vec_rotate (rtx dst, rtx reg, rtx amnt_vec) gcc_assert (CONST_INT_P (amnt)); HOST_WIDE_INT rotamnt = UINTVAL (amnt); machine_mode mode = GET_MODE (reg); + /* Don't end up here after reload. */ + gcc_assert (can_create_pseudo_p ()); /* Rotates by half the element width map down to REV* instructions and should always be preferred when possible. */ if (rotamnt == GET_MODE_UNIT_BITSIZE (mode) / 2 @@ -16037,11 +16039,10 @@ aarch64_emit_opt_vec_rotate (rtx dst, rtx reg, rtx amnt_vec) return true; /* 64 and 128-bit vector modes can use the XAR instruction when available. */ - else if (can_create_pseudo_p () - && ((TARGET_SHA3 && mode == V2DImode) - || (TARGET_SVE2 - && (known_eq (GET_MODE_SIZE (mode), 8) - || known_eq (GET_MODE_SIZE (mode), 16))))) + else if ((TARGET_SHA3 && mode == V2DImode) + || (TARGET_SVE2 + && (known_eq (GET_MODE_SIZE (mode), 8) + || known_eq (GET_MODE_SIZE (mode), 16)))) { rtx zeroes = aarch64_gen_shareable_zero (mode); rtx xar_op diff --git a/gcc/testsuite/gcc.c-torture/compile/pr117449.c b/gcc/testsuite/gcc.c-torture/compile/pr117449.c new file mode 100644 index 000000000000..8ae0071fca6b --- /dev/null +++ b/gcc/testsuite/gcc.c-torture/compile/pr117449.c @@ -0,0 +1,8 @@ +/* { dg-additional-options "-march=armv8.2-a+sha3" { target aarch64*-*-* } } */ + +unsigned long *a; +int i; +void f() { + for (i = 0; i < 80; i++) + a[i] = (a[i] >> 8 | a[i] << 64 - 8) ^ a[i]; +}