On 22/05/25 16:06, Richard Sandiford wrote:
External email: Use caution opening links or attachments
<dhr...@nvidia.com> writes:
[...]
+;; The RTL combiners are able to combine "ior (ashift, ashiftrt)" to a "bswap".
+;; Match that as well.
+(define_insn_and_split "*v_revvnx8hi"
+ [(parallel
+ [(set (match_operand:VNx8HI 0 "register_operand")
+ (bswap:VNx8HI (match_operand 1 "register_operand")))
+ (clobber (match_scratch:VNx8BI 2))])]
Sorry for not noticing last time, but operand 0 should have a "=w"
constraint, operand 1 should have a "w" constraint, and the match_scratch
should have a "=Upl" constraint.
Ah, thanks, sorry about forgetting to add those in the first place.
+ "TARGET_SVE"
+ "#"
+ ""
The last line should be "&& 1", since the TARGET_SVE test doesn't
automatically apply to the define_split.
+ [(set (match_dup 0)
+ (unspec:VNx8HI
+ [(match_dup 2)
+ (unspec:VNx8HI
+ [(match_dup 1)]
+ UNSPEC_REVB)]
+ UNSPEC_PRED_X))]
+ {
+ if (!can_create_pseudo_p ())
+ operands[2] = CONSTM1_RTX (VNx8BImode);
+ else
+ operands[2] = aarch64_ptrue_reg (VNx8BImode);
This should be:
if (!can_create_pseudo_p ())
emit_move_insn (operands[2], CONSTM1_RTX (VNx8BImode));
else
operands[2] = aarch64_ptrue_reg (VNx8BImode);
That is, after register allocation, the pattern gives us a scratch
predicate register, but we need to initialise it to a ptrue.
Ah right, that makes sense, my bad - I had just copied the else-block
and forgot to think about it.
+ }
+)
+
;; Predicated integer unary operations.
(define_insn "@aarch64_pred_<optab><mode>"
[(set (match_operand:SVE_FULL_I 0 "register_operand")
[...]
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/shift_rev_1.c
b/gcc/testsuite/gcc.target/aarch64/sve/shift_rev_1.c
new file mode 100644
index 00000000000..3a30f80d152
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/shift_rev_1.c
@@ -0,0 +1,83 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=armv8.2-a+sve" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <arm_sve.h>
+
+/*
+** ror32_sve_lsl_imm:
+** ptrue p3.b, all
+** revw z0.d, p3/m, z0.d
There's no requirement to choose p3 for the predicate, so this would
be better as:
** ptrue (p[0-3]).b, all
** revw z0.d, \1/m, z0.d
Same for the others.
OK with those changes, thanks.
Here's a version of the patch with changes applied - I will commit it after
receiving write-after-approval approval and adding myself to the MAINTAINERS
file :) Thanks for the sponsor!
-- >8 --
[PATCH] aarch64: Fold lsl+lsr+orr to rev for half-width shifts
This patch folds the following pattern:
lsl <y>, <x>, <shift>
lsr <z>, <x>, <shift>
orr <r>, <y>, <z>
to:
revb/h/w <r>, <x>
when the shift amount is equal to half the bitwidth of the <x>
register.
Bootstrapped and regtested on aarch64-linux-gnu.
Signed-off-by: Dhruv Chawla <dhr...@nvidia.com>
Co-authored-by: Richard Sandiford <richard.sandif...@arm.com>
gcc/ChangeLog:
* expmed.cc (expand_rotate_as_vec_perm): Avoid a no-op move if the
target already provided the result in the expected register.
* config/aarch64/aarch64.cc (aarch64_vectorize_vec_perm_const):
Avoid forcing subregs into fresh registers unnecessarily.
* config/aarch64/aarch64-sve.md: Add define_split for rotate.
(*v_revvnx8hi): New pattern.
gcc/testsuite/ChangeLog:
* gcc.target/aarch64/sve/shift_rev_1.c: New test.
* gcc.target/aarch64/sve/shift_rev_2.c: Likewise.
* gcc.target/aarch64/sve/shift_rev_3.c: Likewise.
---
gcc/config/aarch64/aarch64-sve.md | 55 ++++++++++++
gcc/config/aarch64/aarch64.cc | 10 ++-
gcc/expmed.cc | 3 +-
.../gcc.target/aarch64/sve/shift_rev_1.c | 83 +++++++++++++++++++
.../gcc.target/aarch64/sve/shift_rev_2.c | 63 ++++++++++++++
.../gcc.target/aarch64/sve/shift_rev_3.c | 83 +++++++++++++++++++
6 files changed, 294 insertions(+), 3 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/shift_rev_1.c
create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/shift_rev_2.c
create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/shift_rev_3.c
diff --git a/gcc/config/aarch64/aarch64-sve.md
b/gcc/config/aarch64/aarch64-sve.md
index e1ec778b10d..c5d3e8cd3b3 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -3317,6 +3317,61 @@
;; - REVW
;; -------------------------------------------------------------------------
+(define_split
+ [(set (match_operand:SVE_FULL_HSDI 0 "register_operand")
+ (rotate:SVE_FULL_HSDI
+ (match_operand:SVE_FULL_HSDI 1 "register_operand")
+ (match_operand:SVE_FULL_HSDI 2 "aarch64_constant_vector_operand")))]
+ "TARGET_SVE && can_create_pseudo_p ()"
+ [(set (match_dup 3)
+ (ashift:SVE_FULL_HSDI (match_dup 1)
+ (match_dup 2)))
+ (set (match_dup 0)
+ (plus:SVE_FULL_HSDI
+ (lshiftrt:SVE_FULL_HSDI (match_dup 1)
+ (match_dup 4))
+ (match_dup 3)))]
+ {
+ if (aarch64_emit_opt_vec_rotate (operands[0], operands[1], operands[2]))
+ DONE;
+
+ if (!TARGET_SVE2)
+ FAIL;
+
+ operands[3] = gen_reg_rtx (<MODE>mode);
+ HOST_WIDE_INT shift_amount =
+ INTVAL (unwrap_const_vec_duplicate (operands[2]));
+ int bitwidth = GET_MODE_UNIT_BITSIZE (<MODE>mode);
+ operands[4] = aarch64_simd_gen_const_vector_dup (<MODE>mode,
+ bitwidth - shift_amount);
+ }
+)
+
+;; The RTL combiners are able to combine "ior (ashift, ashiftrt)" to a "bswap".
+;; Match that as well.
+(define_insn_and_split "*v_revvnx8hi"
+ [(parallel
+ [(set (match_operand:VNx8HI 0 "register_operand" "=w")
+ (bswap:VNx8HI (match_operand 1 "register_operand" "w")))
+ (clobber (match_scratch:VNx8BI 2 "=Upl"))])]
+ "TARGET_SVE"
+ "#"
+ "&& 1"
+ [(set (match_dup 0)
+ (unspec:VNx8HI
+ [(match_dup 2)
+ (unspec:VNx8HI
+ [(match_dup 1)]
+ UNSPEC_REVB)]
+ UNSPEC_PRED_X))]
+ {
+ if (!can_create_pseudo_p ())
+ emit_move_insn (operands[2], CONSTM1_RTX (VNx8BImode));
+ else
+ operands[2] = aarch64_ptrue_reg (VNx8BImode);
+ }
+)
+
;; Predicated integer unary operations.
(define_insn "@aarch64_pred_<optab><mode>"
[(set (match_operand:SVE_FULL_I 0 "register_operand")
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 1da615c8955..7cdd5fda903 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -27067,11 +27067,17 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode,
machine_mode op_mode,
d.op_mode = op_mode;
d.op_vec_flags = aarch64_classify_vector_mode (d.op_mode);
d.target = target;
- d.op0 = op0 ? force_reg (op_mode, op0) : NULL_RTX;
+ d.op0 = op0;
+ if (d.op0 && !register_operand (d.op0, op_mode))
+ d.op0 = force_reg (op_mode, d.op0);
if (op0 && d.one_vector_p)
d.op1 = copy_rtx (d.op0);
else
- d.op1 = op1 ? force_reg (op_mode, op1) : NULL_RTX;
+ {
+ d.op1 = op1;
+ if (d.op1 && !register_operand (d.op1, op_mode))
+ d.op1 = force_reg (op_mode, d.op1);
+ }
d.testing_p = !target;
if (!d.testing_p)
diff --git a/gcc/expmed.cc b/gcc/expmed.cc
index 72dbafe5d9f..deb4e48d14f 100644
--- a/gcc/expmed.cc
+++ b/gcc/expmed.cc
@@ -6324,7 +6324,8 @@ expand_rotate_as_vec_perm (machine_mode mode, rtx dst,
rtx x, rtx amt)
qimode, perm_dst);
if (!res)
return NULL_RTX;
- emit_move_insn (dst, lowpart_subreg (mode, res, qimode));
+ if (!rtx_equal_p (res, perm_dst))
+ emit_move_insn (dst, lowpart_subreg (mode, res, qimode));
return dst;
}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/shift_rev_1.c b/gcc/testsuite/gcc.target/aarch64/sve/shift_rev_1.c
new file mode 100644
index 00000000000..29ed378eb1a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/shift_rev_1.c
@@ -0,0 +1,83 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=armv8.2-a+sve" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <arm_sve.h>
+
+/*
+** ror32_sve_lsl_imm:
+** ptrue (p[0-3]).b, all
+** revw z0.d, \1/m, z0.d
+** ret
+*/
+svuint64_t
+ror32_sve_lsl_imm (svuint64_t r)
+{
+ return svorr_u64_z (svptrue_b64 (), svlsl_n_u64_z (svptrue_b64 (), r, 32),
+ svlsr_n_u64_z (svptrue_b64 (), r, 32));
+}
+
+/*
+** ror32_sve_lsl_operand:
+** ptrue (p[0-3]).b, all
+** revw z0.d, \1/m, z0.d
+** ret
+*/
+svuint64_t
+ror32_sve_lsl_operand (svuint64_t r)
+{
+ svbool_t pt = svptrue_b64 ();
+ return svorr_u64_z (pt, svlsl_n_u64_z (pt, r, 32), svlsr_n_u64_z (pt, r,
32));
+}
+
+/*
+** ror16_sve_lsl_imm:
+** ptrue (p[0-3]).b, all
+** revh z0.s, \1/m, z0.s
+** ret
+*/
+svuint32_t
+ror16_sve_lsl_imm (svuint32_t r)
+{
+ return svorr_u32_z (svptrue_b32 (), svlsl_n_u32_z (svptrue_b32 (), r, 16),
+ svlsr_n_u32_z (svptrue_b32 (), r, 16));
+}
+
+/*
+** ror16_sve_lsl_operand:
+** ptrue (p[0-3]).b, all
+** revh z0.s, \1/m, z0.s
+** ret
+*/
+svuint32_t
+ror16_sve_lsl_operand (svuint32_t r)
+{
+ svbool_t pt = svptrue_b32 ();
+ return svorr_u32_z (pt, svlsl_n_u32_z (pt, r, 16), svlsr_n_u32_z (pt, r,
16));
+}
+
+/*
+** ror8_sve_lsl_imm:
+** ptrue (p[0-3]).b, all
+** revb z0.h, \1/m, z0.h
+** ret
+*/
+svuint16_t
+ror8_sve_lsl_imm (svuint16_t r)
+{
+ return svorr_u16_z (svptrue_b16 (), svlsl_n_u16_z (svptrue_b16 (), r, 8),
+ svlsr_n_u16_z (svptrue_b16 (), r, 8));
+}
+
+/*
+** ror8_sve_lsl_operand:
+** ptrue (p[0-3]).b, all
+** revb z0.h, \1/m, z0.h
+** ret
+*/
+svuint16_t
+ror8_sve_lsl_operand (svuint16_t r)
+{
+ svbool_t pt = svptrue_b16 ();
+ return svorr_u16_z (pt, svlsl_n_u16_z (pt, r, 8), svlsr_n_u16_z (pt, r, 8));
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/shift_rev_2.c
b/gcc/testsuite/gcc.target/aarch64/sve/shift_rev_2.c
new file mode 100644
index 00000000000..2d380b14582
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/shift_rev_2.c
@@ -0,0 +1,63 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=armv8.2-a+sve" } */
+
+#include <arm_sve.h>
+
+#define PTRUE_B(BITWIDTH) svptrue_b##BITWIDTH ()
+
+#define ROR_SVE_LSL(NAME, INPUT_TYPE, SHIFT_AMOUNT, BITWIDTH)
\
+ INPUT_TYPE
\
+ NAME##_imm (INPUT_TYPE r)
\
+ {
\
+ return svorr_u##BITWIDTH##_z (PTRUE_B (BITWIDTH),
\
+ svlsl_n_u##BITWIDTH##_z (PTRUE_B (BITWIDTH), \
+ r, SHIFT_AMOUNT), \
+ svlsr_n_u##BITWIDTH##_z (PTRUE_B (BITWIDTH), \
+ r, SHIFT_AMOUNT)); \
+ }
\
+
\
+ INPUT_TYPE
\
+ NAME##_operand (INPUT_TYPE r)
\
+ {
\
+ svbool_t pt = PTRUE_B (BITWIDTH);
\
+ return svorr_u##BITWIDTH##_z (
\
+ pt, svlsl_n_u##BITWIDTH##_z (pt, r, SHIFT_AMOUNT),
\
+ svlsr_n_u##BITWIDTH##_z (pt, r, SHIFT_AMOUNT));
\
+ }
+
+/* Make sure that the pattern doesn't match incorrect bit-widths, eg. a shift
of
+ 8 matching the 32-bit mode. */
+
+ROR_SVE_LSL (higher_ror32, svuint64_t, 64, 64);
+ROR_SVE_LSL (higher_ror16, svuint32_t, 32, 32);
+ROR_SVE_LSL (higher_ror8, svuint16_t, 16, 16);
+
+ROR_SVE_LSL (lower_ror32, svuint64_t, 16, 64);
+ROR_SVE_LSL (lower_ror16, svuint32_t, 8, 32);
+ROR_SVE_LSL (lower_ror8, svuint16_t, 4, 16);
+
+/* Check off-by-one cases. */
+
+ROR_SVE_LSL (off_1_high_ror32, svuint64_t, 33, 64);
+ROR_SVE_LSL (off_1_high_ror16, svuint32_t, 17, 32);
+ROR_SVE_LSL (off_1_high_ror8, svuint16_t, 9, 16);
+
+ROR_SVE_LSL (off_1_low_ror32, svuint64_t, 31, 64);
+ROR_SVE_LSL (off_1_low_ror16, svuint32_t, 15, 32);
+ROR_SVE_LSL (off_1_low_ror8, svuint16_t, 7, 16);
+
+/* Check out of bounds cases. */
+
+ROR_SVE_LSL (oob_ror32, svuint64_t, 65, 64);
+ROR_SVE_LSL (oob_ror16, svuint32_t, 33, 32);
+ROR_SVE_LSL (oob_ror8, svuint16_t, 17, 16);
+
+/* Check zero case. */
+
+ROR_SVE_LSL (zero_ror32, svuint64_t, 0, 64);
+ROR_SVE_LSL (zero_ror16, svuint32_t, 0, 32);
+ROR_SVE_LSL (zero_ror8, svuint16_t, 0, 16);
+
+/* { dg-final { scan-assembler-times "\trevb\t" 0 } } */
+/* { dg-final { scan-assembler-times "\trevh\t" 0 } } */
+/* { dg-final { scan-assembler-times "\trevw\t" 0 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/shift_rev_3.c
b/gcc/testsuite/gcc.target/aarch64/sve/shift_rev_3.c
new file mode 100644
index 00000000000..126766d0a80
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/shift_rev_3.c
@@ -0,0 +1,83 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=armv8.2-a+sve+sve2" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <arm_sve.h>
+
+/*
+** lsl_usra_32_sve_lsl_imm:
+** lsl z0.d, z1.d, #34
+** usra z0.d, z1.d, #30
+** ret
+*/
+svuint64_t
+lsl_usra_32_sve_lsl_imm (svuint64_t __attribute__ ((unused)) dummy, svuint64_t
r)
+{
+ return svorr_u64_z (svptrue_b64 (), svlsl_n_u64_z (svptrue_b64 (), r, 34),
+ svlsr_n_u64_z (svptrue_b64 (), r, 30));
+}
+
+/*
+** lsl_usra_32_sve_lsl_operand:
+** lsl z0.d, z1.d, #34
+** usra z0.d, z1.d, #30
+** ret
+*/
+svuint64_t
+lsl_usra_32_sve_lsl_operand (svuint64_t __attribute__ ((unused)) dummy,
svuint64_t r)
+{
+ svbool_t pt = svptrue_b64 ();
+ return svorr_u64_z (pt, svlsl_n_u64_z (pt, r, 34), svlsr_n_u64_z (pt, r,
30));
+}
+
+/*
+** lsl_usra_16_sve_lsl_imm:
+** lsl z0.s, z1.s, #14
+** usra z0.s, z1.s, #18
+** ret
+*/
+svuint32_t
+lsl_usra_16_sve_lsl_imm (svuint32_t __attribute__ ((unused)) dummy, svuint32_t
r)
+{
+ return svorr_u32_z (svptrue_b32 (), svlsl_n_u32_z (svptrue_b32 (), r, 14),
+ svlsr_n_u32_z (svptrue_b32 (), r, 18));
+}
+
+/*
+** lsl_usra_16_sve_lsl_operand:
+** lsl z0.s, z1.s, #14
+** usra z0.s, z1.s, #18
+** ret
+*/
+svuint32_t
+lsl_usra_16_sve_lsl_operand (svuint32_t __attribute__ ((unused)) dummy,
svuint32_t r)
+{
+ svbool_t pt = svptrue_b32 ();
+ return svorr_u32_z (pt, svlsl_n_u32_z (pt, r, 14), svlsr_n_u32_z (pt, r,
18));
+}
+
+/*
+** lsl_usra_8_sve_lsl_imm:
+** lsl z0.h, z1.h, #6
+** usra z0.h, z1.h, #10
+** ret
+*/
+svuint16_t
+lsl_usra_8_sve_lsl_imm (svuint16_t __attribute__ ((unused)) dummy, svuint16_t
r)
+{
+ return svorr_u16_z (svptrue_b16 (), svlsl_n_u16_z (svptrue_b16 (), r, 6),
+ svlsr_n_u16_z (svptrue_b16 (), r, 10));
+}
+
+/*
+** lsl_usra_8_sve_lsl_operand:
+** lsl z0.h, z1.h, #6
+** usra z0.h, z1.h, #10
+** ret
+*/
+svuint16_t
+lsl_usra_8_sve_lsl_operand (svuint16_t __attribute__ ((unused)) dummy,
svuint16_t r)
+{
+ svbool_t pt = svptrue_b16 ();
+ return svorr_u16_z (pt, svlsl_n_u16_z (pt, r, 6), svlsr_n_u16_z (pt, r, 10));
+}
--
2.44.0
Richard
--
Regards,
Dhruv
From 96a6a852a27e22925d2848d04d25229d7755c405 Mon Sep 17 00:00:00 2001
From: Dhruv Chawla <dhr...@nvidia.com>
Date: Wed, 18 Dec 2024 08:21:24 -0800
Subject: [PATCH] aarch64: Fold lsl+lsr+orr to rev for half-width shifts
This patch folds the following pattern:
lsl <y>, <x>, <shift>
lsr <z>, <x>, <shift>
orr <r>, <y>, <z>
to:
revb/h/w <r>, <x>
when the shift amount is equal to half the bitwidth of the <x>
register.
Bootstrapped and regtested on aarch64-linux-gnu.
Signed-off-by: Dhruv Chawla <dhr...@nvidia.com>
Co-authored-by: Richard Sandiford <richard.sandif...@arm.com>
gcc/ChangeLog:
* expmed.cc (expand_rotate_as_vec_perm): Avoid a no-op move if the
target already provided the result in the expected register.
* config/aarch64/aarch64.cc (aarch64_vectorize_vec_perm_const):
Avoid forcing subregs into fresh registers unnecessarily.
* config/aarch64/aarch64-sve.md: Add define_split for rotate.
(*v_revvnx8hi): New pattern.
gcc/testsuite/ChangeLog:
* gcc.target/aarch64/sve/shift_rev_1.c: New test.
* gcc.target/aarch64/sve/shift_rev_2.c: Likewise.
* gcc.target/aarch64/sve/shift_rev_3.c: Likewise.
---
gcc/config/aarch64/aarch64-sve.md | 55 ++++++++++++
gcc/config/aarch64/aarch64.cc | 10 ++-
gcc/expmed.cc | 3 +-
.../gcc.target/aarch64/sve/shift_rev_1.c | 83 +++++++++++++++++++
.../gcc.target/aarch64/sve/shift_rev_2.c | 63 ++++++++++++++
.../gcc.target/aarch64/sve/shift_rev_3.c | 83 +++++++++++++++++++
6 files changed, 294 insertions(+), 3 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/shift_rev_1.c
create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/shift_rev_2.c
create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/shift_rev_3.c
diff --git a/gcc/config/aarch64/aarch64-sve.md
b/gcc/config/aarch64/aarch64-sve.md
index e1ec778b10d..c5d3e8cd3b3 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -3317,6 +3317,61 @@
;; - REVW
;; -------------------------------------------------------------------------
+(define_split
+ [(set (match_operand:SVE_FULL_HSDI 0 "register_operand")
+ (rotate:SVE_FULL_HSDI
+ (match_operand:SVE_FULL_HSDI 1 "register_operand")
+ (match_operand:SVE_FULL_HSDI 2 "aarch64_constant_vector_operand")))]
+ "TARGET_SVE && can_create_pseudo_p ()"
+ [(set (match_dup 3)
+ (ashift:SVE_FULL_HSDI (match_dup 1)
+ (match_dup 2)))
+ (set (match_dup 0)
+ (plus:SVE_FULL_HSDI
+ (lshiftrt:SVE_FULL_HSDI (match_dup 1)
+ (match_dup 4))
+ (match_dup 3)))]
+ {
+ if (aarch64_emit_opt_vec_rotate (operands[0], operands[1], operands[2]))
+ DONE;
+
+ if (!TARGET_SVE2)
+ FAIL;
+
+ operands[3] = gen_reg_rtx (<MODE>mode);
+ HOST_WIDE_INT shift_amount =
+ INTVAL (unwrap_const_vec_duplicate (operands[2]));
+ int bitwidth = GET_MODE_UNIT_BITSIZE (<MODE>mode);
+ operands[4] = aarch64_simd_gen_const_vector_dup (<MODE>mode,
+ bitwidth - shift_amount);
+ }
+)
+
+;; The RTL combiners are able to combine "ior (ashift, ashiftrt)" to a "bswap".
+;; Match that as well.
+(define_insn_and_split "*v_revvnx8hi"
+ [(parallel
+ [(set (match_operand:VNx8HI 0 "register_operand" "=w")
+ (bswap:VNx8HI (match_operand 1 "register_operand" "w")))
+ (clobber (match_scratch:VNx8BI 2 "=Upl"))])]
+ "TARGET_SVE"
+ "#"
+ "&& 1"
+ [(set (match_dup 0)
+ (unspec:VNx8HI
+ [(match_dup 2)
+ (unspec:VNx8HI
+ [(match_dup 1)]
+ UNSPEC_REVB)]
+ UNSPEC_PRED_X))]
+ {
+ if (!can_create_pseudo_p ())
+ emit_move_insn (operands[2], CONSTM1_RTX (VNx8BImode));
+ else
+ operands[2] = aarch64_ptrue_reg (VNx8BImode);
+ }
+)
+
;; Predicated integer unary operations.
(define_insn "@aarch64_pred_<optab><mode>"
[(set (match_operand:SVE_FULL_I 0 "register_operand")
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 1da615c8955..7cdd5fda903 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -27067,11 +27067,17 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode,
machine_mode op_mode,
d.op_mode = op_mode;
d.op_vec_flags = aarch64_classify_vector_mode (d.op_mode);
d.target = target;
- d.op0 = op0 ? force_reg (op_mode, op0) : NULL_RTX;
+ d.op0 = op0;
+ if (d.op0 && !register_operand (d.op0, op_mode))
+ d.op0 = force_reg (op_mode, d.op0);
if (op0 && d.one_vector_p)
d.op1 = copy_rtx (d.op0);
else
- d.op1 = op1 ? force_reg (op_mode, op1) : NULL_RTX;
+ {
+ d.op1 = op1;
+ if (d.op1 && !register_operand (d.op1, op_mode))
+ d.op1 = force_reg (op_mode, d.op1);
+ }
d.testing_p = !target;
if (!d.testing_p)
diff --git a/gcc/expmed.cc b/gcc/expmed.cc
index 72dbafe5d9f..deb4e48d14f 100644
--- a/gcc/expmed.cc
+++ b/gcc/expmed.cc
@@ -6324,7 +6324,8 @@ expand_rotate_as_vec_perm (machine_mode mode, rtx dst,
rtx x, rtx amt)
qimode, perm_dst);
if (!res)
return NULL_RTX;
- emit_move_insn (dst, lowpart_subreg (mode, res, qimode));
+ if (!rtx_equal_p (res, perm_dst))
+ emit_move_insn (dst, lowpart_subreg (mode, res, qimode));
return dst;
}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/shift_rev_1.c
b/gcc/testsuite/gcc.target/aarch64/sve/shift_rev_1.c
new file mode 100644
index 00000000000..29ed378eb1a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/shift_rev_1.c
@@ -0,0 +1,83 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=armv8.2-a+sve" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <arm_sve.h>
+
+/*
+** ror32_sve_lsl_imm:
+** ptrue (p[0-3]).b, all
+** revw z0.d, \1/m, z0.d
+** ret
+*/
+svuint64_t
+ror32_sve_lsl_imm (svuint64_t r)
+{
+ return svorr_u64_z (svptrue_b64 (), svlsl_n_u64_z (svptrue_b64 (), r, 32),
+ svlsr_n_u64_z (svptrue_b64 (), r, 32));
+}
+
+/*
+** ror32_sve_lsl_operand:
+** ptrue (p[0-3]).b, all
+** revw z0.d, \1/m, z0.d
+** ret
+*/
+svuint64_t
+ror32_sve_lsl_operand (svuint64_t r)
+{
+ svbool_t pt = svptrue_b64 ();
+ return svorr_u64_z (pt, svlsl_n_u64_z (pt, r, 32), svlsr_n_u64_z (pt, r,
32));
+}
+
+/*
+** ror16_sve_lsl_imm:
+** ptrue (p[0-3]).b, all
+** revh z0.s, \1/m, z0.s
+** ret
+*/
+svuint32_t
+ror16_sve_lsl_imm (svuint32_t r)
+{
+ return svorr_u32_z (svptrue_b32 (), svlsl_n_u32_z (svptrue_b32 (), r, 16),
+ svlsr_n_u32_z (svptrue_b32 (), r, 16));
+}
+
+/*
+** ror16_sve_lsl_operand:
+** ptrue (p[0-3]).b, all
+** revh z0.s, \1/m, z0.s
+** ret
+*/
+svuint32_t
+ror16_sve_lsl_operand (svuint32_t r)
+{
+ svbool_t pt = svptrue_b32 ();
+ return svorr_u32_z (pt, svlsl_n_u32_z (pt, r, 16), svlsr_n_u32_z (pt, r,
16));
+}
+
+/*
+** ror8_sve_lsl_imm:
+** ptrue (p[0-3]).b, all
+** revb z0.h, \1/m, z0.h
+** ret
+*/
+svuint16_t
+ror8_sve_lsl_imm (svuint16_t r)
+{
+ return svorr_u16_z (svptrue_b16 (), svlsl_n_u16_z (svptrue_b16 (), r, 8),
+ svlsr_n_u16_z (svptrue_b16 (), r, 8));
+}
+
+/*
+** ror8_sve_lsl_operand:
+** ptrue (p[0-3]).b, all
+** revb z0.h, \1/m, z0.h
+** ret
+*/
+svuint16_t
+ror8_sve_lsl_operand (svuint16_t r)
+{
+ svbool_t pt = svptrue_b16 ();
+ return svorr_u16_z (pt, svlsl_n_u16_z (pt, r, 8), svlsr_n_u16_z (pt, r, 8));
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/shift_rev_2.c
b/gcc/testsuite/gcc.target/aarch64/sve/shift_rev_2.c
new file mode 100644
index 00000000000..2d380b14582
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/shift_rev_2.c
@@ -0,0 +1,63 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=armv8.2-a+sve" } */
+
+#include <arm_sve.h>
+
+#define PTRUE_B(BITWIDTH) svptrue_b##BITWIDTH ()
+
+#define ROR_SVE_LSL(NAME, INPUT_TYPE, SHIFT_AMOUNT, BITWIDTH)
\
+ INPUT_TYPE
\
+ NAME##_imm (INPUT_TYPE r)
\
+ {
\
+ return svorr_u##BITWIDTH##_z (PTRUE_B (BITWIDTH),
\
+ svlsl_n_u##BITWIDTH##_z (PTRUE_B (BITWIDTH), \
+ r, SHIFT_AMOUNT), \
+ svlsr_n_u##BITWIDTH##_z (PTRUE_B (BITWIDTH), \
+ r, SHIFT_AMOUNT)); \
+ }
\
+
\
+ INPUT_TYPE
\
+ NAME##_operand (INPUT_TYPE r)
\
+ {
\
+ svbool_t pt = PTRUE_B (BITWIDTH);
\
+ return svorr_u##BITWIDTH##_z (
\
+ pt, svlsl_n_u##BITWIDTH##_z (pt, r, SHIFT_AMOUNT),
\
+ svlsr_n_u##BITWIDTH##_z (pt, r, SHIFT_AMOUNT));
\
+ }
+
+/* Make sure that the pattern doesn't match incorrect bit-widths, eg. a shift
of
+ 8 matching the 32-bit mode. */
+
+ROR_SVE_LSL (higher_ror32, svuint64_t, 64, 64);
+ROR_SVE_LSL (higher_ror16, svuint32_t, 32, 32);
+ROR_SVE_LSL (higher_ror8, svuint16_t, 16, 16);
+
+ROR_SVE_LSL (lower_ror32, svuint64_t, 16, 64);
+ROR_SVE_LSL (lower_ror16, svuint32_t, 8, 32);
+ROR_SVE_LSL (lower_ror8, svuint16_t, 4, 16);
+
+/* Check off-by-one cases. */
+
+ROR_SVE_LSL (off_1_high_ror32, svuint64_t, 33, 64);
+ROR_SVE_LSL (off_1_high_ror16, svuint32_t, 17, 32);
+ROR_SVE_LSL (off_1_high_ror8, svuint16_t, 9, 16);
+
+ROR_SVE_LSL (off_1_low_ror32, svuint64_t, 31, 64);
+ROR_SVE_LSL (off_1_low_ror16, svuint32_t, 15, 32);
+ROR_SVE_LSL (off_1_low_ror8, svuint16_t, 7, 16);
+
+/* Check out of bounds cases. */
+
+ROR_SVE_LSL (oob_ror32, svuint64_t, 65, 64);
+ROR_SVE_LSL (oob_ror16, svuint32_t, 33, 32);
+ROR_SVE_LSL (oob_ror8, svuint16_t, 17, 16);
+
+/* Check zero case. */
+
+ROR_SVE_LSL (zero_ror32, svuint64_t, 0, 64);
+ROR_SVE_LSL (zero_ror16, svuint32_t, 0, 32);
+ROR_SVE_LSL (zero_ror8, svuint16_t, 0, 16);
+
+/* { dg-final { scan-assembler-times "\trevb\t" 0 } } */
+/* { dg-final { scan-assembler-times "\trevh\t" 0 } } */
+/* { dg-final { scan-assembler-times "\trevw\t" 0 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/shift_rev_3.c
b/gcc/testsuite/gcc.target/aarch64/sve/shift_rev_3.c
new file mode 100644
index 00000000000..126766d0a80
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/shift_rev_3.c
@@ -0,0 +1,83 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=armv8.2-a+sve+sve2" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <arm_sve.h>
+
+/*
+** lsl_usra_32_sve_lsl_imm:
+** lsl z0.d, z1.d, #34
+** usra z0.d, z1.d, #30
+** ret
+*/
+svuint64_t
+lsl_usra_32_sve_lsl_imm (svuint64_t __attribute__ ((unused)) dummy, svuint64_t
r)
+{
+ return svorr_u64_z (svptrue_b64 (), svlsl_n_u64_z (svptrue_b64 (), r, 34),
+ svlsr_n_u64_z (svptrue_b64 (), r, 30));
+}
+
+/*
+** lsl_usra_32_sve_lsl_operand:
+** lsl z0.d, z1.d, #34
+** usra z0.d, z1.d, #30
+** ret
+*/
+svuint64_t
+lsl_usra_32_sve_lsl_operand (svuint64_t __attribute__ ((unused)) dummy,
svuint64_t r)
+{
+ svbool_t pt = svptrue_b64 ();
+ return svorr_u64_z (pt, svlsl_n_u64_z (pt, r, 34), svlsr_n_u64_z (pt, r,
30));
+}
+
+/*
+** lsl_usra_16_sve_lsl_imm:
+** lsl z0.s, z1.s, #14
+** usra z0.s, z1.s, #18
+** ret
+*/
+svuint32_t
+lsl_usra_16_sve_lsl_imm (svuint32_t __attribute__ ((unused)) dummy, svuint32_t
r)
+{
+ return svorr_u32_z (svptrue_b32 (), svlsl_n_u32_z (svptrue_b32 (), r, 14),
+ svlsr_n_u32_z (svptrue_b32 (), r, 18));
+}
+
+/*
+** lsl_usra_16_sve_lsl_operand:
+** lsl z0.s, z1.s, #14
+** usra z0.s, z1.s, #18
+** ret
+*/
+svuint32_t
+lsl_usra_16_sve_lsl_operand (svuint32_t __attribute__ ((unused)) dummy,
svuint32_t r)
+{
+ svbool_t pt = svptrue_b32 ();
+ return svorr_u32_z (pt, svlsl_n_u32_z (pt, r, 14), svlsr_n_u32_z (pt, r,
18));
+}
+
+/*
+** lsl_usra_8_sve_lsl_imm:
+** lsl z0.h, z1.h, #6
+** usra z0.h, z1.h, #10
+** ret
+*/
+svuint16_t
+lsl_usra_8_sve_lsl_imm (svuint16_t __attribute__ ((unused)) dummy, svuint16_t
r)
+{
+ return svorr_u16_z (svptrue_b16 (), svlsl_n_u16_z (svptrue_b16 (), r, 6),
+ svlsr_n_u16_z (svptrue_b16 (), r, 10));
+}
+
+/*
+** lsl_usra_8_sve_lsl_operand:
+** lsl z0.h, z1.h, #6
+** usra z0.h, z1.h, #10
+** ret
+*/
+svuint16_t
+lsl_usra_8_sve_lsl_operand (svuint16_t __attribute__ ((unused)) dummy,
svuint16_t r)
+{
+ svbool_t pt = svptrue_b16 ();
+ return svorr_u16_z (pt, svlsl_n_u16_z (pt, r, 6), svlsr_n_u16_z (pt, r, 10));
+}
--
2.44.0