From 4d0c48c79d24d27724111906bcfb0b709041fa08 Mon Sep 17 00:00:00 2001
From: Kyrylo Tkachov <ktkachov@nvidia.com>
Date: Mon, 5 Aug 2024 11:29:44 -0700
Subject: [PATCH 2/2] aarch64: Emit ADD X, Y, Y instead of SHL X, Y, #1 for
 Advanced SIMD

On many cores, including Neoverse V2 the throughput of vector ADD
instructions is higher than vector shifts like SHL.  We can lean on that
to emit code like:
  add     v0.4s, v0.4s, v0.4s
instead of:
  shl     v0.4s, v0.4s, 1

LLVM already does this trick.
In RTL the code gets canonincalised from (plus x x) to (ashift x 1) so I
opted to instead do this at the final assembly printing stage, similar
to how we emit CMLT instead of SSHR elsewhere in the backend.

I'd like to also do this for SVE shifts, but those will have to be
separate patches.

Signed-off-by: Kyrylo Tkachov <ktkachov@nvidia.com>

gcc/ChangeLog:

	* config/aarch64/aarch64-simd.md
	(aarch64_simd_imm_shl<mode><vczle><vczbe>): Rewrite to new
	syntax.  Add =w,w,vs1 alternative.
	* config/aarch64/constraints.md (vs1): New constraint.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/advsimd_shl_add.c: New test.
---
 gcc/config/aarch64/aarch64-simd.md            | 12 ++--
 gcc/config/aarch64/constraints.md             |  6 ++
 .../gcc.target/aarch64/advsimd_shl_add.c      | 64 +++++++++++++++++++
 3 files changed, 77 insertions(+), 5 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/advsimd_shl_add.c

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 816f499e963..dff00b68a06 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1352,12 +1352,14 @@ (define_expand "aarch64_<sra_op>rsra_n<mode>"
 )
 
 (define_insn "aarch64_simd_imm_shl<mode><vczle><vczbe>"
- [(set (match_operand:VDQ_I 0 "register_operand" "=w")
-       (ashift:VDQ_I (match_operand:VDQ_I 1 "register_operand" "w")
-		   (match_operand:VDQ_I  2 "aarch64_simd_lshift_imm" "Dl")))]
+ [(set (match_operand:VDQ_I 0 "register_operand")
+       (ashift:VDQ_I (match_operand:VDQ_I 1 "register_operand")
+		   (match_operand:VDQ_I  2 "aarch64_simd_lshift_imm")))]
  "TARGET_SIMD"
-  "shl\t%0.<Vtype>, %1.<Vtype>, %2"
-  [(set_attr "type" "neon_shift_imm<q>")]
+  {@ [ cons: =0, 1,  2   ; attrs: type       ]
+     [ w       , w,  vs1 ; neon_add<q>       ] add\t%0.<Vtype>, %1.<Vtype>, %1.<Vtype>
+     [ w       , w,  Dl  ; neon_shift_imm<q> ] shl\t%0.<Vtype>, %1.<Vtype>, %2
+  }
 )
 
 (define_insn "aarch64_simd_reg_sshl<mode><vczle><vczbe>"
diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
index a2878f580d9..f491e4bd6a0 100644
--- a/gcc/config/aarch64/constraints.md
+++ b/gcc/config/aarch64/constraints.md
@@ -667,6 +667,12 @@ (define_constraint "vsm"
    SMAX and SMIN operations."
  (match_operand 0 "aarch64_sve_vsm_immediate"))
 
+(define_constraint "vs1"
+  "@internal
+ A constraint that matches a vector of immediate one."
+ (and (match_code "const,const_vector")
+      (match_test "op == CONST1_RTX (GET_MODE (op))")))
+
 (define_constraint "vsA"
   "@internal
    A constraint that matches an immediate operand valid for SVE FADD
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd_shl_add.c b/gcc/testsuite/gcc.target/aarch64/advsimd_shl_add.c
new file mode 100644
index 00000000000..a161f89a3ac
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd_shl_add.c
@@ -0,0 +1,64 @@
+/* { dg-do compile } */
+/* { dg-additional-options "--save-temps -O1" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+typedef __INT64_TYPE__ __attribute__ ((vector_size (16))) v2di;
+typedef int __attribute__ ((vector_size (16))) v4si;
+typedef short __attribute__ ((vector_size (16))) v8hi;
+typedef char __attribute__ ((vector_size (16))) v16qi;
+typedef short __attribute__ ((vector_size (8))) v4hi;
+typedef char __attribute__ ((vector_size (8))) v8qi;
+
+#define FUNC(S) \
+S               \
+foo_##S (S a)   \
+{ return a << 1; }
+
+/*
+** foo_v2di:
+**      add	v0.2d, v0.2d, v0.2d
+**      ret
+*/
+
+FUNC (v2di)
+
+/*
+** foo_v4si:
+**      add	v0.4s, v0.4s, v0.4s
+**      ret
+*/
+
+FUNC (v4si)
+
+/*
+** foo_v8hi:
+**      add	v0.8h, v0.8h, v0.8h
+**      ret
+*/
+
+FUNC (v8hi)
+
+/*
+** foo_v16qi:
+**      add	v0.16b, v0.16b, v0.16b
+**      ret
+*/
+
+FUNC (v16qi)
+
+/*
+** foo_v4hi:
+**      add	v0.4h, v0.4h, v0.4h
+**      ret
+*/
+
+FUNC (v4hi)
+
+/*
+** foo_v8qi:
+**      add	v0.8b, v0.8b, v0.8b
+**      ret
+*/
+
+FUNC (v8qi)
+
-- 
2.43.2

