[gcc r15-6609] aarch64: remove extra XTN in vector concatenation

Richard Sandiford via Gcc-cvs Mon, 06 Jan 2025 12:10:02 -0800

https://gcc.gnu.org/g:6069f02a486054484ad638b083cb3b9486bb4321


commit r15-6609-g6069f02a486054484ad638b083cb3b9486bb4321
Author: Akram Ahmad <akram.ah...@arm.com>
Date:   Mon Jan 6 20:09:30 2025 +0000

    aarch64: remove extra XTN in vector concatenation
    
    GIMPLE code which performs a narrowing truncation on the result of a
    vector concatenation currently results in an unnecessary XTN being
    emitted following a UZP1 to concate the operands. In cases such as this,
    UZP1 should instead use a smaller arrangement specifier to replace the
    XTN instruction. This is seen in cases such as in this GIMPLE example:
    
            int32x2_t foo (svint64_t a, svint64_t b)
            {
              vector(2) int vect__2.8;
              long int _1;
              long int _3;
              vector(2) long int _12;
    
              <bb 2> [local count: 1073741824]:
              _1 = svaddv_s64 ({ -1, 0, 0, 0, 0, 0, 0, 0, ... }, a_6(D));
              _3 = svaddv_s64 ({ -1, 0, 0, 0, 0, 0, 0, 0, ... }, b_7(D));
              _12 = {_1, _3};
              vect__2.8_13 = (vector(2) int) _12;
              return vect__2.8_13;
    
            }
    
    Original assembly generated:
    
            bar:
                    ptrue   p3.b, all
                    uaddv   d0, p3, z0.d
                    uaddv   d1, p3, z1.d
                    uzp1    v0.2d, v0.2d, v1.2d
                    xtn     v0.2s, v0.2d
                    ret
    
    This patch therefore defines the *aarch64_trunc_concat<mode> insn which
    truncates the concatenation result, rather than concatenating the
    truncated operands (such as in *aarch64_narrow_trunc<mode>), resulting
    in the following optimised assembly being emitted:
    
            bar:
                    ptrue   p3.b, all
                    uaddv   d0, p3, z0.d
                    uaddv   d1, p3, z1.d
                    uzp1    v0.2s, v0.2s, v1.2s
                    ret
    
    This patch passes all regression tests on aarch64 with no new failures.
    A supporting test for this optimisation is also written and passes.
    
    OK for master? I do not have commit rights so I cannot push the patch
    myself.
    
    gcc/ChangeLog:
    
            * config/aarch64/aarch64-simd.md: (*aarch64_trunc_concat)
            new insn definition.
    
    gcc/testsuite/ChangeLog:
    
            * gcc.target/aarch64/sve/truncated_concatenation_1.c: new test
            for the above example and other modes covered by insn
            definitions.

Diff:
---
 gcc/config/aarch64/aarch64-simd.md                 | 16 +++++++++++
 .../aarch64/sve/truncated_concatenation_1.c        | 32 ++++++++++++++++++++++
 2 files changed, 48 insertions(+)

diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index f43f0f6af0df..eeb626f129a8 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1874,6 +1874,22 @@
   [(set_attr "type" "neon_permute<q>")]
 )
 
+(define_insn "*aarch64_trunc_concat<mode>"
+  [(set (match_operand:<VNARROWQ> 0 "register_operand" "=w")
+       (truncate:<VNARROWQ>
+         (vec_concat:VQN
+           (match_operand:<VHALF> 1 "register_operand" "w")
+           (match_operand:<VHALF> 2 "register_operand" "w"))))]
+  "TARGET_SIMD"
+{
+  if (!BYTES_BIG_ENDIAN)
+    return "uzp1\\t%0.<Vntype>, %1.<Vntype>, %2.<Vntype>";
+  else
+    return "uzp1\\t%0.<Vntype>, %2.<Vntype>, %1.<Vntype>";
+}
+  [(set_attr "type" "neon_permute<q>")]
+)
+
 ;; Packing doubles.
 
 (define_expand "vec_pack_trunc_<mode>"
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/truncated_concatenation_1.c 
b/gcc/testsuite/gcc.target/aarch64/sve/truncated_concatenation_1.c
new file mode 100644
index 000000000000..95577a1a9efb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/truncated_concatenation_1.c
@@ -0,0 +1,32 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -Wall -march=armv8.2-a+sve" } */
+
+#include <arm_neon.h>
+#include <arm_sve.h>
+
+int8x8_t f1 (int16x4_t a, int16x4_t b) {
+    int8x8_t ab = vdup_n_s8 (0);
+    int16x8_t ab_concat = vcombine_s16 (a, b);
+    ab = vmovn_s16 (ab_concat);
+    return ab;
+}
+
+int16x4_t f2 (int32x2_t a, int32x2_t b) {
+    int16x4_t ab = vdup_n_s16 (0);
+    int32x4_t ab_concat = vcombine_s32 (a, b);
+    ab = vmovn_s32 (ab_concat);
+    return ab;
+}
+
+int32x2_t f3 (svint64_t a, svint64_t b) {
+    int32x2_t ab = vdup_n_s32 (0);
+    ab = vset_lane_s32 ((int)svaddv_s64 (svptrue_b64 (), a), ab, 0);
+    ab = vset_lane_s32 ((int)svaddv_s64 (svptrue_b64 (), b), ab, 1);
+    return ab;
+}
+
+/* { dg-final { scan-assembler-not {\txtn\t} } }*/
+/* { dg-final { scan-assembler-not {\tfcvtn\t} } }*/
+/* { dg-final { scan-assembler-times {\tuzp1\tv[0-9]+\.8b, v[0-9]+\.8b, 
v[0-9]+\.8b} 1 } }*/
+/* { dg-final { scan-assembler-times {\tuzp1\tv[0-9]+\.4h, v[0-9]+\.4h, 
v[0-9]+\.4h} 1 } }*/
+/* { dg-final { scan-assembler-times {\tuzp1\tv[0-9]+\.2s, v[0-9]+\.2s, 
v[0-9]+\.2s} 1 } }*/
\ No newline at end of file

[gcc r15-6609] aarch64: remove extra XTN in vector concatenation

Reply via email to