Hi,

Attached is the updated patch for the aarch64 conversion of some 
mvn+shrn patterns into a mvni+subhn. Hopefully attachment fixes the tab 
issues, the cover letter was updated to better explain what the patch 
does, code was changed to use emit_move_insn, and testcase was cleaned 
up per Richard and Alex's suggestions.

Sorry for the delay in fixing all suggestions but I was traveling for 
the past 2 weeks.

Remi
From 102b7358be9d030f9f518c2accd329d14fe545a3 Mon Sep 17 00:00:00 2001
From: Remi Machet <rmac...@nvidia.com>
Date: Fri, 13 Jun 2025 18:44:53 +0000
Subject: [PATCH v3] AArch64 SIMD: convert mvn+shrn into mvni+subhn

Add an optimization to aarch64 SIMD converting mvn+shrn into mvni+subhn when
possible, which allows for better optimization when the code is inside a loop
by using a constant.

The conversion is based on the fact that for an unsigned integer:
  -x = ~x + 1 => ~x = -1 - x
thus '(u8)(~x >> imm)' is equivalent to '(u8)(((u16)-1 - x) >> imm)'.

For the following function:
uint8x8_t neg_narrow_v8hi(uint16x8_t a) {
  uint16x8_t b = vmvnq_u16(a);
  return vshrn_n_u16(b, 8);
}

Without this patch the assembly look like:
        not     v0.16b, v0.16b
        shrn    v0.8b, v0.8h, 8

After the patch it becomes:
        mvni    v31.4s, 0
        subhn   v0.8b, v31.8h, v0.8h

Bootstrapped and regtested on aarch64-linux-gnu.

Signed-off-by: Remi Machet <rmac...@nvidia.com>

gcc/ChangeLog:

        * config/aarch64/aarch64-simd.md (*shrn_to_subhn_<mode>): Add pattern
        converting mvn+shrn into mvni+subhn.

gcc/testsuite/ChangeLog:

        * gcc.target/aarch64/simd/shrn2subhn.c: New test.
---
 gcc/config/aarch64/aarch64-simd.md            | 30 ++++++++++++++++
 .../gcc.target/aarch64/simd/shrn2subhn.c      | 36 +++++++++++++++++++
 2 files changed, 66 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/shrn2subhn.c

diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 6e30dc48934..29796f7cf1b 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -5028,6 +5028,36 @@
   DONE;
 })
 
+;; convert (truncate)(~x >> imm) into (truncate)(((u16)-1 - x) >> imm)
+;; because it will result in the 'not' being replaced with a constant load
+;; which allows for better loop optimization.
+;; We limit this to truncations that take the upper half and shift it to the
+;; lower half as we use subhn (patterns that would have generated an shrn
+;; otherwise).
+;; On some implementations the use of subhn also result in better throughput.
+(define_insn_and_split "*shrn_to_subhn_<mode>"
+  [(set (match_operand:<VNARROWQ> 0 "register_operand" "=&w")
+       (truncate:<VNARROWQ>
+         (lshiftrt:VQN
+           (not:VQN (match_operand:VQN 1 "register_operand" "w"))
+           (match_operand:VQN 2 "aarch64_simd_shift_imm_vec_exact_top"))))]
+  "TARGET_SIMD"
+  "#"
+  "&& true"
+  [(const_int 0)]
+{
+  rtx tmp;
+  if (can_create_pseudo_p ())
+    tmp = gen_reg_rtx (<MODE>mode);
+  else
+    tmp = gen_rtx_REG (<MODE>mode, REGNO (operands[0]));
+  emit_move_insn (tmp, CONSTM1_RTX (<MODE>mode));
+  emit_insn (gen_aarch64_subhn<mode>_insn (operands[0], tmp,
+                                          operands[1], operands[2]));
+  DONE;
+})
+
+
 ;; pmul.
 
 (define_insn "aarch64_pmul<mode>"
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/shrn2subhn.c 
b/gcc/testsuite/gcc.target/aarch64/simd/shrn2subhn.c
new file mode 100644
index 00000000000..f90ea134f09
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/shrn2subhn.c
@@ -0,0 +1,36 @@
+/* This test case checks that replacing a not+shift by a sub -1 works. */
+/* { dg-do compile } */
+/* { dg-additional-options "-O1" } */
+/* { dg-final { scan-assembler-times "\\tsubhn\\t" 6 } } */
+
+#include<arm_neon.h>
+
+uint8x8_t neg_narrow_v8hi(uint16x8_t a) {
+  uint16x8_t b = vmvnq_u16(a);
+  return vshrn_n_u16(b, 8);
+}
+
+uint8x8_t neg_narrow_vsubhn_v8hi(uint16x8_t a) {
+  uint16x8_t ones = vdupq_n_u16(0xffff);
+  return vsubhn_u16(ones, a);
+}
+
+uint16x4_t neg_narrow_v4si(uint32x4_t a) {
+  uint32x4_t b = vmvnq_u32(a);
+  return vshrn_n_u32(b, 16);
+}
+
+uint16x4_t neg_narrow_vsubhn_v4si(uint32x4_t a) {
+  uint32x4_t ones = vdupq_n_u32(0xffffffff);
+  return vsubhn_u32(ones, a);
+}
+
+uint32x2_t neg_narrow_v2di(uint64x2_t a) {
+  uint64x2_t b = ~a;
+  return vshrn_n_u64(b, 32);
+}
+
+uint32x2_t neg_narrow_vsubhn_v2di(uint64x2_t a) {
+  uint64x2_t ones = vdupq_n_u64(0xffffffffffffffff);
+  return vsubhn_u64(ones, a);
+}
-- 
2.34.1

Reply via email to