[gcc r16-702] aarch64: Recognize vector permute patterns which can be interpreted as AND [PR100165]

Pengxuan Zheng via Gcc-cvs Fri, 16 May 2025 11:25:37 -0700

https://gcc.gnu.org/g:dc501cb0dc857663f7fa762f3dbf0ae60973d2c3


commit r16-702-gdc501cb0dc857663f7fa762f3dbf0ae60973d2c3
Author: Pengxuan Zheng <quic_pzh...@quicinc.com>
Date:   Wed May 7 10:47:37 2025 -0700

    aarch64: Recognize vector permute patterns which can be interpreted as AND 
[PR100165]
    
    Certain permute that blends a vector with zero can be interpreted as an AND 
of a
    mask. This idea was suggested by Richard Sandiford when he was reviewing my
    patch which tries to optimizes certain vector permute with the FMOV 
instruction
    for the aarch64 target.
    
    For example, for the aarch64 target, at present:
    
    v4hi
    f_v4hi (v4hi x)
    {
      return __builtin_shuffle (x, (v4hi){ 0, 0, 0, 0 }, (v4hi){ 4, 1, 6, 3 });
    }
    
    generates:
    
    f_v4hi:
            uzp1    v0.2d, v0.2d, v0.2d
            adrp    x0, .LC0
            ldr     d31, [x0, #:lo12:.LC0]
            tbl     v0.8b, {v0.16b}, v31.8b
            ret
    .LC0:
            .byte   -1
            .byte   -1
            .byte   2
            .byte   3
            .byte   -1
            .byte   -1
            .byte   6
            .byte   7
    
    With this patch, it generates:
    
    f_v4hi:
            mvni    v31.2s, 0xff, msl 8
            and     v0.8b, v0.8b, v31.8b
            ret
    
    This patch also provides a target-independent routine for detecting vector
    permute patterns which can be interpreted as AND.
    
    Changes since v1:
    * v2: Rework the patch to only perform the optimization for aarch64 by 
calling
    the target independent routine vec_perm_and_mask.
    
            PR target/100165
    
    gcc/ChangeLog:
    
            * config/aarch64/aarch64.cc (aarch64_evpc_and): New.
            (aarch64_expand_vec_perm_const_1): Call aarch64_evpc_and.
            * optabs.cc (vec_perm_and_mask): New.
            * optabs.h (vec_perm_and_mask): New prototype.
    
    gcc/testsuite/ChangeLog:
    
            * gcc.target/aarch64/and-be.c: New test.
            * gcc.target/aarch64/and-le.c: New test.
    
    Signed-off-by: Pengxuan Zheng <quic_pzh...@quicinc.com>

Diff:
---
 gcc/config/aarch64/aarch64.cc             |  36 +++++++++
 gcc/optabs.cc                             |  44 +++++++++++
 gcc/optabs.h                              |   4 +
 gcc/testsuite/gcc.target/aarch64/and-be.c | 123 ++++++++++++++++++++++++++++++
 gcc/testsuite/gcc.target/aarch64/and-le.c | 123 ++++++++++++++++++++++++++++++
 5 files changed, 330 insertions(+)

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index f5552e4b86ce..34f9725485d2 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -26886,6 +26886,40 @@ aarch64_evpc_ins (struct expand_vec_perm_d *d)
   return true;
 }
 
+/* Recognize patterns suitable for the AND instructions.  */
+static bool
+aarch64_evpc_and (struct expand_vec_perm_d *d)
+{
+  /* Either d->op0 or d->op1 should be a vector of all zeros.  */
+  if (d->one_vector_p || (!d->zero_op0_p && !d->zero_op1_p))
+    return false;
+
+  machine_mode mode = d->vmode;
+  machine_mode sel_mode;
+  if (!related_int_vector_mode (mode).exists (&sel_mode))
+    return false;
+
+  insn_code and_code = optab_handler (and_optab, sel_mode);
+  rtx and_mask = vec_perm_and_mask (sel_mode, d->perm, d->zero_op0_p);
+  if (and_code == CODE_FOR_nothing || !and_mask)
+    return false;
+
+  if (d->testing_p)
+    return true;
+
+  class expand_operand ops[3];
+  rtx in = d->zero_op0_p ? d->op1 : d->op0;
+  create_output_operand (&ops[0], gen_lowpart (sel_mode, d->target), sel_mode);
+  create_input_operand (&ops[1], gen_lowpart (sel_mode, in), sel_mode);
+  create_input_operand (&ops[2], and_mask, sel_mode);
+  expand_insn (and_code, 3, ops);
+  rtx result = gen_lowpart (mode, ops[0].value);
+  if (!rtx_equal_p (d->target, result))
+    emit_move_insn (d->target, result);
+
+  return true;
+}
+
 static bool
 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
 {
@@ -26921,6 +26955,8 @@ aarch64_expand_vec_perm_const_1 (struct 
expand_vec_perm_d *d)
            return true;
          else if (aarch64_evpc_uzp (d))
            return true;
+         else if (aarch64_evpc_and (d))
+           return true;
          else if (aarch64_evpc_trn (d))
            return true;
          else if (aarch64_evpc_sel (d))
diff --git a/gcc/optabs.cc b/gcc/optabs.cc
index 92d6d50d55a0..5c9450f61450 100644
--- a/gcc/optabs.cc
+++ b/gcc/optabs.cc
@@ -6362,6 +6362,50 @@ expand_vec_perm_1 (enum insn_code icode, rtx target,
   return NULL_RTX;
 }
 
+/* Check if vec_perm mask SEL is a constant equivalent to an and operation of
+   the non-zero vec_perm operand with some mask consisting of 0xffs and 0x00s,
+   assuming the other vec_perm operand is a constant vector of zeros.  Return
+   the mask for the equivalent and operation, or NULL_RTX if the vec_perm can
+   not be modeled as an and.  MODE is the mode of the value being anded.
+   ZERO_OP0_P is true if the first operand of the vec_perm is a constant vector
+   of zeros or false if the second operand of the vec_perm is a constant vector
+   of zeros.  */
+rtx
+vec_perm_and_mask (machine_mode mode, const vec_perm_indices &sel,
+                  bool zero_op0_p)
+{
+  unsigned int nelt;
+  if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
+    return NULL_RTX;
+
+  rtx_vector_builder builder (mode, nelt, 1);
+  machine_mode emode = GET_MODE_INNER (mode);
+
+  for (unsigned int i = 0; i < nelt; i++)
+    {
+      if (zero_op0_p)
+       {
+         if (known_eq (sel[i], nelt + i))
+           builder.quick_push (CONSTM1_RTX (emode));
+         else if (known_lt (sel[i], nelt))
+           builder.quick_push (CONST0_RTX (emode));
+         else
+           return NULL_RTX;
+       }
+      else
+       {
+         if (known_eq (sel[i], i))
+           builder.quick_push (CONSTM1_RTX (emode));
+         else if (known_ge (sel[i], nelt))
+           builder.quick_push (CONST0_RTX (emode));
+         else
+           return NULL_RTX;
+       }
+    }
+
+  return builder.build ();
+}
+
 /* Implement a permutation of vectors v0 and v1 using the permutation
    vector in SEL and return the result.  Use TARGET to hold the result
    if nonnull and convenient.
diff --git a/gcc/optabs.h b/gcc/optabs.h
index ae525c848d32..a8b0e93d60bc 100644
--- a/gcc/optabs.h
+++ b/gcc/optabs.h
@@ -334,6 +334,10 @@ extern bool have_insn_for (enum rtx_code, machine_mode);
 /* Generate a conditional trap instruction.  */
 extern rtx_insn *gen_cond_trap (enum rtx_code, rtx, rtx, rtx);
 
+/* Check whether the vec_perm can be interpreted as an and operation.  */
+extern rtx vec_perm_and_mask (machine_mode mode, const vec_perm_indices &sel,
+                             bool zero_op0_p);
+
 /* Generate code for VEC_PERM_EXPR.  */
 extern rtx expand_vec_perm_var (machine_mode, rtx, rtx, rtx, rtx);
 extern rtx expand_vec_perm_const (machine_mode, rtx, rtx,
diff --git a/gcc/testsuite/gcc.target/aarch64/and-be.c 
b/gcc/testsuite/gcc.target/aarch64/and-be.c
new file mode 100644
index 000000000000..7457dd5e35cf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/and-be.c
@@ -0,0 +1,123 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mbig-endian" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+typedef short v4hi __attribute__ ((vector_size (8)));
+typedef char v8qi __attribute__ ((vector_size (8)));
+typedef int v4si __attribute__ ((vector_size (16)));
+typedef float v4sf __attribute__ ((vector_size (16)));
+typedef short v8hi __attribute__ ((vector_size (16)));
+typedef char v16qi __attribute__ ((vector_size (16)));
+
+
+/*
+** f_v4hi:
+**     movi    v([0-9]+).2s, 0xff, msl 8
+**     and     v0.8b, (?:v0.8b, v\1.8b|v\1.8b, v0.8b)
+**     ret
+*/
+v4hi
+f_v4hi (v4hi x)
+{
+  return __builtin_shuffle (x, (v4hi){ 0, 0, 0, 0 }, (v4hi){ 4, 1, 6, 3 });
+}
+
+/*
+** g_v4hi:
+**     mvni    v([0-9]+).2s, 0xff, msl 8
+**     and     v0.8b, (?:v0.8b, v\1.8b|v\1.8b, v0.8b)
+**     ret
+*/
+v4hi
+g_v4hi (v4hi x)
+{
+  return __builtin_shuffle (x, (v4hi){ 0, 0, 0, 0 }, (v4hi){ 0, 5, 2, 7 });
+}
+
+/*
+** f_v8hi:
+**     ...
+**     and     v0.16b, (?:v0.16b, v[0-9]+.16b|v[0-9]+.16b, v0.16b)
+**     ret
+*/
+v8hi
+f_v8hi (v8hi x)
+{
+  return __builtin_shuffle (x, (v8hi){ 0, 0, 0, 0, 0, 0, 0, 0 },
+                           (v8hi){ 0, 8, 2, 9, 4, 10, 12, 11 });
+}
+
+/*
+** f_v4si:
+**     movi    v([0-9]+).2d, 0xffffffff00000000
+**     and     v0.16b, (?:v0.16b, v\1.16b|v\1.16b, v0.16b)
+**     ret
+*/
+v4si
+f_v4si (v4si x)
+{
+  return __builtin_shuffle (x, (v4si){ 0, 0, 0, 0 }, (v4si){ 0, 4, 2, 5 });
+}
+
+/*
+** g_v4si:
+**     movi    v([0-9]+).2d, 0xffffffff
+**     and     v0.16b, (?:v0.16b, v\1.16b|v\1.16b, v0.16b)
+**     ret
+*/
+v4si
+g_v4si (v4si x)
+{
+  return __builtin_shuffle ((v4si){ 0, 0, 0, 0 }, x, (v4si){ 1, 5, 3, 7 });
+}
+
+/*
+** h_v4si:
+**     movi    v([0-9]+).2d, 0xffffffff
+**     and     v0.16b, (?:v0.16b, v\1.16b|v\1.16b, v0.16b)
+**     ret
+*/
+v4si
+h_v4si (v4si x)
+{
+  return __builtin_shuffle (x, (v4si){ 0, 0, 0, 0 }, (v4si){ 7, 1, 6, 3 });
+}
+
+/*
+** f_v4sf:
+**     movi    v([0-9]+).2d, 0xffffffff00000000
+**     and     v0.16b, (?:v0.16b, v\1.16b|v\1.16b, v0.16b)
+**     ret
+*/
+v4sf
+f_v4sf (v4sf x)
+{
+  return __builtin_shuffle (x, (v4sf){ 0, 0, 0, 0 }, (v4si){ 0, 6, 2, 7 });
+}
+
+/*
+** f_v8qi:
+**     movi    d([0-9]+), 0xff00ff00ff000000
+**     and     v0.8b, (?:v0.8b, v\1.8b|v\1.8b, v0.8b)
+**     ret
+*/
+v8qi
+f_v8qi (v8qi x)
+{
+  return __builtin_shuffle (x, (v8qi){ 0, 0, 0, 0, 0, 0, 0, 0 },
+                           (v8qi){ 0, 8, 2, 9, 4, 10, 12, 11 });
+}
+
+/*
+** f_v16qi:
+**     ...
+**     and     v0.16b, (?:v0.16b, v[0-9]+.16b|v[0-9]+.16b, v0.16b)
+**     ret
+*/
+v16qi
+f_v16qi (v16qi x)
+{
+  return __builtin_shuffle (
+      x, (v16qi){ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      (v16qi){ 16, 1, 17, 3, 18, 5, 19, 7, 20, 9, 21, 11, 22, 13, 23, 24 });
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/and-le.c 
b/gcc/testsuite/gcc.target/aarch64/and-le.c
new file mode 100644
index 000000000000..398813bd7148
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/and-le.c
@@ -0,0 +1,123 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mlittle-endian" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+typedef short v4hi __attribute__ ((vector_size (8)));
+typedef char v8qi __attribute__ ((vector_size (8)));
+typedef int v4si __attribute__ ((vector_size (16)));
+typedef float v4sf __attribute__ ((vector_size (16)));
+typedef short v8hi __attribute__ ((vector_size (16)));
+typedef char v16qi __attribute__ ((vector_size (16)));
+
+
+/*
+** f_v4hi:
+**     mvni    v([0-9]+).2s, 0xff, msl 8
+**     and     v0.8b, (?:v0.8b, v\1.8b|v\1.8b, v0.8b)
+**     ret
+*/
+v4hi
+f_v4hi (v4hi x)
+{
+  return __builtin_shuffle (x, (v4hi){ 0, 0, 0, 0 }, (v4hi){ 4, 1, 6, 3 });
+}
+
+/*
+** g_v4hi:
+**     movi    v([0-9]+).2s, 0xff, msl 8
+**     and     v0.8b, (?:v0.8b, v\1.8b|v\1.8b, v0.8b)
+**     ret
+*/
+v4hi
+g_v4hi (v4hi x)
+{
+  return __builtin_shuffle (x, (v4hi){ 0, 0, 0, 0 }, (v4hi){ 0, 5, 2, 7 });
+}
+
+/*
+** f_v8hi:
+**     ...
+**     and     v0.16b, (?:v0.16b, v[0-9]+.16b|v[0-9]+.16b, v0.16b)
+**     ret
+*/
+v8hi
+f_v8hi (v8hi x)
+{
+  return __builtin_shuffle (x, (v8hi){ 0, 0, 0, 0, 0, 0, 0, 0 },
+                           (v8hi){ 0, 8, 2, 9, 4, 10, 12, 11 });
+}
+
+/*
+** f_v4si:
+**     movi    v([0-9]+).2d, 0xffffffff
+**     and     v0.16b, (?:v0.16b, v\1.16b|v\1.16b, v0.16b)
+**     ret
+*/
+v4si
+f_v4si (v4si x)
+{
+  return __builtin_shuffle (x, (v4si){ 0, 0, 0, 0 }, (v4si){ 0, 4, 2, 5 });
+}
+
+/*
+** g_v4si:
+**     movi    v([0-9]+).2d, 0xffffffff00000000
+**     and     v0.16b, (?:v0.16b, v\1.16b|v\1.16b, v0.16b)
+**     ret
+*/
+v4si
+g_v4si (v4si x)
+{
+  return __builtin_shuffle ((v4si){ 0, 0, 0, 0 }, x, (v4si){ 1, 5, 3, 7 });
+}
+
+/*
+** h_v4si:
+**     movi    v([0-9]+).2d, 0xffffffff00000000
+**     and     v0.16b, (?:v0.16b, v\1.16b|v\1.16b, v0.16b)
+**     ret
+*/
+v4si
+h_v4si (v4si x)
+{
+  return __builtin_shuffle (x, (v4si){ 0, 0, 0, 0 }, (v4si){ 7, 1, 6, 3 });
+}
+
+/*
+** f_v4sf:
+**     movi    v([0-9]+).2d, 0xffffffff
+**     and     v0.16b, (?:v0.16b, v\1.16b|v\1.16b, v0.16b)
+**     ret
+*/
+v4sf
+f_v4sf (v4sf x)
+{
+  return __builtin_shuffle (x, (v4sf){ 0, 0, 0, 0 }, (v4si){ 0, 6, 2, 7 });
+}
+
+/*
+** f_v8qi:
+**     movi    d([0-9]+), 0xff00ff00ff
+**     and     v0.8b, (?:v0.8b, v\1.8b|v\1.8b, v0.8b)
+**     ret
+*/
+v8qi
+f_v8qi (v8qi x)
+{
+  return __builtin_shuffle (x, (v8qi){ 0, 0, 0, 0, 0, 0, 0, 0 },
+                           (v8qi){ 0, 8, 2, 9, 4, 10, 12, 11 });
+}
+
+/*
+** f_v16qi:
+**     ...
+**     and     v0.16b, (?:v0.16b, v[0-9]+.16b|v[0-9]+.16b, v0.16b)
+**     ret
+*/
+v16qi
+f_v16qi (v16qi x)
+{
+  return __builtin_shuffle (
+      x, (v16qi){ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      (v16qi){ 16, 1, 17, 3, 18, 5, 19, 7, 20, 9, 21, 11, 22, 13, 23, 24 });
+}

[gcc r16-702] aarch64: Recognize vector permute patterns which can be interpreted as AND [PR100165]

Reply via email to