[gcc r16-1298] Also handle avx512 kmask & immediate 15 or 3 when VF is 4/2.

hongtao Liu via Gcc-cvs Sun, 08 Jun 2025 19:22:00 -0700

https://gcc.gnu.org/g:cdfa5fe03512f7ac5a293480f634df68fc973060


commit r16-1298-gcdfa5fe03512f7ac5a293480f634df68fc973060
Author: liuhongt <hongtao....@intel.com>
Date:   Tue Jun 3 14:12:23 2025 +0800

    Also handle avx512 kmask & immediate 15 or 3 when VF is 4/2.
    
    like r16-105-g599bca27dc37b3, the patch handles redunduant clean up of
    upper-bits for maskload.
    .i.e
    Successfully matched this instruction:
    (set (reg:V4DF 175)
        (vec_merge:V4DF (unspec:V4DF [
                    (mem:V4DF (plus:DI (reg/v/f:DI 155 [ b ])
                            (reg:DI 143 [ ivtmp.56 ])) [1  S32 A64])
                ] UNSPEC_MASKLOAD)
            (const_vector:V4DF [
                    (const_double:DF 0.0 [0x0.0p+0]) repeated x4
                ])
            (and:QI (reg:QI 125 [ mask__29.16 ])
                (const_int 15 [0xf]))))
    
    For maskstore, looks like it's already optimal(at least I can't make a
    testcase).
    So The patch only hanldes maskload.
    
    gcc/ChangeLog:
    
            PR target/103750
            * config/i386/i386.cc (ix86_rtx_costs): Adjust rtx_cost for
            maskload.
            * config/i386/sse.md (*<avx512>_load<mode>mask_and15): New
            define_insn_and_split.
            (*<avx512>_load<mode>mask_and3): Ditto.
    
    gcc/testsuite/ChangeLog:
    
            * gcc.target/i386/avx512f-pr103750-3.c: New test.

Diff:
---
 gcc/config/i386/i386.cc                            | 12 ++++++-
 gcc/config/i386/sse.md                             | 38 ++++++++++++++++++++++
 gcc/testsuite/gcc.target/i386/avx512f-pr103750-3.c | 26 +++++++++++++++
 3 files changed, 75 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index d48654a729a1..20ee360dcb0f 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -22938,7 +22938,17 @@ ix86_rtx_costs (rtx x, machine_mode mode, int 
outer_code_i, int opno,
        }
       /* This is masked instruction, assume the same cost,
         as nonmasked variant.  */
-      else if (TARGET_AVX512F && register_operand (mask, GET_MODE (mask)))
+      else if (TARGET_AVX512F
+              && (register_operand (mask, GET_MODE (mask))
+                  /* Redunduant clean up of high bits for kmask with VL=2/4
+                     .i.e (vec_merge op0, op1, (and op3 15)).  */
+                  || (GET_CODE (mask) == AND
+                      && register_operand (XEXP (mask, 0), GET_MODE (mask))
+                      && CONST_INT_P (XEXP (mask, 1))
+                      && ((INTVAL (XEXP (mask, 1)) == 3
+                           && GET_MODE_NUNITS (mode) == 2)
+                          || (INTVAL (XEXP (mask, 1)) == 15
+                              && GET_MODE_NUNITS (mode) == 4)))))
        {
          *total = rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed)
                   + rtx_cost (XEXP (x, 1), mode, outer_code, opno, speed);
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index c40b0fd49978..252ba0796065 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -1589,6 +1589,44 @@
   "&& 1"
   [(set (match_dup 0) (match_dup 1))])
 
+(define_insn_and_split "*<avx512>_load<mode>mask_and15"
+  [(set (match_operand:V48_AVX512VL_4 0 "register_operand" "=v")
+       (vec_merge:V48_AVX512VL_4
+        (unspec:V48_AVX512VL_4
+         [(match_operand:V48_AVX512VL_4 1 "memory_operand" "m")]
+         UNSPEC_MASKLOAD)
+        (match_operand:V48_AVX512VL_4 2 "nonimm_or_0_operand" "0C")
+        (and:QI
+         (match_operand:QI 3 "register_operand" "Yk")
+         (const_int 15))))]
+  "TARGET_AVX512F"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+       (vec_merge:V48_AVX512VL_4
+        (unspec:V48_AVX512VL_4 [(match_dup 1)] UNSPEC_MASKLOAD)
+        (match_dup 2)
+        (match_dup 3)))])
+
+(define_insn_and_split "*<avx512>_load<mode>mask_and3"
+  [(set (match_operand:V8_AVX512VL_2 0 "register_operand" "=v")
+       (vec_merge:V8_AVX512VL_2
+        (unspec:V8_AVX512VL_2
+         [(match_operand:V8_AVX512VL_2 1 "memory_operand" "m")]
+         UNSPEC_MASKLOAD)
+        (match_operand:V8_AVX512VL_2 2 "nonimm_or_0_operand" "0C")
+        (and:QI
+         (match_operand:QI 3 "register_operand" "Yk")
+         (const_int 3))))]
+  "TARGET_AVX512F"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+       (vec_merge:V8_AVX512VL_2
+        (unspec:V8_AVX512VL_2 [(match_dup 1)] UNSPEC_MASKLOAD)
+        (match_dup 2)
+        (match_dup 3)))])
+
 (define_expand "<avx512>_load<mode>_mask"
   [(set (match_operand:VI12_AVX512VL 0 "register_operand")
        (vec_merge:VI12_AVX512VL
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-pr103750-3.c 
b/gcc/testsuite/gcc.target/i386/avx512f-pr103750-3.c
new file mode 100644
index 000000000000..9965e633b201
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512f-pr103750-3.c
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64-v4 -mprefer-vector-width=256 -Ofast" } */
+/* { dg-final { scan-assembler-not "kmov" } } */
+
+void
+foo (double* a, double* __restrict b, double* c, double* d, int n)
+{
+  for (int i = 0; i != n; i++)
+    {
+      double tmp = 0.0;
+      if (c[i] > d[i])
+       tmp = b[i];
+      a[i] = tmp;
+    }
+}
+
+void
+foo1 (double* a, double* __restrict b, double* c, double* d, int n)
+{
+  for (int i = 0; i != n; i++)
+    {
+      double tmp = 0.0;
+      if (c[i] > d[i])
+       a[i] = b[i];
+    }
+}

[gcc r16-1298] Also handle avx512 kmask & immediate 15 or 3 when VF is 4/2.

Reply via email to