https://gcc.gnu.org/g:20d184e3f84d859e7e9f44a8d91772a02b658872

commit r16-367-g20d184e3f84d859e7e9f44a8d91772a02b658872
Author: Jan Hubicka <hubi...@ucw.cz>
Date:   Sat May 3 00:26:29 2025 +0200

    Improve ix86 VEC_MERGE costs
    
    ix86_rtx_costs VEC_MERGE by special casing AVX512 mask operations and 
otherwise
    returning cost->sse_op completely ignoring costs of the operands.  Since
    VEC_MERGE is also used to represent scalar variant of SSE/AVX operation, 
this
    means that many instructions (such as SSE converisions) are often costed as
    sse_op instead of their real cost.
    
    This patch adds pattern matching for the VEC_MERGE pattern which also 
forced me
    to add special cases for masked versions and vcmp otherwise combine is 
confused
    by the default cost compred to the cost of recognized version of the
    instruction.
    
    Since now the important cases should be handled, I also added recursion to 
the
    remaining cases so substituting constants and memory is adequately costed.
    
    gcc/ChangeLog:
    
            * config/i386/i386.cc (unspec_pcmp_p): New function.
            (ix86_rtx_costs): Cost VEC_MERGE more realistically.

Diff:
---
 gcc/config/i386/i386.cc | 82 ++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 77 insertions(+), 5 deletions(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 0c808c22b4f0..5ad47e194348 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -22025,6 +22025,15 @@ vec_fp_conversion_cost (const struct processor_costs 
*cost, int size)
     return cost->vcvtps2pd512;
 }
 
+/* Return true of X is UNSPEC with UNSPEC_PCMP or UNSPEC_UNSIGNED_PCMP.  */
+
+static bool
+unspec_pcmp_p (rtx x)
+{
+  return GET_CODE (x) == UNSPEC
+        && (XINT (x, 1) == UNSPEC_PCMP || XINT (x, 1) == UNSPEC_UNSIGNED_PCMP);
+}
+
 /* Compute a (partial) cost for rtx X.  Return true if the complete
    cost has been computed, and false if subexpressions should be
    scanned.  In either case, *TOTAL contains the cost result.  */
@@ -22807,14 +22816,77 @@ ix86_rtx_costs (rtx x, machine_mode mode, int 
outer_code_i, int opno,
 
     case VEC_MERGE:
       mask = XEXP (x, 2);
+      /* Scalar versions of SSE instructions may be represented as:
+
+        (vec_merge (vec_duplicate (operation ....))
+                    (register or memory)
+                    (const_int 1))
+
+        In this case vec_merge and vec_duplicate is for free.
+        Just recurse into operation and second operand.  */
+      if (mask == const1_rtx
+         && GET_CODE (XEXP (x, 0)) == VEC_DUPLICATE)
+       {
+         *total = rtx_cost (XEXP (XEXP (x, 0), 0), mode,
+                            outer_code, opno, speed)
+                  + rtx_cost (XEXP (x, 1), mode, outer_code, opno, speed);
+         return true;
+       }
       /* This is masked instruction, assume the same cost,
         as nonmasked variant.  */
-      if (TARGET_AVX512F && register_operand (mask, GET_MODE (mask)))
-       *total = rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed);
+      else if (TARGET_AVX512F && register_operand (mask, GET_MODE (mask)))
+       {
+         *total = rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed)
+                  + rtx_cost (XEXP (x, 1), mode, outer_code, opno, speed);
+         return true;
+       }
+      /* Combination of the two above:
+
+        (vec_merge (vec_merge (vec_duplicate (operation ...))
+                      (register or memory)
+                      (reg:QI mask))
+                   (register or memory)
+                   (const_int 1))
+
+        i.e. avx512fp16_vcvtss2sh_mask.  */
+      else if (TARGET_AVX512F
+              && mask == const1_rtx
+              && GET_CODE (XEXP (x, 0)) == VEC_MERGE
+              && GET_CODE (XEXP (XEXP (x, 0), 0)) == VEC_DUPLICATE
+              && register_operand (XEXP (XEXP (x, 0), 2),
+                                   GET_MODE (XEXP (XEXP (x, 0), 2))))
+       {
+         *total = rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0),
+                            mode, outer_code, opno, speed)
+                  + rtx_cost (XEXP (XEXP (x, 0), 1),
+                              mode, outer_code, opno, speed)
+                  + rtx_cost (XEXP (x, 1), mode, outer_code, opno, speed);
+         return true;
+       }
+      /* vcmp.  */
+      else if (unspec_pcmp_p (mask)
+              || (GET_CODE (mask) == NOT
+                  && unspec_pcmp_p (XEXP (mask, 0))))
+       {
+         rtx uns = GET_CODE (mask) == NOT ? XEXP (mask, 0) : mask;
+         rtx unsop0 = XVECEXP (uns, 0, 0);
+         /* Make (subreg:V4SI (not:V16QI (reg:V16QI ..)) 0)
+            cost the same as register.
+            This is used by avx_cmp<mode>3_ltint_not.  */
+         if (GET_CODE (unsop0) == SUBREG)
+           unsop0 = XEXP (unsop0, 0);
+         if (GET_CODE (unsop0) == NOT)
+           unsop0 = XEXP (unsop0, 0);
+         *total = rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed)
+                  + rtx_cost (XEXP (x, 1), mode, outer_code, opno, speed)
+                  + rtx_cost (unsop0, mode, UNSPEC, opno, speed)
+                  + rtx_cost (XVECEXP (uns, 0, 1), mode, UNSPEC, opno, speed)
+                  + cost->sse_op;
+         return true;
+       }
       else
-       /* ??? We should still recruse when computing cost.  */
        *total = cost->sse_op;
-      return true;
+      return false;
 
     case MEM:
       /* CONST_VECTOR_DUPLICATE_P in constant_pool is just broadcast.
@@ -22831,7 +22903,7 @@ ix86_rtx_costs (rtx x, machine_mode mode, int 
outer_code_i, int opno,
        }
 
       /* An insn that accesses memory is slightly more expensive
-         than one that does not.  */
+        than one that does not.  */
       if (speed)
        {
          *total += 1;

Reply via email to