[Bug rtl-optimization/91154] [10 Regression] 456.hmmer regression on Haswell caused by r272922

rguenth at gcc dot gnu.org Fri, 19 Jul 2019 05:49:43 -0700

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=91154


--- Comment #15 from Richard Biener <rguenth at gcc dot gnu.org> ---
So another idea would be to provide [us]{min,max} patterns for integer modes
that split after reload into a compare&cmov or jumpy sequence if allocated
using GPR regs but also allow SSE reg alternatives which would fit into
existing SSE code if we'd allow SI-WITH-SSE similar to how we do
TARGET_MMX_WITH_SSE operations.  We already seem to have movsi patterns
{r,m}<->v so that part is done, for the testcase that would leave
addsi3 plus appropriate costing of the min/max case.

The smaxsi3 "splitter" (well ;)) is

Index: gcc/config/i386/i386.md
===================================================================
--- gcc/config/i386/i386.md     (revision 273592)
+++ gcc/config/i386/i386.md     (working copy)
@@ -1881,6 +1881,27 @@ (define_expand "mov<mode>"
   ""
   "ix86_expand_move (<MODE>mode, operands); DONE;")

+(define_insn "smaxsi3"
+ [(set (match_operand:SI 0 "register_operand" "=r,x")
+       (smax:SI (match_operand:SI 1 "register_operand" "%0,x")
+                (match_operand:SI 2 "register_operand" "r,x")))
+  (clobber (reg:CC FLAGS_REG))]
+  "TARGET_AVX2"
+{
+  switch (get_attr_type (insn))
+    {
+    case TYPE_SSEADD:
+      return "vpmaxsd\t{%2, %1, %0|%0, %1, %2}";
+    case TYPE_ICMOV:
+      return "cmpl\t{%2, %0|%0, %2}\n"
+            "cmovl\t{%2, %0|%0, %2}";
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "icmov,sseadd")])
+
 (define_insn "*mov<mode>_xor"
   [(set (match_operand:SWI48 0 "register_operand" "=r")
        (match_operand:SWI48 1 "const0_operand"))

with that we get the elision of the zeroing between the vpmaxsd but even
-mtune=bdver2 doesn't disparage the cross-unit moves enough for the
RA to choose the first alternative.  Huh.  But it then goes through the
stack...

.L3:
        vmovd   %xmm0, %r8d
        addl    (%rdx,%rax,4), %r8d
        movl    %r8d, 4(%rdi,%rax,4)
        movl    (%rcx,%rax,4), %r9d
        addl    (%rsi,%rax,4), %r9d
        movl    %r9d, -4(%rsp)
        vmovd   -4(%rsp), %xmm2
        movl    %r8d, -4(%rsp)
        movq    %rax, %r8
        vmovd   -4(%rsp), %xmm3
        vpmaxsd %xmm3, %xmm2, %xmm0
        vpmaxsd %xmm1, %xmm0, %xmm0
        vmovd   %xmm0, 4(%rdi,%rax,4)
        incq    %rax
        cmpq    %r8, %r10
        jne     .L3

not sure why RA selected the 2nd alternative at all... but yes, we'd
want to slightly prefer it.  But I expected the inter-unit moves to
push us to the first alternative.

As you can see we can automatically handle stores of SImode in SSE
regs and I verified we can also handle loads by slightly altering the
testcase.  That leaves implementing the addsi3 alternative for SSE regs,
like the following quite incomplete hack

Index: gcc/config/i386/i386.md
===================================================================
--- gcc/config/i386/i386.md     (revision 273592)
+++ gcc/config/i386/i386.md     (working copy)
@@ -5368,10 +5389,10 @@ (define_insn_and_split "*add<dwi>3_doubl
 })

 (define_insn "*add<mode>_1"
-  [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r,r,r")
+  [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r,r,r,v")
        (plus:SWI48
-         (match_operand:SWI48 1 "nonimmediate_operand" "%0,0,r,r")
-         (match_operand:SWI48 2 "x86_64_general_operand" "re,m,0,le")))
+         (match_operand:SWI48 1 "nonimmediate_operand" "%0,0,r,r,v")
+         (match_operand:SWI48 2 "x86_64_general_operand" "re,m,0,le,v")))
    (clobber (reg:CC FLAGS_REG))]
   "ix86_binary_operator_ok (PLUS, <MODE>mode, operands)"
 {
@@ -5390,6 +5411,9 @@ (define_insn "*add<mode>_1"
           return "dec{<imodesuffix>}\t%0";
        }

+     case TYPE_SSEADD:
+       return "vpaddd\t{%2, %1, %0|%0, %1, %2}";
+
     default:
       /* For most processors, ADD is faster than LEA.  This alternative
         was added to use ADD as much as possible.  */
@@ -5406,6 +5430,8 @@ (define_insn "*add<mode>_1"
   [(set (attr "type")
      (cond [(eq_attr "alternative" "3")
               (const_string "lea")
+           (eq_attr "alternative" "4")
+             (const_string "sseadd")
            (match_operand:SWI48 2 "incdec_operand")
              (const_string "incdec")
           ]

but somehow that doesn't trigger for me.

[Bug rtl-optimization/91154] [10 Regression] 456.hmmer regression on Haswell caused by r272922

Reply via email to