https://gcc.gnu.org/g:37554bacfd38b1466278b529d9e70a44d7b1b909

commit r15-4105-g37554bacfd38b1466278b529d9e70a44d7b1b909
Author: Jakub Jelinek <ja...@redhat.com>
Date:   Mon Oct 7 10:50:39 2024 +0200

    ssa-math-opts, i386: Improve spaceship expansion [PR116896]
    
    The PR notes that we don't emit optimal code for C++ spaceship
    operator if the result is returned as an integer rather than the
    result just being compared against different values and different
    code executed based on that.
    So e.g. for
    template <typename T>
    auto foo (T x, T y) { return x <=> y; }
    for both floating point types, signed integer types and unsigned integer
    types.  auto in that case is std::strong_ordering or std::partial_ordering,
    which are fancy C++ abstractions around struct with signed char member
    which is -1, 0, 1 for the strong ordering and -1, 0, 1, 2 for the partial
    ordering (but for -ffast-math 2 is never the case).
    I'm afraid functions like that are fairly common and unless they are
    inlined, we really need to map the comparison to those -1, 0, 1 or
    -1, 0, 1, 2 values.
    
    Now, for floating point spaceship I've in the past already added an
    optimization (with tree-ssa-math-opts.cc discovery and named optab, the
    optab only defined on x86 though right now), which ensures there is just
    a single comparison instruction and then just tests based on flags.
    Now, if we have code like:
      auto a = x <=> y;
      if (a == std::partial_ordering::less)
        bar ();
      else if (a == std::partial_ordering::greater)
        baz ();
      else if (a == std::partial_ordering::equivalent)
        qux ();
      else if (a == std::partial_ordering::unordered)
        corge ();
    etc., that results in decent code generation, the spaceship named pattern
    on x86 optimizes for the jumps, so emits comparisons on the flags, followed
    by setting the result to -1, 0, 1, 2 and subsequent jump pass optimizes that
    well.  But if the result needs to be stored into an integer and just
    returned that way or there are no immediate jumps based on it (or turned
    into some non-standard integer values like -42, 0, 36, 75 etc.), then CE
    doesn't do a good job for that, we end up with say
            comiss  %xmm1, %xmm0
            jp      .L4
            seta    %al
            movl    $0, %edx
            leal    -1(%rax,%rax), %eax
            cmove   %edx, %eax
            ret
    .L4:
            movl    $2, %eax
            ret
    The jp is good, that is the unlikely case and can't be easily handled in
    straight line code due to the layout of the flags, but the rest uses cmov
    which often isn't a win and a weird math.
    With the patch below we can get instead
            xorl    %eax, %eax
            comiss  %xmm1, %xmm0
            jp      .L2
            seta    %al
            sbbl    $0, %eax
            ret
    .L2:
            movl    $2, %eax
            ret
    
    The patch changes the discovery in the generic code, by detecting if
    the future .SPACESHIP result is just used in a PHI with -1, 0, 1 or
    -1, 0, 1, 2 values (the latter for HONOR_NANS) and passes that as a flag in
    a new argument to .SPACESHIP ifn, so that the named pattern is told whether
    it should optimize for branches or for loading the result into a -1, 0, 1
    (, 2) integer.  Additionally, it doesn't detect just floating point <=>
    anymore, but also integer and unsigned integer, but in those cases only
    if an integer -1, 0, 1 is wanted (otherwise == and > or similar comparisons
    result in good code).
    The backend then can for those integer or unsigned integer <=>s return
    effectively (x > y) - (x < y) in a way that is efficient on the target
    (so for x86 with ensuring zero initialization first when needed before
    setcc; one for floating point and unsigned, where there is just one setcc
    and the second one optimized into sbb instruction, two for the signed int
    case).  So e.g. for signed int we now emit
            xorl    %edx, %edx
            xorl    %eax, %eax
            cmpl    %esi, %edi
            setl    %dl
            setg    %al
            subl    %edx, %eax
            ret
    and for unsigned
            xorl    %eax, %eax
            cmpl    %esi, %edi
            seta    %al
            sbbb    $0, %al
            ret
    
    Note, I wonder if other targets wouldn't benefit from defining the
    named optab too...
    
    2024-10-07  Jakub Jelinek  <ja...@redhat.com>
    
            PR middle-end/116896
            * optabs.def (spaceship_optab): Use spaceship$a4 rather than
            spaceship$a3.
            * internal-fn.cc (expand_SPACESHIP): Expect 3 call arguments
            rather than 2, expand the last one, expect 4 operands of
            spaceship_optab.
            * tree-ssa-math-opts.cc: Include cfghooks.h.
            (optimize_spaceship): Check if a single PHI is initialized to
            -1, 0, 1, 2 or -1, 0, 1 values, in that case pass 1 as last (new)
            argument to .SPACESHIP and optimize away the comparisons,
            otherwise pass 0.  Also check for integer comparisons rather than
            floating point, in that case do it only if there is a single PHI
            with -1, 0, 1 values and pass 1 to last argument of .SPACESHIP
            if the <=> is signed, 2 if unsigned.
            * config/i386/i386-protos.h (ix86_expand_fp_spaceship): Add
            another rtx argument.
            (ix86_expand_int_spaceship): Declare.
            * config/i386/i386-expand.cc (ix86_expand_fp_spaceship): Add
            arg3 argument, if it is const0_rtx, expand like before, otherwise
            emit optimized sequence for setting the result into a GPR.
            (ix86_expand_int_spaceship): New function.
            * config/i386/i386.md (UNSPEC_SETCC_SI_SLP): New UNSPEC code.
            (setcc_si_slp): New define_expand.
            (*setcc_si_slp): New define_insn_and_split.
            (setcc + setcc + movzbl): New define_peephole2.
            (spaceship<mode>3): Renamed to ...
            (spaceship<mode>4): ... this.  Add an extra operand, pass it
            to ix86_expand_fp_spaceship.
            (spaceshipxf3): Renamed to ...
            (spaceshipxf4): ... this.  Add an extra operand, pass it
            to ix86_expand_fp_spaceship.
            (spaceship<mode>4): New define_expand for SWI modes.
            * doc/md.texi (spaceship@var{m}3): Renamed to ...
            (spaceship@var{m}4): ... this.  Document the meaning of last
            operand.
    
            * g++.target/i386/pr116896-1.C: New test.
            * g++.target/i386/pr116896-2.C: New test.

Diff:
---
 gcc/config/i386/i386-expand.cc             | 128 ++++++++++++++++++++++++-----
 gcc/config/i386/i386-protos.h              |   3 +-
 gcc/config/i386/i386.md                    |  76 +++++++++++++++--
 gcc/doc/md.texi                            |   8 +-
 gcc/internal-fn.cc                         |   7 +-
 gcc/optabs.def                             |   2 +-
 gcc/testsuite/g++.target/i386/pr116896-1.C |  35 ++++++++
 gcc/testsuite/g++.target/i386/pr116896-2.C |  41 +++++++++
 gcc/tree-ssa-math-opts.cc                  | 114 +++++++++++++++++++++++--
 9 files changed, 373 insertions(+), 41 deletions(-)

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index a9f83a299e3a..81dd50649007 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -3146,12 +3146,15 @@ ix86_expand_setcc (rtx dest, enum rtx_code code, rtx 
op0, rtx op1)
    dest = op0 == op1 ? 0 : op0 < op1 ? -1 : op0 > op1 ? 1 : 2.  */
 
 void
-ix86_expand_fp_spaceship (rtx dest, rtx op0, rtx op1)
+ix86_expand_fp_spaceship (rtx dest, rtx op0, rtx op1, rtx op2)
 {
   gcc_checking_assert (ix86_fp_comparison_strategy (GT) != IX86_FPCMP_ARITH);
+  rtx zero = NULL_RTX;
+  if (op2 != const0_rtx && TARGET_IEEE_FP && GET_MODE (dest) == SImode)
+    zero = force_reg (SImode, const0_rtx);
   rtx gt = ix86_expand_fp_compare (GT, op0, op1);
-  rtx l0 = gen_label_rtx ();
-  rtx l1 = gen_label_rtx ();
+  rtx l0 = op2 == const0_rtx ? gen_label_rtx () : NULL_RTX;
+  rtx l1 = op2 == const0_rtx ? gen_label_rtx () : NULL_RTX;
   rtx l2 = TARGET_IEEE_FP ? gen_label_rtx () : NULL_RTX;
   rtx lend = gen_label_rtx ();
   rtx tmp;
@@ -3165,23 +3168,68 @@ ix86_expand_fp_spaceship (rtx dest, rtx op0, rtx op1)
       jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
       add_reg_br_prob_note (jmp, profile_probability:: very_unlikely ());
     }
-  rtx eq = gen_rtx_fmt_ee (UNEQ, VOIDmode,
-                          gen_rtx_REG (CCFPmode, FLAGS_REG), const0_rtx);
-  tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, eq,
-                             gen_rtx_LABEL_REF (VOIDmode, l0), pc_rtx);
-  jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
-  add_reg_br_prob_note (jmp, profile_probability::unlikely ());
-  tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, gt,
-                             gen_rtx_LABEL_REF (VOIDmode, l1), pc_rtx);
-  jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
-  add_reg_br_prob_note (jmp, profile_probability::even ());
-  emit_move_insn (dest, constm1_rtx);
-  emit_jump (lend);
-  emit_label (l0);
-  emit_move_insn (dest, const0_rtx);
-  emit_jump (lend);
-  emit_label (l1);
-  emit_move_insn (dest, const1_rtx);
+  if (op2 == const0_rtx)
+    {
+      rtx eq = gen_rtx_fmt_ee (UNEQ, VOIDmode,
+                              gen_rtx_REG (CCFPmode, FLAGS_REG), const0_rtx);
+      tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, eq,
+                                 gen_rtx_LABEL_REF (VOIDmode, l0), pc_rtx);
+      jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
+      add_reg_br_prob_note (jmp, profile_probability::unlikely ());
+      tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, gt,
+                                 gen_rtx_LABEL_REF (VOIDmode, l1), pc_rtx);
+      jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
+      add_reg_br_prob_note (jmp, profile_probability::even ());
+      emit_move_insn (dest, constm1_rtx);
+      emit_jump (lend);
+      emit_label (l0);
+      emit_move_insn (dest, const0_rtx);
+      emit_jump (lend);
+      emit_label (l1);
+      emit_move_insn (dest, const1_rtx);
+    }
+  else
+    {
+      rtx lt_tmp = gen_reg_rtx (QImode);
+      ix86_expand_setcc (lt_tmp, UNLT, gen_rtx_REG (CCFPmode, FLAGS_REG),
+                        const0_rtx);
+      if (GET_MODE (dest) != QImode)
+       {
+         tmp = gen_reg_rtx (GET_MODE (dest));
+         emit_insn (gen_rtx_SET (tmp, gen_rtx_ZERO_EXTEND (GET_MODE (dest),
+                                                           lt_tmp)));
+         lt_tmp = tmp;
+       }
+      rtx gt_tmp;
+      if (zero)
+       {
+         /* If TARGET_IEEE_FP and dest has SImode, emit SImode clear
+            before the floating point comparison and use setcc_si_slp
+            pattern to hide it from the combiner, so that it doesn't
+            undo it.  */
+         tmp = ix86_expand_compare (GT, XEXP (gt, 0), const0_rtx);
+         PUT_MODE (tmp, QImode);
+         emit_insn (gen_setcc_si_slp (zero, tmp, zero));
+         gt_tmp = zero;
+       }
+      else
+       {
+         gt_tmp = gen_reg_rtx (QImode);
+         ix86_expand_setcc (gt_tmp, GT, XEXP (gt, 0), const0_rtx);
+         if (GET_MODE (dest) != QImode)
+           {
+             tmp = gen_reg_rtx (GET_MODE (dest));
+             emit_insn (gen_rtx_SET (tmp,
+                                     gen_rtx_ZERO_EXTEND (GET_MODE (dest),
+                                                          gt_tmp)));
+             gt_tmp = tmp;
+           }
+       }
+      tmp = expand_simple_binop (GET_MODE (dest), MINUS, gt_tmp, lt_tmp, dest,
+                                0, OPTAB_DIRECT);
+      if (!rtx_equal_p (tmp, dest))
+       emit_move_insn (dest, tmp);
+    }
   emit_jump (lend);
   if (l2)
     {
@@ -3191,6 +3239,46 @@ ix86_expand_fp_spaceship (rtx dest, rtx op0, rtx op1)
   emit_label (lend);
 }
 
+/* Expand integral op0 <=> op1, i.e.
+   dest = op0 == op1 ? 0 : op0 < op1 ? -1 : 1.  */
+
+void
+ix86_expand_int_spaceship (rtx dest, rtx op0, rtx op1, rtx op2)
+{
+  gcc_assert (INTVAL (op2));
+  /* Not using ix86_expand_int_compare here, so that it doesn't swap
+     operands nor optimize CC mode - we need a mode usable for both
+     LT and GT resp. LTU and GTU comparisons with the same unswapped
+     operands.  */
+  rtx flags = gen_rtx_REG (INTVAL (op2) == 1 ? CCGCmode : CCmode, FLAGS_REG);
+  rtx tmp = gen_rtx_COMPARE (GET_MODE (flags), op0, op1);
+  emit_insn (gen_rtx_SET (flags, tmp));
+  rtx lt_tmp = gen_reg_rtx (QImode);
+  ix86_expand_setcc (lt_tmp, INTVAL (op2) == 1 ? LT : LTU, flags,
+                    const0_rtx);
+  if (GET_MODE (dest) != QImode)
+    {
+      tmp = gen_reg_rtx (GET_MODE (dest));
+      emit_insn (gen_rtx_SET (tmp, gen_rtx_ZERO_EXTEND (GET_MODE (dest),
+                                                       lt_tmp)));
+      lt_tmp = tmp;
+    }
+  rtx gt_tmp = gen_reg_rtx (QImode);
+  ix86_expand_setcc (gt_tmp, INTVAL (op2) == 1 ? GT : GTU, flags,
+                    const0_rtx);
+  if (GET_MODE (dest) != QImode)
+    {
+      tmp = gen_reg_rtx (GET_MODE (dest));
+      emit_insn (gen_rtx_SET (tmp, gen_rtx_ZERO_EXTEND (GET_MODE (dest),
+                                                       gt_tmp)));
+      gt_tmp = tmp;
+    }
+  tmp = expand_simple_binop (GET_MODE (dest), MINUS, gt_tmp, lt_tmp, dest,
+                            0, OPTAB_DIRECT);
+  if (!rtx_equal_p (tmp, dest))
+    emit_move_insn (dest, tmp);
+}
+
 /* Expand comparison setting or clearing carry flag.  Return true when
    successful and set pop for the operation.  */
 static bool
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index 3a7bc949e56c..c1f9147769cb 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -164,7 +164,8 @@ extern bool ix86_expand_fp_vec_cmp (rtx[]);
 extern void ix86_expand_sse_movcc (rtx, rtx, rtx, rtx);
 extern void ix86_expand_sse_extend (rtx, rtx, bool);
 extern void ix86_expand_sse_unpack (rtx, rtx, bool, bool);
-extern void ix86_expand_fp_spaceship (rtx, rtx, rtx);
+extern void ix86_expand_fp_spaceship (rtx, rtx, rtx, rtx);
+extern void ix86_expand_int_spaceship (rtx, rtx, rtx, rtx);
 extern bool ix86_expand_int_addcc (rtx[]);
 extern void ix86_expand_carry (rtx arg);
 extern rtx_insn *ix86_expand_call (rtx, rtx, rtx, rtx, rtx, bool);
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 9c2a0aa61126..fb9befcf65b3 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -118,6 +118,7 @@
   UNSPEC_PUSHFL
   UNSPEC_POPFL
   UNSPEC_OPTCOMX
+  UNSPEC_SETCC_SI_SLP
 
   ;; For SSE/MMX support:
   UNSPEC_FIX_NOTRUNC
@@ -19281,6 +19282,27 @@
   [(set_attr "type" "setcc")
    (set_attr "mode" "QI")])
 
+(define_expand "setcc_si_slp"
+  [(set (match_operand:SI 0 "register_operand")
+       (unspec:SI
+         [(match_operand:QI 1)
+          (match_operand:SI 2 "register_operand")] UNSPEC_SETCC_SI_SLP))])
+
+(define_insn_and_split "*setcc_si_slp"
+  [(set (match_operand:SI 0 "register_operand" "=q")
+       (unspec:SI
+         [(match_operator:QI 1 "ix86_comparison_operator"
+            [(reg FLAGS_REG) (const_int 0)])
+          (match_operand:SI 2 "register_operand" "0")] UNSPEC_SETCC_SI_SLP))]
+  "ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0) (match_dup 2))
+   (set (strict_low_part (match_dup 3)) (match_dup 1))]
+{
+  operands[3] = gen_lowpart (QImode, operands[0]);
+})
+
 ;; In general it is not safe to assume too much about CCmode registers,
 ;; so simplify-rtx stops when it sees a second one.  Under certain
 ;; conditions this is safe on x86, so help combine not create
@@ -19776,6 +19798,32 @@
   operands[8] = gen_lowpart (QImode, operands[4]);
   ix86_expand_clear (operands[4]);
 })
+
+(define_peephole2
+  [(set (match_operand 4 "flags_reg_operand") (match_operand 0))
+   (set (strict_low_part (match_operand:QI 5 "register_operand"))
+       (match_operator:QI 6 "ix86_comparison_operator"
+         [(reg FLAGS_REG) (const_int 0)]))
+   (set (match_operand:QI 1 "register_operand")
+       (match_operator:QI 2 "ix86_comparison_operator"
+         [(reg FLAGS_REG) (const_int 0)]))
+   (set (match_operand 3 "any_QIreg_operand")
+       (zero_extend (match_dup 1)))]
+  "(peep2_reg_dead_p (4, operands[1])
+    || operands_match_p (operands[1], operands[3]))
+   && ! reg_overlap_mentioned_p (operands[3], operands[0])
+   && ! reg_overlap_mentioned_p (operands[3], operands[5])
+   && ! reg_overlap_mentioned_p (operands[1], operands[5])
+   && peep2_regno_dead_p (0, FLAGS_REG)"
+  [(set (match_dup 4) (match_dup 0))
+   (set (strict_low_part (match_dup 5))
+       (match_dup 6))
+   (set (strict_low_part (match_dup 7))
+       (match_dup 2))]
+{
+  operands[7] = gen_lowpart (QImode, operands[3]);
+  ix86_expand_clear (operands[3]);
+})
 
 ;; Call instructions.
 
@@ -29494,24 +29542,40 @@
    (set_attr "length" "4")])
 
 ;; Spaceship optimization
-(define_expand "spaceship<mode>3"
+(define_expand "spaceship<mode>4"
   [(match_operand:SI 0 "register_operand")
    (match_operand:MODEF 1 "cmp_fp_expander_operand")
-   (match_operand:MODEF 2 "cmp_fp_expander_operand")]
+   (match_operand:MODEF 2 "cmp_fp_expander_operand")
+   (match_operand:SI 3 "const_int_operand")]
   "(TARGET_80387 || (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH))
    && (TARGET_CMOVE || (TARGET_SAHF && TARGET_USE_SAHF))"
 {
-  ix86_expand_fp_spaceship (operands[0], operands[1], operands[2]);
+  ix86_expand_fp_spaceship (operands[0], operands[1], operands[2],
+                           operands[3]);
   DONE;
 })
 
-(define_expand "spaceshipxf3"
+(define_expand "spaceshipxf4"
   [(match_operand:SI 0 "register_operand")
    (match_operand:XF 1 "nonmemory_operand")
-   (match_operand:XF 2 "nonmemory_operand")]
+   (match_operand:XF 2 "nonmemory_operand")
+   (match_operand:SI 3 "const_int_operand")]
   "TARGET_80387 && (TARGET_CMOVE || (TARGET_SAHF && TARGET_USE_SAHF))"
 {
-  ix86_expand_fp_spaceship (operands[0], operands[1], operands[2]);
+  ix86_expand_fp_spaceship (operands[0], operands[1], operands[2],
+                           operands[3]);
+  DONE;
+})
+
+(define_expand "spaceship<mode>4"
+  [(match_operand:SI 0 "register_operand")
+   (match_operand:SWI 1 "nonimmediate_operand")
+   (match_operand:SWI 2 "<general_operand>")
+   (match_operand:SI 3 "const_int_operand")]
+  ""
+{
+  ix86_expand_int_spaceship (operands[0], operands[1], operands[2],
+                            operands[3]);
   DONE;
 })
 
diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index 7001dafdc9e1..c58072ea76ca 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -8568,11 +8568,15 @@ inclusive and operand 1 exclusive.
 If this pattern is not defined, a call to the library function
 @code{__clear_cache} is used.
 
-@cindex @code{spaceship@var{m}3} instruction pattern
-@item @samp{spaceship@var{m}3}
+@cindex @code{spaceship@var{m}4} instruction pattern
+@item @samp{spaceship@var{m}4}
 Initialize output operand 0 with mode of integer type to -1, 0, 1 or 2
 if operand 1 with mode @var{m} compares less than operand 2, equal to
 operand 2, greater than operand 2 or is unordered with operand 2.
+Operand 3 should be @code{const0_rtx} if the result is used in comparisons,
+@code{const1_rtx} if the result is used as integer value and the comparison
+is signed, @code{const2_rtx} if the result is used as integer value and
+the comparison is unsigned.
 @var{m} should be a scalar floating point mode.
 
 This pattern is not allowed to @code{FAIL}.
diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
index b55f089cf56d..d89a04fe4122 100644
--- a/gcc/internal-fn.cc
+++ b/gcc/internal-fn.cc
@@ -5107,6 +5107,7 @@ expand_SPACESHIP (internal_fn, gcall *stmt)
   tree lhs = gimple_call_lhs (stmt);
   tree rhs1 = gimple_call_arg (stmt, 0);
   tree rhs2 = gimple_call_arg (stmt, 1);
+  tree rhs3 = gimple_call_arg (stmt, 2);
   tree type = TREE_TYPE (rhs1);
 
   do_pending_stack_adjust ();
@@ -5114,13 +5115,15 @@ expand_SPACESHIP (internal_fn, gcall *stmt)
   rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
   rtx op1 = expand_normal (rhs1);
   rtx op2 = expand_normal (rhs2);
+  rtx op3 = expand_normal (rhs3);
 
-  class expand_operand ops[3];
+  class expand_operand ops[4];
   create_call_lhs_operand (&ops[0], target, TYPE_MODE (TREE_TYPE (lhs)));
   create_input_operand (&ops[1], op1, TYPE_MODE (type));
   create_input_operand (&ops[2], op2, TYPE_MODE (type));
+  create_input_operand (&ops[3], op3, TYPE_MODE (TREE_TYPE (rhs3)));
   insn_code icode = optab_handler (spaceship_optab, TYPE_MODE (type));
-  expand_insn (icode, 3, ops);
+  expand_insn (icode, 4, ops);
   assign_call_lhs (lhs, target, &ops[0]);
 }
 
diff --git a/gcc/optabs.def b/gcc/optabs.def
index ba860144d8be..b48e2e5a5aca 100644
--- a/gcc/optabs.def
+++ b/gcc/optabs.def
@@ -308,7 +308,7 @@ OPTAB_D (negv3_optab, "negv$I$a3")
 OPTAB_D (uaddc5_optab, "uaddc$I$a5")
 OPTAB_D (usubc5_optab, "usubc$I$a5")
 OPTAB_D (addptr3_optab, "addptr$a3")
-OPTAB_D (spaceship_optab, "spaceship$a3")
+OPTAB_D (spaceship_optab, "spaceship$a4")
 
 OPTAB_D (smul_highpart_optab, "smul$a3_highpart")
 OPTAB_D (umul_highpart_optab, "umul$a3_highpart")
diff --git a/gcc/testsuite/g++.target/i386/pr116896-1.C 
b/gcc/testsuite/g++.target/i386/pr116896-1.C
new file mode 100644
index 000000000000..3925ad8f603a
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/pr116896-1.C
@@ -0,0 +1,35 @@
+// PR middle-end/116896
+// { dg-do compile { target c++20 } }
+// { dg-options "-O2 -masm=att -fno-stack-protector" }
+// { dg-final { scan-assembler-times "\tjp\t" 1 } }
+// { dg-final { scan-assembler-not "\tj\[^mp\]\[a-z\]*\t" } }
+// { dg-final { scan-assembler-times "\tsbb\[bl\]\t\\\$0, " 3 } }
+// { dg-final { scan-assembler-times "\tseta\t" 3 } }
+// { dg-final { scan-assembler-times "\tsetg\t" 1 } }
+// { dg-final { scan-assembler-times "\tsetl\t" 1 } }
+
+#include <compare>
+
+[[gnu::noipa]] auto
+foo (float x, float y)
+{
+  return x <=> y;
+}
+
+[[gnu::noipa, gnu::optimize ("fast-math")]] auto
+bar (float x, float y)
+{
+  return x <=> y;
+}
+
+[[gnu::noipa]] auto
+baz (int x, int y)
+{
+  return x <=> y;
+}
+
+[[gnu::noipa]] auto
+qux (unsigned x, unsigned y)
+{
+  return x <=> y;
+}
diff --git a/gcc/testsuite/g++.target/i386/pr116896-2.C 
b/gcc/testsuite/g++.target/i386/pr116896-2.C
new file mode 100644
index 000000000000..1bf690cf100b
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/pr116896-2.C
@@ -0,0 +1,41 @@
+// PR middle-end/116896
+// { dg-do run { target c++20 } }
+// { dg-options "-O2" }
+
+#include "pr116896-1.C"
+
+[[gnu::noipa]] auto
+corge (int x)
+{
+  return x <=> 0;
+}
+
+[[gnu::noipa]] auto
+garply (unsigned x)
+{
+  return x <=> 0;
+}
+
+int
+main ()
+{
+  if (foo (-1.0f, 1.0f) != std::partial_ordering::less
+      || foo (1.0f, -1.0f) != std::partial_ordering::greater
+      || foo (1.0f, 1.0f) != std::partial_ordering::equivalent
+      || foo (__builtin_nanf (""), 1.0f) != std::partial_ordering::unordered
+      || bar (-2.0f, 2.0f) != std::partial_ordering::less
+      || bar (2.0f, -2.0f) != std::partial_ordering::greater
+      || bar (-5.0f, -5.0f) != std::partial_ordering::equivalent
+      || baz (-42, 42) != std::strong_ordering::less
+      || baz (42, -42) != std::strong_ordering::greater
+      || baz (42, 42) != std::strong_ordering::equal
+      || qux (40, 42) != std::strong_ordering::less
+      || qux (42, 40) != std::strong_ordering::greater
+      || qux (40, 40) != std::strong_ordering::equal
+      || corge (-15) != std::strong_ordering::less
+      || corge (15) != std::strong_ordering::greater
+      || corge (0) != std::strong_ordering::equal
+      || garply (15) != std::strong_ordering::greater
+      || garply (0) != std::strong_ordering::equal)
+    __builtin_abort ();
+}
diff --git a/gcc/tree-ssa-math-opts.cc b/gcc/tree-ssa-math-opts.cc
index a61559c52a94..37a3faac7319 100644
--- a/gcc/tree-ssa-math-opts.cc
+++ b/gcc/tree-ssa-math-opts.cc
@@ -117,6 +117,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "domwalk.h"
 #include "tree-ssa-math-opts.h"
 #include "dbgcnt.h"
+#include "cfghooks.h"
 
 /* This structure represents one basic block that either computes a
    division, or is a common dominator for basic block that compute a
@@ -5869,7 +5870,7 @@ convert_mult_to_highpart (gassign *stmt, 
gimple_stmt_iterator *gsi)
    <bb 6> [local count: 1073741824]:
    and turn it into:
    <bb 2> [local count: 1073741824]:
-   _1 = .SPACESHIP (a_2(D), b_3(D));
+   _1 = .SPACESHIP (a_2(D), b_3(D), 0);
    if (_1 == 0)
      goto <bb 6>; [34.00%]
    else
@@ -5891,7 +5892,13 @@ convert_mult_to_highpart (gassign *stmt, 
gimple_stmt_iterator *gsi)
 
    <bb 6> [local count: 1073741824]:
    so that the backend can emit optimal comparison and
-   conditional jump sequence.  */
+   conditional jump sequence.  If the
+   <bb 6> [local count: 1073741824]:
+   above has a single PHI like:
+   # _27 = PHI<0(2), -1(3), 2(4), 1(5)>
+   then replace it with effectively
+   _1 = .SPACESHIP (a_2(D), b_3(D), 1);
+   _27 = _1;  */
 
 static void
 optimize_spaceship (gcond *stmt)
@@ -5901,7 +5908,8 @@ optimize_spaceship (gcond *stmt)
     return;
   tree arg1 = gimple_cond_lhs (stmt);
   tree arg2 = gimple_cond_rhs (stmt);
-  if (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (arg1))
+  if ((!SCALAR_FLOAT_TYPE_P (TREE_TYPE (arg1))
+       && !INTEGRAL_TYPE_P (TREE_TYPE (arg1)))
       || optab_handler (spaceship_optab,
                        TYPE_MODE (TREE_TYPE (arg1))) == CODE_FOR_nothing
       || operand_equal_p (arg1, arg2, 0))
@@ -6013,12 +6021,105 @@ optimize_spaceship (gcond *stmt)
        }
     }
 
-  gcall *gc = gimple_build_call_internal (IFN_SPACESHIP, 2, arg1, arg2);
+  /* Check if there is a single bb into which all failed conditions
+     jump to (perhaps through an empty block) and if it results in
+     a single integral PHI which just sets it to -1, 0, 1, 2
+     (or -1, 0, 1 when NaNs can't happen).  In that case use 1 rather
+     than 0 as last .SPACESHIP argument to tell backends it might
+     consider different code generation and just cast the result
+     of .SPACESHIP to the PHI result.  */
+  tree arg3 = integer_zero_node;
+  edge e = EDGE_SUCC (bb0, 0);
+  if (e->dest == bb1)
+    e = EDGE_SUCC (bb0, 1);
+  basic_block bbp = e->dest;
+  gphi *phi = NULL;
+  for (gphi_iterator psi = gsi_start_phis (bbp);
+       !gsi_end_p (psi); gsi_next (&psi))
+    {
+      gphi *gp = psi.phi ();
+      tree res = gimple_phi_result (gp);
+
+      if (phi != NULL
+         || virtual_operand_p (res)
+         || !INTEGRAL_TYPE_P (TREE_TYPE (res))
+         || TYPE_PRECISION (TREE_TYPE (res)) < 2)
+       {
+         phi = NULL;
+         break;
+       }
+      phi = gp;
+    }
+  if (phi
+      && integer_zerop (gimple_phi_arg_def_from_edge (phi, e))
+      && EDGE_COUNT (bbp->preds) == (HONOR_NANS (TREE_TYPE (arg1)) ? 4 : 3))
+    {
+      for (unsigned i = 0; phi && i < EDGE_COUNT (bbp->preds) - 1; ++i)
+       {
+         edge e3 = i == 0 ? e1 : i == 1 ? em1 : e2;
+         if (e3->dest != bbp)
+           {
+             if (!empty_block_p (e3->dest)
+                 || !single_succ_p (e3->dest)
+                 || single_succ (e3->dest) != bbp)
+               {
+                 phi = NULL;
+                 break;
+               }
+             e3 = single_succ_edge (e3->dest);
+           }
+         tree a = gimple_phi_arg_def_from_edge (phi, e3);
+         if (TREE_CODE (a) != INTEGER_CST
+             || (i == 0 && !integer_onep (a))
+             || (i == 1 && !integer_all_onesp (a))
+             || (i == 2 && wi::to_widest (a) != 2))
+           {
+             phi = NULL;
+             break;
+           }
+       }
+      if (phi)
+       arg3 = build_int_cst (integer_type_node,
+                             TYPE_UNSIGNED (TREE_TYPE (arg1)) ? 2 : 1);
+    }
+
+  /* For integral <=> comparisons only use .SPACESHIP if it is turned
+     into an integer (-1, 0, 1).  */
+  if (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (arg1)) && arg3 == integer_zero_node)
+    return;
+
+  gcall *gc = gimple_build_call_internal (IFN_SPACESHIP, 3, arg1, arg2, arg3);
   tree lhs = make_ssa_name (integer_type_node);
   gimple_call_set_lhs (gc, lhs);
   gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
   gsi_insert_before (&gsi, gc, GSI_SAME_STMT);
 
+  wide_int wm1 = wi::minus_one (TYPE_PRECISION (integer_type_node));
+  wide_int w2 = (HONOR_NANS (TREE_TYPE (arg1))
+                ? wi::two (TYPE_PRECISION (integer_type_node))
+                : wi::one (TYPE_PRECISION (integer_type_node)));
+  int_range<1> vr (TREE_TYPE (lhs), wm1, w2);
+  set_range_info (lhs, vr);
+
+  if (arg3 != integer_zero_node)
+    {
+      tree type = TREE_TYPE (gimple_phi_result (phi));
+      if (!useless_type_conversion_p (type, integer_type_node))
+       {
+         tree tem = make_ssa_name (type);
+         gimple *gcv = gimple_build_assign (tem, NOP_EXPR, lhs);
+         gsi_insert_before (&gsi, gcv, GSI_SAME_STMT);
+         lhs = tem;
+       }
+      SET_PHI_ARG_DEF_ON_EDGE (phi, e, lhs);
+      gimple_cond_set_lhs (stmt, boolean_false_node);
+      gimple_cond_set_rhs (stmt, boolean_false_node);
+      gimple_cond_set_code (stmt, (e->flags & EDGE_TRUE_VALUE)
+                                 ? EQ_EXPR : NE_EXPR);
+      update_stmt (stmt);
+      return;
+    }
+
   gimple_cond_set_lhs (stmt, lhs);
   gimple_cond_set_rhs (stmt, integer_zero_node);
   update_stmt (stmt);
@@ -6055,11 +6156,6 @@ optimize_spaceship (gcond *stmt)
                            (e2->flags & EDGE_TRUE_VALUE) ? NE_EXPR : EQ_EXPR);
       update_stmt (cond);
     }
-
-  wide_int wm1 = wi::minus_one (TYPE_PRECISION (integer_type_node));
-  wide_int w2 = wi::two (TYPE_PRECISION (integer_type_node));
-  int_range<1> vr (TREE_TYPE (lhs), wm1, w2);
-  set_range_info (lhs, vr);
 }

Reply via email to