On Sun, Aug 14, 2011 at 8:00 PM, Uros Bizjak <ubiz...@gmail.com> wrote:

>> We can use ROUNDSP/ROUNDSD in round(a) expansion. Currently, we expand
>> round(a) as (-O2 -ffast-math):
>
> I forgot to add that this expansion is expanded only under
> flag_unsafe_math_optimizations due to addition of 0.5. For the input
> of 0x1.fffffffffffffp-2, new insn sequence returns 1.0.

Actually, using an algorithm, proposed by Richi - sgn(a)*trunc(fabs(a)
+ 0.5++) - solves this failure.

New version of patch attached.

2011-08-14  Uros Bizjak  <ubiz...@gmail.com>

        * config/i386/i386.c (ix86_expand_round_sse4): New function.
        * config/i386/i386-protos.h (ix86_expand_round_sse4): New prototype.
        * config/i386/i386.md (round<mode>2): Use ix86_expand_round_sse4
        for TARGET_ROUND.

        (rint<mode>2): Simplify TARGET_ROUND check.
        (floor<mode>2): Ditto.
        (ceil<mode>2): Ditto.
        (btrunc<mode>2): Ditto.

Bootstrapped on x86_64-pc-linux-gnu {,-m32}, regression test still in progress.

Uros.
Index: i386.md
===================================================================
--- i386.md     (revision 177746)
+++ i386.md     (working copy)
@@ -14394,11 +14394,11 @@
   if (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH
       && !flag_trapping_math)
     {
-      if (!TARGET_ROUND && optimize_insn_for_size_p ())
-       FAIL;
       if (TARGET_ROUND)
        emit_insn (gen_sse4_1_round<mode>2
                   (operands[0], operands[1], GEN_INT (ROUND_MXCSR)));
+      else if (optimize_insn_for_size_p ())
+        FAIL;
       else
        ix86_expand_rint (operand0, operand1);
     }
@@ -14431,7 +14431,12 @@
   if (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH
       && !flag_trapping_math && !flag_rounding_math)
     {
-      if (TARGET_64BIT || (<MODE>mode != DFmode))
+      if (TARGET_ROUND)
+        {
+         operands[1] = force_reg (<MODE>mode, operands[1]);
+         ix86_expand_round_sse4 (operands[0], operands[1]);
+       }
+      else if (TARGET_64BIT || (<MODE>mode != DFmode))
        ix86_expand_round (operands[0], operands[1]);
       else
        ix86_expand_rounddf_32 (operands[0], operands[1]);
@@ -14663,14 +14668,13 @@
        && !flag_trapping_math)"
 {
   if (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH
-      && !flag_trapping_math
-      && (TARGET_ROUND || optimize_insn_for_speed_p ()))
+      && !flag_trapping_math)
     {
-      if (!TARGET_ROUND && optimize_insn_for_size_p ())
-       FAIL;
       if (TARGET_ROUND)
        emit_insn (gen_sse4_1_round<mode>2
                   (operands[0], operands[1], GEN_INT (ROUND_FLOOR)));
+      else if (optimize_insn_for_size_p ())
+        FAIL;
       else if (TARGET_64BIT || (<MODE>mode != DFmode))
        ix86_expand_floorceil (operand0, operand1, true);
       else
@@ -14922,8 +14926,7 @@
        && !flag_trapping_math)"
 {
   if (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH
-      && !flag_trapping_math
-      && (TARGET_ROUND || optimize_insn_for_speed_p ()))
+      && !flag_trapping_math)
     {
       if (TARGET_ROUND)
        emit_insn (gen_sse4_1_round<mode>2
@@ -15179,8 +15182,7 @@
        && !flag_trapping_math)"
 {
   if (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH
-      && !flag_trapping_math
-      && (TARGET_ROUND || optimize_insn_for_speed_p ()))
+      && !flag_trapping_math)
     {
       if (TARGET_ROUND)
        emit_insn (gen_sse4_1_round<mode>2
Index: i386-protos.h
===================================================================
--- i386-protos.h       (revision 177746)
+++ i386-protos.h       (working copy)
@@ -174,6 +174,7 @@ extern void ix86_expand_lfloorceil (rtx, rtx, bool
 extern void ix86_expand_rint (rtx, rtx);
 extern void ix86_expand_floorceil (rtx, rtx, bool);
 extern void ix86_expand_floorceildf_32 (rtx, rtx, bool);
+extern void ix86_expand_round_sse4 (rtx, rtx);
 extern void ix86_expand_round (rtx, rtx);
 extern void ix86_expand_rounddf_32 (rtx, rtx);
 extern void ix86_expand_trunc (rtx, rtx);
Index: i386.c
===================================================================
--- i386.c      (revision 177746)
+++ i386.c      (working copy)
@@ -32676,6 +32676,52 @@ ix86_expand_round (rtx operand0, rtx operand1)
 
   emit_move_insn (operand0, res);
 }
+
+/* Expand SSE sequence for computing round
+   from OP1 storing into OP0 using sse4 round insn.  */
+void
+ix86_expand_round_sse4 (rtx op0, rtx op1)
+{
+  enum machine_mode mode = GET_MODE (op0);
+  rtx e1, e2, e3, res, half, mask;
+  const struct real_format *fmt;
+  REAL_VALUE_TYPE pred_half, half_minus_pred_half;
+  rtx (*gen_round) (rtx, rtx, rtx);
+
+  switch (mode)
+    {
+    case SFmode:
+      gen_round = gen_sse4_1_roundsf2;
+      break;
+    case DFmode:
+      gen_round = gen_sse4_1_rounddf2;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  /* e1 = fabs(op1) */
+  e1 = ix86_expand_sse_fabs (op1, &mask);
+
+  /* load nextafter (0.5, 0.0) */
+  fmt = REAL_MODE_FORMAT (mode);
+  real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
+  REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
+
+  /* e2 = e1 + 0.5 */
+  half = force_reg (mode, const_double_from_real_value (pred_half, mode));
+  e2 = expand_simple_binop (mode, PLUS, e1, half, NULL_RTX, 0, OPTAB_DIRECT);
+
+  /* e3 = trunc(e2) */
+  e3 = gen_reg_rtx (mode);
+  emit_insn (gen_round (e3, e2, GEN_INT (ROUND_TRUNC)));
+
+  /* res = copysign (e3, op1) */
+  res = gen_reg_rtx (mode);
+  ix86_sse_copysign_to_positive (res, e3, op1, mask);
+
+  emit_move_insn (op0, res);
+}
 
 
 /* Table of valid machine attributes.  */

Reply via email to