On Sun, Aug 14, 2011 at 8:00 PM, Uros Bizjak <ubiz...@gmail.com> wrote:
>> We can use ROUNDSP/ROUNDSD in round(a) expansion. Currently, we expand >> round(a) as (-O2 -ffast-math): > > I forgot to add that this expansion is expanded only under > flag_unsafe_math_optimizations due to addition of 0.5. For the input > of 0x1.fffffffffffffp-2, new insn sequence returns 1.0. Actually, using an algorithm, proposed by Richi - sgn(a)*trunc(fabs(a) + 0.5++) - solves this failure. New version of patch attached. 2011-08-14 Uros Bizjak <ubiz...@gmail.com> * config/i386/i386.c (ix86_expand_round_sse4): New function. * config/i386/i386-protos.h (ix86_expand_round_sse4): New prototype. * config/i386/i386.md (round<mode>2): Use ix86_expand_round_sse4 for TARGET_ROUND. (rint<mode>2): Simplify TARGET_ROUND check. (floor<mode>2): Ditto. (ceil<mode>2): Ditto. (btrunc<mode>2): Ditto. Bootstrapped on x86_64-pc-linux-gnu {,-m32}, regression test still in progress. Uros.
Index: i386.md =================================================================== --- i386.md (revision 177746) +++ i386.md (working copy) @@ -14394,11 +14394,11 @@ if (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH && !flag_trapping_math) { - if (!TARGET_ROUND && optimize_insn_for_size_p ()) - FAIL; if (TARGET_ROUND) emit_insn (gen_sse4_1_round<mode>2 (operands[0], operands[1], GEN_INT (ROUND_MXCSR))); + else if (optimize_insn_for_size_p ()) + FAIL; else ix86_expand_rint (operand0, operand1); } @@ -14431,7 +14431,12 @@ if (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH && !flag_trapping_math && !flag_rounding_math) { - if (TARGET_64BIT || (<MODE>mode != DFmode)) + if (TARGET_ROUND) + { + operands[1] = force_reg (<MODE>mode, operands[1]); + ix86_expand_round_sse4 (operands[0], operands[1]); + } + else if (TARGET_64BIT || (<MODE>mode != DFmode)) ix86_expand_round (operands[0], operands[1]); else ix86_expand_rounddf_32 (operands[0], operands[1]); @@ -14663,14 +14668,13 @@ && !flag_trapping_math)" { if (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH - && !flag_trapping_math - && (TARGET_ROUND || optimize_insn_for_speed_p ())) + && !flag_trapping_math) { - if (!TARGET_ROUND && optimize_insn_for_size_p ()) - FAIL; if (TARGET_ROUND) emit_insn (gen_sse4_1_round<mode>2 (operands[0], operands[1], GEN_INT (ROUND_FLOOR))); + else if (optimize_insn_for_size_p ()) + FAIL; else if (TARGET_64BIT || (<MODE>mode != DFmode)) ix86_expand_floorceil (operand0, operand1, true); else @@ -14922,8 +14926,7 @@ && !flag_trapping_math)" { if (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH - && !flag_trapping_math - && (TARGET_ROUND || optimize_insn_for_speed_p ())) + && !flag_trapping_math) { if (TARGET_ROUND) emit_insn (gen_sse4_1_round<mode>2 @@ -15179,8 +15182,7 @@ && !flag_trapping_math)" { if (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH - && !flag_trapping_math - && (TARGET_ROUND || optimize_insn_for_speed_p ())) + && !flag_trapping_math) { if (TARGET_ROUND) emit_insn (gen_sse4_1_round<mode>2 Index: i386-protos.h =================================================================== --- i386-protos.h (revision 177746) +++ i386-protos.h (working copy) @@ -174,6 +174,7 @@ extern void ix86_expand_lfloorceil (rtx, rtx, bool extern void ix86_expand_rint (rtx, rtx); extern void ix86_expand_floorceil (rtx, rtx, bool); extern void ix86_expand_floorceildf_32 (rtx, rtx, bool); +extern void ix86_expand_round_sse4 (rtx, rtx); extern void ix86_expand_round (rtx, rtx); extern void ix86_expand_rounddf_32 (rtx, rtx); extern void ix86_expand_trunc (rtx, rtx); Index: i386.c =================================================================== --- i386.c (revision 177746) +++ i386.c (working copy) @@ -32676,6 +32676,52 @@ ix86_expand_round (rtx operand0, rtx operand1) emit_move_insn (operand0, res); } + +/* Expand SSE sequence for computing round + from OP1 storing into OP0 using sse4 round insn. */ +void +ix86_expand_round_sse4 (rtx op0, rtx op1) +{ + enum machine_mode mode = GET_MODE (op0); + rtx e1, e2, e3, res, half, mask; + const struct real_format *fmt; + REAL_VALUE_TYPE pred_half, half_minus_pred_half; + rtx (*gen_round) (rtx, rtx, rtx); + + switch (mode) + { + case SFmode: + gen_round = gen_sse4_1_roundsf2; + break; + case DFmode: + gen_round = gen_sse4_1_rounddf2; + break; + default: + gcc_unreachable (); + } + + /* e1 = fabs(op1) */ + e1 = ix86_expand_sse_fabs (op1, &mask); + + /* load nextafter (0.5, 0.0) */ + fmt = REAL_MODE_FORMAT (mode); + real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode); + REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half); + + /* e2 = e1 + 0.5 */ + half = force_reg (mode, const_double_from_real_value (pred_half, mode)); + e2 = expand_simple_binop (mode, PLUS, e1, half, NULL_RTX, 0, OPTAB_DIRECT); + + /* e3 = trunc(e2) */ + e3 = gen_reg_rtx (mode); + emit_insn (gen_round (e3, e2, GEN_INT (ROUND_TRUNC))); + + /* res = copysign (e3, op1) */ + res = gen_reg_rtx (mode); + ix86_sse_copysign_to_positive (res, e3, op1, mask); + + emit_move_insn (op0, res); +} /* Table of valid machine attributes. */