Hi! As mentioned in the last mail, the testcase in that patch shows a bug in the unsigned int -> float vectorization on i?86/x86_64. E.g. 0x8000008fU converted to float by scalar code is 0x1.000002p+31, but by vector code is 0x1p+31, i.e. one ulp smaller.
We currently generate: vxorps %xmm2, %xmm2, %xmm2 vmovaps .LC0(%rip), %xmm3 ! { 1333788672, 1333788672, 1333788672, 1333788672 } ... .L2: vcvtdq2ps ui(%rax), %xmm0 vcmpltps %xmm2, %xmm0, %xmm1 vandps %xmm3, %xmm1, %xmm1 vaddps %xmm1, %xmm0, %xmm0 vmovaps %xmm0, f(%rax) For the scalar code we just zero extend it to 64-bit register and convert as signed 64-bit integer to float, but we don't have a vector insn for that. Attached are 3 different fix alternatives: X313a is based on scalar code we generate for ulonglong to double code, i.e. if 0x80000000 or bigger divide by two with oring the shifted off bit into the lowest bit, then signed conversion to float and finally doubling that value. vpxor %xmm4, %xmm4, %xmm4 vmovdqa .LC0(%rip), %xmm5 ! { 1, 1, 1, 1 } ... .L2: vmovdqa ui(%rax), %xmm0 vpand %xmm5, %xmm0, %xmm2 vpsrld $1, %xmm0, %xmm3 vpcmpgtd %xmm0, %xmm4, %xmm1 vpor %xmm2, %xmm3, %xmm2 vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 vcvtdq2ps %xmm0, %xmm0 vandps %xmm1, %xmm0, %xmm1 vaddps %xmm1, %xmm0, %xmm0 vmovaps %xmm0, f(%rax) X313b is based on Richard's sequence, vmovdqa .LC0(%rip), %xmm2 ! { 65535, 65535, 65535, 65535 } ... .L2: vmovdqa ui(%rax), %xmm1 vpandn %xmm1, %xmm2, %xmm0 vpand %xmm2, %xmm1, %xmm1 vpsrld $1, %xmm0, %xmm0 vcvtdq2ps %xmm0, %xmm0 vaddps %xmm0, %xmm0, %xmm0 vcvtdq2ps %xmm1, %xmm1 vaddps %xmm0, %xmm1, %xmm0 vmovaps %xmm0, f(%rax) and finally X313c is based on scalar code we generate for -m32 -O2 -msse2 -mfpmath=sse: vmovdqa .LC0(%rip), %xmm3 ! { 65535, 65535, 65535, 65535 } vmovaps .LC1(%rip), %xmm2 ! { 65536.0f, 65536.0f, 65536.0f, 65536.0f } ... .L2: vmovdqa ui(%rax), %xmm0 vpand %xmm3, %xmm0, %xmm1 vpsrld $16, %xmm0, %xmm0 vcvtdq2ps %xmm0, %xmm0 vmulps %xmm2, %xmm0, %xmm0 vcvtdq2ps %xmm1, %xmm1 vaddps %xmm0, %xmm1, %xmm0 vmovaps %xmm0, f(%rax) I've done some benchmarking of these, X313b has the advantage that it needs just one constant while all other need two, but the last variant seems to win, it is one insn shorter and surprisingly the multiplication isn't any slower than the addition. All this is using the attached testcase which is just a cut down version of the vect-cvt-1.c testcase from my last patch, just with the ui2f routine executed many times. Sandybridge, -O3 -mavx vanilla -fno-tree-vectorize 0m4.136s vanilla 0m1.052s patch X313a 0m1.689s patch X313b 0m1.398s patch X313c 0m1.115s Sandybridge, -O3 -msse2 vanilla -fno-tree-vectorize 0m4.099s vanilla 0m1.054s patch X313a 0m2.527s patch X313b 0m1.394s patch X313c 0m1.267s Barcelona, -O3 -msse2 vanilla -fno-tree-vectorize 0m9.176s vanilla 0m2.706s patch X313a 0m6.515s patch X313b 0m3.825s patch X313c 0m3.310s So, what do you prefer and do you want the expander to be moved into i386.c or kept like this? Do we perhaps want for -ffast-math keep the slightly faster, but imprecise version (I'd prefer not to)? Jakub
2011-11-03 Jakub Jelinek <ja...@redhat.com> * config/i386/sse.md (floatuns<sseintvecmodelower><mode>2): Rewritten. * config/i386/i386.c (ix86_expand_sse_movcc): No longer static. * gcc.dg/torture/vec-cvt-1.c: Enable commented out inttoflttestui test. --- gcc/config/i386/i386.c.jj 2011-11-03 16:11:12.000000000 +0100 +++ gcc/config/i386/i386.c 2011-11-03 16:34:23.000000000 +0100 @@ -19006,7 +19006,9 @@ ix86_expand_sse_cmp (rtx dest, enum rtx_ /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical operations. This is used for both scalar and vector conditional moves. */ -static void +extern void ix86_expand_sse_movcc (rtx, rtx, rtx, rtx); + +void ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false) { enum machine_mode mode = GET_MODE (dest); --- gcc/config/i386/sse.md.jj 2011-11-03 16:11:05.000000000 +0100 +++ gcc/config/i386/sse.md 2011-11-03 16:49:13.000000000 +0100 @@ -2242,30 +2242,41 @@ (define_insn "float<sseintvecmodelower>< (set_attr "mode" "<sseinsnmode>")]) (define_expand "floatuns<sseintvecmodelower><mode>2" - [(set (match_dup 5) - (float:VF1 - (match_operand:<sseintvecmode> 1 "nonimmediate_operand" ""))) - (set (match_dup 6) - (lt:VF1 (match_dup 5) (match_dup 3))) - (set (match_dup 7) - (and:VF1 (match_dup 6) (match_dup 4))) - (set (match_operand:VF1 0 "register_operand" "") - (plus:VF1 (match_dup 5) (match_dup 7)))] - "TARGET_SSE2" -{ - REAL_VALUE_TYPE TWO32r; - rtx x; - int i; - - real_ldexp (&TWO32r, &dconst1, 32); - x = const_double_from_real_value (TWO32r, SFmode); - - operands[3] = force_reg (<MODE>mode, CONST0_RTX (<MODE>mode)); - operands[4] = force_reg (<MODE>mode, - ix86_build_const_vector (<MODE>mode, 1, x)); - - for (i = 5; i < 8; i++) - operands[i] = gen_reg_rtx (<MODE>mode); + [(match_operand:VF1 0 "register_operand" "") + (match_operand:<sseintvecmode> 1 "register_operand" "")] + "TARGET_SSE2 && (<MODE>mode == V4SFmode || TARGET_AVX2)" +{ + extern void ix86_expand_sse_movcc (rtx, rtx, rtx, rtx); + + rtx tmp[9]; + tmp[0] = gen_reg_rtx (<sseintvecmode>mode); + tmp[1] = force_reg (<sseintvecmode>mode, CONST0_RTX (<sseintvecmode>mode)); + if (<sseintvecmode>mode == V4SImode) + emit_insn (gen_sse2_gtv4si3 (tmp[0], tmp[1], operands[1])); + else + emit_insn (gen_avx2_gtv8si3 (tmp[0], tmp[1], operands[1])); + tmp[1] = expand_simple_binop (<sseintvecmode>mode, LSHIFTRT, + operands[1], const1_rtx, NULL_RTX, + 1, OPTAB_DIRECT); + tmp[2] = force_reg (<sseintvecmode>mode, + ix86_build_const_vector (<sseintvecmode>mode, + 1, const1_rtx)); + tmp[3] = expand_simple_binop (<sseintvecmode>mode, AND, + operands[1], tmp[2], NULL_RTX, 1, + OPTAB_DIRECT); + tmp[4] = expand_simple_binop (<sseintvecmode>mode, IOR, + tmp[1], tmp[3], NULL_RTX, 1, OPTAB_DIRECT); + tmp[5] = gen_reg_rtx (<sseintvecmode>mode); + ix86_expand_sse_movcc (tmp[5], tmp[0], tmp[4], operands[1]); + tmp[6] = gen_reg_rtx (<MODE>mode); + emit_insn (gen_float<sseintvecmodelower><mode>2 (tmp[6], tmp[5])); + tmp[7] = expand_simple_binop (<MODE>mode, AND, + gen_lowpart (<MODE>mode, tmp[0]), tmp[6], + NULL_RTX, 1, OPTAB_DIRECT); + tmp[8] = expand_simple_binop (<MODE>mode, PLUS, tmp[6], tmp[7], operands[0], + 1, OPTAB_DIRECT); + gcc_assert (tmp[8] == operands[0]); + DONE; }) (define_insn "avx_cvtps2dq256" --- gcc/testsuite/gcc.dg/torture/vec-cvt-1.c.jj 2011-11-03 14:07:23.000000000 +0100 +++ gcc/testsuite/gcc.dg/torture/vec-cvt-1.c 2011-11-03 17:43:11.000000000 +0100 @@ -205,7 +205,7 @@ main () inttoflttestsl (); inttoflttestuc (); inttoflttestus (); -// inttoflttestui (); + inttoflttestui (); inttoflttestul (); return 0; }
2011-11-03 Jakub Jelinek <ja...@redhat.com> * config/i386/sse.md (floatuns<sseintvecmodelower><mode>2): Rewritten. * gcc.dg/torture/vec-cvt-1.c: Enable commented out inttoflttestui test. --- gcc/config/i386/sse.md.jj 2011-11-03 16:11:05.000000000 +0100 +++ gcc/config/i386/sse.md 2011-11-03 17:12:28.000000000 +0100 @@ -2242,30 +2242,35 @@ (define_insn "float<sseintvecmodelower>< (set_attr "mode" "<sseinsnmode>")]) (define_expand "floatuns<sseintvecmodelower><mode>2" - [(set (match_dup 5) - (float:VF1 - (match_operand:<sseintvecmode> 1 "nonimmediate_operand" ""))) - (set (match_dup 6) - (lt:VF1 (match_dup 5) (match_dup 3))) - (set (match_dup 7) - (and:VF1 (match_dup 6) (match_dup 4))) - (set (match_operand:VF1 0 "register_operand" "") - (plus:VF1 (match_dup 5) (match_dup 7)))] - "TARGET_SSE2" -{ - REAL_VALUE_TYPE TWO32r; - rtx x; - int i; - - real_ldexp (&TWO32r, &dconst1, 32); - x = const_double_from_real_value (TWO32r, SFmode); - - operands[3] = force_reg (<MODE>mode, CONST0_RTX (<MODE>mode)); - operands[4] = force_reg (<MODE>mode, - ix86_build_const_vector (<MODE>mode, 1, x)); - - for (i = 5; i < 8; i++) - operands[i] = gen_reg_rtx (<MODE>mode); + [(match_operand:VF1 0 "register_operand" "") + (match_operand:<sseintvecmode> 1 "register_operand" "")] + "TARGET_SSE2 && (<MODE>mode == V4SFmode || TARGET_AVX2)" +{ + rtx tmp[8]; + tmp[0] = force_reg (<sseintvecmode>mode, + ix86_build_const_vector (<sseintvecmode>mode, 1, + GEN_INT (0xffff))); + tmp[1] = gen_reg_rtx (<sseintvecmode>mode); + if (<sseintvecmode>mode == V4SImode) + emit_insn (gen_sse2_andnotv4si3 (tmp[1], tmp[0], operands[1])); + else + emit_insn (gen_avx2_andnotv8si3 (tmp[1], tmp[0], operands[1])); + tmp[2] = expand_simple_binop (<sseintvecmode>mode, AND, + operands[1], tmp[0], NULL_RTX, + 1, OPTAB_DIRECT); + tmp[3] = expand_simple_binop (<sseintvecmode>mode, LSHIFTRT, + tmp[1], const1_rtx, NULL_RTX, + 1, OPTAB_DIRECT); + tmp[4] = gen_reg_rtx (<MODE>mode); + emit_insn (gen_float<sseintvecmodelower><mode>2 (tmp[4], tmp[2])); + tmp[5] = gen_reg_rtx (<MODE>mode); + emit_insn (gen_float<sseintvecmodelower><mode>2 (tmp[5], tmp[3])); + tmp[6] = expand_simple_binop (<MODE>mode, PLUS, tmp[5], tmp[5], + NULL_RTX, 1, OPTAB_DIRECT); + tmp[7] = expand_simple_binop (<MODE>mode, PLUS, tmp[4], tmp[6], + operands[0], 1, OPTAB_DIRECT); + gcc_assert (tmp[7] == operands[0]); + DONE; }) (define_insn "avx_cvtps2dq256" --- gcc/testsuite/gcc.dg/torture/vec-cvt-1.c.jj 2011-11-03 14:07:23.000000000 +0100 +++ gcc/testsuite/gcc.dg/torture/vec-cvt-1.c 2011-11-03 17:43:11.000000000 +0100 @@ -205,7 +205,7 @@ main () inttoflttestsl (); inttoflttestuc (); inttoflttestus (); -// inttoflttestui (); + inttoflttestui (); inttoflttestul (); return 0; }
2011-11-03 Jakub Jelinek <ja...@redhat.com> * config/i386/sse.md (floatuns<sseintvecmodelower><mode>2): Rewritten. * gcc.dg/torture/vec-cvt-1.c: Enable commented out inttoflttestui test. --- gcc/config/i386/sse.md.jj 2011-11-03 16:11:05.000000000 +0100 +++ gcc/config/i386/sse.md 2011-11-03 17:25:22.000000000 +0100 @@ -2242,30 +2242,35 @@ (define_insn "float<sseintvecmodelower>< (set_attr "mode" "<sseinsnmode>")]) (define_expand "floatuns<sseintvecmodelower><mode>2" - [(set (match_dup 5) - (float:VF1 - (match_operand:<sseintvecmode> 1 "nonimmediate_operand" ""))) - (set (match_dup 6) - (lt:VF1 (match_dup 5) (match_dup 3))) - (set (match_dup 7) - (and:VF1 (match_dup 6) (match_dup 4))) - (set (match_operand:VF1 0 "register_operand" "") - (plus:VF1 (match_dup 5) (match_dup 7)))] - "TARGET_SSE2" -{ - REAL_VALUE_TYPE TWO32r; - rtx x; - int i; - - real_ldexp (&TWO32r, &dconst1, 32); - x = const_double_from_real_value (TWO32r, SFmode); - - operands[3] = force_reg (<MODE>mode, CONST0_RTX (<MODE>mode)); - operands[4] = force_reg (<MODE>mode, - ix86_build_const_vector (<MODE>mode, 1, x)); - - for (i = 5; i < 8; i++) - operands[i] = gen_reg_rtx (<MODE>mode); + [(match_operand:VF1 0 "register_operand" "") + (match_operand:<sseintvecmode> 1 "register_operand" "")] + "TARGET_SSE2 && (<MODE>mode == V4SFmode || TARGET_AVX2)" +{ + rtx tmp[8]; + REAL_VALUE_TYPE TWO16r; + tmp[0] = force_reg (<sseintvecmode>mode, + ix86_build_const_vector (<sseintvecmode>mode, 1, + GEN_INT (0xffff))); + tmp[1] = expand_simple_binop (<sseintvecmode>mode, AND, + operands[1], tmp[0], NULL_RTX, + 1, OPTAB_DIRECT); + tmp[2] = expand_simple_binop (<sseintvecmode>mode, LSHIFTRT, + operands[1], GEN_INT (16), NULL_RTX, + 1, OPTAB_DIRECT); + tmp[3] = gen_reg_rtx (<MODE>mode); + emit_insn (gen_float<sseintvecmodelower><mode>2 (tmp[3], tmp[1])); + tmp[4] = gen_reg_rtx (<MODE>mode); + emit_insn (gen_float<sseintvecmodelower><mode>2 (tmp[4], tmp[2])); + real_ldexp (&TWO16r, &dconst1, 16); + tmp[5] = const_double_from_real_value (TWO16r, SFmode); + tmp[5] = force_reg (<MODE>mode, + ix86_build_const_vector (<MODE>mode, 1, tmp[5])); + tmp[6] = expand_simple_binop (<MODE>mode, MULT, tmp[4], tmp[5], + NULL_RTX, 1, OPTAB_DIRECT); + tmp[7] = expand_simple_binop (<MODE>mode, PLUS, tmp[3], tmp[6], + operands[0], 1, OPTAB_DIRECT); + gcc_assert (tmp[7] == operands[0]); + DONE; }) (define_insn "avx_cvtps2dq256" --- gcc/testsuite/gcc.dg/torture/vec-cvt-1.c.jj 2011-11-03 14:07:23.000000000 +0100 +++ gcc/testsuite/gcc.dg/torture/vec-cvt-1.c 2011-11-03 17:43:11.000000000 +0100 @@ -205,7 +205,7 @@ main () inttoflttestsl (); inttoflttestuc (); inttoflttestus (); -// inttoflttestui (); + inttoflttestui (); inttoflttestul (); return 0; }
#define N 1024 unsigned int ui[N]; float f[N]; #define FN1(from, to) \ __attribute__((noinline, noclone)) void \ from##2##to (void) \ { \ int i; \ for (i = 0; i < N; i++) \ to[i] = from[i]; \ } #define FN(intt, fltt) FN1 (intt, fltt) FN1 (fltt, intt) FN (ui, f) #define FLTTEST(min, max, intt) \ __attribute__((noinline, noclone)) void \ inttoflttest##intt (void) \ { \ int i; \ volatile float vf; \ volatile double vd; \ for (i = 0; i < N; i++) \ { \ asm (""); \ if (i < N / 4) \ intt[i] = min + i; \ else if (i < 3 * N / 4) \ intt[i] = (max + min) / 2 - N * 8 + 16 * i; \ else \ intt[i] = max - N + 1 + i; \ } \ for (i = 0; i < 5000000; i++) \ intt##2f (); \ } FLTTEST (0, 2U * __INT_MAX__ + 1, ui) int main () { inttoflttestui (); return 0; }