On Sat, Apr 27, 2013 at 12:20 AM, Jakub Jelinek <ja...@redhat.com> wrote:
> On > #define N 4096 > unsigned int b[N], d[N]; > > void > foo (void) > { > int i; > for (i = 0; i < N; i++) > d[i] = b[i] / 3; > } > testcase I was looking earlier today because of the XOP issues, > I've noticed we generate unnecessary code: > vmovdqa .LC0(%rip), %ymm2 > ... > vpsrlq $32, %ymm2, %ymm3 > before the loop and in the loop: > vmovdqa b(%rax), %ymm0 > vpmuludq b(%rax), %ymm2, %ymm1 > ... > vpsrlq $32, %ymm0, %ymm0 > vpmuludq %ymm3, %ymm0, %ymm0 > ... > .LC0: > .long -1431655765 > .long -1431655765 > .long -1431655765 > .long -1431655765 > .long -1431655765 > .long -1431655765 > .long -1431655765 > .long -1431655765 > The first vpsrlq and having an extra register live across the loop is not > needed, if each pair of constants in the constant vector is equal, we can > just use .LC0(%rip) (i.e. %ymm2 above) in both places. > > Fixed thusly, bootstrapped/regtested on x86_64-linux and i686-linux. > > Apparently ix86_expand_binop_builtin wasn't prepared for NULL predicates > (but generic code is), alternatively perhaps I could add a predicate > that would accept nonimmediate_operand or CONSTANT_VECTOR if that is > preferrable that way. Yes, please add a new predicate, the pattern is much more descriptive this way. (Without the predicate, it looks like an expander that generates a RTX fragment, used instead of gen_RTX... sequences). OTOH, does vector mode "general_operand" still accept scalar immediates? The predicate, proposed above is effectively "general_vector_operand", where only operands (including immediates) with vector modes should be accepted. IIRC, there were some problems with VOIDmode CONST0_RTX leaking through vector expanders, and there is some code in ix86_expand_..._builtin that deals with this situation. > Also, not sure if force_reg or copy_to_mode_reg is preferrable. I think this approach was just copied from generic code - or perhaps historical code due to CONST0_RTX leaks, as described above (I'm away from my keyboard, and not in the position to check this issue in the code ATM). > 2013-04-26 Jakub Jelinek <ja...@redhat.com> > > * config/i386/i386.c (ix86_expand_binop_builtin): Allow NULL > predicate. > (const_vector_equal_evenodd_p): New function. > (ix86_expand_mul_widen_evenodd): Force op1 resp. op2 into register > if they aren't nonimmediate operands. If their original values > satisfy const_vector_equal_evenodd_p, don't shift them. > * config/i386/sse.md (mul<mode>3): Remove predicates. For the > SSE4.1 case force operands[{1,2}] into registers if not > nonimmediate_operand. > (vec_widen_smult_even_v4si): Use nonimmediate_operand predicates > instead of register_operand. > (vec_widen_<s>mult_odd_<mode>): Remove predicates. > > --- gcc/config/i386/i386.c.jj 2013-04-26 15:11:37.000000000 +0200 > +++ gcc/config/i386/i386.c 2013-04-26 19:03:54.777293448 +0200 > @@ -30149,9 +30150,11 @@ ix86_expand_binop_builtin (enum insn_cod > op1 = gen_lowpart (TImode, x); > } > > - if (!insn_data[icode].operand[1].predicate (op0, mode0)) > + if (insn_data[icode].operand[1].predicate > + && !insn_data[icode].operand[1].predicate (op0, mode0)) > op0 = copy_to_mode_reg (mode0, op0); > - if (!insn_data[icode].operand[2].predicate (op1, mode1)) > + if (insn_data[icode].operand[2].predicate > + && !insn_data[icode].operand[2].predicate (op1, mode1)) > op1 = copy_to_mode_reg (mode1, op1); > > pat = GEN_FCN (icode) (target, op0, op1); > @@ -40826,6 +40829,24 @@ ix86_expand_vecop_qihi (enum rtx_code co > gen_rtx_fmt_ee (code, qimode, op1, op2)); > } > > +/* Helper function of ix86_expand_mul_widen_evenodd. Return true > + if op is CONST_VECTOR with all odd elements equal to their > + preceeding element. */ > + > +static bool > +const_vector_equal_evenodd_p (rtx op) > +{ > + enum machine_mode mode = GET_MODE (op); > + int i, nunits = GET_MODE_NUNITS (mode); > + if (GET_CODE (op) != CONST_VECTOR > + || nunits != CONST_VECTOR_NUNITS (op)) > + return false; > + for (i = 0; i < nunits; i += 2) > + if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1)) > + return false; > + return true; > +} > + > void > ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2, > bool uns_p, bool odd_p) > @@ -40833,6 +40854,12 @@ ix86_expand_mul_widen_evenodd (rtx dest, > enum machine_mode mode = GET_MODE (op1); > enum machine_mode wmode = GET_MODE (dest); > rtx x; > + rtx orig_op1 = op1, orig_op2 = op2; > + > + if (!nonimmediate_operand (op1, mode)) > + op1 = force_reg (mode, op1); > + if (!nonimmediate_operand (op2, mode)) > + op2 = force_reg (mode, op2); > > /* We only play even/odd games with vectors of SImode. */ > gcc_assert (mode == V4SImode || mode == V8SImode); > @@ -40849,10 +40876,12 @@ ix86_expand_mul_widen_evenodd (rtx dest, > } > > x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode)); > - op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1), > - x, NULL, 1, OPTAB_DIRECT); > - op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2), > - x, NULL, 1, OPTAB_DIRECT); > + if (!const_vector_equal_evenodd_p (orig_op1)) > + op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1), > + x, NULL, 1, OPTAB_DIRECT); > + if (!const_vector_equal_evenodd_p (orig_op2)) > + op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2), > + x, NULL, 1, OPTAB_DIRECT); > op1 = gen_lowpart (mode, op1); > op2 = gen_lowpart (mode, op2); > } > --- gcc/config/i386/sse.md.jj 2013-04-26 15:11:37.000000000 +0200 > +++ gcc/config/i386/sse.md 2013-04-26 18:59:03.838753277 +0200 > @@ -5631,14 +5631,16 @@ (define_insn "*sse2_pmaddwd" > (define_expand "mul<mode>3" > [(set (match_operand:VI4_AVX2 0 "register_operand") > (mult:VI4_AVX2 > - (match_operand:VI4_AVX2 1 "nonimmediate_operand") > - (match_operand:VI4_AVX2 2 "nonimmediate_operand")))] > + (match_operand:VI4_AVX2 1) > + (match_operand:VI4_AVX2 2)))] > "TARGET_SSE2" > { > if (TARGET_SSE4_1) > { > - if (CONSTANT_P (operands[2])) > - operands[2] = force_const_mem (<MODE>mode, operands[2]); > + if (!nonimmediate_operand (operands[1], <MODE>mode)) > + operands[1] = force_reg (<MODE>mode, operands[1]); > + if (!nonimmediate_operand (operands[2], <MODE>mode)) > + operands[2] = force_reg (<MODE>mode, operands[2]); > ix86_fixup_binary_operands_no_copy (MULT, <MODE>mode, operands); > } > else > @@ -5702,8 +5704,8 @@ (define_expand "vec_widen_<s>mult_lo_<mo > ;; named patterns, but signed V4SI needs special help for plain SSE2. > (define_expand "vec_widen_smult_even_v4si" > [(match_operand:V2DI 0 "register_operand") > - (match_operand:V4SI 1 "register_operand") > - (match_operand:V4SI 2 "register_operand")] > + (match_operand:V4SI 1 "nonimmediate_operand") > + (match_operand:V4SI 2 "nonimmediate_operand")] > "TARGET_SSE2" > { > ix86_expand_mul_widen_evenodd (operands[0], operands[1], operands[2], > @@ -5714,8 +5716,8 @@ (define_expand "vec_widen_smult_even_v4s > (define_expand "vec_widen_<s>mult_odd_<mode>" > [(match_operand:<sseunpackmode> 0 "register_operand") > (any_extend:<sseunpackmode> > - (match_operand:VI4_AVX2 1 "register_operand")) > - (match_operand:VI4_AVX2 2 "register_operand")] > + (match_operand:VI4_AVX2 1)) > + (match_operand:VI4_AVX2 2)] > "TARGET_SSE2" > { > ix86_expand_mul_widen_evenodd (operands[0], operands[1], operands[2], > > Jakub