--- gcc/ChangeLog | 19 ++++++ gcc/config/i386/i386-builtin-types.def | 5 +- gcc/config/i386/i386.c | 103 +++++++++++++++++++++++++++++++- gcc/config/i386/sse.md | 14 +++++ 4 files changed, 137 insertions(+), 4 deletions(-)
diff --git a/gcc/ChangeLog b/gcc/ChangeLog index b95eab5..f63f523 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,5 +1,24 @@ 2012-06-25 Richard Henderson <r...@redhat.com> + * config/i386/i386-builtin-types.def (V4UDI, V8USI): New. + (V2UDI_FUNC_V4USI_V4USI): New. + (V4UDI_FUNC_V8USI_V8USI): New. + * config/i386/i386.c (ix86_expand_args_builtin): Handle them. + (IX86_BUILTIN_VEC_WIDEN_SMUL_ODD_V4SI): New. + (IX86_BUILTIN_VEC_WIDEN_SMUL_ODD_V8SI): New. + (IX86_BUILTIN_VEC_WIDEN_UMUL_ODD_V4SI): New. + (IX86_BUILTIN_VEC_WIDEN_UMUL_ODD_V8SI): New. + (IX86_BUILTIN_VEC_WIDEN_UMUL_EVEN_V4SI): New. + (IX86_BUILTIN_VEC_WIDEN_UMUL_EVEN_V8SI): New. + (bdesc_args): Add them. + (ix86_builtin_mul_widen_even, ix86_builtin_mul_widen_odd): New. + (TARGET_VECTORIZE_BUILTIN_MUL_WIDEN_EVEN): New. + (TARGET_VECTORIZE_BUILTIN_MUL_WIDEN_ODD): New. + (ix86_expand_mul_widen_evenodd): Use xop_pmacsdqh. + * config/i386/sse.md (vec_widen_<s>mult_odd_<V124_AVX2>): New. + +2012-06-25 Richard Henderson <r...@redhat.com> + * config/i386.sse.md (mul<VI4_AVX2>3): Use xop_pmacsdd. 2012-06-25 Richard Henderson <r...@redhat.com> diff --git a/gcc/config/i386/i386-builtin-types.def b/gcc/config/i386/i386-builtin-types.def index 401668a..398bf0a 100644 --- a/gcc/config/i386/i386-builtin-types.def +++ b/gcc/config/i386/i386-builtin-types.def @@ -97,7 +97,8 @@ DEF_VECTOR_TYPE (V4DI, DI) DEF_VECTOR_TYPE (V8SI, SI) DEF_VECTOR_TYPE (V16HI, HI) DEF_VECTOR_TYPE (V32QI, QI) - +DEF_VECTOR_TYPE (V4UDI, UDI, V4DI) +DEF_VECTOR_TYPE (V8USI, USI, V8SI) DEF_POINTER_TYPE (PCCHAR, CHAR, CONST) DEF_POINTER_TYPE (PCDOUBLE, DOUBLE, CONST) @@ -283,6 +284,7 @@ DEF_FUNCTION_TYPE (V2DI, V2DI, SI) DEF_FUNCTION_TYPE (V2DI, V2DI, V16QI) DEF_FUNCTION_TYPE (V2DI, V2DI, V2DI) DEF_FUNCTION_TYPE (V2DI, V4SI, V4SI) +DEF_FUNCTION_TYPE (V2UDI, V4USI, V4USI) DEF_FUNCTION_TYPE (V2DI, PCV2DI, V2DI) DEF_FUNCTION_TYPE (V2SF, V2SF, V2SF) DEF_FUNCTION_TYPE (V2SI, INT, INT) @@ -349,6 +351,7 @@ DEF_FUNCTION_TYPE (V8SI, V8SI, SI) DEF_FUNCTION_TYPE (V8SI, PCV8SI, V8SI) DEF_FUNCTION_TYPE (V4DI, V4DI, V4DI) DEF_FUNCTION_TYPE (V4DI, V8SI, V8SI) +DEF_FUNCTION_TYPE (V4UDI, V8USI, V8USI) DEF_FUNCTION_TYPE (V4DI, V4DI, V2DI) DEF_FUNCTION_TYPE (V4DI, PCV4DI, V4DI) DEF_FUNCTION_TYPE (V4DI, V4DI, INT) diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index a1b7628..c825033 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -25754,6 +25754,13 @@ enum ix86_builtins IX86_BUILTIN_CPYSGNPS256, IX86_BUILTIN_CPYSGNPD256, + IX86_BUILTIN_VEC_WIDEN_SMUL_ODD_V4SI, + IX86_BUILTIN_VEC_WIDEN_SMUL_ODD_V8SI, + IX86_BUILTIN_VEC_WIDEN_UMUL_ODD_V4SI, + IX86_BUILTIN_VEC_WIDEN_UMUL_ODD_V8SI, + IX86_BUILTIN_VEC_WIDEN_UMUL_EVEN_V4SI, + IX86_BUILTIN_VEC_WIDEN_UMUL_EVEN_V8SI, + /* FMA4 instructions. */ IX86_BUILTIN_VFMADDSS, IX86_BUILTIN_VFMADDSD, @@ -26612,6 +26619,8 @@ static const struct builtin_description bdesc_args[] = { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv1siv1di3, "__builtin_ia32_pmuludq", IX86_BUILTIN_PMULUDQ, UNKNOWN, (int) V1DI_FTYPE_V2SI_V2SI }, { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv2siv2di3, "__builtin_ia32_pmuludq128", IX86_BUILTIN_PMULUDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv2siv2di3, "__builtin_vw_umul_even_v4si", IX86_BUILTIN_VEC_WIDEN_UMUL_EVEN_V4SI, UNKNOWN, (int) V2UDI_FTYPE_V4USI_V4USI }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_widen_umult_odd_v4si, "__builtin_ia32_vw_umul_odd_v4si", IX86_BUILTIN_VEC_WIDEN_UMUL_ODD_V4SI, UNKNOWN, (int) V2UDI_FTYPE_V4USI_V4USI }, { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmaddwd, "__builtin_ia32_pmaddwd128", IX86_BUILTIN_PMADDWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI_V8HI }, @@ -26738,6 +26747,7 @@ static const struct builtin_description bdesc_args[] = { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv4si3, "__builtin_ia32_pminud128", IX86_BUILTIN_PMINUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI }, { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv8hi3, "__builtin_ia32_pminuw128", IX86_BUILTIN_PMINUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI }, { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mulv2siv2di3, "__builtin_ia32_pmuldq128", IX86_BUILTIN_PMULDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI }, + { OPTION_MASK_ISA_SSE4_1, CODE_FOR_vec_widen_smult_odd_v4si, "__builtin_ia32_vw_smul_odd_v4si", IX86_BUILTIN_VEC_WIDEN_SMUL_ODD_V4SI, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI }, { OPTION_MASK_ISA_SSE4_1, CODE_FOR_mulv4si3, "__builtin_ia32_pmulld128", IX86_BUILTIN_PMULLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI }, /* SSE4.1 */ @@ -27004,12 +27014,15 @@ static const struct builtin_description bdesc_args[] = { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4hiv4di2 , "__builtin_ia32_pmovzxwq256", IX86_BUILTIN_PMOVZXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI }, { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4siv4di2 , "__builtin_ia32_pmovzxdq256", IX86_BUILTIN_PMOVZXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI }, { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mulv4siv4di3 , "__builtin_ia32_pmuldq256" , IX86_BUILTIN_PMULDQ256 , UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_vec_widen_smult_odd_v8si, "__builtin_ia32_vw_smul_odd_v8si", IX86_BUILTIN_VEC_WIDEN_SMUL_ODD_V8SI, UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI }, { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_umulhrswv16hi3 , "__builtin_ia32_pmulhrsw256", IX86_BUILTIN_PMULHRSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI }, { OPTION_MASK_ISA_AVX2, CODE_FOR_umulv16hi3_highpart, "__builtin_ia32_pmulhuw256" , IX86_BUILTIN_PMULHUW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI }, { OPTION_MASK_ISA_AVX2, CODE_FOR_smulv16hi3_highpart, "__builtin_ia32_pmulhw256" , IX86_BUILTIN_PMULHW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI }, { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv16hi3, "__builtin_ia32_pmullw256" , IX86_BUILTIN_PMULLW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI }, { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv8si3, "__builtin_ia32_pmulld256" , IX86_BUILTIN_PMULLD256 , UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI }, { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_umulv4siv4di3 , "__builtin_ia32_pmuludq256" , IX86_BUILTIN_PMULUDQ256 , UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_umulv4siv4di3 , "__builtin_i386_vw_umul_even_v8si" , IX86_BUILTIN_VEC_WIDEN_UMUL_EVEN_V8SI, UNKNOWN, (int) V4UDI_FTYPE_V8USI_V8USI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_vec_widen_umult_odd_v8si, "__builtin_ia32_vw_umul_odd_v8si", IX86_BUILTIN_VEC_WIDEN_UMUL_ODD_V8SI, UNKNOWN, (int) V4UDI_FTYPE_V8USI_V8USI }, { OPTION_MASK_ISA_AVX2, CODE_FOR_iorv4di3, "__builtin_ia32_por256", IX86_BUILTIN_POR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI }, { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psadbw, "__builtin_ia32_psadbw256", IX86_BUILTIN_PSADBW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI }, { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufbv32qi3, "__builtin_ia32_pshufb256", IX86_BUILTIN_PSHUFB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI }, @@ -29142,6 +29155,7 @@ ix86_expand_args_builtin (const struct builtin_description *d, case V2DI_FTYPE_V2DI_V2DI: case V2DI_FTYPE_V16QI_V16QI: case V2DI_FTYPE_V4SI_V4SI: + case V2UDI_FTYPE_V4USI_V4USI: case V2DI_FTYPE_V2DI_V16QI: case V2DI_FTYPE_V2DF_V2DF: case V2SI_FTYPE_V2SI_V2SI: @@ -29166,6 +29180,7 @@ ix86_expand_args_builtin (const struct builtin_description *d, case V8SI_FTYPE_V16HI_V16HI: case V4DI_FTYPE_V4DI_V4DI: case V4DI_FTYPE_V8SI_V8SI: + case V4UDI_FTYPE_V8USI_V8USI: if (comparison == UNKNOWN) return ix86_expand_binop_builtin (icode, exp, target); nargs = 2; @@ -31042,6 +31057,78 @@ ix86_builtin_reciprocal (unsigned int fn, bool md_fn, return NULL_TREE; } } + +static tree +ix86_builtin_mul_widen_even (tree type) +{ + bool uns_p = TYPE_UNSIGNED (type); + enum ix86_builtins code; + + switch (TYPE_MODE (type)) + { + case V4SImode: + if (uns_p) + { + if (!TARGET_SSE2) + return NULL; + code = IX86_BUILTIN_VEC_WIDEN_UMUL_EVEN_V4SI; + } + else + { + if (!TARGET_SSE4_1) + return NULL; + code = IX86_BUILTIN_PMULDQ128; + } + break; + + case V8SImode: + if (!TARGET_AVX2) + return NULL; + code = (uns_p ? IX86_BUILTIN_VEC_WIDEN_UMUL_EVEN_V8SI + : IX86_BUILTIN_PMULDQ256); + break; + + default: + return NULL; + } + return ix86_builtins[code]; +} + +static tree +ix86_builtin_mul_widen_odd (tree type) +{ + bool uns_p = TYPE_UNSIGNED (type); + enum ix86_builtins code; + + switch (TYPE_MODE (type)) + { + case V4SImode: + if (uns_p) + { + if (!TARGET_SSE2) + return NULL; + code = IX86_BUILTIN_VEC_WIDEN_UMUL_ODD_V4SI; + } + else + { + if (!TARGET_SSE4_1) + return NULL; + code = IX86_BUILTIN_VEC_WIDEN_SMUL_ODD_V4SI; + } + break; + + case V8SImode: + if (!TARGET_AVX2) + return NULL; + code = (uns_p ? IX86_BUILTIN_VEC_WIDEN_UMUL_ODD_V8SI + : IX86_BUILTIN_VEC_WIDEN_SMUL_ODD_V8SI); + break; + + default: + return NULL; + } + return ix86_builtins[code]; +} /* Helper for avx_vpermilps256_operand et al. This is also used by the expansion functions to turn the parallel back into a mask. @@ -38663,6 +38750,7 @@ ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2, bool uns_p, bool odd_p) { enum machine_mode mode = GET_MODE (op1); + enum machine_mode wmode = GET_MODE (dest); rtx x; /* We only play even/odd games with vectors of SImode. */ @@ -38672,8 +38760,12 @@ ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2, the even slots. For some cpus this is faster than a PSHUFD. */ if (odd_p) { - enum machine_mode wmode = GET_MODE (dest); - + if (TARGET_XOP && mode == V4SImode) + { + x = force_reg (wmode, CONST0_RTX (wmode)); + emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x)); + return; + } op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1), GEN_INT (GET_MODE_UNIT_BITSIZE (mode)), NULL, 1, OPTAB_DIRECT); @@ -38697,7 +38789,7 @@ ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2, x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2); else if (TARGET_XOP) { - x = force_reg (V2DImode, CONST0_RTX (V2DImode)); + x = force_reg (wmode, CONST0_RTX (wmode)); x = gen_xop_pmacsdql (dest, op1, op2, x); } else @@ -39980,6 +40072,11 @@ ix86_memmodel_check (unsigned HOST_WIDE_INT val) #undef TARGET_VECTORIZE_BUILTIN_GATHER #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather +#undef TARGET_VECTORIZE_BUILTIN_MUL_WIDEN_EVEN +#define TARGET_VECTORIZE_BUILTIN_MUL_WIDEN_EVEN ix86_builtin_mul_widen_even +#undef TARGET_VECTORIZE_BUILTIN_MUL_WIDEN_ODD +#define TARGET_VECTORIZE_BUILTIN_MUL_WIDEN_ODD ix86_builtin_mul_widen_odd + #undef TARGET_BUILTIN_RECIPROCAL #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 93cd9d7..45d3a9c 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -5708,6 +5708,20 @@ DONE; }) +(define_expand "vec_widen_<s>mult_odd_<mode>" + [(match_operand:<sseunpackmode> 0 "register_operand") + (any_extend:<sseunpackmode> + (match_operand:VI124_AVX2 1 "register_operand")) + (match_operand:VI124_AVX2 2 "register_operand")] + ; Note that SSE2 does not have signed SI multiply + "TARGET_AVX || TARGET_XOP || TARGET_SSE4_1 + || (TARGET_SSE2 && (<u_bool> || <MODE>mode != V4SImode))" +{ + ix86_expand_mul_widen_evenodd (operands[0], operands[1], operands[2], + <u_bool>, true); + DONE; +}) + (define_expand "sdot_prod<mode>" [(match_operand:<sseunpackmode> 0 "register_operand") (match_operand:VI2_AVX2 1 "register_operand") -- 1.7.10.2