Hi Jakub, For 401.bzip2 it looks perfect. This is loop is vectorized: .L6: vmovdqa (%rsi,%rax), %ymm0 addl $1, %ecx vpsrad $8, %ymm0, %ymm0 vpsrld $31, %ymm0, %ymm1 vpaddd %ymm1, %ymm0, %ymm0 vpsrad $1, %ymm0, %ymm0 vpaddd %ymm2, %ymm0, %ymm0 vpslld $8, %ymm0, %ymm0 vmovdqa %ymm0, (%rsi,%rax) addq $32, %rax cmpl %r8d, %ecx jne .L3
Unfortunatelly, I have no perf. simulator of AVX2. But I believe it will benefit since, in absence of that optimization we got (same options): .L3: movl (%rax), %edx sarl $8, %edx movl %edx, %ecx shrl $31, %ecx addl %ecx, %edx sarl %edx addl $1, %edx sall $8, %edx movl %edx, (%rax) addq $4, %rax cmpq %rsi, %rax jne .L3 Thanks, K On Wed, Dec 14, 2011 at 4:25 PM, Jakub Jelinek <ja...@redhat.com> wrote: > On Tue, Dec 13, 2011 at 05:57:40PM +0400, Kirill Yukhin wrote: >> > Let me hack up a quick pattern recognizer for this... > > Here it is, untested so far. > On the testcase doing 2000000 f1+f2+f3+f4 calls in the loop with -O3 -mavx > on Sandybridge (so, vectorized just with 16 byte vectors) gives: > vanilla 0m34.571s > the tree-vect* parts of this patch only 0m9.013s > the whole patch 0m8.824s > The i386 parts are just a small optimization, I guess it could be > done in the vectorizer too (but then we'd have to check whether the > arithmetic/logical right shifts are supported and check costs?), or > perhaps in the generic vcond expander (again, we'd need to check some > costs). > > Can you see if it gives a measurable speedup on SPEC bzip2? > > 2011-12-14 Jakub Jelinek <ja...@redhat.com> > > * tree-vectorizer.h (NUM_PATTERNS): Bump to 10. > * tree-vect-patterns.c (vect_recog_sdivmod_pow2_pattern): New > function. > (vect_vect_recog_func_ptrs): Add it. > > * config/i386/sse.md (vcond<V_256:mode><VI_256:mode>, > vcond<V_128:mode><VI124_128:mode>, vcond<VI8F_128:mode>v2di): > Use general_operand instead of nonimmediate_operand for > operand 5 and no predicate for operands 1 and 2. > * config/i386/i386.c (ix86_expand_int_vcond): Optimize > x < 0 ? -1 : 0 and x < 0 ? 1 : 0 into vector arithmetic > resp. logical shift. > > * gcc.dg/vect/vect-sdivmod-1.c: New test. > > --- gcc/tree-vectorizer.h.jj 2011-12-14 08:11:03.592010101 +0100 > +++ gcc/tree-vectorizer.h 2011-12-14 08:28:57.006693371 +0100 > @@ -929,7 +929,7 @@ extern void vect_slp_transform_bb (basic > Additional pattern recognition functions can (and will) be added > in the future. */ > typedef gimple (* vect_recog_func_ptr) (VEC (gimple, heap) **, tree *, tree > *); > -#define NUM_PATTERNS 9 > +#define NUM_PATTERNS 10 > void vect_pattern_recog (loop_vec_info); > > /* In tree-vectorizer.c. */ > --- gcc/tree-vect-patterns.c.jj 2011-12-02 01:52:27.918883940 +0100 > +++ gcc/tree-vect-patterns.c 2011-12-14 11:50:10.439763740 +0100 > @@ -53,6 +53,8 @@ static gimple vect_recog_widen_shift_pat > tree *, tree *); > static gimple vect_recog_vector_vector_shift_pattern (VEC (gimple, heap) **, > tree *, tree *); > +static gimple vect_recog_sdivmod_pow2_pattern (VEC (gimple, heap) **, > + tree *, tree *); > static gimple vect_recog_mixed_size_cond_pattern (VEC (gimple, heap) **, > tree *, tree *); > static gimple vect_recog_bool_pattern (VEC (gimple, heap) **, tree *, tree > *); > @@ -64,6 +66,7 @@ static vect_recog_func_ptr vect_vect_rec > vect_recog_over_widening_pattern, > vect_recog_widen_shift_pattern, > vect_recog_vector_vector_shift_pattern, > + vect_recog_sdivmod_pow2_pattern, > vect_recog_mixed_size_cond_pattern, > vect_recog_bool_pattern}; > > @@ -1573,6 +1576,211 @@ vect_recog_vector_vector_shift_pattern ( > return pattern_stmt; > } > > +/* Detect a signed division by power of two constant that wouldn't be > + otherwise vectorized: > + > + type a_t, b_t; > + > + S1 a_t = b_t / N; > + > + where type 'type' is a signed integral type and N is a constant positive > + power of two. > + > + Similarly handle signed modulo by power of two constant: > + > + S4 a_t = b_t % N; > + > + Input/Output: > + > + * STMTS: Contains a stmt from which the pattern search begins, > + i.e. the division stmt. S1 is replaced by: > + S3 y_t = b_t < 0 ? N - 1 : 0; > + S2 x_t = b_t + y_t; > + S1' a_t = x_t >> log2 (N); > + > + S4 is replaced by (where *_T temporaries have unsigned type): > + S9 y_T = b_t < 0 ? -1U : 0U; > + S8 z_T = y_T >> (sizeof (type_t) * CHAR_BIT - log2 (N)); > + S7 z_t = (type) z_T; > + S6 w_t = b_t + z_t; > + S5 x_t = w_t & (N - 1); > + S4' a_t = x_t - z_t; > + > + Output: > + > + * TYPE_IN: The type of the input arguments to the pattern. > + > + * TYPE_OUT: The type of the output of this pattern. > + > + * Return value: A new stmt that will be used to replace the division > + S1 or modulo S4 stmt. */ > + > +static gimple > +vect_recog_sdivmod_pow2_pattern (VEC (gimple, heap) **stmts, > + tree *type_in, tree *type_out) > +{ > + gimple last_stmt = VEC_pop (gimple, *stmts); > + gimple_stmt_iterator gsi; > + tree oprnd0, oprnd1, vectype, itype, cond; > + gimple pattern_stmt, def_stmt; > + enum tree_code rhs_code; > + stmt_vec_info stmt_vinfo = vinfo_for_stmt (last_stmt); > + loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo); > + optab optab; > + > + if (!is_gimple_assign (last_stmt)) > + return NULL; > + > + rhs_code = gimple_assign_rhs_code (last_stmt); > + switch (rhs_code) > + { > + case TRUNC_DIV_EXPR: > + case TRUNC_MOD_EXPR: > + break; > + default: > + return NULL; > + } > + > + if (STMT_VINFO_IN_PATTERN_P (stmt_vinfo)) > + return NULL; > + > + oprnd0 = gimple_assign_rhs1 (last_stmt); > + oprnd1 = gimple_assign_rhs2 (last_stmt); > + itype = TREE_TYPE (oprnd0); > + if (TREE_CODE (oprnd0) != SSA_NAME > + || TREE_CODE (oprnd1) != INTEGER_CST > + || TREE_CODE (itype) != INTEGER_TYPE > + || TYPE_UNSIGNED (itype) > + || TYPE_PRECISION (itype) != GET_MODE_PRECISION (TYPE_MODE (itype)) > + || !integer_pow2p (oprnd1) > + || tree_int_cst_sgn (oprnd1) != 1) > + return NULL; > + > + vectype = get_vectype_for_scalar_type (itype); > + if (vectype == NULL_TREE) > + return NULL; > + > + /* If the target can handle vectorized division or modulo natively, > + don't attempt to optimize this. */ > + optab = optab_for_tree_code (rhs_code, vectype, optab_default); > + if (optab != NULL) > + { > + enum machine_mode vec_mode = TYPE_MODE (vectype); > + int icode = (int) optab_handler (optab, vec_mode); > + if (icode != CODE_FOR_nothing > + || GET_MODE_SIZE (vec_mode) == UNITS_PER_WORD) > + return NULL; > + } > + > + /* Pattern detected. */ > + if (vect_print_dump_info (REPORT_DETAILS)) > + fprintf (vect_dump, "vect_recog_sdivmod_pow2_pattern: detected: "); > + > + cond = build2 (LT_EXPR, boolean_type_node, oprnd0, build_int_cst (itype, > 0)); > + gsi = gsi_for_stmt (last_stmt); > + if (rhs_code == TRUNC_DIV_EXPR) > + { > + tree var = vect_recog_temp_ssa_var (itype, NULL); > + def_stmt > + = gimple_build_assign_with_ops3 (COND_EXPR, var, cond, > + fold_build2 (MINUS_EXPR, itype, > + oprnd1, > + build_int_cst (itype, > + 1)), > + build_int_cst (itype, 0)); > + gsi_insert_before (&gsi, def_stmt, GSI_SAME_STMT); > + set_vinfo_for_stmt (def_stmt, new_stmt_vec_info (def_stmt, loop_vinfo, > + NULL)); > + var = vect_recog_temp_ssa_var (itype, NULL); > + def_stmt > + = gimple_build_assign_with_ops (PLUS_EXPR, var, oprnd0, > + gimple_assign_lhs (def_stmt)); > + STMT_VINFO_PATTERN_DEF_STMT (stmt_vinfo) = def_stmt; > + > + pattern_stmt > + = gimple_build_assign_with_ops (RSHIFT_EXPR, > + vect_recog_temp_ssa_var (itype, NULL), > + var, > + build_int_cst (itype, > + tree_log2 (oprnd1))); > + } > + else > + { > + tree signmask; > + tree utype = build_nonstandard_integer_type (TYPE_PRECISION (itype), > 1); > + tree shift = build_int_cst (utype, GET_MODE_BITSIZE (TYPE_MODE (itype)) > + - tree_log2 (oprnd1)); > + if (compare_tree_int (oprnd1, 2) == 0) > + { > + signmask = vect_recog_temp_ssa_var (itype, NULL); > + def_stmt > + = gimple_build_assign_with_ops3 (COND_EXPR, signmask, cond, > + build_int_cst (itype, 1), > + build_int_cst (itype, 0)); > + gsi_insert_before (&gsi, def_stmt, GSI_SAME_STMT); > + set_vinfo_for_stmt (def_stmt, > + new_stmt_vec_info (def_stmt, loop_vinfo, NULL)); > + } > + else > + { > + tree var = vect_recog_temp_ssa_var (utype, NULL); > + def_stmt > + = gimple_build_assign_with_ops3 (COND_EXPR, var, cond, > + build_int_cst (utype, -1), > + build_int_cst (utype, 0)); > + gsi_insert_before (&gsi, def_stmt, GSI_SAME_STMT); > + set_vinfo_for_stmt (def_stmt, > + new_stmt_vec_info (def_stmt, loop_vinfo, NULL)); > + var = vect_recog_temp_ssa_var (utype, NULL); > + def_stmt > + = gimple_build_assign_with_ops (RSHIFT_EXPR, var, > + gimple_assign_lhs (def_stmt), > + shift); > + gsi_insert_before (&gsi, def_stmt, GSI_SAME_STMT); > + set_vinfo_for_stmt (def_stmt, > + new_stmt_vec_info (def_stmt, loop_vinfo, NULL)); > + signmask = vect_recog_temp_ssa_var (itype, NULL); > + def_stmt > + = gimple_build_assign_with_ops (NOP_EXPR, signmask, var, > + NULL_TREE); > + gsi_insert_before (&gsi, def_stmt, GSI_SAME_STMT); > + set_vinfo_for_stmt (def_stmt, > + new_stmt_vec_info (def_stmt, loop_vinfo, NULL)); > + } > + def_stmt > + = gimple_build_assign_with_ops (PLUS_EXPR, > + vect_recog_temp_ssa_var (itype, NULL), > + oprnd0, signmask); > + gsi_insert_before (&gsi, def_stmt, GSI_SAME_STMT); > + set_vinfo_for_stmt (def_stmt, new_stmt_vec_info (def_stmt, loop_vinfo, > + NULL)); > + def_stmt > + = gimple_build_assign_with_ops (BIT_AND_EXPR, > + vect_recog_temp_ssa_var (itype, NULL), > + gimple_assign_lhs (def_stmt), > + fold_build2 (MINUS_EXPR, itype, > + oprnd1, > + build_int_cst (itype, > + 1))); > + STMT_VINFO_PATTERN_DEF_STMT (stmt_vinfo) = def_stmt; > + > + pattern_stmt > + = gimple_build_assign_with_ops (MINUS_EXPR, > + vect_recog_temp_ssa_var (itype, NULL), > + gimple_assign_lhs (def_stmt), > + signmask); > + } > + > + if (vect_print_dump_info (REPORT_DETAILS)) > + print_gimple_stmt (vect_dump, pattern_stmt, 0, TDF_SLIM); > + > + VEC_safe_push (gimple, heap, *stmts, last_stmt); > + > + *type_in = vectype; > + *type_out = vectype; > + return pattern_stmt; > +} > + > /* Function vect_recog_mixed_size_cond_pattern > > Try to find the following pattern: > --- gcc/config/i386/sse.md.jj 2011-12-03 12:22:33.000000000 +0100 > +++ gcc/config/i386/sse.md 2011-12-14 13:00:47.695788256 +0100 > @@ -6340,9 +6340,9 @@ (define_expand "vcond<V_256:mode><VI_256 > (if_then_else:V_256 > (match_operator 3 "" > [(match_operand:VI_256 4 "nonimmediate_operand" "") > - (match_operand:VI_256 5 "nonimmediate_operand" "")]) > - (match_operand:V_256 1 "general_operand" "") > - (match_operand:V_256 2 "general_operand" "")))] > + (match_operand:VI_256 5 "general_operand" "")]) > + (match_operand:V_256 1 "" "") > + (match_operand:V_256 2 "" "")))] > "TARGET_AVX2 > && (GET_MODE_NUNITS (<V_256:MODE>mode) > == GET_MODE_NUNITS (<VI_256:MODE>mode))" > @@ -6357,9 +6357,9 @@ (define_expand "vcond<V_128:mode><VI124_ > (if_then_else:V_128 > (match_operator 3 "" > [(match_operand:VI124_128 4 "nonimmediate_operand" "") > - (match_operand:VI124_128 5 "nonimmediate_operand" "")]) > - (match_operand:V_128 1 "general_operand" "") > - (match_operand:V_128 2 "general_operand" "")))] > + (match_operand:VI124_128 5 "general_operand" "")]) > + (match_operand:V_128 1 "" "") > + (match_operand:V_128 2 "" "")))] > "TARGET_SSE2 > && (GET_MODE_NUNITS (<V_128:MODE>mode) > == GET_MODE_NUNITS (<VI124_128:MODE>mode))" > @@ -6374,9 +6374,9 @@ (define_expand "vcond<VI8F_128:mode>v2di > (if_then_else:VI8F_128 > (match_operator 3 "" > [(match_operand:V2DI 4 "nonimmediate_operand" "") > - (match_operand:V2DI 5 "nonimmediate_operand" "")]) > - (match_operand:VI8F_128 1 "general_operand" "") > - (match_operand:VI8F_128 2 "general_operand" "")))] > + (match_operand:V2DI 5 "general_operand" "")]) > + (match_operand:VI8F_128 1 "" "") > + (match_operand:VI8F_128 2 "" "")))] > "TARGET_SSE4_2" > { > bool ok = ix86_expand_int_vcond (operands); > --- gcc/config/i386/i386.c.jj 2011-12-14 08:11:01.000000000 +0100 > +++ gcc/config/i386/i386.c 2011-12-14 13:01:30.207538465 +0100 > @@ -19434,6 +19434,45 @@ ix86_expand_int_vcond (rtx operands[]) > cop0 = operands[4]; > cop1 = operands[5]; > > + /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31 > + and x < 0 ? 1 : 0 into (unsigned) x >> 31. */ > + if ((code == LT || code == GE) > + && data_mode == mode > + && cop1 == CONST0_RTX (mode) > + && operands[1 + (code == LT)] == CONST0_RTX (data_mode) > + && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) > 1 > + && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) <= 8 > + && (GET_MODE_SIZE (data_mode) == 16 > + || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32))) > + { > + rtx negop = operands[2 - (code == LT)]; > + int shift = GET_MODE_BITSIZE (GET_MODE_INNER (data_mode)) - 1; > + if (negop == CONST1_RTX (data_mode)) > + { > + rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT > (shift), > + operands[0], 1, OPTAB_DIRECT); > + if (res != operands[0]) > + emit_move_insn (operands[0], res); > + return true; > + } > + else if (GET_MODE_INNER (data_mode) != DImode > + && vector_all_ones_operand (negop, data_mode)) > + { > + rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT > (shift), > + operands[0], 0, OPTAB_DIRECT); > + if (res != operands[0]) > + emit_move_insn (operands[0], res); > + return true; > + } > + } > + > + if (!nonimmediate_operand (cop1, mode)) > + cop1 = force_reg (mode, cop1); > + if (!general_operand (operands[1], data_mode)) > + operands[1] = force_reg (data_mode, operands[1]); > + if (!general_operand (operands[2], data_mode)) > + operands[2] = force_reg (data_mode, operands[2]); > + > /* XOP supports all of the comparisons on all 128-bit vector int types. */ > if (TARGET_XOP > && (mode == V16QImode || mode == V8HImode > --- gcc/testsuite/gcc.dg/vect/vect-sdivmod-1.c.jj 2011-12-14 > 13:05:36.240133846 +0100 > +++ gcc/testsuite/gcc.dg/vect/vect-sdivmod-1.c 2011-12-14 13:08:10.288228745 > +0100 > @@ -0,0 +1,98 @@ > +#include "tree-vect.h" > + > +extern void abort (void); > +int a[4096]; > + > +__attribute__((noinline, noclone)) void > +f1 (int x) > +{ > + int i, j; > + for (i = 1; i <= x; i++) > + { > + j = a[i] >> 8; > + j = 1 + (j / 2); > + a[i] = j << 8; > + } > +} > + > +__attribute__((noinline, noclone)) void > +f2 (int x) > +{ > + int i, j; > + for (i = 1; i <= x; i++) > + { > + j = a[i] >> 8; > + j = 1 + (j / 16); > + a[i] = j << 8; > + } > +} > + > +__attribute__((noinline, noclone)) void > +f3 (int x) > +{ > + int i, j; > + for (i = 1; i <= x; i++) > + { > + j = a[i] >> 8; > + j = 1 + (j % 2); > + a[i] = j << 8; > + } > +} > + > +__attribute__((noinline, noclone)) void > +f4 (int x) > +{ > + int i, j; > + for (i = 1; i <= x; i++) > + { > + j = a[i] >> 8; > + j = 1 + (j % 16); > + a[i] = j << 8; > + } > +} > + > +int > +main () > +{ > + int i; > + check_vect (); > + for (i = 0; i < 4096; i++) > + { > + asm (""); > + a[i] = (i - 2048) << 8; > + } > + f1 (4095); > + if (a[0] != (-2048 << 8)) > + abort (); > + for (i = 1; i < 4096; i++) > + if (a[i] != ((1 + ((i - 2048) / 2)) << 8)) > + abort (); > + else > + a[i] = (i - 2048) << 8; > + f2 (4095); > + if (a[0] != (-2048 << 8)) > + abort (); > + for (i = 1; i < 4096; i++) > + if (a[i] != ((1 + ((i - 2048) / 16)) << 8)) > + abort (); > + else > + a[i] = (i - 2048) << 8; > + f3 (4095); > + if (a[0] != (-2048 << 8)) > + abort (); > + for (i = 1; i < 4096; i++) > + if (a[i] != ((1 + ((i - 2048) % 2)) << 8)) > + abort (); > + else > + a[i] = (i - 2048) << 8; > + f4 (4095); > + if (a[0] != (-2048 << 8)) > + abort (); > + for (i = 1; i < 4096; i++) > + if (a[i] != ((1 + ((i - 2048) % 16)) << 8)) > + abort (); > + return 0; > +} > + > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 4 "vect" { target > vect_condition } } } */ > +/* { dg-final { cleanup-tree-dump "vect" } } */ > > Jakub