On Tue, Dec 13, 2011 at 05:57:40PM +0400, Kirill Yukhin wrote: > > Let me hack up a quick pattern recognizer for this...
Here it is, untested so far. On the testcase doing 2000000 f1+f2+f3+f4 calls in the loop with -O3 -mavx on Sandybridge (so, vectorized just with 16 byte vectors) gives: vanilla 0m34.571s the tree-vect* parts of this patch only 0m9.013s the whole patch 0m8.824s The i386 parts are just a small optimization, I guess it could be done in the vectorizer too (but then we'd have to check whether the arithmetic/logical right shifts are supported and check costs?), or perhaps in the generic vcond expander (again, we'd need to check some costs). Can you see if it gives a measurable speedup on SPEC bzip2? 2011-12-14 Jakub Jelinek <ja...@redhat.com> * tree-vectorizer.h (NUM_PATTERNS): Bump to 10. * tree-vect-patterns.c (vect_recog_sdivmod_pow2_pattern): New function. (vect_vect_recog_func_ptrs): Add it. * config/i386/sse.md (vcond<V_256:mode><VI_256:mode>, vcond<V_128:mode><VI124_128:mode>, vcond<VI8F_128:mode>v2di): Use general_operand instead of nonimmediate_operand for operand 5 and no predicate for operands 1 and 2. * config/i386/i386.c (ix86_expand_int_vcond): Optimize x < 0 ? -1 : 0 and x < 0 ? 1 : 0 into vector arithmetic resp. logical shift. * gcc.dg/vect/vect-sdivmod-1.c: New test. --- gcc/tree-vectorizer.h.jj 2011-12-14 08:11:03.592010101 +0100 +++ gcc/tree-vectorizer.h 2011-12-14 08:28:57.006693371 +0100 @@ -929,7 +929,7 @@ extern void vect_slp_transform_bb (basic Additional pattern recognition functions can (and will) be added in the future. */ typedef gimple (* vect_recog_func_ptr) (VEC (gimple, heap) **, tree *, tree *); -#define NUM_PATTERNS 9 +#define NUM_PATTERNS 10 void vect_pattern_recog (loop_vec_info); /* In tree-vectorizer.c. */ --- gcc/tree-vect-patterns.c.jj 2011-12-02 01:52:27.918883940 +0100 +++ gcc/tree-vect-patterns.c 2011-12-14 11:50:10.439763740 +0100 @@ -53,6 +53,8 @@ static gimple vect_recog_widen_shift_pat tree *, tree *); static gimple vect_recog_vector_vector_shift_pattern (VEC (gimple, heap) **, tree *, tree *); +static gimple vect_recog_sdivmod_pow2_pattern (VEC (gimple, heap) **, + tree *, tree *); static gimple vect_recog_mixed_size_cond_pattern (VEC (gimple, heap) **, tree *, tree *); static gimple vect_recog_bool_pattern (VEC (gimple, heap) **, tree *, tree *); @@ -64,6 +66,7 @@ static vect_recog_func_ptr vect_vect_rec vect_recog_over_widening_pattern, vect_recog_widen_shift_pattern, vect_recog_vector_vector_shift_pattern, + vect_recog_sdivmod_pow2_pattern, vect_recog_mixed_size_cond_pattern, vect_recog_bool_pattern}; @@ -1573,6 +1576,211 @@ vect_recog_vector_vector_shift_pattern ( return pattern_stmt; } +/* Detect a signed division by power of two constant that wouldn't be + otherwise vectorized: + + type a_t, b_t; + + S1 a_t = b_t / N; + + where type 'type' is a signed integral type and N is a constant positive + power of two. + + Similarly handle signed modulo by power of two constant: + + S4 a_t = b_t % N; + + Input/Output: + + * STMTS: Contains a stmt from which the pattern search begins, + i.e. the division stmt. S1 is replaced by: + S3 y_t = b_t < 0 ? N - 1 : 0; + S2 x_t = b_t + y_t; + S1' a_t = x_t >> log2 (N); + + S4 is replaced by (where *_T temporaries have unsigned type): + S9 y_T = b_t < 0 ? -1U : 0U; + S8 z_T = y_T >> (sizeof (type_t) * CHAR_BIT - log2 (N)); + S7 z_t = (type) z_T; + S6 w_t = b_t + z_t; + S5 x_t = w_t & (N - 1); + S4' a_t = x_t - z_t; + + Output: + + * TYPE_IN: The type of the input arguments to the pattern. + + * TYPE_OUT: The type of the output of this pattern. + + * Return value: A new stmt that will be used to replace the division + S1 or modulo S4 stmt. */ + +static gimple +vect_recog_sdivmod_pow2_pattern (VEC (gimple, heap) **stmts, + tree *type_in, tree *type_out) +{ + gimple last_stmt = VEC_pop (gimple, *stmts); + gimple_stmt_iterator gsi; + tree oprnd0, oprnd1, vectype, itype, cond; + gimple pattern_stmt, def_stmt; + enum tree_code rhs_code; + stmt_vec_info stmt_vinfo = vinfo_for_stmt (last_stmt); + loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo); + optab optab; + + if (!is_gimple_assign (last_stmt)) + return NULL; + + rhs_code = gimple_assign_rhs_code (last_stmt); + switch (rhs_code) + { + case TRUNC_DIV_EXPR: + case TRUNC_MOD_EXPR: + break; + default: + return NULL; + } + + if (STMT_VINFO_IN_PATTERN_P (stmt_vinfo)) + return NULL; + + oprnd0 = gimple_assign_rhs1 (last_stmt); + oprnd1 = gimple_assign_rhs2 (last_stmt); + itype = TREE_TYPE (oprnd0); + if (TREE_CODE (oprnd0) != SSA_NAME + || TREE_CODE (oprnd1) != INTEGER_CST + || TREE_CODE (itype) != INTEGER_TYPE + || TYPE_UNSIGNED (itype) + || TYPE_PRECISION (itype) != GET_MODE_PRECISION (TYPE_MODE (itype)) + || !integer_pow2p (oprnd1) + || tree_int_cst_sgn (oprnd1) != 1) + return NULL; + + vectype = get_vectype_for_scalar_type (itype); + if (vectype == NULL_TREE) + return NULL; + + /* If the target can handle vectorized division or modulo natively, + don't attempt to optimize this. */ + optab = optab_for_tree_code (rhs_code, vectype, optab_default); + if (optab != NULL) + { + enum machine_mode vec_mode = TYPE_MODE (vectype); + int icode = (int) optab_handler (optab, vec_mode); + if (icode != CODE_FOR_nothing + || GET_MODE_SIZE (vec_mode) == UNITS_PER_WORD) + return NULL; + } + + /* Pattern detected. */ + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "vect_recog_sdivmod_pow2_pattern: detected: "); + + cond = build2 (LT_EXPR, boolean_type_node, oprnd0, build_int_cst (itype, 0)); + gsi = gsi_for_stmt (last_stmt); + if (rhs_code == TRUNC_DIV_EXPR) + { + tree var = vect_recog_temp_ssa_var (itype, NULL); + def_stmt + = gimple_build_assign_with_ops3 (COND_EXPR, var, cond, + fold_build2 (MINUS_EXPR, itype, + oprnd1, + build_int_cst (itype, + 1)), + build_int_cst (itype, 0)); + gsi_insert_before (&gsi, def_stmt, GSI_SAME_STMT); + set_vinfo_for_stmt (def_stmt, new_stmt_vec_info (def_stmt, loop_vinfo, + NULL)); + var = vect_recog_temp_ssa_var (itype, NULL); + def_stmt + = gimple_build_assign_with_ops (PLUS_EXPR, var, oprnd0, + gimple_assign_lhs (def_stmt)); + STMT_VINFO_PATTERN_DEF_STMT (stmt_vinfo) = def_stmt; + + pattern_stmt + = gimple_build_assign_with_ops (RSHIFT_EXPR, + vect_recog_temp_ssa_var (itype, NULL), + var, + build_int_cst (itype, + tree_log2 (oprnd1))); + } + else + { + tree signmask; + tree utype = build_nonstandard_integer_type (TYPE_PRECISION (itype), 1); + tree shift = build_int_cst (utype, GET_MODE_BITSIZE (TYPE_MODE (itype)) + - tree_log2 (oprnd1)); + if (compare_tree_int (oprnd1, 2) == 0) + { + signmask = vect_recog_temp_ssa_var (itype, NULL); + def_stmt + = gimple_build_assign_with_ops3 (COND_EXPR, signmask, cond, + build_int_cst (itype, 1), + build_int_cst (itype, 0)); + gsi_insert_before (&gsi, def_stmt, GSI_SAME_STMT); + set_vinfo_for_stmt (def_stmt, + new_stmt_vec_info (def_stmt, loop_vinfo, NULL)); + } + else + { + tree var = vect_recog_temp_ssa_var (utype, NULL); + def_stmt + = gimple_build_assign_with_ops3 (COND_EXPR, var, cond, + build_int_cst (utype, -1), + build_int_cst (utype, 0)); + gsi_insert_before (&gsi, def_stmt, GSI_SAME_STMT); + set_vinfo_for_stmt (def_stmt, + new_stmt_vec_info (def_stmt, loop_vinfo, NULL)); + var = vect_recog_temp_ssa_var (utype, NULL); + def_stmt + = gimple_build_assign_with_ops (RSHIFT_EXPR, var, + gimple_assign_lhs (def_stmt), + shift); + gsi_insert_before (&gsi, def_stmt, GSI_SAME_STMT); + set_vinfo_for_stmt (def_stmt, + new_stmt_vec_info (def_stmt, loop_vinfo, NULL)); + signmask = vect_recog_temp_ssa_var (itype, NULL); + def_stmt + = gimple_build_assign_with_ops (NOP_EXPR, signmask, var, + NULL_TREE); + gsi_insert_before (&gsi, def_stmt, GSI_SAME_STMT); + set_vinfo_for_stmt (def_stmt, + new_stmt_vec_info (def_stmt, loop_vinfo, NULL)); + } + def_stmt + = gimple_build_assign_with_ops (PLUS_EXPR, + vect_recog_temp_ssa_var (itype, NULL), + oprnd0, signmask); + gsi_insert_before (&gsi, def_stmt, GSI_SAME_STMT); + set_vinfo_for_stmt (def_stmt, new_stmt_vec_info (def_stmt, loop_vinfo, + NULL)); + def_stmt + = gimple_build_assign_with_ops (BIT_AND_EXPR, + vect_recog_temp_ssa_var (itype, NULL), + gimple_assign_lhs (def_stmt), + fold_build2 (MINUS_EXPR, itype, + oprnd1, + build_int_cst (itype, + 1))); + STMT_VINFO_PATTERN_DEF_STMT (stmt_vinfo) = def_stmt; + + pattern_stmt + = gimple_build_assign_with_ops (MINUS_EXPR, + vect_recog_temp_ssa_var (itype, NULL), + gimple_assign_lhs (def_stmt), + signmask); + } + + if (vect_print_dump_info (REPORT_DETAILS)) + print_gimple_stmt (vect_dump, pattern_stmt, 0, TDF_SLIM); + + VEC_safe_push (gimple, heap, *stmts, last_stmt); + + *type_in = vectype; + *type_out = vectype; + return pattern_stmt; +} + /* Function vect_recog_mixed_size_cond_pattern Try to find the following pattern: --- gcc/config/i386/sse.md.jj 2011-12-03 12:22:33.000000000 +0100 +++ gcc/config/i386/sse.md 2011-12-14 13:00:47.695788256 +0100 @@ -6340,9 +6340,9 @@ (define_expand "vcond<V_256:mode><VI_256 (if_then_else:V_256 (match_operator 3 "" [(match_operand:VI_256 4 "nonimmediate_operand" "") - (match_operand:VI_256 5 "nonimmediate_operand" "")]) - (match_operand:V_256 1 "general_operand" "") - (match_operand:V_256 2 "general_operand" "")))] + (match_operand:VI_256 5 "general_operand" "")]) + (match_operand:V_256 1 "" "") + (match_operand:V_256 2 "" "")))] "TARGET_AVX2 && (GET_MODE_NUNITS (<V_256:MODE>mode) == GET_MODE_NUNITS (<VI_256:MODE>mode))" @@ -6357,9 +6357,9 @@ (define_expand "vcond<V_128:mode><VI124_ (if_then_else:V_128 (match_operator 3 "" [(match_operand:VI124_128 4 "nonimmediate_operand" "") - (match_operand:VI124_128 5 "nonimmediate_operand" "")]) - (match_operand:V_128 1 "general_operand" "") - (match_operand:V_128 2 "general_operand" "")))] + (match_operand:VI124_128 5 "general_operand" "")]) + (match_operand:V_128 1 "" "") + (match_operand:V_128 2 "" "")))] "TARGET_SSE2 && (GET_MODE_NUNITS (<V_128:MODE>mode) == GET_MODE_NUNITS (<VI124_128:MODE>mode))" @@ -6374,9 +6374,9 @@ (define_expand "vcond<VI8F_128:mode>v2di (if_then_else:VI8F_128 (match_operator 3 "" [(match_operand:V2DI 4 "nonimmediate_operand" "") - (match_operand:V2DI 5 "nonimmediate_operand" "")]) - (match_operand:VI8F_128 1 "general_operand" "") - (match_operand:VI8F_128 2 "general_operand" "")))] + (match_operand:V2DI 5 "general_operand" "")]) + (match_operand:VI8F_128 1 "" "") + (match_operand:VI8F_128 2 "" "")))] "TARGET_SSE4_2" { bool ok = ix86_expand_int_vcond (operands); --- gcc/config/i386/i386.c.jj 2011-12-14 08:11:01.000000000 +0100 +++ gcc/config/i386/i386.c 2011-12-14 13:01:30.207538465 +0100 @@ -19434,6 +19434,45 @@ ix86_expand_int_vcond (rtx operands[]) cop0 = operands[4]; cop1 = operands[5]; + /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31 + and x < 0 ? 1 : 0 into (unsigned) x >> 31. */ + if ((code == LT || code == GE) + && data_mode == mode + && cop1 == CONST0_RTX (mode) + && operands[1 + (code == LT)] == CONST0_RTX (data_mode) + && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) > 1 + && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) <= 8 + && (GET_MODE_SIZE (data_mode) == 16 + || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32))) + { + rtx negop = operands[2 - (code == LT)]; + int shift = GET_MODE_BITSIZE (GET_MODE_INNER (data_mode)) - 1; + if (negop == CONST1_RTX (data_mode)) + { + rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift), + operands[0], 1, OPTAB_DIRECT); + if (res != operands[0]) + emit_move_insn (operands[0], res); + return true; + } + else if (GET_MODE_INNER (data_mode) != DImode + && vector_all_ones_operand (negop, data_mode)) + { + rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift), + operands[0], 0, OPTAB_DIRECT); + if (res != operands[0]) + emit_move_insn (operands[0], res); + return true; + } + } + + if (!nonimmediate_operand (cop1, mode)) + cop1 = force_reg (mode, cop1); + if (!general_operand (operands[1], data_mode)) + operands[1] = force_reg (data_mode, operands[1]); + if (!general_operand (operands[2], data_mode)) + operands[2] = force_reg (data_mode, operands[2]); + /* XOP supports all of the comparisons on all 128-bit vector int types. */ if (TARGET_XOP && (mode == V16QImode || mode == V8HImode --- gcc/testsuite/gcc.dg/vect/vect-sdivmod-1.c.jj 2011-12-14 13:05:36.240133846 +0100 +++ gcc/testsuite/gcc.dg/vect/vect-sdivmod-1.c 2011-12-14 13:08:10.288228745 +0100 @@ -0,0 +1,98 @@ +#include "tree-vect.h" + +extern void abort (void); +int a[4096]; + +__attribute__((noinline, noclone)) void +f1 (int x) +{ + int i, j; + for (i = 1; i <= x; i++) + { + j = a[i] >> 8; + j = 1 + (j / 2); + a[i] = j << 8; + } +} + +__attribute__((noinline, noclone)) void +f2 (int x) +{ + int i, j; + for (i = 1; i <= x; i++) + { + j = a[i] >> 8; + j = 1 + (j / 16); + a[i] = j << 8; + } +} + +__attribute__((noinline, noclone)) void +f3 (int x) +{ + int i, j; + for (i = 1; i <= x; i++) + { + j = a[i] >> 8; + j = 1 + (j % 2); + a[i] = j << 8; + } +} + +__attribute__((noinline, noclone)) void +f4 (int x) +{ + int i, j; + for (i = 1; i <= x; i++) + { + j = a[i] >> 8; + j = 1 + (j % 16); + a[i] = j << 8; + } +} + +int +main () +{ + int i; + check_vect (); + for (i = 0; i < 4096; i++) + { + asm (""); + a[i] = (i - 2048) << 8; + } + f1 (4095); + if (a[0] != (-2048 << 8)) + abort (); + for (i = 1; i < 4096; i++) + if (a[i] != ((1 + ((i - 2048) / 2)) << 8)) + abort (); + else + a[i] = (i - 2048) << 8; + f2 (4095); + if (a[0] != (-2048 << 8)) + abort (); + for (i = 1; i < 4096; i++) + if (a[i] != ((1 + ((i - 2048) / 16)) << 8)) + abort (); + else + a[i] = (i - 2048) << 8; + f3 (4095); + if (a[0] != (-2048 << 8)) + abort (); + for (i = 1; i < 4096; i++) + if (a[i] != ((1 + ((i - 2048) % 2)) << 8)) + abort (); + else + a[i] = (i - 2048) << 8; + f4 (4095); + if (a[0] != (-2048 << 8)) + abort (); + for (i = 1; i < 4096; i++) + if (a[i] != ((1 + ((i - 2048) % 16)) << 8)) + abort (); + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 4 "vect" { target vect_condition } } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ Jakub