Hi, The following patch implements the vectorization logic for FLOOR_MOD_EXPR and FLOOR_DIV_EXPR. According to the logic mentioned in the PR, we have For signed operands, r = x %[fl] y; is r = x % y; if (r && (x ^ y) < 0) r += y; and d = x /[fl] y; is r = x % y; d = x / y; if (r && (x ^ y) < 0) --d;
The first part enables FLOOR_{MOD,DIV}_EXPR in the switch case. Since the second operand is always a constant (check done in line 4875), we check if the operands are signed by seeing if first operand has unsigned type and the second operand is greater than zero. If so, then FLOOR_{DIV,MOD} is same as TRUNC_{DIV,MOD}. For the signed operands, the logic written above is implemented right after the TRUNC_MOD_EXPR ends, since that is needed in both cases. The pseudo code for vector implementation is as follows (op0, and op1 are the operands, and r is the remainder = op0%op1) v1 = op0 ^ op1 v2 = (v1 < 0) v3 = (r!=0) v4 = (v3 && v2) // For FLOOR_MOD_EXPR result = r + (v4 ? op1 : 0) // For FLOOR_DIV_EXPR result = d - (v4 ? 1 : 0) Please let me know if this logic is fine. Also I needed some more inputs on this. 1. In the if (interger_pow2p (oprnd1)) path, are there any recommendations, or would it be fine to produce this same code? I have skipped the handling for correctness for now. 2. I can have a test case for FLOOR_MOD_EXPR using the modulo intrinsic procedure. Since the PR talks about FLOOR_DIV_EXPR and {CEIL,ROUND}_{MOD,DIV}_EXPR, could someone please suggest source code that can have these operators in the gimple? I am not sure how to feed gcc just the updated gimple (I assume this is not possible currently). PR vect/119702 gcc: * gcc/tree-vect-patterns.cc: Added vectorization logic for FLOOR_MOD_EXPR and FLOOR_DIV_EXPR --- gcc/tree-vect-patterns.cc | 82 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 78 insertions(+), 4 deletions(-) diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc index b39da1062c0..c5e3c758ef0 100644 --- a/gcc/tree-vect-patterns.cc +++ b/gcc/tree-vect-patterns.cc @@ -4862,6 +4862,8 @@ vect_recog_divmod_pattern (vec_info *vinfo, case TRUNC_DIV_EXPR: case EXACT_DIV_EXPR: case TRUNC_MOD_EXPR: + case FLOOR_MOD_EXPR: + case FLOOR_DIV_EXPR: break; default: return NULL; @@ -4881,6 +4883,23 @@ vect_recog_divmod_pattern (vec_info *vinfo, if (vectype == NULL_TREE) return NULL; + bool unsignedp = TYPE_UNSIGNED (itype) && (tree_int_cst_sgn (oprnd1) > 0); + + if (unsignedp) + { + switch (rhs_code) + { + case FLOOR_DIV_EXPR: + rhs_code = TRUNC_DIV_EXPR; + break; + case FLOOR_MOD_EXPR: + rhs_code = TRUNC_MOD_EXPR; + break; + default: + break; + } + } + if (optimize_bb_for_size_p (gimple_bb (last_stmt))) { /* If the target can handle vectorized division or modulo natively, @@ -4893,7 +4912,9 @@ vect_recog_divmod_pattern (vec_info *vinfo, } prec = TYPE_PRECISION (itype); - if (integer_pow2p (oprnd1)) + if (integer_pow2p (oprnd1) + && rhs_code != FLOOR_DIV_EXPR + && rhs_code != FLOOR_MOD_EXPR) { if (TYPE_UNSIGNED (itype) || tree_int_cst_sgn (oprnd1) != 1) return NULL; @@ -5315,13 +5336,15 @@ vect_recog_divmod_pattern (vec_info *vinfo, } } - if (rhs_code == TRUNC_MOD_EXPR) + if (rhs_code == TRUNC_MOD_EXPR + || rhs_code == FLOOR_MOD_EXPR + || rhs_code == FLOOR_DIV_EXPR) { tree r, t1; /* We divided. Now finish by: - t1 = q * oprnd1; - r = oprnd0 - t1; */ + t1 = q * oprnd1; + r = oprnd0 - t1; */ append_pattern_def_seq (vinfo, stmt_vinfo, pattern_stmt); t1 = vect_recog_temp_ssa_var (itype, NULL); @@ -5330,6 +5353,57 @@ vect_recog_divmod_pattern (vec_info *vinfo, r = vect_recog_temp_ssa_var (itype, NULL); pattern_stmt = gimple_build_assign (r, MINUS_EXPR, oprnd0, t1); + + if (rhs_code == FLOOR_MOD_EXPR + || rhs_code == FLOOR_DIV_EXPR) + { + // r = x %[fl] y; + // is + // r = x % y; if (r && (x ^ y) < 0) r += y; + // Extract the sign bit + // x^y + append_pattern_def_seq (vinfo, stmt_vinfo, pattern_stmt); + tree cond_reg = vect_recog_temp_ssa_var(itype, NULL); + def_stmt = gimple_build_assign(cond_reg, BIT_XOR_EXPR, oprnd0, oprnd1); + append_pattern_def_seq (vinfo, stmt_vinfo, def_stmt); + + // x^y < 0 + tree cond_reg2 = vect_recog_temp_ssa_var(boolean_type_node, NULL); + def_stmt = gimple_build_assign(cond_reg2, LT_EXPR, cond_reg, build_int_cst(itype, 0)); + append_pattern_def_seq (vinfo, stmt_vinfo, def_stmt, truth_type_for(vectype), itype); + + // r != 0 + tree cond_reg3 = vect_recog_temp_ssa_var(boolean_type_node, NULL); + def_stmt = gimple_build_assign(cond_reg3, NE_EXPR, r, build_int_cst(itype, 0)); + append_pattern_def_seq (vinfo, stmt_vinfo, def_stmt, truth_type_for(vectype), itype); + + // x^y < 0 && r != 0 + tree cond_reg4 = vect_recog_temp_ssa_var(boolean_type_node, NULL); + def_stmt = gimple_build_assign(cond_reg4, BIT_AND_EXPR, cond_reg3, cond_reg2); + append_pattern_def_seq (vinfo, stmt_vinfo, def_stmt, truth_type_for(vectype), itype); + if (rhs_code == FLOOR_MOD_EXPR) + { + // (x^y < 0 && r) ? y : 0 + tree extr_cond = vect_recog_temp_ssa_var(itype, NULL); + def_stmt = gimple_build_assign(extr_cond, COND_EXPR, cond_reg4, oprnd1, build_int_cst(itype, 0)); + append_pattern_def_seq (vinfo, stmt_vinfo, def_stmt); + + // r += (x ^ y < 0 && r) ? y : 0 + tree floor_mod_r = vect_recog_temp_ssa_var(itype, NULL); + pattern_stmt = gimple_build_assign(floor_mod_r, PLUS_EXPR, r, extr_cond); + } else if (rhs_code == FLOOR_DIV_EXPR) + { + // (x^y < 0 && r) ? 1 : 0 + tree extr_cond = vect_recog_temp_ssa_var(itype, NULL); + def_stmt = gimple_build_assign(extr_cond, COND_EXPR, cond_reg4, + build_int_cst(itype, 1), build_int_cst(itype, 0)); + append_pattern_def_seq (vinfo, stmt_vinfo, def_stmt); + + // q -= (x ^ y < 0 && r) ? 1 : 0 + tree floor_mod_r = vect_recog_temp_ssa_var(itype, NULL); + pattern_stmt = gimple_build_assign(floor_mod_r, MINUS_EXPR, q, extr_cond); + } + } } /* Pattern detected. */ -- 2.47.3