On Jan 28, 2021, Richard Biener <richard.guent...@gmail.com> wrote: > That would allow turning back the memset into the original loop (but > with optimal IVs, etc.).
Is this sort of what you had in mind? I haven't tested the inline expansion of memset much yet; and that of memcpy, not at all; this really is mainly to check that I understood what you had in mind before I spend further time polishing it. test builtin ratio for loop distribution From: Alexandre Oliva <ol...@adacore.com> The ldist pass turns even very short loops into memset calls. E.g., the TFmode emulation calls end with a loop of up to 3 iterations, to zero out trailing words, and the loop distribution pass turns them into calls of the memset builtin. Though short constant-length memsets are usually dealt with efficiently, for non-constant-length ones, the options are setmemM, or a function calls. RISC-V doesn't have any setmemM pattern, so the loops above end up "optimized" into memset calls, incurring not only the overhead of an explicit call, but also discarding the information the compiler has about the alignment of the destination, and that the length is a multiple of the word alignment. This patch adds, to the loop distribution pass, the ability to perform inline expansion of bounded variable-length memset and memcpy in ways that take advantage of known alignments and size's factors, when preexisting *_RATIO macros suggest the inline expansion is advantageous. for gcc/ChangeLog * tree-loop-distribution.c: Include builtins.h. (generate_memset_builtin): Introduce optional inline expansion of bounded variable-sized memset calls. (generate_memcpy_builtin): Likewise for memcpy only. (loop_distribution::execute): Fix loop structure afterwards. --- gcc/tree-loop-distribution.c | 280 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 279 insertions(+), 1 deletion(-) diff --git a/gcc/tree-loop-distribution.c b/gcc/tree-loop-distribution.c index bb15fd3723fb6..3be7a4c1ac281 100644 --- a/gcc/tree-loop-distribution.c +++ b/gcc/tree-loop-distribution.c @@ -115,6 +115,7 @@ along with GCC; see the file COPYING3. If not see #include "tree-vectorizer.h" #include "tree-eh.h" #include "gimple-fold.h" +#include "builtins.h" #define MAX_DATAREFS_NUM \ @@ -1148,6 +1149,23 @@ generate_memset_builtin (class loop *loop, partition *partition) /* The new statements will be placed before LOOP. */ gsi = gsi_last_bb (loop_preheader_edge (loop)->src); + /* Compute builtin->size range and ctz before it's gimplified; sub-expressions + thereof are rewritten in place, so they end up referencing SSA_NAMEs for + which we don't have VR info. */ + unsigned align = get_pointer_alignment (builtin->dst_base) / BITS_PER_UNIT; + unsigned alctz, szctz, xbits; + wide_int szmin, szmax; + value_range_kind vrk; + if (align) + { + alctz = wi::ctz (align); + szctz = MIN (tree_ctz (builtin->size), alctz); + xbits = alctz - szctz; + vrk = determine_value_range (builtin->size, &szmin, &szmax); + if (szmin == szmax) + align = 0; + } + nb_bytes = rewrite_to_non_trapping_overflow (builtin->size); nb_bytes = force_gimple_operand_gsi (&gsi, nb_bytes, true, NULL_TREE, false, GSI_CONTINUE_LINKING); @@ -1172,6 +1190,127 @@ generate_memset_builtin (class loop *loop, partition *partition) val = tem; } + unsigned HOST_WIDE_INT ratio; + if (integer_zerop (val)) + ratio = CLEAR_RATIO (optimize_loop_for_speed_p (loop)); + else + ratio = SET_RATIO (optimize_loop_for_speed_p (loop)); + + /* Compare the ratio with the number of (most-aligned) required stores needed + for szmax bytes, with the ratio: a loop that iterates up to szmax >> alctz, + and then one conditional store for each extra bit of alignment that the + destination has over the size. */ + if (align && vrk == VR_RANGE + && wi::ltu_p (wi::rshift (szmax, alctz, UNSIGNED) + xbits, ratio)) + { + gimple *stmt; + tree bytes = create_tmp_var_raw (size_type_node, "ldistbytes"); + tree ptr = create_tmp_var_raw (build_pointer_type (char_type_node), + "ldistptr"); + tree stval = force_gimple_operand_gsi (&gsi, + fold_convert + (unsigned_char_type_node, val), + true, NULL_TREE, false, + GSI_CONTINUE_LINKING); + + for (unsigned i = 1; i != alctz; i *= 2) + { + unsigned bits = i * 2 * BITS_PER_UNIT; + tree type = build_nonstandard_integer_type (bits, true); + stval = fold_convert (type, stval); + tree op2 = fold_build2_loc (partition->loc, LSHIFT_EXPR, + TREE_TYPE (stval), stval, + build_int_cst (type, i * BITS_PER_UNIT)); + stval = fold_build2_loc (partition->loc, BIT_IOR_EXPR, + TREE_TYPE (stval), stval, op2); + stval = force_gimple_operand_gsi (&gsi, stval, true, NULL_TREE, + false, GSI_CONTINUE_LINKING); + } + + stmt = gimple_build_assign (bytes, nb_bytes); + gimple_set_location (stmt, partition->loc); + gsi_insert_after (&gsi, stmt, GSI_CONTINUE_LINKING); + + stmt = gimple_build_assign (ptr, mem); + gimple_set_location (stmt, partition->loc); + gsi_insert_after (&gsi, stmt, GSI_CONTINUE_LINKING); + + gsi = gsi_start_bb (split_block (gsi_bb (gsi), stmt)->dest); + + for (unsigned i = 0; i <= xbits; i++) + { + tree blksz = build_int_cst (size_type_node, align >> i); + + if (i > 0) + { + unsigned bits = (align >> i) * BITS_PER_UNIT; + tree type = build_nonstandard_integer_type (bits, true); + stval = fold_convert (type, stval); + stval = force_gimple_operand_gsi (&gsi, stval, true, NULL_TREE, + false, GSI_CONTINUE_LINKING); + } + + tree labcond = NULL; // create_artificial_label (partition->loc); + tree labskip = NULL; // create_artificial_label (partition->loc); + + stmt = gimple_build_cond (GE_EXPR, bytes, blksz, labcond, labskip); + gimple_set_location (stmt, partition->loc); + gsi_insert_after (&gsi, stmt, GSI_CONTINUE_LINKING); + + basic_block bbfrom = gsi_bb (gsi); + basic_block bbcond = split_block (bbfrom, stmt)->dest; + + gsi = gsi_start_bb (bbcond); + + stmt = gimple_build_assign (fold_build2 + (MEM_REF, TREE_TYPE (stval), ptr, + build_int_cst (TREE_TYPE (ptr), 0)), + stval); + gimple_set_location (stmt, partition->loc); + gsi_insert_after (&gsi, stmt, GSI_CONTINUE_LINKING); + + if (i == 0 || i < xbits) + { + stmt = gimple_build_assign (ptr, + fold_build2_loc + (partition->loc, + POINTER_PLUS_EXPR, + TREE_TYPE (ptr), + ptr, blksz)); + gimple_set_location (stmt, partition->loc); + gsi_insert_after (&gsi, stmt, GSI_CONTINUE_LINKING); + + stmt = gimple_build_assign (bytes, + fold_build2_loc + (partition->loc, + MINUS_EXPR, + TREE_TYPE (bytes), + bytes, blksz)); + gimple_set_location (stmt, partition->loc); + gsi_insert_after (&gsi, stmt, GSI_CONTINUE_LINKING); + } + + basic_block bbskip = split_block (bbcond, stmt)->dest; + + if (i == 0) + redirect_edge_succ (single_succ_edge (bbcond), bbfrom); + + single_succ_edge (bbfrom)->flags |= EDGE_TRUE_VALUE; + single_succ_edge (bbfrom)->flags &= ~EDGE_FALLTHRU; + make_edge (bbfrom, bbskip, EDGE_FALSE_VALUE); + set_immediate_dominator (CDI_DOMINATORS, bbskip, bbfrom); + } + + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, + "generated memset, inlined with %u-trip loop ctzs %u..%u \n", + unsigned((wi::rshift (szmax, alctz, UNSIGNED) + + xbits).to_uhwi ()), + alctz, szctz); + + return; + } + fn = build_fold_addr_expr (builtin_decl_implicit (BUILT_IN_MEMSET)); fn_call = gimple_build_call (fn, 3, mem, val, nb_bytes); gimple_set_location (fn_call, partition->loc); @@ -1202,6 +1341,27 @@ generate_memcpy_builtin (class loop *loop, partition *partition) /* The new statements will be placed before LOOP. */ gsi = gsi_last_bb (loop_preheader_edge (loop)->src); + /* Compute builtin->size range and ctz before it's gimplified; sub-expressions + thereof are rewritten in place, so they end up referencing SSA_NAMEs for + which we don't have VR info. */ + unsigned align = (MIN (get_pointer_alignment (builtin->dst_base), + get_pointer_alignment (builtin->src_base)) + / BITS_PER_UNIT); + unsigned alctz, szctz, xbits; + wide_int szmin, szmax; + value_range_kind vrk; + if (align) + { + alctz = wi::ctz (align); + szctz = MIN (tree_ctz (builtin->size), alctz); + xbits = alctz - szctz; + vrk = determine_value_range (builtin->size, &szmin, &szmax); + /* A call with constant size will be expanded into an unrolled loop + later. */ + if (szmin == szmax) + align = 0; + } + nb_bytes = rewrite_to_non_trapping_overflow (builtin->size); nb_bytes = force_gimple_operand_gsi (&gsi, nb_bytes, true, NULL_TREE, false, GSI_CONTINUE_LINKING); @@ -1211,12 +1371,127 @@ generate_memcpy_builtin (class loop *loop, partition *partition) || ! ptr_derefs_may_alias_p (dest, src)) kind = BUILT_IN_MEMCPY; else - kind = BUILT_IN_MEMMOVE; + { + kind = BUILT_IN_MEMMOVE; + /* The inlined loop won't handle overlapping memory areas. */ + align = 0; + } dest = force_gimple_operand_gsi (&gsi, dest, true, NULL_TREE, false, GSI_CONTINUE_LINKING); src = force_gimple_operand_gsi (&gsi, src, true, NULL_TREE, false, GSI_CONTINUE_LINKING); + + unsigned HOST_WIDE_INT ratio; + ratio = MOVE_RATIO (optimize_loop_for_speed_p (loop)); + + /* Compare the ratio with the number of (most-aligned) required stores needed + for szmax bytes, with the ratio: a loop that iterates up to szmax >> alctz, + and then one conditional store for each extra bit of alignment that the + destination has over the size. */ + if (align && vrk == VR_RANGE + && wi::ltu_p (wi::rshift (szmax, alctz, UNSIGNED) + xbits, ratio)) + { + gimple *stmt; + tree bytes = create_tmp_var_raw (size_type_node, "ldistbytes"); + tree dptr = create_tmp_var_raw (build_pointer_type (char_type_node), + "ldistdptr"); + tree sptr = create_tmp_var_raw (build_pointer_type (char_type_node), + "ldistsptr"); + + stmt = gimple_build_assign (bytes, nb_bytes); + gimple_set_location (stmt, partition->loc); + gsi_insert_after (&gsi, stmt, GSI_CONTINUE_LINKING); + + stmt = gimple_build_assign (dptr, dest); + gimple_set_location (stmt, partition->loc); + gsi_insert_after (&gsi, stmt, GSI_CONTINUE_LINKING); + + stmt = gimple_build_assign (sptr, src); + gimple_set_location (stmt, partition->loc); + gsi_insert_after (&gsi, stmt, GSI_CONTINUE_LINKING); + + gsi = gsi_start_bb (split_block (gsi_bb (gsi), stmt)->dest); + + for (unsigned i = 0; i <= xbits; i++) + { + tree blksz = build_int_cst (size_type_node, align >> i); + unsigned bits = (align >> i) * BITS_PER_UNIT; + tree type = build_nonstandard_integer_type (bits, true); + tree val = make_ssa_name (type); + + stmt = gimple_build_cond (GE_EXPR, bytes, blksz, NULL, NULL); + gimple_set_location (stmt, partition->loc); + gsi_insert_after (&gsi, stmt, GSI_CONTINUE_LINKING); + + basic_block bbfrom = gsi_bb (gsi); + basic_block bbcond = split_block (bbfrom, stmt)->dest; + + gsi = gsi_start_bb (bbcond); + stmt = gimple_build_assign (val, + fold_build2 + (MEM_REF, TREE_TYPE (val), sptr, + build_int_cst (TREE_TYPE (sptr), 0))); + gimple_set_location (stmt, partition->loc); + gsi_insert_after (&gsi, stmt, GSI_CONTINUE_LINKING); + + stmt = gimple_build_assign (fold_build2 + (MEM_REF, TREE_TYPE (val), dptr, + build_int_cst (TREE_TYPE (dptr), 0)), + val); + gimple_set_location (stmt, partition->loc); + gsi_insert_after (&gsi, stmt, GSI_CONTINUE_LINKING); + + if (i == 0 || i < xbits) + { + stmt = gimple_build_assign (sptr, + fold_build2_loc + (partition->loc, + POINTER_PLUS_EXPR, + TREE_TYPE (sptr), + sptr, blksz)); + gimple_set_location (stmt, partition->loc); + gsi_insert_after (&gsi, stmt, GSI_CONTINUE_LINKING); + + stmt = gimple_build_assign (dptr, + fold_build2_loc + (partition->loc, + POINTER_PLUS_EXPR, + TREE_TYPE (dptr), + dptr, blksz)); + gimple_set_location (stmt, partition->loc); + gsi_insert_after (&gsi, stmt, GSI_CONTINUE_LINKING); + + stmt = gimple_build_assign (bytes, + fold_build2_loc + (partition->loc, + MINUS_EXPR, + TREE_TYPE (bytes), + bytes, blksz)); + gimple_set_location (stmt, partition->loc); + gsi_insert_after (&gsi, stmt, GSI_CONTINUE_LINKING); + } + + basic_block bbskip = split_block (bbcond, stmt)->dest; + if (i == 0) + redirect_edge_succ (single_succ_edge (bbcond), bbfrom); + + single_succ_edge (bbfrom)->flags |= EDGE_TRUE_VALUE; + single_succ_edge (bbfrom)->flags &= ~EDGE_FALLTHRU; + make_edge (bbfrom, bbskip, EDGE_FALSE_VALUE); + set_immediate_dominator (CDI_DOMINATORS, bbskip, bbfrom); + } + + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, + "generated memcpy, inlined with %u-trip loop ctzs %u..%u \n", + unsigned((wi::rshift (szmax, alctz, UNSIGNED) + + xbits).to_uhwi ()), + alctz, szctz); + + return; + } + fn = build_fold_addr_expr (builtin_decl_implicit (kind)); fn_call = gimple_build_call (fn, 3, dest, src, nb_bytes); gimple_set_location (fn_call, partition->loc); @@ -3357,6 +3632,9 @@ loop_distribution::execute (function *fun) scev_reset_htab (); mark_virtual_operands_for_renaming (fun); rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa); + /* Newly-added loops, for inline expansions memset/memcpy, need to be + integrated. */ + fix_loop_structure (NULL); } checking_verify_loop_structure (); -- Alexandre Oliva, happy hacker https://FSFLA.org/blogs/lxo/ Free Software Activist GNU Toolchain Engineer Vim, Vi, Voltei pro Emacs -- GNUlius Caesar