On Jan 28, 2021, Richard Biener <richard.guent...@gmail.com> wrote:

> That would allow turning back the memset into the original loop (but
> with optimal IVs, etc.).

Is this sort of what you had in mind?

I haven't tested the inline expansion of memset much yet; and that of
memcpy, not at all; this really is mainly to check that I understood
what you had in mind before I spend further time polishing it.


test builtin ratio for loop distribution

From: Alexandre Oliva <ol...@adacore.com>

The ldist pass turns even very short loops into memset calls.  E.g.,
the TFmode emulation calls end with a loop of up to 3 iterations, to
zero out trailing words, and the loop distribution pass turns them
into calls of the memset builtin.

Though short constant-length memsets are usually dealt with
efficiently, for non-constant-length ones, the options are setmemM, or
a function calls.

RISC-V doesn't have any setmemM pattern, so the loops above end up
"optimized" into memset calls, incurring not only the overhead of an
explicit call, but also discarding the information the compiler has
about the alignment of the destination, and that the length is a
multiple of the word alignment.

This patch adds, to the loop distribution pass, the ability to perform
inline expansion of bounded variable-length memset and memcpy in ways
that take advantage of known alignments and size's factors, when
preexisting *_RATIO macros suggest the inline expansion is
advantageous.


for  gcc/ChangeLog

        * tree-loop-distribution.c: Include builtins.h.
        (generate_memset_builtin): Introduce optional inline expansion
        of bounded variable-sized memset calls.
        (generate_memcpy_builtin): Likewise for memcpy only.
        (loop_distribution::execute): Fix loop structure afterwards.
---
 gcc/tree-loop-distribution.c |  280 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 279 insertions(+), 1 deletion(-)

diff --git a/gcc/tree-loop-distribution.c b/gcc/tree-loop-distribution.c
index bb15fd3723fb6..3be7a4c1ac281 100644
--- a/gcc/tree-loop-distribution.c
+++ b/gcc/tree-loop-distribution.c
@@ -115,6 +115,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "tree-vectorizer.h"
 #include "tree-eh.h"
 #include "gimple-fold.h"
+#include "builtins.h"
 
 
 #define MAX_DATAREFS_NUM \
@@ -1148,6 +1149,23 @@ generate_memset_builtin (class loop *loop, partition 
*partition)
   /* The new statements will be placed before LOOP.  */
   gsi = gsi_last_bb (loop_preheader_edge (loop)->src);
 
+  /* Compute builtin->size range and ctz before it's gimplified; 
sub-expressions
+     thereof are rewritten in place, so they end up referencing SSA_NAMEs for
+     which we don't have VR info.  */
+  unsigned align = get_pointer_alignment (builtin->dst_base) / BITS_PER_UNIT;
+  unsigned alctz, szctz, xbits;
+  wide_int szmin, szmax;
+  value_range_kind vrk;
+  if (align)
+    {
+      alctz = wi::ctz (align);
+      szctz = MIN (tree_ctz (builtin->size), alctz);
+      xbits = alctz - szctz;
+      vrk = determine_value_range (builtin->size, &szmin, &szmax);
+      if (szmin == szmax)
+       align = 0;
+    }
+
   nb_bytes = rewrite_to_non_trapping_overflow (builtin->size);
   nb_bytes = force_gimple_operand_gsi (&gsi, nb_bytes, true, NULL_TREE,
                                       false, GSI_CONTINUE_LINKING);
@@ -1172,6 +1190,127 @@ generate_memset_builtin (class loop *loop, partition 
*partition)
       val = tem;
     }
 
+  unsigned HOST_WIDE_INT ratio;
+  if (integer_zerop (val))
+    ratio = CLEAR_RATIO (optimize_loop_for_speed_p (loop));
+  else
+    ratio = SET_RATIO (optimize_loop_for_speed_p (loop));
+
+  /* Compare the ratio with the number of (most-aligned) required stores needed
+     for szmax bytes, with the ratio: a loop that iterates up to szmax >> 
alctz,
+     and then one conditional store for each extra bit of alignment that the
+     destination has over the size.  */
+  if (align && vrk == VR_RANGE
+      && wi::ltu_p (wi::rshift (szmax, alctz, UNSIGNED) + xbits, ratio))
+    {
+      gimple *stmt;
+      tree bytes = create_tmp_var_raw (size_type_node, "ldistbytes");
+      tree ptr = create_tmp_var_raw (build_pointer_type (char_type_node),
+                                    "ldistptr");
+      tree stval = force_gimple_operand_gsi (&gsi,
+                                            fold_convert
+                                            (unsigned_char_type_node, val),
+                                            true, NULL_TREE, false,
+                                            GSI_CONTINUE_LINKING);
+
+      for (unsigned i = 1; i != alctz; i *= 2)
+       {
+         unsigned bits = i * 2 * BITS_PER_UNIT;
+         tree type = build_nonstandard_integer_type (bits, true);
+         stval = fold_convert (type, stval);
+         tree op2 = fold_build2_loc (partition->loc, LSHIFT_EXPR,
+                                     TREE_TYPE (stval), stval,
+                                     build_int_cst (type, i * BITS_PER_UNIT));
+         stval = fold_build2_loc (partition->loc, BIT_IOR_EXPR,
+                                  TREE_TYPE (stval), stval, op2);
+         stval = force_gimple_operand_gsi (&gsi, stval, true, NULL_TREE,
+                                           false, GSI_CONTINUE_LINKING);
+       }
+
+      stmt = gimple_build_assign (bytes, nb_bytes);
+      gimple_set_location (stmt, partition->loc);
+      gsi_insert_after (&gsi, stmt, GSI_CONTINUE_LINKING);
+
+      stmt = gimple_build_assign (ptr, mem);
+      gimple_set_location (stmt, partition->loc);
+      gsi_insert_after (&gsi, stmt, GSI_CONTINUE_LINKING);
+
+      gsi = gsi_start_bb (split_block (gsi_bb (gsi), stmt)->dest);
+
+      for (unsigned i = 0; i <= xbits; i++)
+       {
+         tree blksz = build_int_cst (size_type_node, align >> i);
+
+         if (i > 0)
+           {
+             unsigned bits = (align >> i) * BITS_PER_UNIT;
+             tree type = build_nonstandard_integer_type (bits, true);
+             stval = fold_convert (type, stval);
+             stval = force_gimple_operand_gsi (&gsi, stval, true, NULL_TREE,
+                                               false, GSI_CONTINUE_LINKING);
+           }
+
+         tree labcond = NULL; // create_artificial_label (partition->loc);
+         tree labskip = NULL; // create_artificial_label (partition->loc);
+
+         stmt = gimple_build_cond (GE_EXPR, bytes, blksz, labcond, labskip);
+         gimple_set_location (stmt, partition->loc);
+         gsi_insert_after (&gsi, stmt, GSI_CONTINUE_LINKING);
+
+         basic_block bbfrom = gsi_bb (gsi);
+         basic_block bbcond = split_block (bbfrom, stmt)->dest;
+
+         gsi = gsi_start_bb (bbcond);
+
+         stmt = gimple_build_assign (fold_build2
+                                     (MEM_REF, TREE_TYPE (stval), ptr,
+                                      build_int_cst (TREE_TYPE (ptr), 0)),
+                                     stval);
+         gimple_set_location (stmt, partition->loc);
+         gsi_insert_after (&gsi, stmt, GSI_CONTINUE_LINKING);
+
+         if (i == 0 || i < xbits)
+           {
+             stmt = gimple_build_assign (ptr,
+                                         fold_build2_loc
+                                         (partition->loc,
+                                          POINTER_PLUS_EXPR,
+                                          TREE_TYPE (ptr),
+                                          ptr, blksz));
+             gimple_set_location (stmt, partition->loc);
+             gsi_insert_after (&gsi, stmt, GSI_CONTINUE_LINKING);
+
+             stmt = gimple_build_assign (bytes,
+                                         fold_build2_loc
+                                         (partition->loc,
+                                          MINUS_EXPR,
+                                          TREE_TYPE (bytes),
+                                          bytes, blksz));
+             gimple_set_location (stmt, partition->loc);
+             gsi_insert_after (&gsi, stmt, GSI_CONTINUE_LINKING);
+           }
+
+         basic_block bbskip = split_block (bbcond, stmt)->dest;
+
+         if (i == 0)
+           redirect_edge_succ (single_succ_edge (bbcond), bbfrom);
+
+         single_succ_edge (bbfrom)->flags |= EDGE_TRUE_VALUE;
+         single_succ_edge (bbfrom)->flags &= ~EDGE_FALLTHRU;
+         make_edge (bbfrom, bbskip, EDGE_FALSE_VALUE);
+         set_immediate_dominator (CDI_DOMINATORS, bbskip, bbfrom);
+       }
+
+      if (dump_file && (dump_flags & TDF_DETAILS))
+       fprintf (dump_file,
+                "generated memset, inlined with %u-trip loop ctzs %u..%u \n",
+                unsigned((wi::rshift (szmax, alctz, UNSIGNED)
+                          + xbits).to_uhwi ()),
+                alctz, szctz);
+
+      return;
+    }
+
   fn = build_fold_addr_expr (builtin_decl_implicit (BUILT_IN_MEMSET));
   fn_call = gimple_build_call (fn, 3, mem, val, nb_bytes);
   gimple_set_location (fn_call, partition->loc);
@@ -1202,6 +1341,27 @@ generate_memcpy_builtin (class loop *loop, partition 
*partition)
   /* The new statements will be placed before LOOP.  */
   gsi = gsi_last_bb (loop_preheader_edge (loop)->src);
 
+  /* Compute builtin->size range and ctz before it's gimplified; 
sub-expressions
+     thereof are rewritten in place, so they end up referencing SSA_NAMEs for
+     which we don't have VR info.  */
+  unsigned align = (MIN (get_pointer_alignment (builtin->dst_base),
+                        get_pointer_alignment (builtin->src_base))
+                   / BITS_PER_UNIT);
+  unsigned alctz, szctz, xbits;
+  wide_int szmin, szmax;
+  value_range_kind vrk;
+  if (align)
+    {
+      alctz = wi::ctz (align);
+      szctz = MIN (tree_ctz (builtin->size), alctz);
+      xbits = alctz - szctz;
+      vrk = determine_value_range (builtin->size, &szmin, &szmax);
+      /* A call with constant size will be expanded into an unrolled loop
+        later.  */
+      if (szmin == szmax)
+       align = 0;
+    }
+
   nb_bytes = rewrite_to_non_trapping_overflow (builtin->size);
   nb_bytes = force_gimple_operand_gsi (&gsi, nb_bytes, true, NULL_TREE,
                                       false, GSI_CONTINUE_LINKING);
@@ -1211,12 +1371,127 @@ generate_memcpy_builtin (class loop *loop, partition 
*partition)
       || ! ptr_derefs_may_alias_p (dest, src))
     kind = BUILT_IN_MEMCPY;
   else
-    kind = BUILT_IN_MEMMOVE;
+    {
+      kind = BUILT_IN_MEMMOVE;
+      /* The inlined loop won't handle overlapping memory areas.  */
+      align = 0;
+    }
 
   dest = force_gimple_operand_gsi (&gsi, dest, true, NULL_TREE,
                                   false, GSI_CONTINUE_LINKING);
   src = force_gimple_operand_gsi (&gsi, src, true, NULL_TREE,
                                  false, GSI_CONTINUE_LINKING);
+
+  unsigned HOST_WIDE_INT ratio;
+  ratio = MOVE_RATIO (optimize_loop_for_speed_p (loop));
+
+  /* Compare the ratio with the number of (most-aligned) required stores needed
+     for szmax bytes, with the ratio: a loop that iterates up to szmax >> 
alctz,
+     and then one conditional store for each extra bit of alignment that the
+     destination has over the size.  */
+  if (align && vrk == VR_RANGE
+      && wi::ltu_p (wi::rshift (szmax, alctz, UNSIGNED) + xbits, ratio))
+    {
+      gimple *stmt;
+      tree bytes = create_tmp_var_raw (size_type_node, "ldistbytes");
+      tree dptr = create_tmp_var_raw (build_pointer_type (char_type_node),
+                                    "ldistdptr");
+      tree sptr = create_tmp_var_raw (build_pointer_type (char_type_node),
+                                    "ldistsptr");
+
+      stmt = gimple_build_assign (bytes, nb_bytes);
+      gimple_set_location (stmt, partition->loc);
+      gsi_insert_after (&gsi, stmt, GSI_CONTINUE_LINKING);
+
+      stmt = gimple_build_assign (dptr, dest);
+      gimple_set_location (stmt, partition->loc);
+      gsi_insert_after (&gsi, stmt, GSI_CONTINUE_LINKING);
+
+      stmt = gimple_build_assign (sptr, src);
+      gimple_set_location (stmt, partition->loc);
+      gsi_insert_after (&gsi, stmt, GSI_CONTINUE_LINKING);
+
+      gsi = gsi_start_bb (split_block (gsi_bb (gsi), stmt)->dest);
+
+      for (unsigned i = 0; i <= xbits; i++)
+       {
+         tree blksz = build_int_cst (size_type_node, align >> i);
+         unsigned bits = (align >> i) * BITS_PER_UNIT;
+         tree type = build_nonstandard_integer_type (bits, true);
+         tree val = make_ssa_name (type);
+
+         stmt = gimple_build_cond (GE_EXPR, bytes, blksz, NULL, NULL);
+         gimple_set_location (stmt, partition->loc);
+         gsi_insert_after (&gsi, stmt, GSI_CONTINUE_LINKING);
+
+         basic_block bbfrom = gsi_bb (gsi);
+         basic_block bbcond = split_block (bbfrom, stmt)->dest;
+
+         gsi = gsi_start_bb (bbcond);
+         stmt = gimple_build_assign (val,
+                                     fold_build2
+                                     (MEM_REF, TREE_TYPE (val), sptr,
+                                      build_int_cst (TREE_TYPE (sptr), 0)));
+         gimple_set_location (stmt, partition->loc);
+         gsi_insert_after (&gsi, stmt, GSI_CONTINUE_LINKING);
+
+         stmt = gimple_build_assign (fold_build2
+                                     (MEM_REF, TREE_TYPE (val), dptr,
+                                      build_int_cst (TREE_TYPE (dptr), 0)),
+                                     val);
+         gimple_set_location (stmt, partition->loc);
+         gsi_insert_after (&gsi, stmt, GSI_CONTINUE_LINKING);
+
+         if (i == 0 || i < xbits)
+           {
+             stmt = gimple_build_assign (sptr,
+                                         fold_build2_loc
+                                         (partition->loc,
+                                          POINTER_PLUS_EXPR,
+                                          TREE_TYPE (sptr),
+                                          sptr, blksz));
+             gimple_set_location (stmt, partition->loc);
+             gsi_insert_after (&gsi, stmt, GSI_CONTINUE_LINKING);
+
+             stmt = gimple_build_assign (dptr,
+                                         fold_build2_loc
+                                         (partition->loc,
+                                          POINTER_PLUS_EXPR,
+                                          TREE_TYPE (dptr),
+                                          dptr, blksz));
+             gimple_set_location (stmt, partition->loc);
+             gsi_insert_after (&gsi, stmt, GSI_CONTINUE_LINKING);
+
+             stmt = gimple_build_assign (bytes,
+                                         fold_build2_loc
+                                         (partition->loc,
+                                          MINUS_EXPR,
+                                          TREE_TYPE (bytes),
+                                          bytes, blksz));
+             gimple_set_location (stmt, partition->loc);
+             gsi_insert_after (&gsi, stmt, GSI_CONTINUE_LINKING);
+           }
+
+         basic_block bbskip = split_block (bbcond, stmt)->dest;
+         if (i == 0)
+           redirect_edge_succ (single_succ_edge (bbcond), bbfrom);
+
+         single_succ_edge (bbfrom)->flags |= EDGE_TRUE_VALUE;
+         single_succ_edge (bbfrom)->flags &= ~EDGE_FALLTHRU;
+         make_edge (bbfrom, bbskip, EDGE_FALSE_VALUE);
+         set_immediate_dominator (CDI_DOMINATORS, bbskip, bbfrom);
+       }
+
+      if (dump_file && (dump_flags & TDF_DETAILS))
+       fprintf (dump_file,
+                "generated memcpy, inlined with %u-trip loop ctzs %u..%u \n",
+                unsigned((wi::rshift (szmax, alctz, UNSIGNED)
+                          + xbits).to_uhwi ()),
+                alctz, szctz);
+
+      return;
+    }
+
   fn = build_fold_addr_expr (builtin_decl_implicit (kind));
   fn_call = gimple_build_call (fn, 3, dest, src, nb_bytes);
   gimple_set_location (fn_call, partition->loc);
@@ -3357,6 +3632,9 @@ loop_distribution::execute (function *fun)
       scev_reset_htab ();
       mark_virtual_operands_for_renaming (fun);
       rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa);
+      /* Newly-added loops, for inline expansions memset/memcpy, need to be
+        integrated.  */
+      fix_loop_structure (NULL);
     }
 
   checking_verify_loop_structure ();


-- 
Alexandre Oliva, happy hacker  https://FSFLA.org/blogs/lxo/
   Free Software Activist         GNU Toolchain Engineer
        Vim, Vi, Voltei pro Emacs -- GNUlius Caesar

Reply via email to