From: "[email protected]" <[email protected]>

1. Clone the aprefetch pass. When nontemporal store optimization is enabled,
   first perform the analysis for nontemporal stores in this pass, and then
   handle data alignment in the vect pass.
2. Attempt to retrieve the original loop's trip count from within an OpenMP
   outlined function.

gcc/ChangeLog:

        * common.opt: Add -fnon-temporal-store.
        * passes.def: Add pass_loop_prefetch before pass_if_conversion.
        * tree-ssa-loop-niter.cc
        (get_oldest_def_with_type_conv): New.
        (get_omp_orig_loop_niters): New.
        (traverse_niters_expr_operands): New.
        (get_const_niters_from_omp_outlined): New.
        * tree-ssa-loop-niter.h
        (get_const_niters_from_omp_outlined): New.
        * tree-ssa-loop-prefetch.cc
        (nontemporal_store_p): Attempt to get the preferred SIMD mode for
        scalar type.
        (mark_nontemporal_store): Add parameter to control nontemporal
        store analysis.
        (mark_nontemporal_stores): Ditto.
        (loop_prefetch_arrays): Ditto.
        (tree_ssa_prefetch_arrays): Ditto.
        (pass_loop_prefetch::execute): Ditto.
        (determine_loop_nest_reuse): Attempt to obtain const niters from omp
        outlined fn.
        (unset_nontemporal_move): New
        (class pass_loop_prefetch): Add clone method and set pass param.
        * tree-vect-data-refs.cc (vect_supportable_dr_alignment):
        Enforce alignment for statements marked as nontemporal.

Signed-off-by: [email protected] <[email protected]>
---
 gcc/common.opt                |   4 +
 gcc/passes.def                |   3 +-
 gcc/tree-ssa-loop-niter.cc    | 151 ++++++++++++++++++++++++++++++++++
 gcc/tree-ssa-loop-niter.h     |   1 +
 gcc/tree-ssa-loop-prefetch.cc | 109 ++++++++++++++++++++----
 gcc/tree-vect-data-refs.cc    |   6 ++
 6 files changed, 258 insertions(+), 16 deletions(-)

diff --git a/gcc/common.opt b/gcc/common.opt
index 50de980615f..f6e9c8314a0 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -2615,6 +2615,10 @@ fprefetch-loop-arrays
 Common Var(flag_prefetch_loop_arrays) Init(-1) Optimization
 Generate prefetch instructions, if available, for arrays in loops.
 
+fnon-temporal-store
+Common Var(flag_non_temporal_store) Init(0) Optimization
+Generate nontemporal store instructions in loops if available.
+
 fprofile
 Common Var(profile_flag)
 Enable basic program profiling code.
diff --git a/gcc/passes.def b/gcc/passes.def
index fac04cd86c7..140c707b217 100644
--- a/gcc/passes.def
+++ b/gcc/passes.def
@@ -314,6 +314,7 @@ along with GCC; see the file COPYING3.  If not see
          NEXT_PASS (pass_parallelize_loops, false /* oacc_kernels_p */);
          NEXT_PASS (pass_expand_omp_ssa);
          NEXT_PASS (pass_ch_vect);
+         NEXT_PASS (pass_loop_prefetch, true /* nontemporal_store_analysis_p 
*/);
          NEXT_PASS (pass_if_conversion);
          /* pass_vectorize must immediately follow pass_if_conversion.
             Please do not add any other passes in between.  */
@@ -329,7 +330,7 @@ along with GCC; see the file COPYING3.  If not see
              NEXT_PASS (pass_dse);
          POP_INSERT_PASSES ()
          NEXT_PASS (pass_slp_vectorize);
-         NEXT_PASS (pass_loop_prefetch);
+         NEXT_PASS (pass_loop_prefetch, false);
          /* Run IVOPTs after the last pass that uses data-reference analysis
             as that doesn't handle TARGET_MEM_REFs.  */
          NEXT_PASS (pass_iv_optimize);
diff --git a/gcc/tree-ssa-loop-niter.cc b/gcc/tree-ssa-loop-niter.cc
index 8aa52dc54b0..44aeb0d223e 100644
--- a/gcc/tree-ssa-loop-niter.cc
+++ b/gcc/tree-ssa-loop-niter.cc
@@ -5183,6 +5183,157 @@ estimated_stmt_executions_int (class loop *loop)
   return snit < 0 ? -1 : snit;
 }
 
+/* Extract x from type conversion. ie:
+   a=(uint64_t)b, we can use this function to get b from a. */
+
+static tree
+get_oldest_def_with_type_conv (tree t)
+{
+  auto ssa = t;
+  while (ssa && TREE_CODE (ssa) == SSA_NAME)
+    {
+      auto g = SSA_NAME_DEF_STMT (ssa);
+      if (!is_gimple_assign (g))
+       break;
+      auto rhs_code = gimple_assign_rhs_code (g);
+      if (rhs_code != NOP_EXPR && rhs_code != SSA_NAME)
+       break;
+      ssa = gimple_assign_rhs1 (g);
+    }
+  return ssa;
+}
+
+/* Recursively traces the SSA variable OP to obtain the original loop's
+   constant iteration count from an OpenMP outlined function. Only supports
+   cases where the trip count is an INTEGER_CST. The trip count can only be
+   determined when OpenMP static scheduling is used without a specified chunk
+   size. */
+
+static HOST_WIDE_INT
+get_omp_orig_loop_niters (tree op)
+{
+  HOST_WIDE_INT niters = -1;
+  tree rhs1 = NULL_TREE, rhs2 = NULL_TREE, fndecl, arg;
+  gimple *def;
+  gphi *phi;
+  unsigned num_args;
+  enum tree_code code;
+
+  if (!op)
+    return niters;
+
+  if (TREE_CODE (op) != SSA_NAME)
+    return niters;
+
+  def = SSA_NAME_DEF_STMT (op);
+  if (!def)
+    return niters;
+
+  if (gimple_code (def) == GIMPLE_PHI)
+    {
+      /* Only handle PHI nodes in the outermost loop. */
+      if (gimple_bb (def)->loop_father != current_loops->tree_root)
+       return niters;
+
+      phi = as_a<gphi *> (def);
+      num_args = gimple_phi_num_args (phi);
+      if (num_args > 2)
+       return niters;
+
+      /* Recursively trace PHI arguments. */
+      for (unsigned i = 0; i < num_args; i++)
+       {
+         arg = gimple_phi_arg_def (phi, i);
+         niters = get_omp_orig_loop_niters (arg);
+         if (niters != -1)
+           break;
+       }
+      return niters;
+    }
+
+  if (!is_gimple_assign (def))
+    return niters;
+  code = gimple_assign_rhs_code (def);
+  rhs1 = gimple_assign_rhs1 (def);
+
+  if (code == COND_EXPR)
+    return get_omp_orig_loop_niters (gimple_assign_rhs2 (def));
+
+  /* Recursively trace rhs1 if not a divisianalyze_tree_opson/modulo or not a
+   * constant. */
+  if (code != TRUNC_MOD_EXPR && code != TRUNC_DIV_EXPR)
+    return get_omp_orig_loop_niters (rhs1);
+  else if (TREE_CODE (rhs1) != INTEGER_CST)
+    return get_omp_orig_loop_niters (rhs1);
+
+  /* Find the expression q = n / nthreads or tt = n % nthreads. */
+  rhs2 = gimple_assign_rhs2 (def);
+  rhs2 = get_oldest_def_with_type_conv (rhs2);
+  def = SSA_NAME_DEF_STMT (rhs2);
+  if (def && is_gimple_call (def))
+    {
+      fndecl = gimple_call_fndecl (def);
+      if (fndecl
+         && (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_OMP_GET_NUM_THREADS
+             || DECL_FUNCTION_CODE (fndecl) == BUILT_IN_OMP_GET_THREAD_NUM)
+         && tree_fits_shwi_p (rhs1))
+       return tree_to_shwi (rhs1);
+    }
+  niters = get_omp_orig_loop_niters (rhs2);
+
+  return niters;
+}
+
+/* Recursively traverses all operands of NITERS_EXPR to determine if the
+   expression depends on the number of OpenMP threads, where NITERS_EXPR
+   is typically of the form (upper bound - lower bound + 1) for a thread's 
task.
+   If a dependency is found, NITERS is updated with the trip count of the
+   original loop corresponding to the OpenMP outlined function. */
+
+static void
+traverse_niters_expr_operands (HOST_WIDE_INT *niters, tree niters_expr)
+{
+  if (!niters_expr)
+    return;
+
+  unsigned n = TREE_OPERAND_LENGTH (niters_expr);
+  if (n == 0)
+    {
+      *niters = get_omp_orig_loop_niters (niters_expr);
+      return;
+    }
+
+  for (unsigned i = 0; i < n; i++)
+    {
+      tree op = TREE_OPERAND (niters_expr, i);
+      if (TREE_CODE (op) == INTEGER_CST)
+       continue;
+
+      if (TREE_CODE (op) == NOP_EXPR)
+       op = TREE_OPERAND (op, 0);
+      traverse_niters_expr_operands (niters, op);
+    }
+}
+
+/* Get the constant niters of the original loop from an OpenMP outlined
+   function. Only supports the case where niters is an INTEGER_CST. */
+
+HOST_WIDE_INT
+get_const_niters_from_omp_outlined (class loop *loop)
+{
+  HOST_WIDE_INT niters = -1;
+  tree niters_expr = number_of_latch_executions (loop);
+  if (TREE_CODE (niters_expr) == COND_EXPR)
+    niters_expr = TREE_OPERAND (niters_expr, 1);
+
+  niters_expr = analyze_scalar_evolution (loop, niters_expr);
+  if (!niters_expr || chrec_contains_undetermined (niters_expr))
+    return niters;
+
+  traverse_niters_expr_operands (&niters, niters_expr);
+  return niters;
+}
+
 /* Sets NIT to the maximum number of executions of the latch of the
    LOOP, plus one.  If we have no reliable estimate, the function returns
    false, otherwise returns true.  */
diff --git a/gcc/tree-ssa-loop-niter.h b/gcc/tree-ssa-loop-niter.h
index 88e47b9ab94..a564bacf941 100644
--- a/gcc/tree-ssa-loop-niter.h
+++ b/gcc/tree-ssa-loop-niter.h
@@ -38,6 +38,7 @@ extern tree loop_niter_by_eval (class loop *, edge);
 extern tree find_loop_niter_by_eval (class loop *, edge *);
 extern bool estimated_loop_iterations (class loop *, widest_int *);
 extern HOST_WIDE_INT estimated_loop_iterations_int (class loop *);
+extern HOST_WIDE_INT get_const_niters_from_omp_outlined (class loop *);
 extern bool max_loop_iterations (class loop *, widest_int *);
 extern HOST_WIDE_INT max_loop_iterations_int (class loop *);
 extern bool likely_max_loop_iterations (class loop *, widest_int *);
diff --git a/gcc/tree-ssa-loop-prefetch.cc b/gcc/tree-ssa-loop-prefetch.cc
index dad53bfb79b..fd83382905f 100644
--- a/gcc/tree-ssa-loop-prefetch.cc
+++ b/gcc/tree-ssa-loop-prefetch.cc
@@ -48,6 +48,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "tree-data-ref.h"
 #include "diagnostic-core.h"
 #include "dbgcnt.h"
+#include "tree-parloops.h"
 
 /* This pass inserts prefetch instructions to optimize cache usage during
    accesses to arrays in loops.  It processes loops sequentially and:
@@ -1245,7 +1246,7 @@ issue_prefetches (struct mem_ref_group *groups,
    can be used.  */
 
 static bool
-nontemporal_store_p (struct mem_ref *ref)
+nontemporal_store_p (struct mem_ref *ref, bool nontemporal_store_analysis_p)
 {
   machine_mode mode;
   enum insn_code code;
@@ -1264,6 +1265,20 @@ nontemporal_store_p (struct mem_ref *ref)
     return false;
 
   code = optab_handler (storent_optab, mode);
+
+  /* If there is no nontemporal store instruction for the current scalar mode,
+     then try to get the preferred SIMD (vector) mode for this scalar type
+     and check if there is a corresponding nontemporal store instruction. */
+  if (nontemporal_store_analysis_p && code == CODE_FOR_nothing
+      && is_a<scalar_mode> (mode))
+    {
+      scalar_mode smode = as_a<scalar_mode> (mode);
+      machine_mode simd_mode = targetm.vectorize.preferred_simd_mode (smode);
+
+      if (VECTOR_MODE_P (simd_mode))
+       code = optab_handler (storent_optab, simd_mode);
+    }
+
   return code != CODE_FOR_nothing;
 }
 
@@ -1271,9 +1286,9 @@ nontemporal_store_p (struct mem_ref *ref)
    and return true.  Otherwise, we return false.  */
 
 static bool
-mark_nontemporal_store (struct mem_ref *ref)
+mark_nontemporal_store (struct mem_ref *ref, bool nontemporal_store_analysis_p)
 {
-  if (!nontemporal_store_p (ref))
+  if (!nontemporal_store_p (ref, nontemporal_store_analysis_p))
     return false;
 
   if (dump_file && (dump_flags & TDF_DETAILS))
@@ -1343,7 +1358,8 @@ may_use_storent_in_loop_p (class loop *loop)
    references in the loop.  Returns whether we inserted any mfence call.  */
 
 static bool
-mark_nontemporal_stores (class loop *loop, struct mem_ref_group *groups)
+mark_nontemporal_stores (class loop *loop, struct mem_ref_group *groups,
+                        bool nontemporal_store_analysis_p)
 {
   struct mem_ref *ref;
   bool any = false;
@@ -1353,9 +1369,12 @@ mark_nontemporal_stores (class loop *loop, struct 
mem_ref_group *groups)
 
   for (; groups; groups = groups->next)
     for (ref = groups->refs; ref; ref = ref->next)
-      any |= mark_nontemporal_store (ref);
+      any |= mark_nontemporal_store (ref, nontemporal_store_analysis_p);
 
-  if (any && FENCE_FOLLOWING_MOVNT != NULL_TREE)
+  /* Avoid inserting excessive fence instructions, insert only in the 
aprefetch2
+     pass. */
+  if (!nontemporal_store_analysis_p
+      && any && FENCE_FOLLOWING_MOVNT != NULL_TREE)
     {
       emit_mfence_after_loop (loop);
       return true;
@@ -1653,6 +1672,10 @@ determine_loop_nest_reuse (class loop *loop, struct 
mem_ref_group *refs,
 
       aloop = vloops[i];
       vol = estimated_stmt_executions_int (aloop);
+
+      if (vol == -1 && flag_openmp && parallelized_function_p (cfun->decl))
+       vol = get_const_niters_from_omp_outlined (aloop);
+
       if (vol == -1)
        vol = expected_loop_iterations (aloop);
       volume *= vol;
@@ -1886,7 +1909,8 @@ insn_to_prefetch_ratio_too_small_p (unsigned ninsns, 
unsigned prefetch_count,
    to update SSA for virtual operands and LC SSA for a split edge.  */
 
 static bool
-loop_prefetch_arrays (class loop *loop, bool &need_lc_ssa_update)
+loop_prefetch_arrays (class loop *loop, bool &need_lc_ssa_update,
+                      bool nontemporal_store_analysis_p)
 {
   struct mem_ref_group *refs;
   unsigned ahead, ninsns, time, unroll_factor;
@@ -1927,7 +1951,8 @@ loop_prefetch_arrays (class loop *loop, bool 
&need_lc_ssa_update)
   /* Give up prefetching if the number of memory references in the
      loop is not reasonable based on profitablity and compilation time
      considerations.  */
-  if (!mem_ref_count_reasonable_p (ninsns, mem_ref_count))
+  if (!nontemporal_store_analysis_p
+      && !mem_ref_count_reasonable_p (ninsns, mem_ref_count))
     goto fail;
 
   /* Step 2: estimate the reuse effects.  */
@@ -1957,11 +1982,18 @@ loop_prefetch_arrays (class loop *loop, bool 
&need_lc_ssa_update)
 
   /* Prefetching is not likely to be profitable if the instruction to prefetch
      ratio is too small.  */
-  if (insn_to_prefetch_ratio_too_small_p (ninsns, prefetch_count,
+  if (!nontemporal_store_analysis_p &&
+      insn_to_prefetch_ratio_too_small_p (ninsns, prefetch_count,
                                          unroll_factor))
     goto fail;
 
-  need_lc_ssa_update |= mark_nontemporal_stores (loop, refs);
+  need_lc_ssa_update
+    |= mark_nontemporal_stores (loop, refs, nontemporal_store_analysis_p);
+
+  /* With nontemporal store optimization enabled, only the analysis phase is
+     performed during the aprefetch1 pass. */
+  if (nontemporal_store_analysis_p)
+    goto fail;
 
   /* Step 4: what to prefetch?  */
   if (!schedule_prefetches (refs, unroll_factor, ahead))
@@ -1983,10 +2015,33 @@ fail:
   return unrolled;
 }
 
+/* Cancel the nontemporal store marked in aprefetch1 pass. */
+
+static void
+unset_nontemporal_move (void)
+{
+  basic_block bb;
+  gimple_stmt_iterator bsi;
+  gimple *stmt;
+  FOR_EACH_BB_FN (bb, cfun)
+    {
+      for (bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi))
+       {
+         stmt = gsi_stmt (bsi);
+         if (!is_gimple_assign (stmt))
+           continue;
+
+         gassign *assign_stmt = as_a<gassign *> (stmt);
+         if (gimple_assign_nontemporal_move_p (assign_stmt))
+           gimple_assign_set_nontemporal_move (assign_stmt, false);
+       }
+    }
+}
+
 /* Issue prefetch instructions for array references in loops.  */
 
 unsigned int
-tree_ssa_prefetch_arrays (void)
+tree_ssa_prefetch_arrays (bool nontemporal_store_analysis_p)
 {
   bool unrolled = false;
   bool need_lc_ssa_update = false;
@@ -2033,12 +2088,21 @@ tree_ssa_prefetch_arrays (void)
       set_builtin_decl (BUILT_IN_PREFETCH, decl, false);
     }
 
+  /* In the aprefetch1 pass, stmts that are successfully analyzed are marked
+     with nontemporal store, but no fence is inserted. The main purpose is
+     to complete alignment operations in the vect pass. In the aprefetch2
+     pass, the nontemporal store flag is removed, and the stmts are
+     re-analyzed under the condition of alignment. */
+  if (!nontemporal_store_analysis_p)
+    unset_nontemporal_move ();
+
   for (auto loop : loops_list (cfun, LI_FROM_INNERMOST))
     {
       if (dump_file && (dump_flags & TDF_DETAILS))
        fprintf (dump_file, "Processing loop %d:\n", loop->num);
 
-      unrolled |= loop_prefetch_arrays (loop, need_lc_ssa_update);
+      unrolled |= loop_prefetch_arrays (loop, need_lc_ssa_update,
+                                        nontemporal_store_analysis_p);
 
       if (dump_file && (dump_flags & TDF_DETAILS))
        fprintf (dump_file, "\n\n");
@@ -2078,15 +2142,30 @@ class pass_loop_prefetch : public gimple_opt_pass
 {
 public:
   pass_loop_prefetch (gcc::context *ctxt)
-    : gimple_opt_pass (pass_data_loop_prefetch, ctxt)
+    : gimple_opt_pass (pass_data_loop_prefetch, ctxt),
+      nontemporal_store_analysis_p (false)
   {}
 
   /* opt_pass methods: */
+  opt_pass *clone () { return new pass_loop_prefetch (m_ctxt); }
   bool gate (function *) final override
   {
-    return flag_prefetch_loop_arrays > 0;
+    if (nontemporal_store_analysis_p)
+      return optimize >= 3 && flag_non_temporal_store
+            && (flag_tree_loop_vectorize || cfun->has_force_vectorize_loops);
+    else
+      return flag_prefetch_loop_arrays > 0
+            || (optimize >= 3 && flag_non_temporal_store);
   }
   unsigned int execute (function *) final override;
+  void set_pass_param (unsigned int n, bool param)
+  {
+    gcc_assert (n == 0);
+    nontemporal_store_analysis_p = param;
+  }
+
+private:
+  bool nontemporal_store_analysis_p;
 
 }; // class pass_loop_prefetch
 
@@ -2110,7 +2189,7 @@ pass_loop_prefetch::execute (function *fun)
       return 0;
     }
 
-  return tree_ssa_prefetch_arrays ();
+  return tree_ssa_prefetch_arrays (nontemporal_store_analysis_p);
 }
 
 } // anon namespace
diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
index 89a9077af0d..afc86d15cd5 100644
--- a/gcc/tree-vect-data-refs.cc
+++ b/gcc/tree-vect-data-refs.cc
@@ -6787,6 +6787,12 @@ vect_supportable_dr_alignment (vec_info *vinfo, 
dr_vec_info *dr_info,
   else if (dr_safe_speculative_read_required (stmt_info))
     return dr_unaligned_unsupported;
 
+  /* When using the vmovnt instruction, the data address must be aligned to the
+     vector width. */
+  gassign *assign_stmt = dyn_cast<gassign *> (stmt_info->stmt);
+  if (assign_stmt && gimple_assign_nontemporal_move_p (assign_stmt))
+    return dr_unaligned_unsupported;
+
   if (loop_vinfo)
     {
       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
-- 
2.22.0


Reply via email to