[gcc r15-4986] openmp: use offload max_vf for chunk_size

Andrew Stubbs via Gcc-cvs Wed, 06 Nov 2024 08:28:04 -0800

https://gcc.gnu.org/g:896c6c28939f0b1eb6582231d24ea07ce01d071e


commit r15-4986-g896c6c28939f0b1eb6582231d24ea07ce01d071e
Author: Andrew Stubbs <a...@baylibre.com>
Date:   Fri Nov 1 13:53:34 2024 +0000

    openmp: use offload max_vf for chunk_size
    
    The chunk size for SIMD loops should be right for the current device; too 
big
    allocates too much memory, too small is inefficient.  Getting it wrong 
doesn't
    actually break anything though.
    
    This patch attempts to choose the optimal setting based on the context.  
Both
    host-fallback and device will get the same chunk size, but device 
performance
    is the most important in this case.
    
    gcc/ChangeLog:
    
            * omp-expand.cc (is_in_offload_region): New function.
            (omp_adjust_chunk_size): Add pass-through "offload" parameter.
            (get_ws_args_for): Likewise.
            (determine_parallel_type): Use is_in_offload_region to adjust call 
to
            get_ws_args_for.
            (expand_omp_for_generic): Likewise.
            (expand_omp_for_static_chunk): Likewise.

Diff:
---
 gcc/omp-expand.cc | 36 ++++++++++++++++++++++++++++--------
 1 file changed, 28 insertions(+), 8 deletions(-)

diff --git a/gcc/omp-expand.cc b/gcc/omp-expand.cc
index 907fd46a5b26..b0f9d375b6c7 100644
--- a/gcc/omp-expand.cc
+++ b/gcc/omp-expand.cc
@@ -127,6 +127,23 @@ is_combined_parallel (struct omp_region *region)
   return region->is_combined_parallel;
 }
 
+/* Return true is REGION is or is contained within an offload region.  */
+
+static bool
+is_in_offload_region (struct omp_region *region)
+{
+  gimple *entry_stmt = last_nondebug_stmt (region->entry);
+  if (is_gimple_omp (entry_stmt)
+      && is_gimple_omp_offloaded (entry_stmt))
+    return true;
+  else if (region->outer)
+    return is_in_offload_region (region->outer);
+  else
+    return (lookup_attribute ("omp declare target",
+                             DECL_ATTRIBUTES (current_function_decl))
+           != NULL);
+}
+
 /* Given two blocks PAR_ENTRY_BB and WS_ENTRY_BB such that WS_ENTRY_BB
    is the immediate dominator of PAR_ENTRY_BB, return true if there
    are no data dependencies that would prevent expanding the parallel
@@ -207,12 +224,12 @@ workshare_safe_to_combine_p (basic_block ws_entry_bb)
    presence (SIMD_SCHEDULE).  */
 
 static tree
-omp_adjust_chunk_size (tree chunk_size, bool simd_schedule)
+omp_adjust_chunk_size (tree chunk_size, bool simd_schedule, bool offload)
 {
   if (!simd_schedule || integer_zerop (chunk_size))
     return chunk_size;
 
-  poly_uint64 vf = omp_max_vf (false);
+  poly_uint64 vf = omp_max_vf (offload);
   if (known_eq (vf, 1U))
     return chunk_size;
 
@@ -228,7 +245,7 @@ omp_adjust_chunk_size (tree chunk_size, bool simd_schedule)
    expanded.  */
 
 static vec<tree, va_gc> *
-get_ws_args_for (gimple *par_stmt, gimple *ws_stmt)
+get_ws_args_for (gimple *par_stmt, gimple *ws_stmt, bool offload)
 {
   tree t;
   location_t loc = gimple_location (ws_stmt);
@@ -270,7 +287,7 @@ get_ws_args_for (gimple *par_stmt, gimple *ws_stmt)
       if (fd.chunk_size)
        {
          t = fold_convert_loc (loc, long_integer_type_node, fd.chunk_size);
-         t = omp_adjust_chunk_size (t, fd.simd_schedule);
+         t = omp_adjust_chunk_size (t, fd.simd_schedule, offload);
          ws_args->quick_push (t);
        }
 
@@ -366,7 +383,8 @@ determine_parallel_type (struct omp_region *region)
 
       region->is_combined_parallel = true;
       region->inner->is_combined_parallel = true;
-      region->ws_args = get_ws_args_for (par_stmt, ws_stmt);
+      region->ws_args = get_ws_args_for (par_stmt, ws_stmt,
+                                        is_in_offload_region (region));
     }
 }
 
@@ -3929,6 +3947,7 @@ expand_omp_for_generic (struct omp_region *region,
   tree *counts = NULL;
   int i;
   bool ordered_lastprivate = false;
+  bool offload = is_in_offload_region (region);
 
   gcc_assert (!broken_loop || !in_combined_parallel);
   gcc_assert (fd->iter_type == long_integer_type_node
@@ -4196,7 +4215,7 @@ expand_omp_for_generic (struct omp_region *region,
          if (fd->chunk_size)
            {
              t = fold_convert (fd->iter_type, fd->chunk_size);
-             t = omp_adjust_chunk_size (t, fd->simd_schedule);
+             t = omp_adjust_chunk_size (t, fd->simd_schedule, offload);
              if (sched_arg)
                {
                  if (fd->ordered)
@@ -4240,7 +4259,7 @@ expand_omp_for_generic (struct omp_region *region,
            {
              tree bfn_decl = builtin_decl_explicit (start_fn);
              t = fold_convert (fd->iter_type, fd->chunk_size);
-             t = omp_adjust_chunk_size (t, fd->simd_schedule);
+             t = omp_adjust_chunk_size (t, fd->simd_schedule, offload);
              if (sched_arg)
                t = build_call_expr (bfn_decl, 10, t5, t0, t1, t2, sched_arg,
                                     t, t3, t4, reductions, mem);
@@ -5937,7 +5956,8 @@ expand_omp_for_static_chunk (struct omp_region *region,
   step = force_gimple_operand_gsi (&gsi, fold_convert (itype, step),
                                   true, NULL_TREE, true, GSI_SAME_STMT);
   tree chunk_size = fold_convert (itype, fd->chunk_size);
-  chunk_size = omp_adjust_chunk_size (chunk_size, fd->simd_schedule);
+  chunk_size = omp_adjust_chunk_size (chunk_size, fd->simd_schedule,
+                                     is_in_offload_region (region));
   chunk_size
     = force_gimple_operand_gsi (&gsi, chunk_size, true, NULL_TREE, true,
                                GSI_SAME_STMT);

[gcc r15-4986] openmp: use offload max_vf for chunk_size

Reply via email to