> -----Original Message----- > From: Jakub Jelinek <ja...@redhat.com> > Sent: 28 November 2024 17:39 > To: Prathamesh Kulkarni <prathame...@nvidia.com> > Cc: Andrew Stubbs <a...@baylibre.com>; Richard Biener > <richard.guent...@gmail.com>; Richard Biener <rguent...@suse.de>; > gcc@gcc.gnu.org; Thomas Schwinge <tschwi...@baylibre.com> > Subject: Re: [RFC] Enabling SVE with offloading to nvptx > > External email: Use caution opening links or attachments > > > On Thu, Nov 14, 2024 at 08:29:26AM +0000, Prathamesh Kulkarni wrote: > > + /* True if SIMD loop needs delayed lowering of artefacts like > > + safelen and length of omp simd arrays that depend on target's > > + max_vf. This is true for offloading, when max_vf is computed > > + after > > 2 spaces after ., not just one. Fixed in the attached patch. > > > + streaming out to device. */ > > + unsigned needs_max_vf_lowering: 1; > > + > > /* The number of times to unroll the loop. 0 means no > information given, > > just do what we always do. A value of 1 means do not unroll > the loop. > > A value of USHRT_MAX means unroll with no specific unrolling > factor. > > diff --git a/gcc/omp-expand.cc b/gcc/omp-expand.cc index > > 80fb1843445..c9581e3d92e 100644 > > --- a/gcc/omp-expand.cc > > +++ b/gcc/omp-expand.cc > > @@ -7171,6 +7171,10 @@ expand_omp_simd (struct omp_region *region, > struct omp_for_data *fd) > > loop->latch = cont_bb; > > add_loop (loop, l1_bb->loop_father); > > loop->safelen = safelen_int; > > + loop->needs_max_vf_lowering = is_in_offload_region (region); > > + if (loop->needs_max_vf_lowering) > > + cfun->curr_properties &= ~PROP_gimple_lomp_dev; > > + > > if (simduid) > > { > > loop->simduid = OMP_CLAUSE__SIMDUID__DECL (simduid); diff > > --git a/gcc/omp-low.cc b/gcc/omp-low.cc index > 70a2c108fbc..0f68876a2bc > > 100644 > > --- a/gcc/omp-low.cc > > +++ b/gcc/omp-low.cc > > @@ -4660,7 +4660,16 @@ lower_rec_simd_input_clauses (tree new_var, > omp_context *ctx, > > } > > else > > { > > - tree atype = build_array_type_nelts (TREE_TYPE (new_var), > sctx->max_vf); > > + poly_uint64 nelts; > > + /* FIXME: If offloading is enabled, and max_vf is poly_int, > avoid > > + using it as length of omp simd array, and use a placeholder > value > > + instead to avoid streaming out poly_int to device. The arrays > > + will be set to correct length later in omp_device_lower pass. > > + */ > > Likewise. I really don't like introducing FIXME comments. Describe > what you want to say, but not with FIXME. Fixed. > > > + if (omp_maybe_offloaded_ctx (ctx) && !sctx- > >max_vf.is_constant ()) > > + nelts = INT_MAX; > > I'm not convinced INT_MAX is the right placeholder. > On 32-bit arches whenever TREE_TYPE (new_var) is 2 or more bytes the > array won't fit into address space anymore. > Use some magic value like 64 and say in the comment it really doesn't > matter much, as we'll change it later. > 1 probably wouldn't be best, then I think some undesirable > optimizations couild trigger (e.g. ignoring the exact value of the > index). Adjusted to use 64 in the patch. > > > + else > > + nelts = sctx->max_vf; > > + tree atype = build_array_type_nelts (TREE_TYPE (new_var), > > + nelts); > > tree avar = create_tmp_var_raw (atype); > > if (TREE_ADDRESSABLE (new_var)) > > TREE_ADDRESSABLE (avar) = 1; > > diff --git a/gcc/omp-offload.cc b/gcc/omp-offload.cc index > > 372b019f9d6..9d6467cc6a6 100644 > > --- a/gcc/omp-offload.cc > > +++ b/gcc/omp-offload.cc > > @@ -2618,6 +2618,75 @@ find_simtpriv_var_op (tree *tp, int > *walk_subtrees, void *) > > return NULL_TREE; > > } > > > > +/* Compute max_vf for target, and accordingly set loop->safelen and > length > > + of omp simd arrays. */ > > + > > +static void > > +adjust_max_vf (function *fun) > > +{ > > + if (!fun->has_simduid_loops) > > + return; > > + > > + poly_uint64 max_vf = omp_max_vf (false); > > + > > + /* FIXME: Since loop->safelen has to be an integer, it's not > always possible > > + to compare against poly_int. For eg 32 and 16+16x are not > comparable at > > + compile-time because 16+16x <= 32 for x < 2, but 16+16x > 32 > for x >= 2. > > + Even if we could get runtime VL based on -mcpu/-march, that > would not be > > + portable across other SVE archs. > > + > > + For now, use constant_lower_bound (max_vf), as a "safer > approximation" to > > + max_vf that avoids these issues, with the downside that it > will be > > + suboptimal max_vf for SVE archs implementing SIMD width > 128 > > + bits. */ > > Again, I don't like FIXMEs and . should be followed by 2 spaces. Fixed. > > > + tree assign_stmt_lhs = gimple_assign_lhs > (assign_stmt); > > + if (TREE_CODE (assign_stmt_lhs) == ARRAY_REF) > > + { > > + tree array = TREE_OPERAND (assign_stmt_lhs, > 0); > > + tree& max = TYPE_MAX_VALUE (TYPE_DOMAIN > (TREE_TYPE (array))); > > + max = size_int (loop->safelen - 1); > > + relayout_decl (array); > > This is definitely wrong, please don't do that. > ARRAY_TYPEs are shared, see build_array_type_1, so by forcefully > changing the type in place, you don't change just one particular array > (e.g. if it is int[INT_MAX]), but all user arrays as well. > You need to create a new array type instead, using > built_array_type_nelts. > See what shrink_simd_arrays does. Fixed, thanks. > > Why do you need that calculate_dominance_info (CDI_DOMINATORS); ? Because get_loop_body needs dominated_by_p. For eg without calculating dominance info, it results in following ICE for libgomp.c/pr81778.c:
0x2415d63 internal_error(char const*, ...) ../../gcc/gcc/diagnostic-global-context.cc:517 0x7dd1ab fancy_abort(char const*, int, char const*) ../../gcc/gcc/diagnostic.cc:1696 0xa7a877 dominated_by_p(cdi_direction, basic_block_def const*, basic_block_def const*) ../../gcc/gcc/dominance.cc:1130 0xa7a877 dominated_by_p(cdi_direction, basic_block_def const*, basic_block_def const*) ../../gcc/gcc/dominance.cc:1125 0x9d9227 dfs_enumerate_from(basic_block_def*, int, bool (*)(basic_block_def const*, void const*), basic_block_def**, int, void const*) ../../gcc/gcc/cfganal.cc:1588 0x9f7ac7 get_loop_body_with_size(loop const*, basic_block_def**, unsigned int) ../../gcc/gcc/cfgloop.cc:872 0x9f7ac7 get_loop_body(loop const*) ../../gcc/gcc/cfgloop.cc:901 0xe197fb adjust_max_vf ../../gcc/gcc/omp-offload.cc:2655 > I wonder if it wouldn't be better to reuse tree-vectorizer.cc > infrastructure here, note_simd_array_uses in particular, perhaps add a > helper function in tree-vectorizer.cc for that. AFAIU, note_simd_array_uses builds a map from omp simd array -> simduid used for indexing it, but we need map from omp simd array -> SIMD loop it's used in (for setting length to loop->safelen), and I am not sure if note_simd_array_uses would help with that ? IIUC, shrink_simd_arrays also seems to use a separate map simduid_to_vf, for setting length of omp simd array to desired vf. In the patch, I have left it as-is to "collect" omp simd arrays by walking over their uses in the SIMD loop, and set length to loop->safelen. Does that look OK ? > > > @@ -2755,7 +2826,10 @@ execute_omp_device_lower () > > rhs = build_int_cst (type, vf); > > break; > > case IFN_GOMP_MAX_VF: > > - rhs = build_int_cst (type, omp_max_vf (false)); > > + /* Use constant_lower_bound, to keep max_vf consistent > with > > + adjut_max_vf above. */ > > s/adjut/adjust/ Fixed. Thanks, Prathamesh > > > + rhs = build_int_cst (type, > > + constant_lower_bound (omp_max_vf > > + (false))); > > break; > > case IFN_GOMP_SIMT_ORDERED_PRED: > > rhs = vf == 1 ? integer_zero_node : NULL_TREE; > > > Jakub
Delay lowering safelen if offloading is enabled. gcc/ChangeLog: * cfgloop.h (loop): New bitfield needs_max_vf_lowering. * omp-expand.cc (expand_omp_simd): Set loop->needs_max_vf_lowering to result of is_in_offload_region. * omp-low.cc (lower_rec_simd_input_clauses): Set a placeholder value for length of omp simd array if offloading is enabled and max_vf is non-constant poly_int. * omp-offload.cc (adjust_max_vf): New function. (execute_omp_device_lower): Call adjust_max_vf, and use constant_lower_bound on result of omp_max_vf while replacing call to .GOMP_MAX_VF. Signed-off-by: Prathamesh Kulkarni <prathame...@nvidia.com> diff --git a/gcc/cfgloop.h b/gcc/cfgloop.h index 30b5e40d0d9..4e986e8857d 100644 --- a/gcc/cfgloop.h +++ b/gcc/cfgloop.h @@ -233,6 +233,12 @@ public: flag_finite_loops or similar pragmas state. */ unsigned finite_p : 1; + /* True if SIMD loop needs delayed lowering of artefacts like + safelen and length of omp simd arrays that depend on target's + max_vf. This is true for offloading, when max_vf is computed after + streaming out to device. */ + unsigned needs_max_vf_lowering: 1; + /* The number of times to unroll the loop. 0 means no information given, just do what we always do. A value of 1 means do not unroll the loop. A value of USHRT_MAX means unroll with no specific unrolling factor. diff --git a/gcc/omp-expand.cc b/gcc/omp-expand.cc index 8eb6f6f718c..33661c0f0f3 100644 --- a/gcc/omp-expand.cc +++ b/gcc/omp-expand.cc @@ -7170,6 +7170,10 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd) loop->latch = cont_bb; add_loop (loop, l1_bb->loop_father); loop->safelen = safelen_int; + loop->needs_max_vf_lowering = is_in_offload_region (region); + if (loop->needs_max_vf_lowering) + cfun->curr_properties &= ~PROP_gimple_lomp_dev; + if (simduid) { loop->simduid = OMP_CLAUSE__SIMDUID__DECL (simduid); diff --git a/gcc/omp-low.cc b/gcc/omp-low.cc index 33d81604cbe..503f32c7f57 100644 --- a/gcc/omp-low.cc +++ b/gcc/omp-low.cc @@ -4664,7 +4664,17 @@ lower_rec_simd_input_clauses (tree new_var, omp_context *ctx, } else { - tree atype = build_array_type_nelts (TREE_TYPE (new_var), sctx->max_vf); + poly_uint64 nelts; + /* If offloading is enabled, and max_vf is poly_int, avoid + using it as length of omp simd array, and use a placeholder value + instead to avoid streaming out poly_int to device. The choice of + placeholder value (64) doesn't really matter since the arrays + will be set to correct length later in omp_device_lower pass. */ + if (omp_maybe_offloaded_ctx (ctx) && !sctx->max_vf.is_constant ()) + nelts = 64; + else + nelts = sctx->max_vf; + tree atype = build_array_type_nelts (TREE_TYPE (new_var), nelts); tree avar = create_tmp_var_raw (atype); if (TREE_ADDRESSABLE (new_var)) TREE_ADDRESSABLE (avar) = 1; diff --git a/gcc/omp-offload.cc b/gcc/omp-offload.cc index f16a2921b4b..f22c51ba221 100644 --- a/gcc/omp-offload.cc +++ b/gcc/omp-offload.cc @@ -2617,6 +2617,77 @@ find_simtpriv_var_op (tree *tp, int *walk_subtrees, void *) return NULL_TREE; } +/* Compute max_vf for target, and accordingly set loop->safelen and length + of omp simd arrays. */ + +static void +adjust_max_vf (function *fun) +{ + if (!fun->has_simduid_loops) + return; + + poly_uint64 max_vf = omp_max_vf (false); + + /* Since loop->safelen has to be an integer, it's not always possible + to compare against poly_int. For eg 32 and 16+16x are not comparable at + compile-time because 16+16x <= 32 for x < 2, but 16+16x > 32 for x >= 2. + Even if we could get runtime VL based on -mcpu/-march, that would not be + portable across other SVE archs. + + For now, use constant_lower_bound (max_vf), as a "safer approximation" to + max_vf that avoids these issues, with the downside that it will be + suboptimal max_vf for SVE archs implementing SIMD width > 128 bits. */ + + uint64_t max_vf_int; + if (!max_vf.is_constant (&max_vf_int)) + max_vf_int = constant_lower_bound (max_vf); + + calculate_dominance_info (CDI_DOMINATORS); + for (auto loop: loops_list (fun, 0)) + { + if (!loop->needs_max_vf_lowering) + continue; + + if (loop->safelen > max_vf_int) + loop->safelen = max_vf_int; + + basic_block *bbs = get_loop_body (loop); + for (unsigned i = 0; i < loop->num_nodes; i++) + for (auto gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi)) + { + gcall *stmt = dyn_cast<gcall *> (gsi_stmt (gsi)); + tree lhs = NULL_TREE; + + if (stmt + && gimple_call_internal_p (stmt) + && (gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE) + && ((lhs = gimple_call_lhs (stmt)) != NULL_TREE)) + { + imm_use_iterator use_iter; + gimple *use_stmt; + + FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, lhs) + { + gassign *assign_stmt = dyn_cast<gassign *> (use_stmt); + if (!assign_stmt) + continue; + + tree assign_stmt_lhs = gimple_assign_lhs (assign_stmt); + if (TREE_CODE (assign_stmt_lhs) == ARRAY_REF) + { + tree array = TREE_OPERAND (assign_stmt_lhs, 0); + tree elt_type = TREE_TYPE (TREE_TYPE (array)); + tree atype = build_array_type_nelts (elt_type, + loop->safelen); + TREE_TYPE (array) = atype; + relayout_decl (array); + } + } + } + } + } +} + /* Cleanup uses of SIMT placeholder internal functions: on non-SIMT targets, VF is 1 and LANE is 0; on SIMT targets, VF is folded to a constant, and LANE is kept to be expanded to RTL later on. Also cleanup all other SIMT @@ -2626,6 +2697,8 @@ find_simtpriv_var_op (tree *tp, int *walk_subtrees, void *) static unsigned int execute_omp_device_lower () { + adjust_max_vf (cfun); + int vf = targetm.simt.vf ? targetm.simt.vf () : 1; bool regimplify = false; basic_block bb; @@ -2754,7 +2827,10 @@ execute_omp_device_lower () rhs = build_int_cst (type, vf); break; case IFN_GOMP_MAX_VF: - rhs = build_int_cst (type, omp_max_vf (false)); + /* Use constant_lower_bound, to keep max_vf consistent with + adjust_max_vf above. */ + rhs = build_int_cst (type, + constant_lower_bound (omp_max_vf (false))); break; case IFN_GOMP_SIMT_ORDERED_PRED: rhs = vf == 1 ? integer_zero_node : NULL_TREE;