RE: [RFC] Enabling SVE with offloading to nvptx

Prathamesh Kulkarni via Gcc Mon, 28 Oct 2024 05:57:37 -0700

> -----Original Message-----
> From: Richard Biener <[email protected]>
> Sent: 21 October 2024 12:45
> To: Prathamesh Kulkarni <[email protected]>
> Cc: [email protected]; Thomas Schwinge <[email protected]>; Jakub
> Jelinek <[email protected]>
> Subject: RE: [RFC] Enabling SVE with offloading to nvptx
> 
> External email: Use caution opening links or attachments
> 
> 
> On Fri, 18 Oct 2024, Prathamesh Kulkarni wrote:
> 
> >
> >
> > > -----Original Message-----
> > > From: Richard Biener <[email protected]>
> > > Sent: 17 October 2024 19:18
> > > To: Prathamesh Kulkarni <[email protected]>
> > > Cc: [email protected]; Thomas Schwinge <[email protected]>
> > > Subject: RE: [RFC] Enabling SVE with offloading to nvptx
> > >
> > > External email: Use caution opening links or attachments
> > >
> > >
> > > On Thu, 17 Oct 2024, Prathamesh Kulkarni wrote:
> > >
> > > > > -----Original Message-----
> > > > > From: Richard Biener <[email protected]>
> > > > > Sent: 16 October 2024 13:05
> > > > > To: Prathamesh Kulkarni <[email protected]>
> > > > > Cc: [email protected]; Thomas Schwinge <[email protected]>
> > > > > Subject: Re: [RFC] Enabling SVE with offloading to nvptx
> > > > >
> > > > > External email: Use caution opening links or attachments
> > > > >
> > > > >
> > > > > On Tue, 15 Oct 2024, Prathamesh Kulkarni wrote:
> > > > >
> > > > > > Hi,
> > > > > > Testing libgomp with SVE enabled (-mcpu=generic+sve2),
> results
> > > in
> > > > > > ~60
> > > > > UNRESOLVED errors with following error message:
> > > > > >
> > > > > > lto1: fatal error: degree of 'poly_int' exceeds
> > > 'NUM_POLY_INT_COEFFS'
> > > > > > compilation terminated.
> > > > > > nvptx mkoffload: fatal error:
> > > > > > ../../install/bin/aarch64-unknown-linux-gnu-accel-nvptx-
> none-g
> > > > > > cc
> > > > > returned 1 exit status compilation terminated.
> > > > > >
> > > > > > This behaviour can be reproduced with the following simple
> > > > > > test-case
> > > > > with -fopenmp -foffload=nvptx-none -mcpu=generic+sve2:
> > > > > >
> > > > > > #define N 1000
> > > > > > int main ()
> > > > > > {
> > > > > >   int i;
> > > > > >   int A[N] = {0}, B[N] = {0};
> > > > > >
> > > > > >   #pragma omp target map(i), map(tofrom: A), map(from: B)
> > > > > >   #pragma omp simd
> > > > > >   for (i = 0; i < N; i++)
> > > > > >     A[i] = A[i] + B[i];
> > > > > >   return A[0];
> > > > > > }
> > > > > >
> > > > > > omplower pass lowers the above loop to the following:
> > > > > >
> > > > > >                 D.4576 = .GOMP_USE_SIMT ();
> > > > > >                 if (D.4576 != 0) goto <D.4577>; else goto
> > > <D.4578>;
> > > > > >                 <D.4577>:
> > > > > >                 {
> > > > > >                   unsigned int D.4586;
> > > > > >                   unsigned int D.4587;
> > > > > >                   int D.4588;
> > > > > >                   void * simduid.5;
> > > > > >                   void * .omp_simt.6;
> > > > > >                   int D.4596;
> > > > > >                   _Bool D.4597;
> > > > > >                   int D.4598;
> > > > > >                   unsigned int D.4599;
> > > > > >                   int D.4600;
> > > > > >                   int D.4601;
> > > > > >                   int * D.4602;
> > > > > >                   int i [value-expr: D.4588];
> > > > > >                   int i.0;
> > > > > >
> > > > > >                   simduid.5 = .GOMP_SIMT_ENTER (simduid.5,
> > > &D.4588);
> > > > > >                   .omp_simt.6 = .GOMP_SIMT_ENTER_ALLOC
> > > (simduid.5);
> > > > > >                   D.4587 = 0;
> > > > > >                   i.0 = 0;
> > > > > >                   #pragma omp simd safelen(32)
> > > > > > _simduid_(simduid.5)
> > > > > _simt_ linear(i.0:1) linear(i:1)
> > > > > >                   for (i.0 = 0; i.0 < 1000; i.0 = i.0 + 1)
> > > > > >                   ...
> > > > > >                 }
> > > > > >                 goto <D.4579>;
> > > > > >                 <D.4578>:
> > > > > >                 {
> > > > > >                   unsigned int D.4603;
> > > > > >                   unsigned int D.4604;
> > > > > >                   int D.4605[0:POLY_INT_CST [15, 16]];
> > > > > >                   void * simduid.7;
> > > > > >                   unsigned int D.4612;
> > > > > >                   int * D.4613;
> > > > > >                   int D.4614;
> > > > > >                   int i [value-expr: D.4605[D.4604]];
> > > > > >                   int i.0;
> > > > > >
> > > > > >                   D.4604 = 0;
> > > > > >                   i.0 = 0;
> > > > > >                   #pragma omp simd safelen(POLY_INT_CST [16,
> > > 16])
> > > > > _simduid_(simduid.7) linear(i.0:1) linear(i:1)
> > > > > >                   ...
> > > > > >                  }
> > > > > >                  <D.4579>:
> > > > > >                  ...
> > > > > >
> > > > > > For offloading to SIMT based device like nvptx,
> scan_omp_simd
> > > > > > duplicates lowering of simd pragma into if-else where the
> if-
> > > part
> > > > > > contains simt code-path, and else-part contains simd code-
> path.
> > > In
> > > > > lower_rec_simd_input_clauses, max_vf is set to 16+16x for the
> > > above
> > > > > case as determined by omp_max_vf, and that becomes length of
> the
> > > omp
> > > > > simd
> > > > > array:
> > > > > > int D.4605[0:POLY_INT_CST [15, 16]];
> > > > > >
> > > > > > The issue here is that, the function containing above if-
> else
> > > > > > condition gets streamed out to LTO bytecode including the
> simd
> > > > > > code-
> > > > > path and the omp simd array, whose domain is
> [0:POLY_INT_CST[15,
> > > > > 16]], and thus we get the above error while streaming-in
> > > > > POLY_INT_CST in lto_input_ts_poly_tree_pointers on device
> side.
> > > > > >
> > > > > > Note that, the simd code-path is essentially dead-code on
> > > > > > nvptx, since
> > > > > > .GOMP_USE_SIMT() resolves to 1 during omp_device_lower pass,
> > > > > > and later optimization passes (ccp2) remove the dead-code
> path
> > > > > > and unused omp
> > > > > simd arrays while compiling to device. So in this case, we
> > > > > aren't really mapping POLY_INT_CST from host to device, but it
> > > > > gets streamed out to device as an artefact of omp simd
> lowering.
> > > > > >
> > > > > > I suppose a proper fix here would be to (somehow) defer
> > > > > > lowering of omp pragma simd after streaming out to device,
> so
> > > > > > the device only sees simt code-path, and the host only sees
> > > > > > simd code path
> > > ?
> > > > > > Or perhaps
> > > > > clone each function in offload region, one for host and one
> for
> > > SIMT
> > > > > device, and only stream the device versions to avoid streaming
> > > > > out
> > > > > host- specific IR changes ?
> > > > >
> > > > > There is currently no way to have the host compiler query
> > > > > offload target capabilities so the only true fix is to delay
> OMP
> > > > > SIMD lowering to the target.
> > > > Um, I thought we could use omp_max_simt_vf from host to query if
> > > > the
> > > offload target is SIMT ?
> > > > The function essentially iterates thru OFFLOAD_TARGET_NAMES and
> > > returns non-zero for nvptx.
> > > > >
> > > > > Are we dependent on the early optimization pipeline being run
> on
> > > the
> > > > > host to produce the offload IL?  There's some oddball OACC
> > > > > passes
> > > in
> > > > > pass_ipa_oacc.
> > > > >
> > > > > That said, I'd probably try to produce clones with unlowered
> IL
> > > and
> > > > > skip those clones from all processing from that point and
> resume
> > > in
> > > > > the offload compiler.
> > > > >
> > > > > > I thought of following approaches as workarounds:
> > > > >
> > > > > I don't think any workaround will fly in the end.  Can't you
> > > simply
> > > > > force SVE to be off for offload clones on the host side and
> > > > > force OMP lowering with ADVSIMD only?
> > > > Would it be correct to set:
> > > > sctx.max_vf = constant_lower_bound (omp_max_vf ())
> > > >
> > > > if function is offloaded and omp_max_vf returns non-constant
> > > poly_int,
> > > > to force max_vf to be VLS, which will avoid VLA vectorization as
> > > > in
> > > the attached patch ?
> > > >
> > > > Or should we modify autovectorize_vector_modes hook to return
> VLS
> > > > modes for offloaded functions ?
> > >
> > > Can't you simply put a target(march=armv8.3a) (or even more basic
> > > ISA) on the OMP target clones?  Or is the lowering happening
> before
> > > outlining?
> > AFAIU, scan_omp_target in omp-lower creates a "child function" but
> doesn't outline it.
> > The actual outlining seems to be happening during omp-expand pass.
> >
> > > In that case maybe we want to switch the host target into
> "offload-
> > > lowering-mode" (effectively switching to a basic ISA)?
> > Sorry, I didn't understand -- Do you mean we should enforce basic
> ISA for all the functions including host-only ones ?
> 
> No, not for host-only.
> 
> > Since the issue stems from having host details leaked into IL during
> > omp-lowering (and omp-expand), I was wondering if we should use
> > default versions of the hooks if offloading is enabled in omp-lower
> and omp-expand, which should avoid host details creeping into offload
> IL, sth like in the attached patch ?
> 
> Ick.  No.  I think we should avoid lowering parts applied when the
> pieces are not outlined yet and ensure those offload functions have a
> "basic ISA" (which the target needs to define) - the target already
> has control over the function decl built (IIRC) so it can attach
> required target attributes (and we can ensure those are dropped on the
> offload compiler side again).
Hi Richard,
Thanks for the suggestions!


>From what I understand, omp expansion proceeds in depth-first order, by 
>expanding innermost regions first
and then progressively outlining them. I tried disabling SIMD lowering during 
omp-lowering but that triggered
several errors during omp-expand, and didn't attempt to pursue it further.

I briefly experimented with adding a new target hook offload_mode, that will 
set host ISA to "offload mode",
for current_function_decl (function containing offload region) when starting to 
lower offload region, and reset the host ISA on
current_function_decl after leaving it (essentially enabling/disabling 
TARGET_FL_SVE in isa_flags in AArch64 backend), but
not sure if that was entirely correct, and dropped it.
> 
> The alternative would be to go with something along your patch in this
> mail but do
> 
>  poly_uint64
> -omp_max_vf (void)
> +omp_max_vf (bool offload)
>  {
> ...
>   if (offload)
>     get_me_magic_value_for_configured_offload_device ()
> 
> but iff we're doing the lowering before outlining then if we have
> configured two different offload devices we'll have to come up with a
> max_vf that's suitable for all offload devices?  I think this shows a
> least this part of the lowering is done in the wrong place?
> 
> Maybe we can have the max_vf "symbolic"?  Like .IFN_MAX_VF () and
> lower that on the offload side only?
AFAIU, there are three things that depend on max_vf:
(1) Setting loop->safelen
(2) Setting length of omp simd arrays
(3) Computing chunk_size for schedule clause (with simd modifier)

We can't use result of an internal function for loop->safelen since it's a 
compile-time artefact.
And for array length, (at-least) sra pass seems to assume that 
TYPE_MIN/TYPE_MAX are INTEGER_CST.
>From prepare_iteration_over_array_elts:

  tree minidx = TYPE_MIN_VALUE (TYPE_DOMAIN (type));
  gcc_assert (TREE_CODE (minidx) == INTEGER_CST);
  tree maxidx = TYPE_MAX_VALUE (TYPE_DOMAIN (type));
  /* Skip (some) zero-length arrays; others have MAXIDX == MINIDX - 1.  */
  if (!maxidx)
    return false;
  gcc_assert (TREE_CODE (maxidx) == INTEGER_CST);

The attached patch:
(a) Uses a placeholder value (INT_MAX) for max_vf which gets assigned to 
loop->safelen and length of omp simd array if offloading
is enabled.
(b) For computing chunk_size, using INT_MAX resulted in chunk_size being 
constant propagated (and harder to recover later),
so I added a new internal function .GOMP_SIMD_MAX_VF, whose result is assigned 
to chunk_size.
(c) Adds a new pass pass_omp_adjust_max_vf, just before omp_device_lower, to 
adjust above 3 artefacts to correctly adjust max_vf,
and fold away .GOMP_SIMD_MAX_VF (or should I piggy back it on some other pass?)
 
For the following contrived test:

#include <stdlib.h>

#define N 1000
int A[N];
int B[N];

int main()
{
  int i;
  int sum = 0;

  #pragma omp target map(sum), map(A), map(B), map (i)
  #pragma omp teams distribute parallel for simd reduction(+:sum) 
schedule(simd:static, 5)
  for (i = 0; i < N; i++)
    sum += A[i] * B[i];
  return sum;
}

With patch, omp expand dump shows length of omp simd arrays set to INT_MAX with 
offloading
enabled:
  int D.5382[2147483647];
  int D.5378[2147483647];

and following computation for chunk_size:
  D.5353 = .GOMP_SIMD_MAX_VF ();
  D.5354 = D.5353 + 4;
  D.5355 = .GOMP_SIMD_MAX_VF ();
  D.5356 = -D.5355;
  D.5357 = D.5354 & D.5356;
  D.5358 = D.5348 - D.5347;

And after omp_adjust_max_vf pass, the dump shows correct max_vf assigned to 
length of
omp simd array and chunk_size on host side:

  int D.5382[0:POLY_INT_CST [15, 16]];
  int D.5378[0:POLY_INT_CST [15, 16]];
  ...
  _38 = POLY_INT_CST [16, 16];
  _39 = _38 + 4;
  _40 = POLY_INT_CST [16, 16];
  _41 = -_40;
  _42 = _39 & _41;
  _43 = _35 - _34;

and would make similar adjustments for SIMD based devices.
For SIMT devices, the patch explicitly sets max_vf to 1, to fold 
.GOMP_SIMD_MAX_VF and shrink omp simd array
(altho I guess it doesn't really matter since the simd code-path would be 
dead-code?)

I had a couple of questions:

(1) With patch, I am seeing a lot of errors -- "multiple dump files found"
For eg:
libgomp.c++/../libgomp.c-c++-common/target-is-initial-host-2.c: multiple dump 
files found
UNRESOLVED: libgomp.c++/../libgomp.c-c++-common/target-is-initial-host-2.c 
scan-nvptx-none-offload-tree-dump-times optimized "omp_is_initial_device" 1

The error seems to come from scandump.exp:glob-dump-file:

        if { $num_files > 1 } {
            verbose -log "$testcase: multiple dump files found"
        }

This seems to happen because the compiler is passed: -fdump-tree-optimized 
-foffload-options=-fdump-tree-optimized, which results
in two optimized dump files. I am not sure tho, why the patch specifically 
triggers this ?

(2) To check if we're in offload region in omp_adjust_chunk_size, the patch 
defines a new function enclosing_target_region_p,
and keeps walking region->outer till it reaches GIMPLE_OMP_TARGET region (or 
NULL). While it seems to work, I was wondering if there was a better
way to do this ? Another option would be to keep a static variable 
target_nesting_level, which is incremented/decremented
before/after each call to omp_expand_target, similar to one in omp-lower.cc ? I 
tried using omp_maybe_offloaded from omp_adjust_chunk_size, but that
didn't seem to work.

Does the patch look in the right direction ?

Thanks,
Prathamesh
> 
> Richard.
> 
> 
> > Thanks,
> > Prathamesh
> > >
> > > > Thanks,
> > > > Prathamesh
> > > > >
> > > > > Richard.
> > > > >
> > > > > > [1] Set sctx.max_vf to constant_lower_bound(omp_max_vf ())
> in
> > > > > > lower_rec_simd_input_clauses, if the function is going to be
> > > > > > offloaded and omp_max_vf returns non-constant poly_int. For
> > > above
> > > > > > case, it sets
> > > > > max_vf to 16 instead of 16+16x which seems to resolve the
> issue,
> > > but
> > > > > it'd use suboptimal max VF for host ? This is done in patch
> > > > > p-283-
> > > 2.txt.
> > > > > >
> > > > > > However, with clean trunk it still seems to use max_vf = 16
> > > after
> > > > > disabling the above error.
> > > > > > vect dump shows:
> > > > > >
> > > > > > (compute_affine_dependence
> > > > > >   ref_a: (*_25)[i.0_51], stmt_a: _26 = (*_25)[i.0_51];
> > > > > >   ref_b: (*_23)[i.0_51], stmt_b: (*_23)[i.0_51] = _27;
> > > > > > ) -> dependence analysis failed
> > > > > > foo.c:10:13: note:   dependence distance  = 0.
> > > > > > foo.c:10:13: note:   dependence distance == 0 between
> > > (*_23)[i.0_51]
> > > > > and (*_23)[i.0_51]
> > > > > > foo.c:10:13: missed:  bad data dependence.
> > > > > > foo.c:10:13: note:  ***** Analysis failed with vector mode
> > > VNx4SI
> > > > > >
> > > > > > This seems to happen because, loop->safelen is set to 16 by
> > > taking
> > > > > > MIN(constant_lower_bound(16+16x), INT_MAX) in
> expand_omp_simd:
> > > > > >
> > > > > >       if (!poly_int_tree_p (safelen, &val))
> > > > > >         safelen_int = 0;
> > > > > >       else
> > > > > >         safelen_int = MIN (constant_lower_bound (val),
> > > > > > INT_MAX);
> > > > > >
> > > > > > and fails to vectorize with VLA vectors, because max_vf ==
> 16
> > > and
> > > > > min_vf == 4+4x resulting in bad data dependence due to:
> > > > > >
> > > > > >   if (max_vf != MAX_VECTORIZATION_FACTOR
> > > > > >       && maybe_lt (max_vf, min_vf))
> > > > > >     return opt_result::failure_at (vect_location, "bad data
> > > > > > dependence.\n");
> > > > > >
> > > > > > If safelen was (somehow) set to 16+16x, I guess it could
> have
> > > used
> > > > > VF=4+4x and vectorized with VLA vectors.
> > > > > > but I suppose that's a separate issue ?
> > > > > >
> > > > > > [2] Since the issue seems to be only with streaming out
> length
> > > of
> > > > > > omp simd array when it's POLY_INT_CST, could we perhaps use
> a
> > > > > > place holder length during omp lowering and compute the
> > > > > > correct length after streaming out, so POLY_INT_CST doesn't
> > > > > > get leaked into bytecode ? The
> > > > > attached patch p-283-3.txt follows this approach by using
> bogus
> > > > > length INT_MAX in lower_rec_simd_input_clauses if offloading
> to
> > > SIMT
> > > > > device and max_vf is non-constant poly_int, and later
> computing
> > > the
> > > > > correct length in beginning of vect pass by setting it to
> > > omp_max_vf
> > > > > (), but I am not sure if this is entirely correct.
> > > > > > I am assuming that creating omp simd array of bogus length
> > > > > > will not be an issue for nvptx since it will never get
> > > > > > referenced and eventually be removed by remove_unused_locals
> ?
> > > > > > If it'd not be a good idea to
> > > > > rely on the pass pipeline to eliminate simd code-path and omp
> > > > > simd array while compiling to device, it could be possibly
> done
> > > > > during omp_lower_device pass itself ?
> > > > > >
> > > > > > [3] While streaming-in POLY_INT_CST, avoid emitting error
> > > > > > immediately if degree of POLY_INT_CST exceeds accel's
> > > > > > NUM_POLY_INT_COEFFS to ignore POLY_INT_CSTs that may
> > > > > > potentially occur on dead-code path, and instead mark it as
> error_mark_node.
> > > > > > For the above case, since
> > > > > POLY_INT_CST appears on dead-code path, streaming POLY_INT_CST
> > > with
> > > > > higher degree than accel's NUM_POLY_INT_COEFFS would be
> > > "harmless".
> > > > > And detect invalid POLY_INT_CST's in expand pass (if it
> survives
> > > > > till this point), and emit above error, but not sure if that'd
> > > > > be the right place ?
> > > > > > This is done in p-283-4.txt.
> > > > > >
> > > > > > All the three patches fix UNRESOLVED tests due to
> POLY_INT_CST
> > > > > streaming error in libgomp testsuite with -mcpu=generic+sve2.
> > > > > > (Altho it introduces a strange FAIL for data-5.f90, which I
> am
> > > > > investigating).
> > > > > > I would be grateful for suggestions on how to proceed.
> > > > > >
> > > > > > Signed-off-by: Prathamesh Kulkarni <[email protected]>
> > > > > >
> > > > > > Thanks,
> > > > > > Prathamesh
> > > > > >
> > > > >
> > > > > --
> > > > > Richard Biener <[email protected]> SUSE Software Solutions
> > > > > Germany GmbH, Frankenstrasse 146, 90461 Nuernberg, Germany;
> > > > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809,
> AG
> > > > > Nuernberg)
> > > >
> > >
> > > --
> > > Richard Biener <[email protected]>
> > > SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461
> > > Nuernberg, Germany;
> > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG
> > > Nuernberg)
> >
> 
> --
> Richard Biener <[email protected]>
> SUSE Software Solutions Germany GmbH,
> Frankenstrasse 146, 90461 Nuernberg, Germany;
> GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG
> Nuernberg)

Delay computing max_vf when offloading is enabled.

gcc/ChangeLog:

        * internal-fn.cc (expand_GOMP_SIMD_MAX_VF): New function.
        * internal-fn.def (GOMP_SIMD_MAX_VF): New entry.
        * omp-expand.cc (enclosing_target_region_p): New function.
        (omp_adjust_chunk_size): New parameter offload.
        If offload is true, build call_expr for internal
        function GOMP_SIMD_MAX_VF.
        (get_ws_args_for): New parameter offload, and pass it to
        omp_adjust_chunk_size.
        (determine_parallel_type): Call enclosing_target_region_p
        and pass it's result to get_ws_args_for.
        (expand_omp_for_generic): Call enclosing_target_region_p
        and pass it's result to omp_adjust_chunk_size.
        (expand_omp_for_static_chunk): Likewise.
        * omp-low.cc (lower_rec_simd_input_clauses): Set sctx.max_vf
        to INT_MAX if offloading is enabled.
        * omp-offload.cc (class pass_omp_adjust_max_vf): Define new pass.
        (make_pass_omp_adjust_max_vf): New function.
        * passes.def: Add entry for pass_omp_adjust_max_vf.
        * tree-pass.h (make_pass_omp_adjust_max_vf): Declare.

Signed-off-by: Prathamesh Kulkarni <[email protected]>

diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
index d89a04fe412..dd15facc43c 100644
--- a/gcc/internal-fn.cc
+++ b/gcc/internal-fn.cc
@@ -662,6 +662,14 @@ expand_GOMP_SIMD_ORDERED_END (internal_fn, gcall *)
   gcc_unreachable ();
 }
 
+/* This should get folded in omp_adjust_max_vf pass.  */
+
+static void
+expand_GOMP_SIMD_MAX_VF (internal_fn, gcall *)
+{
+  gcc_unreachable ();
+}
+
 /* This should get expanded in the sanopt pass.  */
 
 static void
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index 23b4ab02b30..0b756a3034e 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -469,6 +469,7 @@ DEF_INTERNAL_FN (GOMP_SIMD_VF, ECF_CONST | ECF_LEAF | 
ECF_NOTHROW, NULL)
 DEF_INTERNAL_FN (GOMP_SIMD_LAST_LANE, ECF_CONST | ECF_LEAF | ECF_NOTHROW, NULL)
 DEF_INTERNAL_FN (GOMP_SIMD_ORDERED_START, ECF_LEAF | ECF_NOTHROW, NULL)
 DEF_INTERNAL_FN (GOMP_SIMD_ORDERED_END, ECF_LEAF | ECF_NOTHROW, NULL)
+DEF_INTERNAL_FN (GOMP_SIMD_MAX_VF, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW, NULL)
 DEF_INTERNAL_FN (LOOP_VECTORIZED, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW, NULL)
 DEF_INTERNAL_FN (LOOP_DIST_ALIAS, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW, NULL)
 DEF_INTERNAL_FN (ANNOTATE,  ECF_CONST | ECF_LEAF | ECF_NOTHROW, NULL)
diff --git a/gcc/omp-expand.cc b/gcc/omp-expand.cc
index 9ff9553c3ea..6c20e7922cc 100644
--- a/gcc/omp-expand.cc
+++ b/gcc/omp-expand.cc
@@ -126,6 +126,17 @@ is_combined_parallel (struct omp_region *region)
   return region->is_combined_parallel;
 }
 
+/* Return true if REGION is enclosed in omp target region.  */
+
+static bool
+enclosing_target_region_p (struct omp_region *region)
+{
+  for (omp_region *r = region; r; r = r->outer)
+    if (r->type == GIMPLE_OMP_TARGET)
+      return true;
+  return false;
+}
+
 /* Given two blocks PAR_ENTRY_BB and WS_ENTRY_BB such that WS_ENTRY_BB
    is the immediate dominator of PAR_ENTRY_BB, return true if there
    are no data dependencies that would prevent expanding the parallel
@@ -206,20 +217,30 @@ workshare_safe_to_combine_p (basic_block ws_entry_bb)
    presence (SIMD_SCHEDULE).  */
 
 static tree
-omp_adjust_chunk_size (tree chunk_size, bool simd_schedule)
+omp_adjust_chunk_size (tree chunk_size, bool simd_schedule, bool offload)
 {
   if (!simd_schedule || integer_zerop (chunk_size))
     return chunk_size;
 
-  poly_uint64 vf = omp_max_vf ();
-  if (known_eq (vf, 1U))
-    return chunk_size;
-
   tree type = TREE_TYPE (chunk_size);
+  tree max_vf;
+
+  if (offload)
+    max_vf = build_call_expr_internal_loc (input_location,
+                                          IFN_GOMP_SIMD_MAX_VF, type, 0);
+  else
+    {
+      poly_uint64 vf = omp_max_vf ();
+      if (known_eq (vf, 1U))
+       return chunk_size;
+      max_vf = build_int_cst (type, vf);
+    }
+
   chunk_size = fold_build2 (PLUS_EXPR, type, chunk_size,
-                           build_int_cst (type, vf - 1));
+                           fold_build2 (MINUS_EXPR, type,
+                                        max_vf, build_one_cst (type)));
   return fold_build2 (BIT_AND_EXPR, type, chunk_size,
-                     build_int_cst (type, -vf));
+                     fold_build1 (NEGATE_EXPR, type, max_vf));
 }
 
 /* Collect additional arguments needed to emit a combined
@@ -227,7 +248,7 @@ omp_adjust_chunk_size (tree chunk_size, bool simd_schedule)
    expanded.  */
 
 static vec<tree, va_gc> *
-get_ws_args_for (gimple *par_stmt, gimple *ws_stmt)
+get_ws_args_for (gimple *par_stmt, gimple *ws_stmt, bool offload)
 {
   tree t;
   location_t loc = gimple_location (ws_stmt);
@@ -269,7 +290,7 @@ get_ws_args_for (gimple *par_stmt, gimple *ws_stmt)
       if (fd.chunk_size)
        {
          t = fold_convert_loc (loc, long_integer_type_node, fd.chunk_size);
-         t = omp_adjust_chunk_size (t, fd.simd_schedule);
+         t = omp_adjust_chunk_size (t, fd.simd_schedule, offload);
          ws_args->quick_push (t);
        }
 
@@ -365,7 +386,8 @@ determine_parallel_type (struct omp_region *region)
 
       region->is_combined_parallel = true;
       region->inner->is_combined_parallel = true;
-      region->ws_args = get_ws_args_for (par_stmt, ws_stmt);
+      region->ws_args = get_ws_args_for (par_stmt, ws_stmt,
+                                        enclosing_target_region_p (region));
     }
 }
 
@@ -4195,7 +4217,8 @@ expand_omp_for_generic (struct omp_region *region,
          if (fd->chunk_size)
            {
              t = fold_convert (fd->iter_type, fd->chunk_size);
-             t = omp_adjust_chunk_size (t, fd->simd_schedule);
+             t = omp_adjust_chunk_size (t, fd->simd_schedule,
+                                        enclosing_target_region_p (region));
              if (sched_arg)
                {
                  if (fd->ordered)
@@ -4239,7 +4262,8 @@ expand_omp_for_generic (struct omp_region *region,
            {
              tree bfn_decl = builtin_decl_explicit (start_fn);
              t = fold_convert (fd->iter_type, fd->chunk_size);
-             t = omp_adjust_chunk_size (t, fd->simd_schedule);
+             t = omp_adjust_chunk_size (t, fd->simd_schedule,
+                                        enclosing_target_region_p (region));
              if (sched_arg)
                t = build_call_expr (bfn_decl, 10, t5, t0, t1, t2, sched_arg,
                                     t, t3, t4, reductions, mem);
@@ -5936,7 +5960,8 @@ expand_omp_for_static_chunk (struct omp_region *region,
   step = force_gimple_operand_gsi (&gsi, fold_convert (itype, step),
                                   true, NULL_TREE, true, GSI_SAME_STMT);
   tree chunk_size = fold_convert (itype, fd->chunk_size);
-  chunk_size = omp_adjust_chunk_size (chunk_size, fd->simd_schedule);
+  chunk_size = omp_adjust_chunk_size (chunk_size, fd->simd_schedule,
+                                     enclosing_target_region_p (region));
   chunk_size
     = force_gimple_operand_gsi (&gsi, chunk_size, true, NULL_TREE, true,
                                GSI_SAME_STMT);
diff --git a/gcc/omp-low.cc b/gcc/omp-low.cc
index da2051b0279..ea1fa920005 100644
--- a/gcc/omp-low.cc
+++ b/gcc/omp-low.cc
@@ -4588,7 +4588,13 @@ lower_rec_simd_input_clauses (tree new_var, omp_context 
*ctx,
 {
   if (known_eq (sctx->max_vf, 0U))
     {
-      sctx->max_vf = sctx->is_simt ? omp_max_simt_vf () : omp_max_vf ();
+      if (sctx->is_simt)
+       sctx->max_vf = omp_max_simt_vf ();
+      else if (omp_maybe_offloaded_ctx (ctx))
+       sctx->max_vf = INT_MAX;
+      else
+       sctx->max_vf = omp_max_vf ();
+
       if (maybe_gt (sctx->max_vf, 1U))
        {
          tree c = omp_find_clause (gimple_omp_for_clauses (ctx->stmt),
diff --git a/gcc/omp-offload.cc b/gcc/omp-offload.cc
index 934fbd80bdd..1a6bd9bd02d 100644
--- a/gcc/omp-offload.cc
+++ b/gcc/omp-offload.cc
@@ -2944,3 +2944,105 @@ make_pass_omp_target_link (gcc::context *ctxt)
 {
   return new pass_omp_target_link (ctxt);
 }
+
+namespace {
+
+const pass_data pass_data_omp_adjust_max_vf =
+{
+  GIMPLE_PASS,                  /* type */
+  "ompadjustmaxvf",             /* name */
+  OPTGROUP_OMP,                 /* optinfo_flags */
+  TV_NONE,                      /* tv_id */
+  PROP_ssa,                     /* properties_required */
+  0,                            /* properties_provided */
+  0,                            /* properties_destroyed */
+  0,                            /* todo_flags_start */
+  TODO_update_ssa,              /* todo_flags_finish */
+};
+
+class pass_omp_adjust_max_vf : public gimple_opt_pass
+{
+public:
+  pass_omp_adjust_max_vf (gcc::context *ctxt)
+    : gimple_opt_pass (pass_data_omp_adjust_max_vf, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  bool gate (function *fun) final override
+  {
+    return offloading_function_p (fun->decl);
+  }
+
+  unsigned execute (function *fun) final override;
+};
+
+/* When offloading is enabled, we do not immediately compute
+   max_vf during omp_lower because it may differ between devices,
+   and instead delay lowering by using a place holder value INT_MAX.
+   max_vf is used for three things:
+   (a) Setting loop->safelen.
+   (b) Setting length of omp simd arrays.
+   (c) Computing chunk size of schedule clause.
+   This pass assigns appropriate values to above three artefacts.  */
+
+unsigned
+pass_omp_adjust_max_vf::execute (function *fun)
+{
+  if (!fun->has_simduid_loops)
+    return 0;
+
+  /* For SIMT targets, the simd code-path is dead-code, so just
+     use a placeholder value 1 to fold .GOMP_SIMD_MAX_VF,
+     and shrink omp simd array length from INT_MAX.  */
+  poly_uint64 max_vf = targetm.simt.vf ? 1 : omp_max_vf ();
+
+  /* Set correct safelen.  */
+
+  for (auto loop: loops_list (fun, 0))
+    if (loop->simduid && loop->safelen == INT_MAX)
+      loop->safelen = MIN (constant_lower_bound (max_vf), INT_MAX);
+
+  /* Set correct length of omp simd arrays.  */
+
+  for (auto decl: fun->local_decls)
+    if (TREE_CODE (TREE_TYPE (decl)) == ARRAY_TYPE
+       && lookup_attribute ("omp simd array", DECL_ATTRIBUTES (decl)))
+      {
+       tree& max = TYPE_MAX_VALUE (TYPE_DOMAIN (TREE_TYPE (decl)));
+       if (TREE_CODE (max) == INTEGER_CST
+           && wi::eq_p (wi::to_widest (max), INT_MAX - 1))
+         {
+           max = size_int (max_vf - 1);
+           relayout_decl (decl);
+         }
+      }
+
+  /* Replace call to .OMP_MAX_VF() with max_vf.
+     The call is built when computing chunk size for schedule clause.
+     See omp_adjust_chunk_size.  */
+
+  basic_block bb;
+  FOR_EACH_BB_FN (bb, fun)
+    for (auto gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+      {
+       gcall *call_stmt = dyn_cast<gcall *> (gsi_stmt (gsi));
+       if (call_stmt
+           && gimple_call_internal_p (call_stmt, IFN_GOMP_SIMD_MAX_VF))
+         {
+           tree lhs = gimple_call_lhs (call_stmt);
+           tree rhs = build_int_cst (integer_type_node, max_vf);
+           gassign *new_stmt = gimple_build_assign (lhs, rhs);
+           gsi_replace (&gsi, new_stmt, true);
+         }
+      }
+
+  return 0;
+}
+
+} // anon namespace
+
+gimple_opt_pass *
+make_pass_omp_adjust_max_vf (gcc::context *ctxt)
+{
+  return new pass_omp_adjust_max_vf (ctxt);
+}
diff --git a/gcc/passes.def b/gcc/passes.def
index 7d01227eed1..bbe6ac5d54f 100644
--- a/gcc/passes.def
+++ b/gcc/passes.def
@@ -190,6 +190,7 @@ along with GCC; see the file COPYING3.  If not see
   NEXT_PASS (pass_oacc_loop_designation);
   NEXT_PASS (pass_omp_oacc_neuter_broadcast);
   NEXT_PASS (pass_oacc_device_lower);
+  NEXT_PASS (pass_omp_adjust_max_vf);
   NEXT_PASS (pass_omp_device_lower);
   NEXT_PASS (pass_omp_target_link);
   NEXT_PASS (pass_adjust_alignment);
diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
index a928cbe4557..d4082bf8bf2 100644
--- a/gcc/tree-pass.h
+++ b/gcc/tree-pass.h
@@ -435,6 +435,7 @@ extern gimple_opt_pass *make_pass_diagnose_omp_blocks 
(gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_expand_omp (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_expand_omp_ssa (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_omp_target_link (gcc::context *ctxt);
+extern gimple_opt_pass *make_pass_omp_adjust_max_vf (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_oacc_loop_designation (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_omp_oacc_neuter_broadcast (gcc::context 
*ctxt);
 extern gimple_opt_pass *make_pass_oacc_device_lower (gcc::context *ctxt);

RE: [RFC] Enabling SVE with offloading to nvptx

Reply via email to