> -----Original Message-----
> From: Richard Biener <[email protected]>
> Sent: 29 October 2024 16:46
> To: Prathamesh Kulkarni <[email protected]>
> Cc: Richard Biener <[email protected]>; [email protected]; Thomas
> Schwinge <[email protected]>; Jakub Jelinek <[email protected]>
> Subject: Re: [RFC] Enabling SVE with offloading to nvptx
>
> External email: Use caution opening links or attachments
>
>
> On Mon, Oct 28, 2024 at 1:52 PM Prathamesh Kulkarni via Gcc
> <[email protected]> wrote:
> >
> > > -----Original Message-----
> > > From: Richard Biener <[email protected]>
> > > Sent: 21 October 2024 12:45
> > > To: Prathamesh Kulkarni <[email protected]>
> > > Cc: [email protected]; Thomas Schwinge <[email protected]>;
> Jakub
> > > Jelinek <[email protected]>
> > > Subject: RE: [RFC] Enabling SVE with offloading to nvptx
> > >
> > > External email: Use caution opening links or attachments
> > >
> > >
> > > On Fri, 18 Oct 2024, Prathamesh Kulkarni wrote:
> > >
> > > >
> > > >
> > > > > -----Original Message-----
> > > > > From: Richard Biener <[email protected]>
> > > > > Sent: 17 October 2024 19:18
> > > > > To: Prathamesh Kulkarni <[email protected]>
> > > > > Cc: [email protected]; Thomas Schwinge <[email protected]>
> > > > > Subject: RE: [RFC] Enabling SVE with offloading to nvptx
> > > > >
> > > > > External email: Use caution opening links or attachments
> > > > >
> > > > >
> > > > > On Thu, 17 Oct 2024, Prathamesh Kulkarni wrote:
> > > > >
> > > > > > > -----Original Message-----
> > > > > > > From: Richard Biener <[email protected]>
> > > > > > > Sent: 16 October 2024 13:05
> > > > > > > To: Prathamesh Kulkarni <[email protected]>
> > > > > > > Cc: [email protected]; Thomas Schwinge
> > > > > > > <[email protected]>
> > > > > > > Subject: Re: [RFC] Enabling SVE with offloading to nvptx
> > > > > > >
> > > > > > > External email: Use caution opening links or attachments
> > > > > > >
> > > > > > >
> > > > > > > On Tue, 15 Oct 2024, Prathamesh Kulkarni wrote:
> > > > > > >
> > > > > > > > Hi,
> > > > > > > > Testing libgomp with SVE enabled (-mcpu=generic+sve2),
> > > results
> > > > > in
> > > > > > > > ~60
> > > > > > > UNRESOLVED errors with following error message:
> > > > > > > >
> > > > > > > > lto1: fatal error: degree of 'poly_int' exceeds
> > > > > 'NUM_POLY_INT_COEFFS'
> > > > > > > > compilation terminated.
> > > > > > > > nvptx mkoffload: fatal error:
> > > > > > > > ../../install/bin/aarch64-unknown-linux-gnu-accel-nvptx-
> > > none-g
> > > > > > > > cc
> > > > > > > returned 1 exit status compilation terminated.
> > > > > > > >
> > > > > > > > This behaviour can be reproduced with the following
> simple
> > > > > > > > test-case
> > > > > > > with -fopenmp -foffload=nvptx-none -mcpu=generic+sve2:
> > > > > > > >
> > > > > > > > #define N 1000
> > > > > > > > int main ()
> > > > > > > > {
> > > > > > > > int i;
> > > > > > > > int A[N] = {0}, B[N] = {0};
> > > > > > > >
> > > > > > > > #pragma omp target map(i), map(tofrom: A), map(from:
> B)
> > > > > > > > #pragma omp simd
> > > > > > > > for (i = 0; i < N; i++)
> > > > > > > > A[i] = A[i] + B[i];
> > > > > > > > return A[0];
> > > > > > > > }
> > > > > > > >
> > > > > > > > omplower pass lowers the above loop to the following:
> > > > > > > >
> > > > > > > > D.4576 = .GOMP_USE_SIMT ();
> > > > > > > > if (D.4576 != 0) goto <D.4577>; else
> goto
> > > > > <D.4578>;
> > > > > > > > <D.4577>:
> > > > > > > > {
> > > > > > > > unsigned int D.4586;
> > > > > > > > unsigned int D.4587;
> > > > > > > > int D.4588;
> > > > > > > > void * simduid.5;
> > > > > > > > void * .omp_simt.6;
> > > > > > > > int D.4596;
> > > > > > > > _Bool D.4597;
> > > > > > > > int D.4598;
> > > > > > > > unsigned int D.4599;
> > > > > > > > int D.4600;
> > > > > > > > int D.4601;
> > > > > > > > int * D.4602;
> > > > > > > > int i [value-expr: D.4588];
> > > > > > > > int i.0;
> > > > > > > >
> > > > > > > > simduid.5 = .GOMP_SIMT_ENTER
> (simduid.5,
> > > > > &D.4588);
> > > > > > > > .omp_simt.6 = .GOMP_SIMT_ENTER_ALLOC
> > > > > (simduid.5);
> > > > > > > > D.4587 = 0;
> > > > > > > > i.0 = 0;
> > > > > > > > #pragma omp simd safelen(32)
> > > > > > > > _simduid_(simduid.5)
> > > > > > > _simt_ linear(i.0:1) linear(i:1)
> > > > > > > > for (i.0 = 0; i.0 < 1000; i.0 = i.0 +
> 1)
> > > > > > > > ...
> > > > > > > > }
> > > > > > > > goto <D.4579>;
> > > > > > > > <D.4578>:
> > > > > > > > {
> > > > > > > > unsigned int D.4603;
> > > > > > > > unsigned int D.4604;
> > > > > > > > int D.4605[0:POLY_INT_CST [15, 16]];
> > > > > > > > void * simduid.7;
> > > > > > > > unsigned int D.4612;
> > > > > > > > int * D.4613;
> > > > > > > > int D.4614;
> > > > > > > > int i [value-expr: D.4605[D.4604]];
> > > > > > > > int i.0;
> > > > > > > >
> > > > > > > > D.4604 = 0;
> > > > > > > > i.0 = 0;
> > > > > > > > #pragma omp simd safelen(POLY_INT_CST
> > > > > > > > [16,
> > > > > 16])
> > > > > > > _simduid_(simduid.7) linear(i.0:1) linear(i:1)
> > > > > > > > ...
> > > > > > > > }
> > > > > > > > <D.4579>:
> > > > > > > > ...
> > > > > > > >
> > > > > > > > For offloading to SIMT based device like nvptx,
> > > scan_omp_simd
> > > > > > > > duplicates lowering of simd pragma into if-else where
> the
> > > if-
> > > > > part
> > > > > > > > contains simt code-path, and else-part contains simd
> code-
> > > path.
> > > > > In
> > > > > > > lower_rec_simd_input_clauses, max_vf is set to 16+16x for
> > > > > > > the
> > > > > above
> > > > > > > case as determined by omp_max_vf, and that becomes length
> of
> > > the
> > > > > omp
> > > > > > > simd
> > > > > > > array:
> > > > > > > > int D.4605[0:POLY_INT_CST [15, 16]];
> > > > > > > >
> > > > > > > > The issue here is that, the function containing above
> if-
> > > else
> > > > > > > > condition gets streamed out to LTO bytecode including
> the
> > > simd
> > > > > > > > code-
> > > > > > > path and the omp simd array, whose domain is
> > > [0:POLY_INT_CST[15,
> > > > > > > 16]], and thus we get the above error while streaming-in
> > > > > > > POLY_INT_CST in lto_input_ts_poly_tree_pointers on device
> > > side.
> > > > > > > >
> > > > > > > > Note that, the simd code-path is essentially dead-code
> on
> > > > > > > > nvptx, since
> > > > > > > > .GOMP_USE_SIMT() resolves to 1 during omp_device_lower
> > > > > > > > pass, and later optimization passes (ccp2) remove the
> > > > > > > > dead-code
> > > path
> > > > > > > > and unused omp
> > > > > > > simd arrays while compiling to device. So in this case, we
> > > > > > > aren't really mapping POLY_INT_CST from host to device,
> but
> > > > > > > it gets streamed out to device as an artefact of omp simd
> > > lowering.
> > > > > > > >
> > > > > > > > I suppose a proper fix here would be to (somehow) defer
> > > > > > > > lowering of omp pragma simd after streaming out to
> device,
> > > so
> > > > > > > > the device only sees simt code-path, and the host only
> > > > > > > > sees simd code path
> > > > > ?
> > > > > > > > Or perhaps
> > > > > > > clone each function in offload region, one for host and
> one
> > > for
> > > > > SIMT
> > > > > > > device, and only stream the device versions to avoid
> > > > > > > streaming out
> > > > > > > host- specific IR changes ?
> > > > > > >
> > > > > > > There is currently no way to have the host compiler query
> > > > > > > offload target capabilities so the only true fix is to
> delay
> > > OMP
> > > > > > > SIMD lowering to the target.
> > > > > > Um, I thought we could use omp_max_simt_vf from host to
> query
> > > > > > if the
> > > > > offload target is SIMT ?
> > > > > > The function essentially iterates thru OFFLOAD_TARGET_NAMES
> > > > > > and
> > > > > returns non-zero for nvptx.
> > > > > > >
> > > > > > > Are we dependent on the early optimization pipeline being
> > > > > > > run
> > > on
> > > > > the
> > > > > > > host to produce the offload IL? There's some oddball OACC
> > > > > > > passes
> > > > > in
> > > > > > > pass_ipa_oacc.
> > > > > > >
> > > > > > > That said, I'd probably try to produce clones with
> unlowered
> > > IL
> > > > > and
> > > > > > > skip those clones from all processing from that point and
> > > resume
> > > > > in
> > > > > > > the offload compiler.
> > > > > > >
> > > > > > > > I thought of following approaches as workarounds:
> > > > > > >
> > > > > > > I don't think any workaround will fly in the end. Can't
> you
> > > > > simply
> > > > > > > force SVE to be off for offload clones on the host side
> and
> > > > > > > force OMP lowering with ADVSIMD only?
> > > > > > Would it be correct to set:
> > > > > > sctx.max_vf = constant_lower_bound (omp_max_vf ())
> > > > > >
> > > > > > if function is offloaded and omp_max_vf returns non-constant
> > > > > poly_int,
> > > > > > to force max_vf to be VLS, which will avoid VLA
> vectorization
> > > > > > as in
> > > > > the attached patch ?
> > > > > >
> > > > > > Or should we modify autovectorize_vector_modes hook to
> return
> > > VLS
> > > > > > modes for offloaded functions ?
> > > > >
> > > > > Can't you simply put a target(march=armv8.3a) (or even more
> > > > > basic
> > > > > ISA) on the OMP target clones? Or is the lowering happening
> > > before
> > > > > outlining?
> > > > AFAIU, scan_omp_target in omp-lower creates a "child function"
> but
> > > doesn't outline it.
> > > > The actual outlining seems to be happening during omp-expand
> pass.
> > > >
> > > > > In that case maybe we want to switch the host target into
> > > "offload-
> > > > > lowering-mode" (effectively switching to a basic ISA)?
> > > > Sorry, I didn't understand -- Do you mean we should enforce
> basic
> > > ISA for all the functions including host-only ones ?
> > >
> > > No, not for host-only.
> > >
> > > > Since the issue stems from having host details leaked into IL
> > > > during omp-lowering (and omp-expand), I was wondering if we
> should
> > > > use default versions of the hooks if offloading is enabled in
> > > > omp-lower
> > > and omp-expand, which should avoid host details creeping into
> > > offload IL, sth like in the attached patch ?
> > >
> > > Ick. No. I think we should avoid lowering parts applied when the
> > > pieces are not outlined yet and ensure those offload functions
> have
> > > a "basic ISA" (which the target needs to define) - the target
> > > already has control over the function decl built (IIRC) so it can
> > > attach required target attributes (and we can ensure those are
> > > dropped on the offload compiler side again).
> > Hi Richard,
> > Thanks for the suggestions!
> >
> > From what I understand, omp expansion proceeds in depth-first order,
> > by expanding innermost regions first and then progressively
> outlining
> > them. I tried disabling SIMD lowering during omp-lowering but that
> triggered several errors during omp-expand, and didn't attempt to
> pursue it further.
> >
> > I briefly experimented with adding a new target hook offload_mode,
> > that will set host ISA to "offload mode", for current_function_decl
> > (function containing offload region) when starting to lower offload
> > region, and reset the host ISA on current_function_decl after
> leaving it (essentially enabling/disabling TARGET_FL_SVE in isa_flags
> in AArch64 backend), but not sure if that was entirely correct, and
> dropped it.
> > >
> > > The alternative would be to go with something along your patch in
> > > this mail but do
> > >
> > > poly_uint64
> > > -omp_max_vf (void)
> > > +omp_max_vf (bool offload)
> > > {
> > > ...
> > > if (offload)
> > > get_me_magic_value_for_configured_offload_device ()
> > >
> > > but iff we're doing the lowering before outlining then if we have
> > > configured two different offload devices we'll have to come up
> with
> > > a max_vf that's suitable for all offload devices? I think this
> > > shows a least this part of the lowering is done in the wrong
> place?
> > >
> > > Maybe we can have the max_vf "symbolic"? Like .IFN_MAX_VF () and
> > > lower that on the offload side only?
> > AFAIU, there are three things that depend on max_vf:
> > (1) Setting loop->safelen
> > (2) Setting length of omp simd arrays
> > (3) Computing chunk_size for schedule clause (with simd modifier)
> >
> > We can't use result of an internal function for loop->safelen since
> it's a compile-time artefact.
>
> True.
>
> > And for array length, (at-least) sra pass seems to assume that
> TYPE_MIN/TYPE_MAX are INTEGER_CST.
> > From prepare_iteration_over_array_elts:
> >
> > tree minidx = TYPE_MIN_VALUE (TYPE_DOMAIN (type));
> > gcc_assert (TREE_CODE (minidx) == INTEGER_CST);
> > tree maxidx = TYPE_MAX_VALUE (TYPE_DOMAIN (type));
> > /* Skip (some) zero-length arrays; others have MAXIDX == MINIDX -
> 1. */
> > if (!maxidx)
> > return false;
> > gcc_assert (TREE_CODE (maxidx) == INTEGER_CST);
>
> The issue might be that local vars cannot be "VLA", instead they would
> be lowered to be allocated by alloca(). That means the assertion is
> technically correct.
>
> > The attached patch:
> > (a) Uses a placeholder value (INT_MAX) for max_vf which gets
> assigned
> > to loop->safelen and length of omp simd array if offloading is
> enabled.
>
> I think it's better to identify the loop that needs "max_vf lowering"
> with a new flag and set a conservative value to max_vf as it could be
> interpreted and used for invalid optimization otherwise.
The attached patch adds a new bitfield needs_max_vf_lowering to loop, and sets
that in expand_omp_simd for loops that need
delayed lowering of safelen and omp simd arrays. The patch defines a new macro
OMP_COMMON_MAX_VF (arbitrarily set to 16),
as a placeholder value for max_vf (instead of INT_MAX), and is later replaced
by appropriate max_vf during omp_adjust_max_vf pass.
Does that look OK ?
>
> For the SIMD array size I have no good suggestions - the uses are very
> constrained though, so I suspect any magic value that's not 0 or 1
> might work.
>
> > (b) For computing chunk_size, using INT_MAX resulted in chunk_size
> > being constant propagated (and harder to recover later), so I added
> a new internal function .GOMP_SIMD_MAX_VF, whose result is assigned to
> chunk_size.
>
> I guess that's OK.
>
> > (c) Adds a new pass pass_omp_adjust_max_vf, just before
> > omp_device_lower, to adjust above 3 artefacts to correctly adjust
> > max_vf, and fold away .GOMP_SIMD_MAX_VF (or should I piggy back it
> on
> > some other pass?)
>
> It does look related enough to pass_omp_device_lower, no?
Well, that's where I put it initially, but the pass has a more stronger
condition for gating:
return (!(fun->curr_properties & PROP_gimple_lomp_dev)
|| (flag_openmp
&& (cgraph_node::get (fun->decl)->calls_declare_variant_alt
|| offload_ind_funcs_p)));
Which I am not sure will trigger for every offloaded function on both host and
device ?
I am gating the pass simply on offloading_function_p, since we need to lower
SIMD constructs on both host and device for every function
that is offloaded, containing SIMD loops.
>
> I hope Jakub can chime in a bit.
>
> > For the following contrived test:
> >
> > #include <stdlib.h>
> >
> > #define N 1000
> > int A[N];
> > int B[N];
> >
> > int main()
> > {
> > int i;
> > int sum = 0;
> >
> > #pragma omp target map(sum), map(A), map(B), map (i)
> > #pragma omp teams distribute parallel for simd reduction(+:sum)
> schedule(simd:static, 5)
> > for (i = 0; i < N; i++)
> > sum += A[i] * B[i];
> > return sum;
> > }
> >
> > With patch, omp expand dump shows length of omp simd arrays set to
> > INT_MAX with offloading
> > enabled:
> > int D.5382[2147483647];
> > int D.5378[2147483647];
> >
> > and following computation for chunk_size:
> > D.5353 = .GOMP_SIMD_MAX_VF ();
> > D.5354 = D.5353 + 4;
> > D.5355 = .GOMP_SIMD_MAX_VF ();
> > D.5356 = -D.5355;
> > D.5357 = D.5354 & D.5356;
> > D.5358 = D.5348 - D.5347;
> >
> > And after omp_adjust_max_vf pass, the dump shows correct max_vf
> > assigned to length of omp simd array and chunk_size on host side:
> >
> > int D.5382[0:POLY_INT_CST [15, 16]];
> > int D.5378[0:POLY_INT_CST [15, 16]];
> > ...
> > _38 = POLY_INT_CST [16, 16];
> > _39 = _38 + 4;
> > _40 = POLY_INT_CST [16, 16];
> > _41 = -_40;
> > _42 = _39 & _41;
> > _43 = _35 - _34;
> >
> > and would make similar adjustments for SIMD based devices.
> > For SIMT devices, the patch explicitly sets max_vf to 1, to fold
> > .GOMP_SIMD_MAX_VF and shrink omp simd array (altho I guess it
> doesn't
> > really matter since the simd code-path would be dead-code?)
> >
> > I had a couple of questions:
> >
> > (1) With patch, I am seeing a lot of errors -- "multiple dump files
> found"
> > For eg:
> > libgomp.c++/../libgomp.c-c++-common/target-is-initial-host-2.c:
> > multiple dump files found
> > UNRESOLVED:
> > libgomp.c++/../libgomp.c-c++-common/target-is-initial-host-2.c
> > scan-nvptx-none-offload-tree-dump-times optimized
> > "omp_is_initial_device" 1
> >
> > The error seems to come from scandump.exp:glob-dump-file:
> >
> > if { $num_files > 1 } {
> > verbose -log "$testcase: multiple dump files found"
> > }
> >
> > This seems to happen because the compiler is passed:
> > -fdump-tree-optimized -foffload-options=-fdump-tree-optimized, which
> results in two optimized dump files. I am not sure tho, why the patch
> specifically triggers this ?
>
> I think the general scan-tree-dump* do not work here, IIRC there was
> work to add offload and lto dump scan variants.
This went away after updating the sources and doing a clean build, I had
possibly screwed up my build dir.
>
> > (2) To check if we're in offload region in omp_adjust_chunk_size,
> the
> > patch defines a new function enclosing_target_region_p, and keeps
> > walking region->outer till it reaches GIMPLE_OMP_TARGET region (or
> > NULL). While it seems to work, I was wondering if there was a better
> > way to do this ? Another option would be to keep a static variable
> target_nesting_level, which is incremented/decremented before/after
> each call to omp_expand_target, similar to one in omp-lower.cc ? I
> tried using omp_maybe_offloaded from omp_adjust_chunk_size, but that
> didn't seem to work.
> >
> > Does the patch look in the right direction ?
>
> I think yes, but I lack the overall OMP lowering picture here (and
> still think we should eventually delay offload target lowering to the
> offload compile and only outline regions on the host)
Right, I initially thought the issue was about if-else lowering of SIMD vs
SIMT, but it's really about host details creeping earlier in offload IL during
omp simd lowering/expansion.
For instance, offloading from AArch64 host with SVE enabled to a SIMD-based
device will also result in same issue above of mismatched degree of
POLY_INT_CST if device's NUM_POLY_INT_COEFFS < 2 ?
I suppose tho, we don't need to delay all the SIMD lowering constructs after
streaming, but only those parts that are target dependent to avoid
discrepancies in offload IL ?
I grepped thru omp-lower and omp-expand, and it seems to me, in addition to
max_vf, only the alignment clause uses autovectorize_vector_modes,
simd_preferred_mode
and related_mode hooks to determine the biggest alignment supported by the
target for vector types. The attached patch thus also adds a new internal
function for representing alignment,
and is lowered during omp_adjust_max_vf.
Passes libgomp testing for Aarch64/nvptx offloading (with and without GPU).
Does the patch look OK ?
Thanks,
Prathamesh
>
> Richard.
>
> > Thanks,
> > Prathamesh
> > >
> > > Richard.
> > >
> > >
> > > > Thanks,
> > > > Prathamesh
> > > > >
> > > > > > Thanks,
> > > > > > Prathamesh
> > > > > > >
> > > > > > > Richard.
> > > > > > >
> > > > > > > > [1] Set sctx.max_vf to constant_lower_bound(omp_max_vf
> ())
> > > in
> > > > > > > > lower_rec_simd_input_clauses, if the function is going
> to
> > > > > > > > be offloaded and omp_max_vf returns non-constant
> poly_int.
> > > > > > > > For
> > > > > above
> > > > > > > > case, it sets
> > > > > > > max_vf to 16 instead of 16+16x which seems to resolve the
> > > issue,
> > > > > but
> > > > > > > it'd use suboptimal max VF for host ? This is done in
> patch
> > > > > > > p-283-
> > > > > 2.txt.
> > > > > > > >
> > > > > > > > However, with clean trunk it still seems to use max_vf =
> > > > > > > > 16
> > > > > after
> > > > > > > disabling the above error.
> > > > > > > > vect dump shows:
> > > > > > > >
> > > > > > > > (compute_affine_dependence
> > > > > > > > ref_a: (*_25)[i.0_51], stmt_a: _26 = (*_25)[i.0_51];
> > > > > > > > ref_b: (*_23)[i.0_51], stmt_b: (*_23)[i.0_51] = _27;
> > > > > > > > ) -> dependence analysis failed
> > > > > > > > foo.c:10:13: note: dependence distance = 0.
> > > > > > > > foo.c:10:13: note: dependence distance == 0 between
> > > > > (*_23)[i.0_51]
> > > > > > > and (*_23)[i.0_51]
> > > > > > > > foo.c:10:13: missed: bad data dependence.
> > > > > > > > foo.c:10:13: note: ***** Analysis failed with vector
> mode
> > > > > VNx4SI
> > > > > > > >
> > > > > > > > This seems to happen because, loop->safelen is set to 16
> > > > > > > > by
> > > > > taking
> > > > > > > > MIN(constant_lower_bound(16+16x), INT_MAX) in
> > > expand_omp_simd:
> > > > > > > >
> > > > > > > > if (!poly_int_tree_p (safelen, &val))
> > > > > > > > safelen_int = 0;
> > > > > > > > else
> > > > > > > > safelen_int = MIN (constant_lower_bound (val),
> > > > > > > > INT_MAX);
> > > > > > > >
> > > > > > > > and fails to vectorize with VLA vectors, because max_vf
> ==
> > > 16
> > > > > and
> > > > > > > min_vf == 4+4x resulting in bad data dependence due to:
> > > > > > > >
> > > > > > > > if (max_vf != MAX_VECTORIZATION_FACTOR
> > > > > > > > && maybe_lt (max_vf, min_vf))
> > > > > > > > return opt_result::failure_at (vect_location, "bad
> > > > > > > > data dependence.\n");
> > > > > > > >
> > > > > > > > If safelen was (somehow) set to 16+16x, I guess it could
> > > have
> > > > > used
> > > > > > > VF=4+4x and vectorized with VLA vectors.
> > > > > > > > but I suppose that's a separate issue ?
> > > > > > > >
> > > > > > > > [2] Since the issue seems to be only with streaming out
> > > length
> > > > > of
> > > > > > > > omp simd array when it's POLY_INT_CST, could we perhaps
> > > > > > > > use
> > > a
> > > > > > > > place holder length during omp lowering and compute the
> > > > > > > > correct length after streaming out, so POLY_INT_CST
> > > > > > > > doesn't get leaked into bytecode ? The
> > > > > > > attached patch p-283-3.txt follows this approach by using
> > > bogus
> > > > > > > length INT_MAX in lower_rec_simd_input_clauses if
> offloading
> > > to
> > > > > SIMT
> > > > > > > device and max_vf is non-constant poly_int, and later
> > > computing
> > > > > the
> > > > > > > correct length in beginning of vect pass by setting it to
> > > > > omp_max_vf
> > > > > > > (), but I am not sure if this is entirely correct.
> > > > > > > > I am assuming that creating omp simd array of bogus
> length
> > > > > > > > will not be an issue for nvptx since it will never get
> > > > > > > > referenced and eventually be removed by
> > > > > > > > remove_unused_locals
> > > ?
> > > > > > > > If it'd not be a good idea to
> > > > > > > rely on the pass pipeline to eliminate simd code-path and
> > > > > > > omp simd array while compiling to device, it could be
> > > > > > > possibly
> > > done
> > > > > > > during omp_lower_device pass itself ?
> > > > > > > >
> > > > > > > > [3] While streaming-in POLY_INT_CST, avoid emitting
> error
> > > > > > > > immediately if degree of POLY_INT_CST exceeds accel's
> > > > > > > > NUM_POLY_INT_COEFFS to ignore POLY_INT_CSTs that may
> > > > > > > > potentially occur on dead-code path, and instead mark it
> > > > > > > > as
> > > error_mark_node.
> > > > > > > > For the above case, since
> > > > > > > POLY_INT_CST appears on dead-code path, streaming
> > > > > > > POLY_INT_CST
> > > > > with
> > > > > > > higher degree than accel's NUM_POLY_INT_COEFFS would be
> > > > > "harmless".
> > > > > > > And detect invalid POLY_INT_CST's in expand pass (if it
> > > survives
> > > > > > > till this point), and emit above error, but not sure if
> > > > > > > that'd be the right place ?
> > > > > > > > This is done in p-283-4.txt.
> > > > > > > >
> > > > > > > > All the three patches fix UNRESOLVED tests due to
> > > POLY_INT_CST
> > > > > > > streaming error in libgomp testsuite with -
> mcpu=generic+sve2.
> > > > > > > > (Altho it introduces a strange FAIL for data-5.f90,
> which
> > > > > > > > I
> > > am
> > > > > > > investigating).
> > > > > > > > I would be grateful for suggestions on how to proceed.
> > > > > > > >
> > > > > > > > Signed-off-by: Prathamesh Kulkarni
> > > > > > > > <[email protected]>
> > > > > > > >
> > > > > > > > Thanks,
> > > > > > > > Prathamesh
> > > > > > > >
> > > > > > >
> > > > > > > --
> > > > > > > Richard Biener <[email protected]> SUSE Software Solutions
> > > > > > > Germany GmbH, Frankenstrasse 146, 90461 Nuernberg,
> Germany;
> > > > > > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB
> 36809,
> > > AG
> > > > > > > Nuernberg)
> > > > > >
> > > > >
> > > > > --
> > > > > Richard Biener <[email protected]> SUSE Software Solutions
> > > > > Germany GmbH, Frankenstrasse 146, 90461 Nuernberg, Germany;
> > > > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809,
> AG
> > > > > Nuernberg)
> > > >
> > >
> > > --
> > > Richard Biener <[email protected]>
> > > SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461
> > > Nuernberg, Germany;
> > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG
> > > Nuernberg)
Delay computing max_vf and alignment when offloading is enabled.
gcc/ChangeLog:
* cfgloop.h (loop): New member needs_max_vf_lowering.
* internal-fn.cc (expand_GOMP_SIMD_MAX_VF): New function.
(expand_GOMP_SIMD_ALIGN): Likewise.
* internal-fn.def (GOMP_SIMD_MAX_VF): New entry.
(GOMP_SIMD_ALIGN): Likewise.
* omp-expand.cc (enclosing_target_region_p): New function.
(omp_adjust_chunk_size): New parameter offload. If offload is true,
build call_expr for internal function GOMP_SIMD_MAX_VF.
(get_ws_args_for): New parameter offload, and pass it to
omp_adjust_chunk_size.
(determine_parallel_type): Call enclosing_target_region_p and pass it's
result to get_ws_args_for.
(expand_omp_for_generic): Call enclosing_target_region_p and pass it's
result to omp_adjust_chunk_size.
(expand_omp_for_static_chunk): Likewise.
(expand_omp_simd): Set loop->needs_max_vf_lowering to result of
enclosing_target_region_p.
* omp-general.cc (omp_max_vf): New parameter offload.
* omp-general.h (omp_max_vf): Adjust declaration.
* omp-low.cc (omp_clause_aligned_alignment): New parameter offload, and
move most of the function from ...
(build_omp_clause_aligned_alignment): ... to here.
(lower_rec_simd_input_clauses): Call omp_maybe_offloaded_ctx and pass
it's result to omp_max_vf.
(lower_rec_input_clauses): Call omp_maybe_offloaded_ctx and pass it's
result to omp_clause_aligned_alignment.
* omp-low.h (build_omp_clause_aligned_alignment): Declare.
* omp-offload.cc (class pass_omp_adjust_max_vf): Define new pass.
(make_pass_omp_adjust_max_vf): New function.
* passes.def: Add entry for pass_omp_adjust_max_vf.
* tree-pass.h (make_pass_omp_adjust_max_vf): Declare.
Signed-off-by: Prathamesh Kulkarni <[email protected]>
diff --git a/gcc/cfgloop.h b/gcc/cfgloop.h
index 30b5e40d0d9..41a14b60f8d 100644
--- a/gcc/cfgloop.h
+++ b/gcc/cfgloop.h
@@ -233,6 +233,10 @@ public:
flag_finite_loops or similar pragmas state. */
unsigned finite_p : 1;
+ /* True if SIMD loop is offloaded, and needs lowering of artefacts
+ that are target-dependent. */
+ unsigned needs_max_vf_lowering: 1;
+
/* The number of times to unroll the loop. 0 means no information given,
just do what we always do. A value of 1 means do not unroll the loop.
A value of USHRT_MAX means unroll with no specific unrolling factor.
diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
index d89a04fe412..68088931a24 100644
--- a/gcc/internal-fn.cc
+++ b/gcc/internal-fn.cc
@@ -662,6 +662,22 @@ expand_GOMP_SIMD_ORDERED_END (internal_fn, gcall *)
gcc_unreachable ();
}
+/* This should get folded in omp_adjust_max_vf pass. */
+
+static void
+expand_GOMP_SIMD_MAX_VF (internal_fn, gcall *)
+{
+ gcc_unreachable ();
+}
+
+/* This should get folded in omp_adjust_max_vf pass. */
+
+static void
+expand_GOMP_SIMD_ALIGN (internal_fn, gcall *)
+{
+ gcc_unreachable ();
+}
+
/* This should get expanded in the sanopt pass. */
static void
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index 23b4ab02b30..fef1903e599 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -469,6 +469,8 @@ DEF_INTERNAL_FN (GOMP_SIMD_VF, ECF_CONST | ECF_LEAF |
ECF_NOTHROW, NULL)
DEF_INTERNAL_FN (GOMP_SIMD_LAST_LANE, ECF_CONST | ECF_LEAF | ECF_NOTHROW, NULL)
DEF_INTERNAL_FN (GOMP_SIMD_ORDERED_START, ECF_LEAF | ECF_NOTHROW, NULL)
DEF_INTERNAL_FN (GOMP_SIMD_ORDERED_END, ECF_LEAF | ECF_NOTHROW, NULL)
+DEF_INTERNAL_FN (GOMP_SIMD_MAX_VF, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW, NULL)
+DEF_INTERNAL_FN (GOMP_SIMD_ALIGN, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW, NULL)
DEF_INTERNAL_FN (LOOP_VECTORIZED, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW, NULL)
DEF_INTERNAL_FN (LOOP_DIST_ALIAS, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW, NULL)
DEF_INTERNAL_FN (ANNOTATE, ECF_CONST | ECF_LEAF | ECF_NOTHROW, NULL)
diff --git a/gcc/omp-expand.cc b/gcc/omp-expand.cc
index 9ff9553c3ea..c22bd251d97 100644
--- a/gcc/omp-expand.cc
+++ b/gcc/omp-expand.cc
@@ -126,6 +126,17 @@ is_combined_parallel (struct omp_region *region)
return region->is_combined_parallel;
}
+/* Return true if REGION is enclosed in omp target region. */
+
+static bool
+enclosing_target_region_p (struct omp_region *region)
+{
+ for (omp_region *r = region; r; r = r->outer)
+ if (r->type == GIMPLE_OMP_TARGET)
+ return true;
+ return false;
+}
+
/* Given two blocks PAR_ENTRY_BB and WS_ENTRY_BB such that WS_ENTRY_BB
is the immediate dominator of PAR_ENTRY_BB, return true if there
are no data dependencies that would prevent expanding the parallel
@@ -206,20 +217,30 @@ workshare_safe_to_combine_p (basic_block ws_entry_bb)
presence (SIMD_SCHEDULE). */
static tree
-omp_adjust_chunk_size (tree chunk_size, bool simd_schedule)
+omp_adjust_chunk_size (tree chunk_size, bool simd_schedule, bool offload)
{
if (!simd_schedule || integer_zerop (chunk_size))
return chunk_size;
- poly_uint64 vf = omp_max_vf ();
- if (known_eq (vf, 1U))
- return chunk_size;
-
tree type = TREE_TYPE (chunk_size);
+ tree max_vf;
+
+ if (offload)
+ max_vf = build_call_expr_internal_loc (input_location,
+ IFN_GOMP_SIMD_MAX_VF, type, 0);
+ else
+ {
+ poly_uint64 vf = omp_max_vf ();
+ if (known_eq (vf, 1U))
+ return chunk_size;
+ max_vf = build_int_cst (type, vf);
+ }
+
chunk_size = fold_build2 (PLUS_EXPR, type, chunk_size,
- build_int_cst (type, vf - 1));
+ fold_build2 (MINUS_EXPR, type,
+ max_vf, build_one_cst (type)));
return fold_build2 (BIT_AND_EXPR, type, chunk_size,
- build_int_cst (type, -vf));
+ fold_build1 (NEGATE_EXPR, type, max_vf));
}
/* Collect additional arguments needed to emit a combined
@@ -227,7 +248,7 @@ omp_adjust_chunk_size (tree chunk_size, bool simd_schedule)
expanded. */
static vec<tree, va_gc> *
-get_ws_args_for (gimple *par_stmt, gimple *ws_stmt)
+get_ws_args_for (gimple *par_stmt, gimple *ws_stmt, bool offload)
{
tree t;
location_t loc = gimple_location (ws_stmt);
@@ -269,7 +290,7 @@ get_ws_args_for (gimple *par_stmt, gimple *ws_stmt)
if (fd.chunk_size)
{
t = fold_convert_loc (loc, long_integer_type_node, fd.chunk_size);
- t = omp_adjust_chunk_size (t, fd.simd_schedule);
+ t = omp_adjust_chunk_size (t, fd.simd_schedule, offload);
ws_args->quick_push (t);
}
@@ -365,7 +386,8 @@ determine_parallel_type (struct omp_region *region)
region->is_combined_parallel = true;
region->inner->is_combined_parallel = true;
- region->ws_args = get_ws_args_for (par_stmt, ws_stmt);
+ region->ws_args = get_ws_args_for (par_stmt, ws_stmt,
+ enclosing_target_region_p (region));
}
}
@@ -4195,7 +4217,8 @@ expand_omp_for_generic (struct omp_region *region,
if (fd->chunk_size)
{
t = fold_convert (fd->iter_type, fd->chunk_size);
- t = omp_adjust_chunk_size (t, fd->simd_schedule);
+ t = omp_adjust_chunk_size (t, fd->simd_schedule,
+ enclosing_target_region_p (region));
if (sched_arg)
{
if (fd->ordered)
@@ -4239,7 +4262,8 @@ expand_omp_for_generic (struct omp_region *region,
{
tree bfn_decl = builtin_decl_explicit (start_fn);
t = fold_convert (fd->iter_type, fd->chunk_size);
- t = omp_adjust_chunk_size (t, fd->simd_schedule);
+ t = omp_adjust_chunk_size (t, fd->simd_schedule,
+ enclosing_target_region_p (region));
if (sched_arg)
t = build_call_expr (bfn_decl, 10, t5, t0, t1, t2, sched_arg,
t, t3, t4, reductions, mem);
@@ -5936,7 +5960,8 @@ expand_omp_for_static_chunk (struct omp_region *region,
step = force_gimple_operand_gsi (&gsi, fold_convert (itype, step),
true, NULL_TREE, true, GSI_SAME_STMT);
tree chunk_size = fold_convert (itype, fd->chunk_size);
- chunk_size = omp_adjust_chunk_size (chunk_size, fd->simd_schedule);
+ chunk_size = omp_adjust_chunk_size (chunk_size, fd->simd_schedule,
+ enclosing_target_region_p (region));
chunk_size
= force_gimple_operand_gsi (&gsi, chunk_size, true, NULL_TREE, true,
GSI_SAME_STMT);
@@ -7136,6 +7161,8 @@ expand_omp_simd (struct omp_region *region, struct
omp_for_data *fd)
loop->latch = cont_bb;
add_loop (loop, l1_bb->loop_father);
loop->safelen = safelen_int;
+ loop->needs_max_vf_lowering = enclosing_target_region_p (region);
+
if (simduid)
{
loop->simduid = OMP_CLAUSE__SIMDUID__DECL (simduid);
diff --git a/gcc/omp-general.cc b/gcc/omp-general.cc
index f4c5f577047..819764542eb 100644
--- a/gcc/omp-general.cc
+++ b/gcc/omp-general.cc
@@ -989,7 +989,7 @@ find_combined_omp_for (tree *tp, int *walk_subtrees, void
*data)
/* Return maximum possible vectorization factor for the target. */
poly_uint64
-omp_max_vf (void)
+omp_max_vf (bool offload)
{
if (!optimize
|| optimize_debug
@@ -998,6 +998,13 @@ omp_max_vf (void)
&& OPTION_SET_P (flag_tree_loop_vectorize)))
return 1;
+ /* If offloading is enabled, just use a conservative placeholder
+ value for max_vf. The actual value will be set during
+ pass_omp_adjust_max_vf. */
+
+ if (offload)
+ return OMP_COMMON_MAX_VF;
+
auto_vector_modes modes;
targetm.vectorize.autovectorize_vector_modes (&modes, true);
if (!modes.is_empty ())
diff --git a/gcc/omp-general.h b/gcc/omp-general.h
index 891f467556e..04fa0b5ba41 100644
--- a/gcc/omp-general.h
+++ b/gcc/omp-general.h
@@ -146,6 +146,15 @@ struct omp_for_data
#define OMP_TS_NAME(t) \
(omp_ts_map[OMP_TS_CODE (t)].name)
+/* FIXME: This is just a placeholder value for max_vf defined arbitrarily, used
+ for setting safelen, and length of omp simd arrays in omplower pass if
+ offloading is enabled. The actual max_vf for the target will be then
+ computed later during omp_adjust_max_vf pass. The rationale for not using
+ a special value like 0, 1 or INT_MAX is to avoid incorrect transforms
+ happening due to special values. */
+
+#define OMP_COMMON_MAX_VF 16
+
extern tree make_trait_set_selector (enum omp_tss_code, tree, tree);
extern tree make_trait_selector (enum omp_ts_code, tree, tree, tree);
extern tree make_trait_property (tree, tree, tree);
@@ -162,7 +171,7 @@ extern void omp_extract_for_data (gomp_for *for_stmt,
struct omp_for_data *fd,
struct omp_for_data_loop *loops);
extern gimple *omp_build_barrier (tree lhs);
extern tree find_combined_omp_for (tree *, int *, void *);
-extern poly_uint64 omp_max_vf (void);
+extern poly_uint64 omp_max_vf (bool offload = false);
extern int omp_max_simt_vf (void);
extern const char *omp_context_name_list_prop (tree);
extern void omp_construct_traits_to_codes (tree, int, enum tree_code *);
diff --git a/gcc/omp-low.cc b/gcc/omp-low.cc
index da2051b0279..b5d422aa8e0 100644
--- a/gcc/omp-low.cc
+++ b/gcc/omp-low.cc
@@ -4519,16 +4519,11 @@ omp_reduction_init (tree clause, tree type)
OMP_CLAUSE_REDUCTION_CODE (clause), type);
}
-/* Return alignment to be assumed for var in CLAUSE, which should be
- OMP_CLAUSE_ALIGNED. */
+/* Return implementation defined alignment. */
-static tree
-omp_clause_aligned_alignment (tree clause)
+tree
+build_omp_clause_aligned_alignment (void)
{
- if (OMP_CLAUSE_ALIGNED_ALIGNMENT (clause))
- return OMP_CLAUSE_ALIGNED_ALIGNMENT (clause);
-
- /* Otherwise return implementation defined alignment. */
unsigned int al = 1;
opt_scalar_mode mode_iter;
auto_vector_modes modes;
@@ -4561,6 +4556,21 @@ omp_clause_aligned_alignment (tree clause)
return build_int_cst (integer_type_node, al);
}
+/* Return alignment to be assumed for var in CLAUSE, which should be
+ OMP_CLAUSE_ALIGNED. */
+
+static tree
+omp_clause_aligned_alignment (tree clause, bool offload)
+{
+ if (OMP_CLAUSE_ALIGNED_ALIGNMENT (clause))
+ return OMP_CLAUSE_ALIGNED_ALIGNMENT (clause);
+
+ return (offload)
+ ? build_call_expr_internal_loc (input_location,
+ IFN_GOMP_SIMD_ALIGN,
+ integer_type_node, 0)
+ : build_omp_clause_aligned_alignment ();
+}
/* This structure is part of the interface between lower_rec_simd_input_clauses
and lower_rec_input_clauses. */
@@ -4588,7 +4598,9 @@ lower_rec_simd_input_clauses (tree new_var, omp_context
*ctx,
{
if (known_eq (sctx->max_vf, 0U))
{
- sctx->max_vf = sctx->is_simt ? omp_max_simt_vf () : omp_max_vf ();
+ bool offload = omp_maybe_offloaded_ctx (ctx);
+ sctx->max_vf = sctx->is_simt ? omp_max_simt_vf () : omp_max_vf (offload);
+
if (maybe_gt (sctx->max_vf, 1U))
{
tree c = omp_find_clause (gimple_omp_for_clauses (ctx->stmt),
@@ -5106,7 +5118,8 @@ lower_rec_input_clauses (tree clauses, gimple_seq *ilist,
gimple_seq *dlist,
if (new_var == NULL_TREE)
new_var = maybe_lookup_decl_in_outer_ctx (var, ctx);
x = builtin_decl_explicit (BUILT_IN_ASSUME_ALIGNED);
- tree alarg = omp_clause_aligned_alignment (c);
+ bool offload = omp_maybe_offloaded_ctx (ctx);
+ tree alarg = omp_clause_aligned_alignment (c, offload);
alarg = fold_convert_loc (clause_loc, size_type_node, alarg);
x = build_call_expr_loc (clause_loc, x, 2, new_var, alarg);
x = fold_convert_loc (clause_loc, TREE_TYPE (new_var), x);
@@ -5121,7 +5134,8 @@ lower_rec_input_clauses (tree clauses, gimple_seq *ilist,
gimple_seq *dlist,
t = maybe_lookup_decl_in_outer_ctx (var, ctx);
t = build_fold_addr_expr_loc (clause_loc, t);
t2 = builtin_decl_explicit (BUILT_IN_ASSUME_ALIGNED);
- tree alarg = omp_clause_aligned_alignment (c);
+ bool offload = omp_maybe_offloaded_ctx (ctx);
+ tree alarg = omp_clause_aligned_alignment (c, offload);
alarg = fold_convert_loc (clause_loc, size_type_node, alarg);
t = build_call_expr_loc (clause_loc, t2, 2, t, alarg);
t = fold_convert_loc (clause_loc, ptype, t);
diff --git a/gcc/omp-low.h b/gcc/omp-low.h
index 425dd448177..d9a36cb1d76 100644
--- a/gcc/omp-low.h
+++ b/gcc/omp-low.h
@@ -26,6 +26,7 @@ extern tree omp_member_access_dummy_var (tree);
extern tree omp_find_combined_for (gimple_stmt_iterator *gsi_p,
bool *handled_ops_p,
struct walk_stmt_info *wi);
+extern tree build_omp_clause_aligned_alignment (void);
#endif /* GCC_OMP_LOW_H */
diff --git a/gcc/omp-offload.cc b/gcc/omp-offload.cc
index 934fbd80bdd..47582711dfd 100644
--- a/gcc/omp-offload.cc
+++ b/gcc/omp-offload.cc
@@ -55,6 +55,7 @@ along with GCC; see the file COPYING3. If not see
#include "context.h"
#include "convert.h"
#include "opts.h"
+#include "omp-low.h"
/* Describe the OpenACC looping structure of a function. The entire
function is held in a 'NULL' loop. */
@@ -2944,3 +2945,122 @@ make_pass_omp_target_link (gcc::context *ctxt)
{
return new pass_omp_target_link (ctxt);
}
+
+namespace {
+
+const pass_data pass_data_omp_adjust_max_vf =
+{
+ GIMPLE_PASS, /* type */
+ "ompadjustmaxvf", /* name */
+ OPTGROUP_OMP, /* optinfo_flags */
+ TV_NONE, /* tv_id */
+ PROP_ssa, /* properties_required */
+ 0, /* properties_provided */
+ 0, /* properties_destroyed */
+ 0, /* todo_flags_start */
+ TODO_update_ssa, /* todo_flags_finish */
+};
+
+class pass_omp_adjust_max_vf : public gimple_opt_pass
+{
+public:
+ pass_omp_adjust_max_vf (gcc::context *ctxt)
+ : gimple_opt_pass (pass_data_omp_adjust_max_vf, ctxt)
+ {}
+
+ /* opt_pass methods: */
+ bool gate (function *fun) final override
+ {
+ return offloading_function_p (fun->decl);
+ }
+
+ unsigned execute (function *fun) final override;
+};
+
+/* When offloading is enabled, we do not immediately compute
+ max_vf during omp_lower because it may differ between devices,
+ and instead delay lowering by using a place holder value INT_MAX.
+ max_vf is used for three things:
+ (a) Setting loop->safelen.
+ (b) Setting length of omp simd arrays.
+ (c) Computing chunk size of schedule clause.
+ This pass assigns appropriate values to above three artefacts. */
+
+unsigned
+pass_omp_adjust_max_vf::execute (function *fun)
+{
+ if (!fun->has_simduid_loops)
+ return 0;
+
+ /* For SIMT targets, the simd code-path is dead-code, so just
+ use a placeholder value 1 to fold .GOMP_SIMD_MAX_VF,
+ and shrink omp simd array length from INT_MAX. */
+ poly_uint64 max_vf = targetm.simt.vf ? 1 : omp_max_vf ();
+
+ /* Set correct safelen. */
+
+ for (auto loop: loops_list (fun, 0))
+ if (loop->needs_max_vf_lowering && loop->safelen == OMP_COMMON_MAX_VF)
+ loop->safelen = constant_lower_bound (max_vf);
+
+ /* Set correct length of omp simd arrays. */
+
+ for (auto decl: fun->local_decls)
+ if (TREE_CODE (TREE_TYPE (decl)) == ARRAY_TYPE
+ && lookup_attribute ("omp simd array", DECL_ATTRIBUTES (decl)))
+ {
+ tree& max = TYPE_MAX_VALUE (TYPE_DOMAIN (TREE_TYPE (decl)));
+ if (TREE_CODE (max) == INTEGER_CST
+ && wi::eq_p (wi::to_widest (max), OMP_COMMON_MAX_VF - 1))
+ {
+ max = size_int (max_vf - 1);
+ relayout_decl (decl);
+ }
+ }
+
+ /* Replace call to .GOMP_SIMD_MAX_VF with max_vf.
+ The call is built when computing chunk size for schedule clause.
+ See omp_adjust_chunk_size.
+
+ Similarly, replace call to .GOMP_SIMD_ALIGN with alignment computed
+ using build_omp_clause_aligned_alignment. */
+
+ basic_block bb;
+ FOR_EACH_BB_FN (bb, fun)
+ for (auto gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+ {
+ gcall *call_stmt = dyn_cast<gcall *> (gsi_stmt (gsi));
+ if (!call_stmt || !gimple_call_internal_p (call_stmt))
+ continue;
+
+ tree rhs = NULL_TREE;
+ switch (gimple_call_internal_fn (call_stmt))
+ {
+ case IFN_GOMP_SIMD_MAX_VF:
+ rhs = build_int_cst (integer_type_node, max_vf);
+ break;
+ case IFN_GOMP_SIMD_ALIGN:
+ rhs = build_omp_clause_aligned_alignment ();
+ break;
+ default:
+ break;
+ }
+
+ if (rhs)
+ {
+ tree lhs = gimple_call_lhs (call_stmt);
+ gassign *new_stmt = gimple_build_assign (lhs, rhs);
+ gsi_replace (&gsi, new_stmt, true);
+ }
+ }
+
+ return 0;
+}
+
+} // anon namespace
+
+gimple_opt_pass *
+make_pass_omp_adjust_max_vf (gcc::context *ctxt)
+{
+ return new pass_omp_adjust_max_vf (ctxt);
+}
diff --git a/gcc/passes.def b/gcc/passes.def
index 7d01227eed1..bbe6ac5d54f 100644
--- a/gcc/passes.def
+++ b/gcc/passes.def
@@ -190,6 +190,7 @@ along with GCC; see the file COPYING3. If not see
NEXT_PASS (pass_oacc_loop_designation);
NEXT_PASS (pass_omp_oacc_neuter_broadcast);
NEXT_PASS (pass_oacc_device_lower);
+ NEXT_PASS (pass_omp_adjust_max_vf);
NEXT_PASS (pass_omp_device_lower);
NEXT_PASS (pass_omp_target_link);
NEXT_PASS (pass_adjust_alignment);
diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
index a928cbe4557..d4082bf8bf2 100644
--- a/gcc/tree-pass.h
+++ b/gcc/tree-pass.h
@@ -435,6 +435,7 @@ extern gimple_opt_pass *make_pass_diagnose_omp_blocks
(gcc::context *ctxt);
extern gimple_opt_pass *make_pass_expand_omp (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_expand_omp_ssa (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_omp_target_link (gcc::context *ctxt);
+extern gimple_opt_pass *make_pass_omp_adjust_max_vf (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_oacc_loop_designation (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_omp_oacc_neuter_broadcast (gcc::context
*ctxt);
extern gimple_opt_pass *make_pass_oacc_device_lower (gcc::context *ctxt);