https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85486
Bug ID: 85486 Summary: [og7, nvptx] ref-1.C fails with vector length 128 Product: gcc Version: unknown Status: UNCONFIRMED Severity: normal Priority: P3 Component: target Assignee: unassigned at gcc dot gnu.org Reporter: vries at gcc dot gnu.org Target Milestone: --- Minimal example: ... /* { dg-do run } */ extern "C" int printf (char const *, ...); #pragma acc routine vector void __attribute__((noinline, noclone)) Vector (int *ptr, int n, const int &inc) { #pragma acc loop vector for (unsigned ix = 0; ix < n; ix++) ptr[ix] += inc; } int main (void) { const int n = 32, m=32; int ary[m][n]; unsigned ix, iy; for (ix = m; ix--;) for (iy = n; iy--;) ary[ix][iy] = (1 << 16) + (ix << 8) + iy; int err = 0; #pragma acc parallel copy(ary) vector_length(128) { Vector (&ary[0][0], m * n, (1<<24) - (1<<16)); } for (ix = m; ix--;) for (iy = n; iy--;) if (ary[ix][iy] != ((1 << 24) + (ix << 8) + iy)) { printf ("ary[%u][%u] = %x expected(II) %x\n", ix, iy, ary[ix][iy], ((1 << 24) + (ix << 8) + iy)); err++; } if (err) { printf ("%d failed\n", err); return 1; } return 0; } ... The problem is that we launch the region with vector length 128: ... //:FUNC_MAP "main$_omp_fn$0", 0x1, 0x1, 0x80 ... but the Vector routine is generate assuming vector length 32: ... .visible .func _Z6VectorPiiRKi (.param.u64 %in_ar0, .param.u32 %in_ar1, .param.u64 %in_ar2) { .reg.u64 %ar0; ld.param.u64 %ar0, [%in_ar0]; .reg.u32 %ar1; ld.param.u32 %ar1, [%in_ar1]; .reg.u64 %ar2; ld.param.u64 %ar2, [%in_ar2]; .reg.u64 %r66; .reg.u64 %r67; .reg.u32 %r68; .reg.u64 %r75; .reg.u64 %r76; .reg.u32 %r77; .reg.u64 %r78; .reg.pred %r79; .reg.u64 %r80; .reg.u32 %r81; .reg.u32 %r82; .reg.u32 %r83; .reg.u64 %r84; .reg.u64 %r85; .reg.u64 %r86; .reg.u64 %r87; .reg.u64 %r88; .reg.u32 %r89; .reg.u32 %r90; .reg.u32 %r91; .reg.pred %r92; .reg.u32 %r93; .reg.u32 %r94; .reg.u32 %r95; .reg.u32 %r96; .reg.pred %r97; { .reg.u32 %x; mov.u32 %x, %tid.x; setp.ne.u32 %r97, %x, 0; } @%r97 bra $L5; mov.u64 %r76, %ar0; mov.u32 %r77, %ar1; mov.u64 %r78, %ar2; // fork 4; $L5: // forked 4; mov.b64 {%r93,%r94}, %r76; shfl.idx.b32 %r93, %r93, 0, 31; shfl.idx.b32 %r94, %r94, 0, 31; mov.b64 %r76, {%r93,%r94}; shfl.idx.b32 %r77, %r77, 0, 31; mov.b64 {%r95,%r96}, %r78; shfl.idx.b32 %r95, %r95, 0, 31; shfl.idx.b32 %r96, %r96, 0, 31; mov.b64 %r78, {%r95,%r96}; mov.u32 %r68, %tid.x; setp.le.s32 %r79, %r77, %r68; @%r79 bra $L2; cvt.s64.s32 %r66, %r68; shl.b64 %r80, %r66, 2; add.u64 %r67, %r76, %r80; add.u32 %r81, %r77, -1; sub.u32 %r82, %r81, %r68; shr.u32 %r83, %r82, 5; cvt.u64.u32 %r84, %r83; shl.b64 %r85, %r84, 5; add.u64 %r86, %r85, %r66; shl.b64 %r87, %r86, 2; add.u64 %r88, %r76, 128; add.u64 %r75, %r87, %r88; $L3: ld.u32 %r90, [%r67]; ld.u32 %r91, [%r78]; add.u32 %r89, %r90, %r91; st.u32 [%r67], %r89; add.u64 %r67, %r67, 128; setp.ne.u64 %r92, %r67, %r75; @%r92 bra $L3; $L2: // joining 4; // join 4; ret; } ...