https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85486
Bug ID: 85486
Summary: [og7, nvptx] ref-1.C fails with vector length 128
Product: gcc
Version: unknown
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: target
Assignee: unassigned at gcc dot gnu.org
Reporter: vries at gcc dot gnu.org
Target Milestone: ---
Minimal example:
...
/* { dg-do run } */
extern "C" int printf (char const *, ...);
#pragma acc routine vector
void __attribute__((noinline, noclone))
Vector (int *ptr, int n, const int &inc)
{
#pragma acc loop vector
for (unsigned ix = 0; ix < n; ix++)
ptr[ix] += inc;
}
int
main (void)
{
const int n = 32, m=32;
int ary[m][n];
unsigned ix, iy;
for (ix = m; ix--;)
for (iy = n; iy--;)
ary[ix][iy] = (1 << 16) + (ix << 8) + iy;
int err = 0;
#pragma acc parallel copy(ary) vector_length(128)
{
Vector (&ary[0][0], m * n, (1<<24) - (1<<16));
}
for (ix = m; ix--;)
for (iy = n; iy--;)
if (ary[ix][iy] != ((1 << 24) + (ix << 8) + iy))
{
printf ("ary[%u][%u] = %x expected(II) %x\n",
ix, iy, ary[ix][iy], ((1 << 24) + (ix << 8) + iy));
err++;
}
if (err)
{
printf ("%d failed\n", err);
return 1;
}
return 0;
}
...
The problem is that we launch the region with vector length 128:
...
//:FUNC_MAP "main$_omp_fn$0", 0x1, 0x1, 0x80
...
but the Vector routine is generate assuming vector length 32:
...
.visible .func _Z6VectorPiiRKi (.param.u64 %in_ar0, .param.u32 %in_ar1,
.param.u64 %in_ar2)
{
.reg.u64 %ar0;
ld.param.u64 %ar0, [%in_ar0];
.reg.u32 %ar1;
ld.param.u32 %ar1, [%in_ar1];
.reg.u64 %ar2;
ld.param.u64 %ar2, [%in_ar2];
.reg.u64 %r66;
.reg.u64 %r67;
.reg.u32 %r68;
.reg.u64 %r75;
.reg.u64 %r76;
.reg.u32 %r77;
.reg.u64 %r78;
.reg.pred %r79;
.reg.u64 %r80;
.reg.u32 %r81;
.reg.u32 %r82;
.reg.u32 %r83;
.reg.u64 %r84;
.reg.u64 %r85;
.reg.u64 %r86;
.reg.u64 %r87;
.reg.u64 %r88;
.reg.u32 %r89;
.reg.u32 %r90;
.reg.u32 %r91;
.reg.pred %r92;
.reg.u32 %r93;
.reg.u32 %r94;
.reg.u32 %r95;
.reg.u32 %r96;
.reg.pred %r97;
{
.reg.u32 %x;
mov.u32 %x, %tid.x;
setp.ne.u32 %r97, %x, 0;
}
@%r97 bra $L5;
mov.u64 %r76, %ar0;
mov.u32 %r77, %ar1;
mov.u64 %r78, %ar2;
// fork 4;
$L5:
// forked 4;
mov.b64 {%r93,%r94}, %r76;
shfl.idx.b32 %r93, %r93, 0, 31;
shfl.idx.b32 %r94, %r94, 0, 31;
mov.b64 %r76, {%r93,%r94};
shfl.idx.b32 %r77, %r77, 0, 31;
mov.b64 {%r95,%r96}, %r78;
shfl.idx.b32 %r95, %r95, 0, 31;
shfl.idx.b32 %r96, %r96, 0, 31;
mov.b64 %r78, {%r95,%r96};
mov.u32 %r68, %tid.x;
setp.le.s32 %r79, %r77, %r68;
@%r79 bra $L2;
cvt.s64.s32 %r66, %r68;
shl.b64 %r80, %r66, 2;
add.u64 %r67, %r76, %r80;
add.u32 %r81, %r77, -1;
sub.u32 %r82, %r81, %r68;
shr.u32 %r83, %r82, 5;
cvt.u64.u32 %r84, %r83;
shl.b64 %r85, %r84, 5;
add.u64 %r86, %r85, %r66;
shl.b64 %r87, %r86, 2;
add.u64 %r88, %r76, 128;
add.u64 %r75, %r87, %r88;
$L3:
ld.u32 %r90, [%r67];
ld.u32 %r91, [%r78];
add.u32 %r89, %r90, %r91;
st.u32 [%r67], %r89;
add.u64 %r67, %r67, 128;
setp.ne.u64 %r92, %r67, %r75;
@%r92 bra $L3;
$L2:
// joining 4;
// join 4;
ret;
}
...