https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85486

            Bug ID: 85486
           Summary: [og7, nvptx] ref-1.C fails with vector length 128
           Product: gcc
           Version: unknown
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: vries at gcc dot gnu.org
  Target Milestone: ---

Minimal example:
...
/* { dg-do run } */

extern "C" int printf (char const *, ...);

#pragma acc routine vector
void __attribute__((noinline, noclone))
Vector (int *ptr, int n, const int &inc)
{
  #pragma acc loop vector
  for (unsigned ix = 0; ix < n; ix++)
    ptr[ix] += inc;
}

int
main (void)
{
  const int n = 32, m=32;

  int ary[m][n];
  unsigned ix,  iy;

  for (ix = m; ix--;)
    for (iy = n; iy--;)
      ary[ix][iy] = (1 << 16) + (ix << 8) + iy;

  int err = 0;

#pragma acc parallel copy(ary) vector_length(128)
  {
    Vector (&ary[0][0], m * n, (1<<24) - (1<<16));
  }

  for (ix = m; ix--;)
    for (iy = n; iy--;)
      if (ary[ix][iy] != ((1 << 24) + (ix << 8) + iy))
        {
          printf ("ary[%u][%u] = %x expected(II) %x\n",
                  ix, iy, ary[ix][iy], ((1 << 24) + (ix << 8) + iy));
          err++;
        }

  if (err)
    {
      printf ("%d failed\n", err);
      return 1;
     }

  return 0;
}
...

The problem is that we launch the region with vector length 128:
...
//:FUNC_MAP "main$_omp_fn$0", 0x1, 0x1, 0x80                                    
...
but the Vector routine is generate assuming vector length 32:
...
.visible .func _Z6VectorPiiRKi (.param.u64 %in_ar0, .param.u32 %in_ar1,
.param.u64 %in_ar2)
{
        .reg.u64 %ar0;
        ld.param.u64 %ar0, [%in_ar0];
        .reg.u32 %ar1;
        ld.param.u32 %ar1, [%in_ar1];
        .reg.u64 %ar2;
        ld.param.u64 %ar2, [%in_ar2];
        .reg.u64 %r66;
        .reg.u64 %r67;
        .reg.u32 %r68;
        .reg.u64 %r75;
        .reg.u64 %r76;
        .reg.u32 %r77;
        .reg.u64 %r78;
        .reg.pred %r79;
        .reg.u64 %r80;
        .reg.u32 %r81;
        .reg.u32 %r82;
        .reg.u32 %r83;
        .reg.u64 %r84;
        .reg.u64 %r85;
        .reg.u64 %r86;
        .reg.u64 %r87;
        .reg.u64 %r88;
        .reg.u32 %r89;
        .reg.u32 %r90;
        .reg.u32 %r91;
        .reg.pred %r92;
        .reg.u32 %r93;
        .reg.u32 %r94;
        .reg.u32 %r95;
        .reg.u32 %r96;
        .reg.pred %r97;
        {
                .reg.u32        %x;
                mov.u32 %x, %tid.x;
                setp.ne.u32     %r97, %x, 0;
        }
        @%r97   bra     $L5;
                mov.u64 %r76, %ar0;
                mov.u32 %r77, %ar1;
                mov.u64 %r78, %ar2;
        // fork 4;
$L5:
        // forked 4;
                mov.b64 {%r93,%r94}, %r76;
                shfl.idx.b32    %r93, %r93, 0, 31;
                shfl.idx.b32    %r94, %r94, 0, 31;
                mov.b64 %r76, {%r93,%r94};
                shfl.idx.b32    %r77, %r77, 0, 31;
                mov.b64 {%r95,%r96}, %r78;
                shfl.idx.b32    %r95, %r95, 0, 31;
                shfl.idx.b32    %r96, %r96, 0, 31;
                mov.b64 %r78, {%r95,%r96};
                mov.u32 %r68, %tid.x;
                setp.le.s32     %r79, %r77, %r68;
        @%r79   bra     $L2;
                cvt.s64.s32     %r66, %r68;
                shl.b64 %r80, %r66, 2;
                add.u64 %r67, %r76, %r80;
                add.u32 %r81, %r77, -1;
                sub.u32 %r82, %r81, %r68;
                shr.u32 %r83, %r82, 5;
                cvt.u64.u32     %r84, %r83;
                shl.b64 %r85, %r84, 5;
                add.u64 %r86, %r85, %r66;
                shl.b64 %r87, %r86, 2;
                add.u64 %r88, %r76, 128;
                add.u64 %r75, %r87, %r88;
$L3:
                ld.u32  %r90, [%r67];
                ld.u32  %r91, [%r78];
                add.u32 %r89, %r90, %r91;
                st.u32  [%r67], %r89;
                add.u64 %r67, %r67, 128;
                setp.ne.u64     %r92, %r67, %r75;
        @%r92   bra     $L3;
$L2:
        // joining 4;
        // join 4;
        ret;
}
...

Reply via email to