https://gcc.gnu.org/bugzilla/show_bug.cgi?id=83589
--- Comment #2 from Tom de Vries <vries at gcc dot gnu.org> --- I've minimized mode-transitions.c to: ... #define n 32 int main (void) { int arr_a[n]; #pragma acc parallel copyout(arr_a) num_gangs(1) num_workers(1) vector_length(32) { #pragma acc loop vector for (int m = 0; m < 32; m++) ; #pragma acc loop vector for (int m = 0; m < 32; m++) arr_a[m] = 0; } } ... and the ptx to: ... .version 3.1 .target sm_30 .address_size 64 .entry main$_omp_fn$0 (.param .u64 %in_ar0); .entry main$_omp_fn$0 (.param .u64 %in_ar0) { .reg .u64 %ar0; ld.param.u64 %ar0,[%in_ar0]; .reg .pred %r36; { .reg .u32 %x; mov.u32 %x,%tid.x; setp.ne.u32 %r36,%x,0; } .reg .u64 %r26; mov.u64 %r26,%ar0; @ %r36 bra $L5; $L5: { .reg .u32 %r32; .reg .u32 %r33; mov.b64 {%r32,%r33},%r26; shfl.idx.b32 %r32,%r32,0,31; shfl.idx.b32 %r33,%r33,0,31; mov.b64 %r26,{%r32,%r33}; } ld.u64 %r26,[%r26]; @ %r36 bra $L6; st.u32 [%r26],0; $L6: ret; } ... Either removing: - the broad cast bit, which is an identity operation, or - the redundant branch to $L5 make the test pass. This looks like another nvidia driver problem (with driver version 384.111).