https://gcc.gnu.org/bugzilla/show_bug.cgi?id=84952
--- Comment #5 from Tom de Vries <vries at gcc dot gnu.org> --- For stage4, however, we want a fix without fixing optimization issue PR84025, so we have: ... $ git log --pretty=%s --reverse HEAD^^..HEAD | cat -n 1 Fix bar.sync position 2 Verify bar.sync position ... which results in: ... // BEGIN PREAMBLE .version 3.1 .target sm_30 .address_size 64 // END PREAMBLE // BEGIN FUNCTION DECL: main$_omp_fn$0 .entry main$_omp_fn$0 (.param .u64 %in_ar0); //:FUNC_MAP "main$_omp_fn$0", 0x1, 0x20, 0x20 // BEGIN VAR DEF: __worker_bcast .shared .align 8 .u8 __worker_bcast[8]; // BEGIN FUNCTION DEF: main$_omp_fn$0 .entry main$_omp_fn$0 (.param .u64 %in_ar0) { .reg .u64 %ar0; ld.param.u64 %ar0,[%in_ar0]; .reg .u32 %r24; .reg .u64 %r25; .reg .pred %r26; .reg .u64 %r27; .reg .u64 %r28; .reg .u64 %r29; .reg .u64 %r30; .reg .u64 %r31; .reg .u64 %r32; .reg .pred %r33; .reg .pred %r34; { .reg .u32 %y; mov.u32 %y,%tid.y; setp.ne.u32 %r34,%y,0; } { .reg .u32 %x; mov.u32 %x,%tid.x; setp.ne.u32 %r33,%x,0; } @ %r34 bra.uni $L6; @ %r33 bra $L7; mov.u64 %r25,%ar0; // fork 2; cvta.shared.u64 %r32,__worker_bcast; st.u64 [%r32],%r25; $L7: $L6: bar.sync 0; @ %r33 bra $L5; // forked 2; cvta.shared.u64 %r31,__worker_bcast; ld.u64 %r25,[%r31]; mov.u32 %r24,%tid.y; setp.le.s32 %r26,%r24,9; @ %r26 bra $L2; bra $L3; $L2: ld.u64 %r27,[%r25]; cvt.s64.s32 %r28,%r24; shl.b64 %r29,%r28,2; add.u64 %r30,%r27,%r29; st.u32 [%r30],%r24; $L3: // joining 2; $L5: bar.sync 1; @ %r34 bra.uni $L8; @ %r33 bra $L9; // join 2; $L9: $L8: ret; } ...