https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85445
--- Comment #2 from Tom de Vries <vries at gcc dot gnu.org> --- (In reply to Tom de Vries from comment #1) > The initialization of the stack is done in thread W0V0, but the stack is > read in WAVA mode, so it's reading uninitialized stack memory in all but > the W0V0 thread. r239736 fixes this by propating the stack frame from W0V0 to WAVO: ... $ diff -u bad.s ok.s --- bad.s 2018-04-20 09:36:51.122581511 +0200 +++ ok.s 2018-04-20 09:53:01.971786471 +0200 @@ -344,28 +344,66 @@ .reg.u32 %r25; .reg.u64 %r26; .reg.u32 %r28; - .reg.pred %r30; - .reg.pred %r31; + .reg.u64 %r30; + .reg.u64 %r31; + .reg.u64 %r32; + .reg.u32 %r33; + .reg.pred %r34; + .reg.u64 %r35; + .reg.u64 %r36; + .reg.u64 %r37; + .reg.u64 %r38; + .reg.u32 %r39; + .reg.pred %r40; + .reg.u64 %r41; + .reg.pred %r42; + .reg.pred %r43; { .reg.u32 %y; mov.u32 %y, %tid.y; - setp.ne.u32 %r30, %y, 0; + setp.ne.u32 %r42, %y, 0; } { .reg.u32 %x; mov.u32 %x, %tid.x; - setp.ne.u32 %r31, %x, 0; + setp.ne.u32 %r43, %x, 0; } - @%r30 bra.uni $L30; - @%r31 bra $L31; + @%r42 bra.uni $L32; + @%r43 bra $L33; mov.u64 %r24, %ar0; mov.u32 %r25, 65536; st.u32 [%frame], %r25; mov.u32 %r28, 32; ld.u64 %r26, [%r24]; + // fork 14; + cvta.shared.u64 %r36, __worker_bcast; + mov.u64 %r38, %frame; + mov.u32 %r39, 2; + add.u64 %r41, %r36, 0; $L31: -$L30: + add.u32 %r39, %r39, -1; + ld.u64 %r37, [%r38]; + st.u64 [%r41], %r37; + add.u64 %r41, %r41, 8; + setp.ne.u32 %r40, %r39, 0; + add.u64 %r38, %r38, 8; + @%r40 bra.uni $L31; +$L33: +$L32: + bar.sync 0; // forked 14; + cvta.shared.u64 %r30, __worker_bcast; + mov.u64 %r32, %frame; + mov.u32 %r33, 2; + add.u64 %r35, %r30, 0; +$L30: + add.u32 %r33, %r33, -1; + ld.u64 %r31, [%r35]; + add.u64 %r35, %r35, 8; + st.u64 [%r32], %r31; + setp.ne.u32 %r34, %r33, 0; + add.u64 %r32, %r32, 8; + @%r34 bra.uni $L30; { .param.u64 %out_arg1; st.param.u64 [%out_arg1], %r26; @@ -377,6 +415,8 @@ st.param.u64 [%out_arg4], %frame; call _Z6WorkerPiiiRKi, (%out_arg1, %out_arg2, %out_arg3, %out_arg4); } + // joining 14; + bar.sync 1; // join 14; ret; } ...