https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85445

--- Comment #2 from Tom de Vries <vries at gcc dot gnu.org> ---
(In reply to Tom de Vries from comment #1)
> The initialization of the stack is done in thread W0V0, but the stack is
> read in  WAVA mode, so it's reading uninitialized stack memory in all but
> the W0V0 thread.

r239736 fixes this by propating the stack frame from W0V0 to WAVO:
...
$ diff -u bad.s ok.s 
--- bad.s       2018-04-20 09:36:51.122581511 +0200
+++ ok.s        2018-04-20 09:53:01.971786471 +0200
@@ -344,28 +344,66 @@
        .reg.u32 %r25;
        .reg.u64 %r26;
        .reg.u32 %r28;
-       .reg.pred %r30;
-       .reg.pred %r31;
+       .reg.u64 %r30;
+       .reg.u64 %r31;
+       .reg.u64 %r32;
+       .reg.u32 %r33;
+       .reg.pred %r34;
+       .reg.u64 %r35;
+       .reg.u64 %r36;
+       .reg.u64 %r37;
+       .reg.u64 %r38;
+       .reg.u32 %r39;
+       .reg.pred %r40;
+       .reg.u64 %r41;
+       .reg.pred %r42;
+       .reg.pred %r43;
        {
                .reg.u32        %y;
                mov.u32 %y, %tid.y;
-               setp.ne.u32     %r30, %y, 0;
+               setp.ne.u32     %r42, %y, 0;
        }
        {
                .reg.u32        %x;
                mov.u32 %x, %tid.x;
-               setp.ne.u32     %r31, %x, 0;
+               setp.ne.u32     %r43, %x, 0;
        }
-       @%r30   bra.uni $L30;
-       @%r31   bra     $L31;
+       @%r42   bra.uni $L32;
+       @%r43   bra     $L33;
                mov.u64 %r24, %ar0;
                mov.u32 %r25, 65536;
                st.u32  [%frame], %r25;
                mov.u32 %r28, 32;
                ld.u64  %r26, [%r24];
+       // fork 14;
+               cvta.shared.u64 %r36, __worker_bcast;
+               mov.u64 %r38, %frame;
+               mov.u32 %r39, 2;
+               add.u64 %r41, %r36, 0;
 $L31:
-$L30:
+               add.u32 %r39, %r39, -1;
+               ld.u64  %r37, [%r38];
+               st.u64  [%r41], %r37;
+               add.u64 %r41, %r41, 8;
+               setp.ne.u32     %r40, %r39, 0;
+               add.u64 %r38, %r38, 8;
+       @%r40   bra.uni $L31;
+$L33:
+$L32:
+               bar.sync        0;
        // forked 14;
+               cvta.shared.u64 %r30, __worker_bcast;
+               mov.u64 %r32, %frame;
+               mov.u32 %r33, 2;
+               add.u64 %r35, %r30, 0;
+$L30:
+               add.u32 %r33, %r33, -1;
+               ld.u64  %r31, [%r35];
+               add.u64 %r35, %r35, 8;
+               st.u64  [%r32], %r31;
+               setp.ne.u32     %r34, %r33, 0;
+               add.u64 %r32, %r32, 8;
+       @%r34   bra.uni $L30;
        {
                .param.u64 %out_arg1;
                st.param.u64 [%out_arg1], %r26;
@@ -377,6 +415,8 @@
                st.param.u64 [%out_arg4], %frame;
                call _Z6WorkerPiiiRKi, (%out_arg1, %out_arg2, %out_arg3,
%out_arg4);
        }
+       // joining 14;
+               bar.sync        1;
        // join 14;
        ret;
 }
...

Reply via email to