[nvptx, PR84952, committed] Fix bar.sync position

Tom de Vries Tue, 20 Mar 2018 03:39:43 -0700

[ was: Re: [og7] Update nvptx_fork/join barrier placement ]


On 03/19/2018 06:02 PM, Tom de Vries wrote:

I've got a tentative patch athttps://gcc.gnu.org/bugzilla/attachment.cgi?id=43707 ( PR84952 -"[nvptx] bar.sync generated in divergent code" ).

Tested on x86_64 with nvptx accelerator (in combination with a patchthat verifies the positioning of bar.sync).


Committed to stage4 trunk.

[ Recap:

Consider testcase workers.c:
...
int
main (void)
{
  int a[10];
#pragma acc parallel loop worker
  for (int i = 0; i < 10; i++)
    a[i] = i;

  return 0;
}
...

At -O2, we generate (edited for readability):
...
// BEGIN PREAMBLE
.version 3.1
.target sm_30
.address_size 64
// END PREAMBLE

// BEGIN FUNCTION DECL: main$_omp_fn$0
.entry main$_omp_fn$0 (.param .u64 %in_ar0);

//:FUNC_MAP "main$_omp_fn$0", 0x1, 0x20, 0x20

// BEGIN VAR DEF: __worker_bcast
.shared .align 8 .u8 __worker_bcast[8];

// BEGIN FUNCTION DEF: main$_omp_fn$0
.entry main$_omp_fn$0 (.param .u64 %in_ar0)
{
  .reg .u64 %ar0;
  ld.param.u64 %ar0,[%in_ar0];
  .reg .u32 %r24;
  .reg .u64 %r25;
  .reg .pred %r26;
  .reg .u64 %r27;
  .reg .u64 %r28;
  .reg .u64 %r29;
  .reg .u64 %r30;
  .reg .u64 %r31;
  .reg .u64 %r32;
  .reg .pred %r33;
  .reg .pred %r34;

  {
    .reg .u32 %y;
    mov.u32 %y,%tid.y;
    setp.ne.u32 %r34,%y,0;
  }

  {
    .reg .u32 %x;
    mov.u32 %x,%tid.x;
    setp.ne.u32 %r33,%x,0;
  }

  @ %r34 bra.uni $L6;
  @ %r33 bra $L7;
  mov.u64 %r25,%ar0;
  // fork 2;
  cvta.shared.u64 %r32,__worker_bcast;
  st.u64 [%r32],%r25;
 $L7:
 $L6:

  @ %r33 bra $L5;
  // forked 2;
  bar.sync 0;
  cvta.shared.u64 %r31,__worker_bcast;
  ld.u64 %r25,[%r31];
  mov.u32 %r24,%tid.y;
  setp.le.s32 %r26,%r24,9;
  @ %r26 bra $L2;
  bra $L3;
 $L2:
  ld.u64 %r27,[%r25];
  cvt.s64.s32 %r28,%r24;
  shl.b64 %r29,%r28,2;
  add.u64 %r30,%r27,%r29;
  st.u32 [%r30],%r24;
 $L3:
  bar.sync 1;
  // joining 2;
 $L5:

  @ %r34 bra.uni $L8;
  @ %r33 bra $L9;
  // join 2;
 $L9:
 $L8:

  ret;
}
...

The problem is the positioning of bar.sync, inside the vector-neuteringbranch "@ %r33 bra $L5".


The documentation for bar.sync says:
...

Barriers are executed on a per-warp basis as if all the threads in awarp are active. Thus, if any thread in a warp executes a barinstruction, it is as if all the threads in the warp have executed thebar instruction. All threads in the warp are stalled until the barriercompletes, and the arrival count for the barrier is incremented by thewarp size (not the number of active threads in the warp). Inconditionally executed code, a bar instruction should only be used if itis known that all threads evaluate the condition identically (the warpdoes not diverge).

...

The documentation is somewhat contradictory, in that it first explainsthat that it is executed on a per-warp basis (implying that only onethread executing it should be fine), but then goes on to state that itshould not be executed in divergent mode (implying that all threadsshould execute it).


Either way, the safest form of usage is: don't execute in divergent mode.

As is evident from the example above, we do generate bar.sync indivergent mode, and patch below fixes that.

With the patch, the difference in positioning of bar.sync is in theexample above is:

...
@@ -42,18 +42,18 @@
   st.u64       [%r32], %r25;
  $L7:
  $L6:
+  bar.sync     0;
   @%r33        bra     $L5;
   // forked 2;
-  bar.sync     0;
   cvta.shared.u64      %r31, __worker_bcast;
   ld.u64       %r25, [%r31];
   mov.u32      %r24, %tid.y;
   setp.le.s32  %r26, %r24, 9;
   @%r26        bra     $L2;
  $L3:
-  bar.sync     1;
   // joining 2;
  $L5:
+  bar.sync     1;
   @%r34        bra.uni $L8;
   @%r33        bra     $L9;
   // join 2;
...
]

Thanks,
- Tom

[nvptx] Fix bar.sync position

2018-03-20  Tom de Vries  <t...@codesourcery.com>

	PR target/84952
	* config/nvptx/nvptx.c (nvptx_single): Don't neuter bar.sync.
	(nvptx_process_pars): Emit bar.sync asap and alap.

---
 gcc/config/nvptx/nvptx.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index a6f4443..a839988 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -3969,7 +3969,9 @@ nvptx_single (unsigned mask, basic_block from, basic_block to)
   while (true)
     {
       /* Find first insn of from block.  */
-      while (head != BB_END (from) && !INSN_P (head))
+      while (head != BB_END (from)
+	     && (!INSN_P (head)
+		 || recog_memoized (head) == CODE_FOR_nvptx_barsync))
 	head = NEXT_INSN (head);
 
       if (from == to)
@@ -4018,6 +4020,7 @@ nvptx_single (unsigned mask, basic_block from, basic_block to)
 	{
 	default:
 	  break;
+	case CODE_FOR_nvptx_barsync:
 	case CODE_FOR_nvptx_fork:
 	case CODE_FOR_nvptx_forked:
 	case CODE_FOR_nvptx_joining:
@@ -4275,8 +4278,8 @@ nvptx_process_pars (parallel *par)
       nvptx_wpropagate (false, par->forked_block, par->forked_insn);
       nvptx_wpropagate (true, par->forked_block, par->fork_insn);
       /* Insert begin and end synchronizations.  */
-      emit_insn_after (nvptx_wsync (false), par->forked_insn);
-      emit_insn_before (nvptx_wsync (true), par->joining_insn);
+      emit_insn_before (nvptx_wsync (false), par->forked_insn);
+      emit_insn_before (nvptx_wsync (true), par->join_insn);
     }
   else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
     nvptx_vpropagate (par->forked_block, par->forked_insn);

[nvptx, PR84952, committed] Fix bar.sync position

Reply via email to