Hi,
When compiling the test-case in the patch, the following ptx code is
generated:
...
$L4:
@ %r91 bra.uni $L24;
selp.u32 %r95,1,0,%r80;
st.shared.u32 [__worker_bcast],%r95;
$L25:
$L24:
@ %r92 bra $L25;
...
There's an eternal loop starting at the last insn, and unsurprisingly
the test-case hangs.
The last insn is a vector neutering branch, which should have been
inserted after the worker neutering branch (the first insn).
In other words, we want:
...
$L4:
@ %r91 bra.uni $L24;
+ @ %r92 bra $L25;
selp.u32 %r95,1,0,%r80;
st.shared.u32 [__worker_bcast],%r95;
$L25:
$L24:
- @ %r92 bra $L25;
...
This minimal stage4 patch fixes this problem. [ I filed a PR85223
"[nvptx] nvptx_single needs rewrite" for a stage1 rewrite of nvptx_single. ]
Build x86_64 with nvptx accelerator, and tested libgomp.
Committed to stage4 trunk.
Thanks,
- Tom
[nvptx] Fix neutering of bb with only cond jump
2018-04-05 Tom de Vries <t...@codesourcery.com>
PR target/85204
* config/nvptx/nvptx.c (nvptx_single): Fix neutering of bb with only
cond jump.
* testsuite/libgomp.oacc-c-c++-common/broadcast-1.c: New test.
---
gcc/config/nvptx/nvptx.c | 6 ++-
.../libgomp.oacc-c-c++-common/broadcast-1.c | 49 ++++++++++++++++++++++
2 files changed, 54 insertions(+), 1 deletion(-)
diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index b2b150f..a9a3053 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -4048,6 +4048,7 @@ nvptx_single (unsigned mask, basic_block from, basic_block to)
/* Insert the vector test inside the worker test. */
unsigned mode;
rtx_insn *before = tail;
+ rtx_insn *neuter_start = NULL;
for (mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
if (GOMP_DIM_MASK (mode) & skip_mask)
{
@@ -4065,7 +4066,10 @@ nvptx_single (unsigned mask, basic_block from, basic_block to)
br = gen_br_true (pred, label);
else
br = gen_br_true_uni (pred, label);
- emit_insn_before (br, head);
+ if (neuter_start)
+ neuter_start = emit_insn_after (br, neuter_start);
+ else
+ neuter_start = emit_insn_before (br, head);
LABEL_NUSES (label)++;
if (tail_branch)
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/broadcast-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/broadcast-1.c
new file mode 100644
index 0000000..ca0d37b
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/broadcast-1.c
@@ -0,0 +1,49 @@
+/* Ensure that worker-vector state conditional expressions are
+ properly handled by the nvptx backend. */
+
+#include <assert.h>
+#include <math.h>
+
+
+#define N 1024
+
+int A[N][N] ;
+
+void test(int x)
+{
+#pragma acc parallel num_gangs(16) num_workers(4) vector_length(32) copyout(A)
+ {
+#pragma acc loop gang
+ for(int j=0;j<N;j++)
+ {
+ if (x==1)
+ {
+#pragma acc loop worker vector
+ for(int i=0;i<N;i++)
+ A[i][j] = 1;
+ }
+ else
+ {
+#pragma acc loop worker vector
+ for(int i=0;i<N;i++)
+ A[i][j] = -1;
+ }
+ }
+ }
+}
+
+
+int main(void)
+{
+ test (0);
+ for (int i = 0; i < N; i++)
+ for (int j = 0; j < N; j++)
+ assert (A[i][j] == -1);
+
+ test (1);
+ for (int i = 0; i < N; i++)
+ for (int j = 0; j < N; j++)
+ assert (A[i][j] == 1);
+
+ return 0;
+}