Currently, the nvptx backend is only neutering the worker axis when propagating variables used in conditional expressions across the worker and vector axes. That's a problem with the worker-state spill and fill propagation implementation because all of the vector threads in worker 0 all write the the same address location being spilled. As the attached test case demonstrates, this might cause an infinite loop depending on the values in the vector threads being propagated.
This patch fixes this issue by introducing a new worker-vector predicate, so that both the worker and vector threads can be predicated together, not separately. I.e., instead of first neutering worker axis, then neutering the vector axis, this patch uses a single predicate for tid.x == 0 && tid.y == 0. Is this patch ok for trunk? Cesar
2016-10-26 Cesar Philippidis <ce...@codesourcery.com> gcc/ * config/nvptx/nvptx.c (nvptx_single): Use a single predicate for loops partitioned across both worker and vector axes. libgomp/ * testsuite/libgomp.oacc-c-c++-common/broadcast-1.c: New test. diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c index 7bf5987..4e6ed60 100644 --- a/gcc/config/nvptx/nvptx.c +++ b/gcc/config/nvptx/nvptx.c @@ -3507,11 +3507,38 @@ nvptx_single (unsigned mask, basic_block from, basic_block to) /* Insert the vector test inside the worker test. */ unsigned mode; rtx_insn *before = tail; + rtx wvpred = NULL_RTX; + bool skip_vector = false; + + /* Create a single predicate for loops containing both worker and + vectors. */ + if (cond_branch + && (GOMP_DIM_MASK (GOMP_DIM_WORKER) & mask) + && (GOMP_DIM_MASK (GOMP_DIM_VECTOR) & mask)) + { + rtx regx = gen_reg_rtx (SImode); + rtx regy = gen_reg_rtx (SImode); + rtx tmp = gen_reg_rtx (SImode); + wvpred = gen_reg_rtx (BImode); + + emit_insn_before (gen_oacc_dim_pos (regx, const1_rtx), head); + emit_insn_before (gen_oacc_dim_pos (regy, const2_rtx), head); + emit_insn_before (gen_rtx_SET (tmp, gen_rtx_IOR (SImode, regx, regy)), + head); + emit_insn_before (gen_rtx_SET (wvpred, gen_rtx_NE (BImode, tmp, + const0_rtx)), + head); + + skip_mask &= ~(GOMP_DIM_MASK (GOMP_DIM_VECTOR)); + skip_vector = true; + } + for (mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++) if (GOMP_DIM_MASK (mode) & skip_mask) { rtx_code_label *label = gen_label_rtx (); - rtx pred = cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER]; + rtx pred = skip_vector ? wvpred + : cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER]; if (!pred) { diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/broadcast-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/broadcast-1.c new file mode 100644 index 0000000..4dcb60d --- /dev/null +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/broadcast-1.c @@ -0,0 +1,49 @@ +/* Ensure that worker-vector state conditional expressions are + properly handled by the nvptx backend. */ + +#include <assert.h> +#include <math.h> + + +#define N 1024 + +int A[N][N] ; + +void test(int x) +{ +#pragma acc parallel num_gangs(16) num_workers(4) vector_length(32) copyout(A) + { +#pragma acc loop gang + for(int j=0;j<N;j++) + { + if (x==1) + { +#pragma acc loop worker vector + for(int i=0;i<N;i++) + A[i][j] = 1; + } + else + { +#pragma acc loop worker vector + for(int i=0;i<N;i++) + A[i][j] = -1; + } + } + } +} + + +int main(void) +{ + test (0); + for (int i = 0; i < N; i++) + for (int j = 0; j < N; j++) + assert (A[i][j] == -1); + + test (1); + for (int i = 0; i < N; i++) + for (int j = 0; j < N; j++) + assert (A[i][j] == 1); + + return 0; +}