Hi! The default set of routines use long as the iterator type, if some loops need either unsigned long, or long long/unsigned long long, they need to use another implementation (__int128 iterators are not supported). This patch adds those entry points and fixes some issues on the compiler side.
2015-10-02 Jakub Jelinek <ja...@redhat.com> * omp-low.c (expand_omp_ordered_source): Use GOMP_doacross_ull_post instead of GOMP_doacross_post if iter_type is unsigned long long. (expand_omp_ordered_sink): Use GOMP_doacross_ull_wait instead of GOMP_doacross_wait if iter_type is unsigned long long. (expand_omp_for_generic): Fix up expansion if zero_iter1_bb is NULL, but zero_iter2_bb is non-NULL. Never pass the up bool argument to GOMP_loop_ull_doacross_*_start entrypoints. * omp-builtins.def (BUILT_IN_GOMP_LOOP_ULL_DOACROSS_STATIC_START, BUILT_IN_GOMP_LOOP_ULL_DOACROSS_DYNAMIC_START, BUILT_IN_GOMP_LOOP_ULL_DOACROSS_GUIDED_START, BUILT_IN_GOMP_LOOP_ULL_DOACROSS_RUNTIME_START, BUILT_IN_GOMP_DOACROSS_ULL_POST, BUILT_IN_GOMP_DOACROSS_ULL_WAIT): New built-ins. * builtin-types.def (BT_FN_BOOL_UINT_ULLPTR_ULLPTR_ULLPTR, BT_FN_BOOL_UINT_ULLPTR_ULL_ULLPTR_ULLPTR, BT_FN_VOID_ULL_VAR): New. gcc/fortran/ * types.def (BT_FN_BOOL_UINT_ULLPTR_ULLPTR_ULLPTR, BT_FN_BOOL_UINT_ULLPTR_ULL_ULLPTR_ULLPTR, BT_FN_VOID_ULL_VAR): New. libgomp/ * loop_ull.c (gomp_loop_ull_doacross_static_start, gomp_loop_ull_doacross_dynamic_start, gomp_loop_ull_doacross_guided_start, GOMP_loop_ull_doacross_runtime_start, GOMP_loop_ull_doacross_static_start, GOMP_loop_ull_doacross_dynamic_start, GOMP_loop_ull_doacross_guided_start): New functions. * ordered.c (gomp_doacross_init): Don't initialize boundary if not static scheduling. (gomp_doacross_ull_init, GOMP_doacross_ull_post, GOMP_doacross_ull_wait): New functions. * libgomp.map (GOMP_4.1): Export GOMP_loop_ull_doacross_dynamic_start, GOMP_loop_ull_doacross_guided_start, GOMP_loop_ull_doacross_runtime_start, GOMP_loop_ull_doacross_static_start, GOMP_doacross_ull_post and GOMP_doacross_ull_wait. * libgomp_g.h (GOMP_loop_ull_doacross_guided_start, GOMP_loop_ull_doacross_runtime_start, GOMP_loop_ull_doacross_static_start, GOMP_doacross_ull_post, GOMP_doacross_ull_wait): New prototypes. * libgomp.h (struct gomp_doacross_work_share): Add chunk_size_ull, q_ull and boundary_ull fields. (gomp_doacross_ull_init): New prototype. * testsuite/libgomp.c/doacross-2.c: New test. --- gcc/omp-low.c.jj 2015-10-01 13:20:13.000000000 +0200 +++ gcc/omp-low.c 2015-10-02 11:38:40.140982433 +0200 @@ -7071,7 +7071,9 @@ static void expand_omp_ordered_source (gimple_stmt_iterator *gsi, struct omp_for_data *fd, tree *counts, location_t loc) { - enum built_in_function source_ix = BUILT_IN_GOMP_DOACROSS_POST; + enum built_in_function source_ix + = fd->iter_type == long_integer_type_node + ? BUILT_IN_GOMP_DOACROSS_POST : BUILT_IN_GOMP_DOACROSS_ULL_POST; gimple g = gimple_build_call (builtin_decl_explicit (source_ix), 1, build_fold_addr_expr (counts[fd->ordered])); @@ -7086,7 +7088,9 @@ expand_omp_ordered_sink (gimple_stmt_ite tree *counts, tree c, location_t loc) { auto_vec<tree, 10> args; - enum built_in_function sink_ix = BUILT_IN_GOMP_DOACROSS_WAIT; + enum built_in_function sink_ix + = fd->iter_type == long_integer_type_node + ? BUILT_IN_GOMP_DOACROSS_WAIT : BUILT_IN_GOMP_DOACROSS_ULL_WAIT; tree t, off, coff = NULL_TREE, deps = OMP_CLAUSE_DECL (c), cond = NULL_TREE; int i; gimple_stmt_iterator gsi2 = *gsi; @@ -7625,11 +7629,11 @@ expand_omp_for_generic (struct omp_regio gsi_prev (&gsi); e = split_block (entry_bb, gsi_stmt (gsi)); entry_bb = e->dest; - make_edge (zero_iter1_bb, entry_bb, EDGE_FALLTHRU); + make_edge (zero_iter2_bb, entry_bb, EDGE_FALLTHRU); gsi = gsi_last_bb (entry_bb); set_immediate_dominator (CDI_DOMINATORS, entry_bb, get_immediate_dominator - (CDI_DOMINATORS, zero_iter1_bb)); + (CDI_DOMINATORS, zero_iter2_bb)); } } if (fd->collapse == 1) @@ -7762,7 +7766,7 @@ expand_omp_for_generic (struct omp_regio t0 = fold_build2 (PLUS_EXPR, fd->iter_type, t0, bias); } } - if (fd->iter_type == long_integer_type_node) + if (fd->iter_type == long_integer_type_node || fd->ordered) { if (fd->chunk_size) { @@ -7801,14 +7805,8 @@ expand_omp_for_generic (struct omp_regio tree bfn_decl = builtin_decl_explicit (start_fn); t = fold_convert (fd->iter_type, fd->chunk_size); t = omp_adjust_chunk_size (t, fd->simd_schedule); - if (fd->ordered) - t = build_call_expr (bfn_decl, 6, t5, t0, t1, t, t3, t4); - else - t = build_call_expr (bfn_decl, 7, t5, t0, t1, t2, t, t3, t4); + t = build_call_expr (bfn_decl, 7, t5, t0, t1, t2, t, t3, t4); } - else if (fd->ordered) - t = build_call_expr (builtin_decl_explicit (start_fn), - 5, t5, t0, t1, t3, t4); else t = build_call_expr (builtin_decl_explicit (start_fn), 6, t5, t0, t1, t2, t3, t4); --- gcc/omp-builtins.def.jj 2015-09-24 13:33:02.000000000 +0200 +++ gcc/omp-builtins.def 2015-10-01 17:19:13.565016484 +0200 @@ -197,6 +197,22 @@ DEF_GOMP_BUILTIN (BUILT_IN_GOMP_LOOP_ULL "GOMP_loop_ull_ordered_runtime_start", BT_FN_BOOL_BOOL_ULL_ULL_ULL_ULLPTR_ULLPTR, ATTR_NOTHROW_LEAF_LIST) +DEF_GOMP_BUILTIN (BUILT_IN_GOMP_LOOP_ULL_DOACROSS_STATIC_START, + "GOMP_loop_ull_doacross_static_start", + BT_FN_BOOL_UINT_ULLPTR_ULL_ULLPTR_ULLPTR, + ATTR_NOTHROW_LEAF_LIST) +DEF_GOMP_BUILTIN (BUILT_IN_GOMP_LOOP_ULL_DOACROSS_DYNAMIC_START, + "GOMP_loop_ull_doacross_dynamic_start", + BT_FN_BOOL_UINT_ULLPTR_ULL_ULLPTR_ULLPTR, + ATTR_NOTHROW_LEAF_LIST) +DEF_GOMP_BUILTIN (BUILT_IN_GOMP_LOOP_ULL_DOACROSS_GUIDED_START, + "GOMP_loop_ull_doacross_guided_start", + BT_FN_BOOL_UINT_ULLPTR_ULL_ULLPTR_ULLPTR, + ATTR_NOTHROW_LEAF_LIST) +DEF_GOMP_BUILTIN (BUILT_IN_GOMP_LOOP_ULL_DOACROSS_RUNTIME_START, + "GOMP_loop_ull_doacross_runtime_start", + BT_FN_BOOL_UINT_ULLPTR_ULLPTR_ULLPTR, + ATTR_NOTHROW_LEAF_LIST) DEF_GOMP_BUILTIN (BUILT_IN_GOMP_LOOP_ULL_STATIC_NEXT, "GOMP_loop_ull_static_next", BT_FN_BOOL_ULONGLONGPTR_ULONGLONGPTR, ATTR_NOTHROW_LEAF_LIST) DEF_GOMP_BUILTIN (BUILT_IN_GOMP_LOOP_ULL_DYNAMIC_NEXT, "GOMP_loop_ull_dynamic_next", @@ -250,6 +266,10 @@ DEF_GOMP_BUILTIN (BUILT_IN_GOMP_DOACROSS BT_FN_VOID_PTR, ATTR_NOTHROW_LEAF_LIST) DEF_GOMP_BUILTIN (BUILT_IN_GOMP_DOACROSS_WAIT, "GOMP_doacross_wait", BT_FN_VOID_LONG_VAR, ATTR_NOTHROW_LEAF_LIST) +DEF_GOMP_BUILTIN (BUILT_IN_GOMP_DOACROSS_ULL_POST, "GOMP_doacross_ull_post", + BT_FN_VOID_PTR, ATTR_NOTHROW_LEAF_LIST) +DEF_GOMP_BUILTIN (BUILT_IN_GOMP_DOACROSS_ULL_WAIT, "GOMP_doacross_ull_wait", + BT_FN_VOID_ULL_VAR, ATTR_NOTHROW_LEAF_LIST) DEF_GOMP_BUILTIN (BUILT_IN_GOMP_PARALLEL, "GOMP_parallel", BT_FN_VOID_OMPFN_PTR_UINT_UINT, ATTR_NOTHROW_LIST) DEF_GOMP_BUILTIN (BUILT_IN_GOMP_TASK, "GOMP_task", --- gcc/builtin-types.def.jj 2015-09-17 09:24:53.000000000 +0200 +++ gcc/builtin-types.def 2015-10-01 17:25:48.760370499 +0200 @@ -475,6 +475,9 @@ DEF_FUNCTION_TYPE_4 (BT_FN_VOID_SIZE_CON BT_CONST_VOLATILE_PTR, BT_PTR, BT_INT) DEF_FUNCTION_TYPE_4 (BT_FN_BOOL_UINT_LONGPTR_LONGPTR_LONGPTR, BT_BOOL, BT_UINT, BT_PTR_LONG, BT_PTR_LONG, BT_PTR_LONG) +DEF_FUNCTION_TYPE_4 (BT_FN_BOOL_UINT_ULLPTR_ULLPTR_ULLPTR, + BT_BOOL, BT_UINT, BT_PTR_ULONGLONG, BT_PTR_ULONGLONG, + BT_PTR_ULONGLONG) DEF_FUNCTION_TYPE_5 (BT_FN_INT_STRING_INT_SIZE_CONST_STRING_VALIST_ARG, BT_INT, BT_STRING, BT_INT, BT_SIZE, BT_CONST_STRING, @@ -502,6 +505,9 @@ DEF_FUNCTION_TYPE_5 (BT_FN_VOID_OMPFN_PT DEF_FUNCTION_TYPE_5 (BT_FN_BOOL_UINT_LONGPTR_LONG_LONGPTR_LONGPTR, BT_BOOL, BT_UINT, BT_PTR_LONG, BT_LONG, BT_PTR_LONG, BT_PTR_LONG) +DEF_FUNCTION_TYPE_5 (BT_FN_BOOL_UINT_ULLPTR_ULL_ULLPTR_ULLPTR, + BT_BOOL, BT_UINT, BT_PTR_ULONGLONG, BT_ULONGLONG, + BT_PTR_ULONGLONG, BT_PTR_ULONGLONG) DEF_FUNCTION_TYPE_6 (BT_FN_INT_STRING_SIZE_INT_SIZE_CONST_STRING_VALIST_ARG, BT_INT, BT_STRING, BT_SIZE, BT_INT, BT_SIZE, @@ -578,6 +584,8 @@ DEF_FUNCTION_TYPE_VAR_1 (BT_FN_UINT32_UI BT_UINT32, BT_UINT32) DEF_FUNCTION_TYPE_VAR_1 (BT_FN_VOID_LONG_VAR, BT_VOID, BT_LONG) +DEF_FUNCTION_TYPE_VAR_1 (BT_FN_VOID_ULL_VAR, + BT_VOID, BT_ULONGLONG) DEF_FUNCTION_TYPE_VAR_2 (BT_FN_INT_FILEPTR_CONST_STRING_VAR, BT_INT, BT_FILEPTR, BT_CONST_STRING) --- gcc/fortran/types.def.jj 2015-09-17 09:31:11.000000000 +0200 +++ gcc/fortran/types.def 2015-10-01 17:30:29.856340476 +0200 @@ -156,6 +156,9 @@ DEF_FUNCTION_TYPE_4 (BT_FN_VOID_SIZE_CON BT_CONST_VOLATILE_PTR, BT_PTR, BT_INT) DEF_FUNCTION_TYPE_4 (BT_FN_BOOL_UINT_LONGPTR_LONGPTR_LONGPTR, BT_BOOL, BT_UINT, BT_PTR_LONG, BT_PTR_LONG, BT_PTR_LONG) +DEF_FUNCTION_TYPE_4 (BT_FN_BOOL_UINT_ULLPTR_ULLPTR_ULLPTR, + BT_BOOL, BT_UINT, BT_PTR_ULONGLONG, BT_PTR_ULONGLONG, + BT_PTR_ULONGLONG) DEF_FUNCTION_TYPE_5 (BT_FN_VOID_OMPFN_PTR_UINT_UINT_UINT, BT_VOID, BT_PTR_FN_VOID_PTR, BT_PTR, BT_UINT, BT_UINT, @@ -170,6 +173,9 @@ DEF_FUNCTION_TYPE_5 (BT_FN_VOID_INT_SIZE DEF_FUNCTION_TYPE_5 (BT_FN_BOOL_UINT_LONGPTR_LONG_LONGPTR_LONGPTR, BT_BOOL, BT_UINT, BT_PTR_LONG, BT_LONG, BT_PTR_LONG, BT_PTR_LONG) +DEF_FUNCTION_TYPE_5 (BT_FN_BOOL_UINT_ULLPTR_ULL_ULLPTR_ULLPTR, + BT_BOOL, BT_UINT, BT_PTR_ULONGLONG, BT_ULONGLONG, + BT_PTR_ULONGLONG, BT_PTR_ULONGLONG) DEF_FUNCTION_TYPE_6 (BT_FN_BOOL_LONG_LONG_LONG_LONG_LONGPTR_LONGPTR, BT_BOOL, BT_LONG, BT_LONG, BT_LONG, BT_LONG, @@ -232,6 +238,8 @@ DEF_FUNCTION_TYPE_VAR_0 (BT_FN_VOID_VAR, DEF_FUNCTION_TYPE_VAR_1 (BT_FN_VOID_LONG_VAR, BT_VOID, BT_LONG) +DEF_FUNCTION_TYPE_VAR_1 (BT_FN_VOID_ULL_VAR, + BT_VOID, BT_ULONGLONG) DEF_FUNCTION_TYPE_VAR_2 (BT_FN_VOID_INT_INT_VAR, BT_VOID, BT_INT, BT_INT) --- libgomp/loop_ull.c.jj 2015-06-11 10:27:48.000000000 +0200 +++ libgomp/loop_ull.c 2015-10-01 17:08:11.969445359 +0200 @@ -299,6 +299,114 @@ GOMP_loop_ull_ordered_runtime_start (boo } } +/* The *_doacross_*_start routines are similar. The only difference is that + this work-share construct is initialized to expect an ORDERED(N) - DOACROSS + section, and the worksharing loop iterates always from 0 to COUNTS[0] - 1 + and other COUNTS array elements tell the library number of iterations + in the ordered inner loops. */ + +static bool +gomp_loop_ull_doacross_static_start (unsigned ncounts, gomp_ull *counts, + gomp_ull chunk_size, gomp_ull *istart, + gomp_ull *iend) +{ + struct gomp_thread *thr = gomp_thread (); + + thr->ts.static_trip = 0; + if (gomp_work_share_start (false)) + { + gomp_loop_ull_init (thr->ts.work_share, true, 0, counts[0], 1, + GFS_STATIC, chunk_size); + gomp_doacross_ull_init (ncounts, counts, chunk_size); + gomp_work_share_init_done (); + } + + return !gomp_iter_ull_static_next (istart, iend); +} + +static bool +gomp_loop_ull_doacross_dynamic_start (unsigned ncounts, gomp_ull *counts, + gomp_ull chunk_size, gomp_ull *istart, + gomp_ull *iend) +{ + struct gomp_thread *thr = gomp_thread (); + bool ret; + + if (gomp_work_share_start (false)) + { + gomp_loop_ull_init (thr->ts.work_share, true, 0, counts[0], 1, + GFS_DYNAMIC, chunk_size); + gomp_doacross_ull_init (ncounts, counts, chunk_size); + gomp_work_share_init_done (); + } + +#if defined HAVE_SYNC_BUILTINS && defined __LP64__ + ret = gomp_iter_ull_dynamic_next (istart, iend); +#else + gomp_mutex_lock (&thr->ts.work_share->lock); + ret = gomp_iter_ull_dynamic_next_locked (istart, iend); + gomp_mutex_unlock (&thr->ts.work_share->lock); +#endif + + return ret; +} + +static bool +gomp_loop_ull_doacross_guided_start (unsigned ncounts, gomp_ull *counts, + gomp_ull chunk_size, gomp_ull *istart, + gomp_ull *iend) +{ + struct gomp_thread *thr = gomp_thread (); + bool ret; + + if (gomp_work_share_start (false)) + { + gomp_loop_ull_init (thr->ts.work_share, true, 0, counts[0], 1, + GFS_GUIDED, chunk_size); + gomp_doacross_ull_init (ncounts, counts, chunk_size); + gomp_work_share_init_done (); + } + +#if defined HAVE_SYNC_BUILTINS && defined __LP64__ + ret = gomp_iter_ull_guided_next (istart, iend); +#else + gomp_mutex_lock (&thr->ts.work_share->lock); + ret = gomp_iter_ull_guided_next_locked (istart, iend); + gomp_mutex_unlock (&thr->ts.work_share->lock); +#endif + + return ret; +} + +bool +GOMP_loop_ull_doacross_runtime_start (unsigned ncounts, gomp_ull *counts, + gomp_ull *istart, gomp_ull *iend) +{ + struct gomp_task_icv *icv = gomp_icv (false); + switch (icv->run_sched_var) + { + case GFS_STATIC: + return gomp_loop_ull_doacross_static_start (ncounts, counts, + icv->run_sched_chunk_size, + istart, iend); + case GFS_DYNAMIC: + return gomp_loop_ull_doacross_dynamic_start (ncounts, counts, + icv->run_sched_chunk_size, + istart, iend); + case GFS_GUIDED: + return gomp_loop_ull_doacross_guided_start (ncounts, counts, + icv->run_sched_chunk_size, + istart, iend); + case GFS_AUTO: + /* For now map to schedule(static), later on we could play with feedback + driven choice. */ + return gomp_loop_ull_doacross_static_start (ncounts, counts, + 0, istart, iend); + default: + abort (); + } +} + /* The *_next routines are called when the thread completes processing of the iteration block currently assigned to it. If the work-share construct is bound directly to a parallel construct, then the iteration @@ -466,6 +574,13 @@ extern __typeof(gomp_loop_ull_ordered_dy extern __typeof(gomp_loop_ull_ordered_guided_start) GOMP_loop_ull_ordered_guided_start __attribute__((alias ("gomp_loop_ull_ordered_guided_start"))); +extern __typeof(gomp_loop_ull_doacross_static_start) GOMP_loop_ull_doacross_static_start + __attribute__((alias ("gomp_loop_ull_doacross_static_start"))); +extern __typeof(gomp_loop_ull_doacross_dynamic_start) GOMP_loop_ull_doacross_dynamic_start + __attribute__((alias ("gomp_loop_ull_doacross_dynamic_start"))); +extern __typeof(gomp_loop_ull_doacross_guided_start) GOMP_loop_ull_doacross_guided_start + __attribute__((alias ("gomp_loop_ull_doacross_guided_start"))); + extern __typeof(gomp_loop_ull_static_next) GOMP_loop_ull_static_next __attribute__((alias ("gomp_loop_ull_static_next"))); extern __typeof(gomp_loop_ull_dynamic_next) GOMP_loop_ull_dynamic_next @@ -535,6 +650,33 @@ GOMP_loop_ull_ordered_guided_start (bool } bool +GOMP_loop_ull_doacross_static_start (unsigned ncounts, gomp_ull *counts, + gomp_ull chunk_size, gomp_ull *istart, + gomp_ull *iend) +{ + return gomp_loop_ull_doacross_static_start (ncounts, counts, chunk_size, + istart, iend); +} + +bool +GOMP_loop_ull_doacross_dynamic_start (unsigned ncounts, gomp_ull *counts, + gomp_ull chunk_size, gomp_ull *istart, + gomp_ull *iend) +{ + return gomp_loop_ull_doacross_dynamic_start (ncounts, counts, chunk_size, + istart, iend); +} + +bool +GOMP_loop_ull_doacross_guided_start (unsigned ncounts, gomp_ull *counts, + gomp_ull chunk_size, gomp_ull *istart, + gomp_ull *iend) +{ + return gomp_loop_ull_doacross_guided_start (ncounts, counts, chunk_size, + istart, iend); +} + +bool GOMP_loop_ull_static_next (gomp_ull *istart, gomp_ull *iend) { return gomp_loop_ull_static_next (istart, iend); --- libgomp/ordered.c.jj 2015-09-24 20:20:32.000000000 +0200 +++ libgomp/ordered.c 2015-10-02 13:21:16.675194039 +0200 @@ -317,7 +317,6 @@ gomp_doacross_init (unsigned ncounts, lo doacross->elt_sz = elt_sz; doacross->ncounts = ncounts; doacross->flattened = false; - doacross->boundary = 0; doacross->array = (unsigned char *) ((((uintptr_t) (doacross + 1)) + 63 + shift_sz) & ~(uintptr_t) 63); @@ -479,3 +478,296 @@ GOMP_doacross_wait (long first, ...) while (1); __sync_synchronize (); } + +typedef unsigned long long gomp_ull; + +void +gomp_doacross_ull_init (unsigned ncounts, gomp_ull *counts, gomp_ull chunk_size) +{ + struct gomp_thread *thr = gomp_thread (); + struct gomp_team *team = thr->ts.team; + struct gomp_work_share *ws = thr->ts.work_share; + unsigned int i, bits[MAX_COLLAPSED_BITS], num_bits = 0; + unsigned long ent, num_ents, elt_sz, shift_sz; + struct gomp_doacross_work_share *doacross; + + if (team == NULL || team->nthreads == 1) + return; + + for (i = 0; i < ncounts; i++) + { + /* If any count is 0, GOMP_doacross_{post,wait} can't be called. */ + if (counts[i] == 0) + return; + + if (num_bits <= MAX_COLLAPSED_BITS) + { + unsigned int this_bits; + if (counts[i] == 1) + this_bits = 1; + else + this_bits = __SIZEOF_LONG_LONG__ * __CHAR_BIT__ + - __builtin_clzll (counts[i] - 1); + if (num_bits + this_bits <= MAX_COLLAPSED_BITS) + { + bits[i] = this_bits; + num_bits += this_bits; + } + else + num_bits = MAX_COLLAPSED_BITS + 1; + } + } + + if (ws->sched == GFS_STATIC) + num_ents = team->nthreads; + else + num_ents = (counts[0] - 1) / chunk_size + 1; + if (num_bits <= MAX_COLLAPSED_BITS) + { + elt_sz = sizeof (unsigned long); + shift_sz = ncounts * sizeof (unsigned int); + } + else + { + if (sizeof (gomp_ull) == sizeof (unsigned long)) + elt_sz = sizeof (gomp_ull) * ncounts; + else if (sizeof (gomp_ull) == 2 * sizeof (unsigned long)) + elt_sz = sizeof (unsigned long) * 2 * ncounts; + else + abort (); + shift_sz = 0; + } + elt_sz = (elt_sz + 63) & ~63UL; + + doacross = gomp_malloc (sizeof (*doacross) + 63 + num_ents * elt_sz + + shift_sz); + doacross->chunk_size_ull = chunk_size; + doacross->elt_sz = elt_sz; + doacross->ncounts = ncounts; + doacross->flattened = false; + doacross->boundary = 0; + doacross->array = (unsigned char *) + ((((uintptr_t) (doacross + 1)) + 63 + shift_sz) + & ~(uintptr_t) 63); + if (num_bits <= MAX_COLLAPSED_BITS) + { + unsigned int shift_count = 0; + doacross->flattened = true; + for (i = ncounts; i > 0; i--) + { + doacross->shift_counts[i - 1] = shift_count; + shift_count += bits[i - 1]; + } + for (ent = 0; ent < num_ents; ent++) + *(unsigned long *) (doacross->array + ent * elt_sz) = 0; + } + else + for (ent = 0; ent < num_ents; ent++) + memset (doacross->array + ent * elt_sz, '\0', + sizeof (unsigned long) * ncounts); + if (ws->sched == GFS_STATIC && chunk_size == 0) + { + gomp_ull q = counts[0] / num_ents; + gomp_ull t = counts[0] % num_ents; + doacross->boundary_ull = t * (q + 1); + doacross->q_ull = q; + doacross->t = t; + } + ws->doacross = doacross; +} + +/* DOACROSS POST operation. */ + +void +GOMP_doacross_ull_post (gomp_ull *counts) +{ + struct gomp_thread *thr = gomp_thread (); + struct gomp_work_share *ws = thr->ts.work_share; + struct gomp_doacross_work_share *doacross = ws->doacross; + unsigned long ent; + unsigned int i; + + if (__builtin_expect (doacross == NULL, 0)) + { + __sync_synchronize (); + return; + } + + if (__builtin_expect (ws->sched == GFS_STATIC, 1)) + ent = thr->ts.team_id; + else + ent = counts[0] / doacross->chunk_size_ull; + + if (__builtin_expect (doacross->flattened, 1)) + { + unsigned long *array = (unsigned long *) (doacross->array + + ent * doacross->elt_sz); + gomp_ull flattened + = counts[0] << doacross->shift_counts[0]; + + for (i = 1; i < doacross->ncounts; i++) + flattened |= counts[i] << doacross->shift_counts[i]; + flattened++; + if (flattened == __atomic_load_n (array, MEMMODEL_ACQUIRE)) + __atomic_thread_fence (MEMMODEL_RELEASE); + else + __atomic_store_n (array, flattened, MEMMODEL_RELEASE); + return; + } + + __atomic_thread_fence (MEMMODEL_ACQUIRE); + if (sizeof (gomp_ull) == sizeof (unsigned long)) + { + gomp_ull *array = (gomp_ull *) (doacross->array + + ent * doacross->elt_sz); + + for (i = doacross->ncounts; i-- > 0; ) + { + if (counts[i] + 1UL != __atomic_load_n (&array[i], MEMMODEL_RELAXED)) + __atomic_store_n (&array[i], counts[i] + 1UL, MEMMODEL_RELEASE); + } + } + else + { + unsigned long *array = (unsigned long *) (doacross->array + + ent * doacross->elt_sz); + + for (i = doacross->ncounts; i-- > 0; ) + { + gomp_ull cull = counts[i] + 1UL; + unsigned long c = (unsigned long) cull; + if (c != __atomic_load_n (&array[2 * i + 1], MEMMODEL_RELAXED)) + __atomic_store_n (&array[2 * i + 1], c, MEMMODEL_RELEASE); + c = cull >> (__SIZEOF_LONG_LONG__ * __CHAR_BIT__ / 2); + if (c != __atomic_load_n (&array[2 * i], MEMMODEL_RELAXED)) + __atomic_store_n (&array[2 * i], c, MEMMODEL_RELEASE); + } + } +} + +/* DOACROSS WAIT operation. */ + +void +GOMP_doacross_ull_wait (gomp_ull first, ...) +{ + struct gomp_thread *thr = gomp_thread (); + struct gomp_work_share *ws = thr->ts.work_share; + struct gomp_doacross_work_share *doacross = ws->doacross; + va_list ap; + unsigned long ent; + unsigned int i; + + if (__builtin_expect (doacross == NULL, 0)) + { + __sync_synchronize (); + return; + } + + if (__builtin_expect (ws->sched == GFS_STATIC, 1)) + { + if (ws->chunk_size_ull == 0) + { + if (first < doacross->boundary_ull) + ent = first / (doacross->q_ull + 1); + else + ent = (first - doacross->boundary_ull) / doacross->q_ull + + doacross->t; + } + else + ent = first / ws->chunk_size_ull % thr->ts.team->nthreads; + } + else + ent = first / doacross->chunk_size_ull; + + if (__builtin_expect (doacross->flattened, 1)) + { + unsigned long *array = (unsigned long *) (doacross->array + + ent * doacross->elt_sz); + gomp_ull flattened = first << doacross->shift_counts[0]; + unsigned long cur; + + va_start (ap, first); + for (i = 1; i < doacross->ncounts; i++) + flattened |= va_arg (ap, gomp_ull) + << doacross->shift_counts[i]; + cur = __atomic_load_n (array, MEMMODEL_ACQUIRE); + if (flattened < cur) + { + __atomic_thread_fence (MEMMODEL_RELEASE); + va_end (ap); + return; + } + doacross_spin (array, flattened, cur); + __atomic_thread_fence (MEMMODEL_RELEASE); + va_end (ap); + return; + } + + if (sizeof (gomp_ull) == sizeof (unsigned long)) + { + gomp_ull *array = (gomp_ull *) (doacross->array + + ent * doacross->elt_sz); + do + { + va_start (ap, first); + for (i = 0; i < doacross->ncounts; i++) + { + gomp_ull thisv + = (i ? va_arg (ap, gomp_ull) : first) + 1; + gomp_ull cur = __atomic_load_n (&array[i], MEMMODEL_RELAXED); + if (thisv < cur) + { + i = doacross->ncounts; + break; + } + if (thisv > cur) + break; + } + va_end (ap); + if (i == doacross->ncounts) + break; + cpu_relax (); + } + while (1); + } + else + { + unsigned long *array = (unsigned long *) (doacross->array + + ent * doacross->elt_sz); + do + { + va_start (ap, first); + for (i = 0; i < doacross->ncounts; i++) + { + gomp_ull thisv + = (i ? va_arg (ap, gomp_ull) : first) + 1; + unsigned long t + = thisv >> (__SIZEOF_LONG_LONG__ * __CHAR_BIT__ / 2); + unsigned long cur + = __atomic_load_n (&array[2 * i], MEMMODEL_RELAXED); + if (t < cur) + { + i = doacross->ncounts; + break; + } + if (t > cur) + break; + t = thisv; + cur = __atomic_load_n (&array[2 * i + 1], MEMMODEL_RELAXED); + if (t < cur) + { + i = doacross->ncounts; + break; + } + if (t > cur) + break; + } + va_end (ap); + if (i == doacross->ncounts) + break; + cpu_relax (); + } + while (1); + } + __sync_synchronize (); +} --- libgomp/libgomp.map.jj 2015-09-18 18:12:29.000000000 +0200 +++ libgomp/libgomp.map 2015-10-01 17:03:55.649130579 +0200 @@ -280,6 +280,12 @@ GOMP_4.1 { GOMP_loop_doacross_static_start; GOMP_doacross_post; GOMP_doacross_wait; + GOMP_loop_ull_doacross_dynamic_start; + GOMP_loop_ull_doacross_guided_start; + GOMP_loop_ull_doacross_runtime_start; + GOMP_loop_ull_doacross_static_start; + GOMP_doacross_ull_post; + GOMP_doacross_ull_wait; } GOMP_4.0.1; OACC_2.0 { --- libgomp/libgomp_g.h.jj 2015-09-24 13:33:32.000000000 +0200 +++ libgomp/libgomp_g.h 2015-10-01 14:15:41.195121635 +0200 @@ -173,12 +173,34 @@ extern bool GOMP_loop_ull_ordered_guided extern bool GOMP_loop_ull_ordered_runtime_next (unsigned long long *, unsigned long long *); +extern bool GOMP_loop_ull_doacross_static_start (unsigned, + unsigned long long *, + unsigned long long, + unsigned long long *, + unsigned long long *); +extern bool GOMP_loop_ull_doacross_dynamic_start (unsigned, + unsigned long long *, + unsigned long long, + unsigned long long *, + unsigned long long *); +extern bool GOMP_loop_ull_doacross_guided_start (unsigned, + unsigned long long *, + unsigned long long, + unsigned long long *, + unsigned long long *); +extern bool GOMP_loop_ull_doacross_runtime_start (unsigned, + unsigned long long *, + unsigned long long *, + unsigned long long *); + /* ordered.c */ extern void GOMP_ordered_start (void); extern void GOMP_ordered_end (void); extern void GOMP_doacross_post (long *); extern void GOMP_doacross_wait (long, ...); +extern void GOMP_doacross_ull_post (unsigned long long *); +extern void GOMP_doacross_ull_wait (unsigned long long, ...); /* parallel.c */ --- libgomp/libgomp.h.jj 2015-09-23 12:25:51.000000000 +0200 +++ libgomp/libgomp.h 2015-10-01 16:24:59.005076396 +0200 @@ -84,10 +84,14 @@ struct gomp_doacross_work_share /* chunk_size copy, as ws->chunk_size is multiplied by incr for GFS_DYNAMIC. */ long chunk_size; + /* Likewise, but for ull implementation. */ + unsigned long long chunk_size_ull; /* For schedule(static,0) this is the number of iterations assigned to the last thread, i.e. number of iterations / number of threads. */ long q; + /* Likewise, but for ull implementation. */ + unsigned long long q_ull; }; /* Size of each array entry (padded to cache line size). */ unsigned long elt_sz; @@ -102,8 +106,12 @@ struct gomp_doacross_work_share /* These two are only used for schedule(static,0). */ /* This one is number of iterations % number of threads. */ long t; - /* And this one is cached t * (q + 1). */ - long boundary; + union { + /* And this one is cached t * (q + 1). */ + long boundary; + /* Likewise, but for the ull implementation. */ + unsigned long long boundary_ull; + }; /* Array of shift counts for each dimension if they can be flattened. */ unsigned int shift_counts[]; }; @@ -683,6 +691,8 @@ extern void gomp_ordered_static_init (vo extern void gomp_ordered_static_next (void); extern void gomp_ordered_sync (void); extern void gomp_doacross_init (unsigned, long *, long); +extern void gomp_doacross_ull_init (unsigned, unsigned long long *, + unsigned long long); /* parallel.c */ --- libgomp/testsuite/libgomp.c/doacross-2.c.jj 2015-10-02 09:36:19.575951751 +0200 +++ libgomp/testsuite/libgomp.c/doacross-2.c 2015-10-02 10:14:47.098868611 +0200 @@ -0,0 +1,225 @@ +extern void abort (void); + +#define N 256 +int a[N], b[N / 16][8][4], c[N / 32][8][8], g[N / 16][8][6]; +volatile int d, e; +volatile unsigned long long f; + +int +main () +{ + unsigned long long i; + int j, k, l, m; + #pragma omp parallel private (l) + { + #pragma omp for schedule(static, 1) ordered (1) nowait + for (i = 1; i < N + f; i++) + { + #pragma omp atomic write + a[i] = 1; + #pragma omp ordered depend(sink: i - 1) + if (i > 1) + { + #pragma omp atomic read + l = a[i - 1]; + if (l < 2) + abort (); + } + #pragma omp atomic write + a[i] = 2; + if (i < N - 1) + { + #pragma omp atomic read + l = a[i + 1]; + if (l == 3) + abort (); + } + #pragma omp ordered depend(source) + #pragma omp atomic write + a[i] = 3; + } + #pragma omp for schedule(static, 0) ordered (3) nowait + for (i = 3; i < N / 16 - 1 + f; i++) + for (j = 0; j < 8; j += 2) + for (k = 1; k <= 3; k++) + { + #pragma omp atomic write + b[i][j][k] = 1; + #pragma omp ordered depend(sink: i, j - 2, k - 1) \ + depend(sink: i - 2, j - 2, k + 1) + #pragma omp ordered depend(sink: i - 3, j + 2, k - 2) + if (j >= 2 && k > 1) + { + #pragma omp atomic read + l = b[i][j - 2][k - 1]; + if (l < 2) + abort (); + } + #pragma omp atomic write + b[i][j][k] = 2; + if (i >= 5 && j >= 2 && k < 3) + { + #pragma omp atomic read + l = b[i - 2][j - 2][k + 1]; + if (l < 2) + abort (); + } + if (i >= 6 && j < N / 16 - 3 && k == 3) + { + #pragma omp atomic read + l = b[i - 3][j + 2][k - 2]; + if (l < 2) + abort (); + } + #pragma omp ordered depend(source) + #pragma omp atomic write + b[i][j][k] = 3; + } +#define A(n) int n; +#define B(n) A(n##0) A(n##1) A(n##2) A(n##3) +#define C(n) B(n##0) B(n##1) B(n##2) B(n##3) +#define D(n) C(n##0) C(n##1) C(n##2) C(n##3) + D(m) +#undef A + #pragma omp for collapse (2) ordered(61) schedule(dynamic, 15) + for (i = 2; i < N / 32 + f; i++) + for (j = 7; j > 1; j--) + for (k = 6; k >= 0; k -= 2) +#define A(n) for (n = 4; n < 5; n++) + D(m) +#undef A + { + #pragma omp atomic write + c[i][j][k] = 1; +#define A(n) ,n +#define E(n) C(n##0) C(n##1) C(n##2) B(n##30) B(n##31) A(n##320) A(n##321) + #pragma omp ordered depend (sink: i, j, k + 2 E(m)) \ + depend (sink:i - 2, j + 1, k - 4 E(m)) \ + depend(sink: i - 1, j - 2, k - 2 E(m)) + if (k <= 4) + { + l = c[i][j][k + 2]; + if (l < 2) + abort (); + } + #pragma omp atomic write + c[i][j][k] = 2; + if (i >= 4 && j < 7 && k >= 4) + { + l = c[i - 2][j + 1][k - 4]; + if (l < 2) + abort (); + } + if (i >= 3 && j >= 4 && k >= 2) + { + l = c[i - 1][j - 2][k - 2]; + if (l < 2) + abort (); + } + #pragma omp ordered depend (source) + #pragma omp atomic write + c[i][j][k] = 3; + } + #pragma omp for schedule(static, 0) ordered (3) nowait + for (j = 0; j < N / 16 - 1; j++) + for (k = 0; k < 8; k += 2) + for (i = 3; i <= 5 + f; i++) + { + #pragma omp atomic write + g[j][k][i] = 1; + #pragma omp ordered depend(sink: j, k - 2, i - 1) \ + depend(sink: j - 2, k - 2, i + 1) + #pragma omp ordered depend(sink: j - 3, k + 2, i - 2) + if (k >= 2 && i > 3) + { + #pragma omp atomic read + l = g[j][k - 2][i - 1]; + if (l < 2) + abort (); + } + #pragma omp atomic write + g[j][k][i] = 2; + if (j >= 2 && k >= 2 && i < 5) + { + #pragma omp atomic read + l = g[j - 2][k - 2][i + 1]; + if (l < 2) + abort (); + } + if (j >= 3 && k < N / 16 - 3 && i == 5) + { + #pragma omp atomic read + l = g[j - 3][k + 2][i - 2]; + if (l < 2) + abort (); + } + #pragma omp ordered depend(source) + #pragma omp atomic write + g[j][k][i] = 3; + } + #pragma omp for collapse(2) ordered(4) lastprivate (i, j, k) + for (i = 2; i < f + 3; i++) + for (j = d + 1; j >= 0; j--) + for (k = 0; k < d; k++) + for (l = 0; l < d + 2; l++) + { + #pragma omp ordered depend (source) + #pragma omp ordered depend (sink:i - 2, j + 2, k - 2, l) + if (!e) + abort (); + } + #pragma omp single + { + if (i != 3 || j != -1 || k != 0) + abort (); + i = 8; j = 9; k = 10; + } + #pragma omp for collapse(2) ordered(4) lastprivate (i, j, k, m) + for (i = 2; i < f + 3; i++) + for (j = d + 1; j >= 0; j--) + for (k = 0; k < d + 2; k++) + for (m = 0; m < d; m++) + { + #pragma omp ordered depend (source) + #pragma omp ordered depend (sink:i - 2, j + 2, k - 2, m) + abort (); + } + #pragma omp single + if (i != 3 || j != -1 || k != 2 || m != 0) + abort (); + #pragma omp for collapse(2) ordered(4) nowait + for (i = 2; i < f + 3; i++) + for (j = d; j > 0; j--) + for (k = 0; k < d + 2; k++) + for (l = 0; l < d + 4; l++) + { + #pragma omp ordered depend (source) + #pragma omp ordered depend (sink:i - 2, j + 2, k - 2, l) + if (!e) + abort (); + } + #pragma omp for nowait + for (i = 0; i < N; i++) + if (a[i] != 3 * (i >= 1)) + abort (); + #pragma omp for collapse(2) private(k) nowait + for (i = 0; i < N / 16; i++) + for (j = 0; j < 8; j++) + for (k = 0; k < 4; k++) + if (b[i][j][k] != 3 * (i >= 3 && i < N / 16 - 1 && (j & 1) == 0 && k >= 1)) + abort (); + #pragma omp for collapse(3) nowait + for (i = 0; i < N / 32; i++) + for (j = 0; j < 8; j++) + for (k = 0; k < 8; k++) + if (c[i][j][k] != 3 * (i >= 2 && j >= 2 && (k & 1) == 0)) + abort (); + #pragma omp for collapse(2) private(k) nowait + for (i = 0; i < N / 16; i++) + for (j = 0; j < 8; j++) + for (k = 0; k < 6; k++) + if (g[i][j][k] != 3 * (i < N / 16 - 1 && (j & 1) == 0 && k >= 3)) + abort (); + } + return 0; +} Jakub