[gomp4.1] Unsigned long long doacross implementation

Jakub Jelinek Fri, 02 Oct 2015 05:15:08 -0700

Hi!

The default set of routines use long as the iterator type, if some loops
need either unsigned long, or long long/unsigned long long, they need to use
another implementation (__int128 iterators are not supported).
This patch adds those entry points and fixes some issues on the compiler
side.


2015-10-02  Jakub Jelinek  <ja...@redhat.com>

        * omp-low.c (expand_omp_ordered_source): Use GOMP_doacross_ull_post
        instead of GOMP_doacross_post if iter_type is unsigned long long.
        (expand_omp_ordered_sink): Use GOMP_doacross_ull_wait
        instead of GOMP_doacross_wait if iter_type is unsigned long long.
        (expand_omp_for_generic): Fix up expansion if zero_iter1_bb is
        NULL, but zero_iter2_bb is non-NULL.  Never pass the up bool argument
        to GOMP_loop_ull_doacross_*_start entrypoints.
        * omp-builtins.def (BUILT_IN_GOMP_LOOP_ULL_DOACROSS_STATIC_START,
        BUILT_IN_GOMP_LOOP_ULL_DOACROSS_DYNAMIC_START,
        BUILT_IN_GOMP_LOOP_ULL_DOACROSS_GUIDED_START,
        BUILT_IN_GOMP_LOOP_ULL_DOACROSS_RUNTIME_START,
        BUILT_IN_GOMP_DOACROSS_ULL_POST, BUILT_IN_GOMP_DOACROSS_ULL_WAIT): New
        built-ins.
        * builtin-types.def (BT_FN_BOOL_UINT_ULLPTR_ULLPTR_ULLPTR,
        BT_FN_BOOL_UINT_ULLPTR_ULL_ULLPTR_ULLPTR, BT_FN_VOID_ULL_VAR): New.
gcc/fortran/
        * types.def (BT_FN_BOOL_UINT_ULLPTR_ULLPTR_ULLPTR,
        BT_FN_BOOL_UINT_ULLPTR_ULL_ULLPTR_ULLPTR, BT_FN_VOID_ULL_VAR): New.
libgomp/
        * loop_ull.c (gomp_loop_ull_doacross_static_start,
        gomp_loop_ull_doacross_dynamic_start,
        gomp_loop_ull_doacross_guided_start,
        GOMP_loop_ull_doacross_runtime_start,
        GOMP_loop_ull_doacross_static_start,
        GOMP_loop_ull_doacross_dynamic_start,
        GOMP_loop_ull_doacross_guided_start): New functions.
        * ordered.c (gomp_doacross_init): Don't initialize boundary
        if not static scheduling.
        (gomp_doacross_ull_init, GOMP_doacross_ull_post,
        GOMP_doacross_ull_wait): New functions.
        * libgomp.map (GOMP_4.1): Export GOMP_loop_ull_doacross_dynamic_start,
        GOMP_loop_ull_doacross_guided_start,
        GOMP_loop_ull_doacross_runtime_start,
        GOMP_loop_ull_doacross_static_start,
        GOMP_doacross_ull_post and GOMP_doacross_ull_wait.
        * libgomp_g.h (GOMP_loop_ull_doacross_guided_start,
        GOMP_loop_ull_doacross_runtime_start,
        GOMP_loop_ull_doacross_static_start,
        GOMP_doacross_ull_post, GOMP_doacross_ull_wait): New prototypes.
        * libgomp.h (struct gomp_doacross_work_share): Add
        chunk_size_ull, q_ull and boundary_ull fields.
        (gomp_doacross_ull_init): New prototype.
        * testsuite/libgomp.c/doacross-2.c: New test.

--- gcc/omp-low.c.jj    2015-10-01 13:20:13.000000000 +0200
+++ gcc/omp-low.c       2015-10-02 11:38:40.140982433 +0200
@@ -7071,7 +7071,9 @@ static void
 expand_omp_ordered_source (gimple_stmt_iterator *gsi, struct omp_for_data *fd,
                           tree *counts, location_t loc)
 {
-  enum built_in_function source_ix = BUILT_IN_GOMP_DOACROSS_POST;
+  enum built_in_function source_ix
+    = fd->iter_type == long_integer_type_node
+      ? BUILT_IN_GOMP_DOACROSS_POST : BUILT_IN_GOMP_DOACROSS_ULL_POST;
   gimple g
     = gimple_build_call (builtin_decl_explicit (source_ix), 1,
                         build_fold_addr_expr (counts[fd->ordered]));
@@ -7086,7 +7088,9 @@ expand_omp_ordered_sink (gimple_stmt_ite
                         tree *counts, tree c, location_t loc)
 {
   auto_vec<tree, 10> args;
-  enum built_in_function sink_ix = BUILT_IN_GOMP_DOACROSS_WAIT;
+  enum built_in_function sink_ix
+    = fd->iter_type == long_integer_type_node
+      ? BUILT_IN_GOMP_DOACROSS_WAIT : BUILT_IN_GOMP_DOACROSS_ULL_WAIT;
   tree t, off, coff = NULL_TREE, deps = OMP_CLAUSE_DECL (c), cond = NULL_TREE;
   int i;
   gimple_stmt_iterator gsi2 = *gsi;
@@ -7625,11 +7629,11 @@ expand_omp_for_generic (struct omp_regio
              gsi_prev (&gsi);
              e = split_block (entry_bb, gsi_stmt (gsi));
              entry_bb = e->dest;
-             make_edge (zero_iter1_bb, entry_bb, EDGE_FALLTHRU);
+             make_edge (zero_iter2_bb, entry_bb, EDGE_FALLTHRU);
              gsi = gsi_last_bb (entry_bb);
              set_immediate_dominator (CDI_DOMINATORS, entry_bb,
                                       get_immediate_dominator
-                                        (CDI_DOMINATORS, zero_iter1_bb));
+                                        (CDI_DOMINATORS, zero_iter2_bb));
            }
        }
       if (fd->collapse == 1)
@@ -7762,7 +7766,7 @@ expand_omp_for_generic (struct omp_regio
              t0 = fold_build2 (PLUS_EXPR, fd->iter_type, t0, bias);
            }
        }
-      if (fd->iter_type == long_integer_type_node)
+      if (fd->iter_type == long_integer_type_node || fd->ordered)
        {
          if (fd->chunk_size)
            {
@@ -7801,14 +7805,8 @@ expand_omp_for_generic (struct omp_regio
              tree bfn_decl = builtin_decl_explicit (start_fn);
              t = fold_convert (fd->iter_type, fd->chunk_size);
              t = omp_adjust_chunk_size (t, fd->simd_schedule);
-             if (fd->ordered)
-               t = build_call_expr (bfn_decl, 6, t5, t0, t1, t, t3, t4);
-             else
-               t = build_call_expr (bfn_decl, 7, t5, t0, t1, t2, t, t3, t4);
+             t = build_call_expr (bfn_decl, 7, t5, t0, t1, t2, t, t3, t4);
            }
-         else if (fd->ordered)
-           t = build_call_expr (builtin_decl_explicit (start_fn),
-                                5, t5, t0, t1, t3, t4);
          else
            t = build_call_expr (builtin_decl_explicit (start_fn),
                                 6, t5, t0, t1, t2, t3, t4);
--- gcc/omp-builtins.def.jj     2015-09-24 13:33:02.000000000 +0200
+++ gcc/omp-builtins.def        2015-10-01 17:19:13.565016484 +0200
@@ -197,6 +197,22 @@ DEF_GOMP_BUILTIN (BUILT_IN_GOMP_LOOP_ULL
                  "GOMP_loop_ull_ordered_runtime_start",
                  BT_FN_BOOL_BOOL_ULL_ULL_ULL_ULLPTR_ULLPTR,
                  ATTR_NOTHROW_LEAF_LIST)
+DEF_GOMP_BUILTIN (BUILT_IN_GOMP_LOOP_ULL_DOACROSS_STATIC_START,
+                 "GOMP_loop_ull_doacross_static_start",
+                 BT_FN_BOOL_UINT_ULLPTR_ULL_ULLPTR_ULLPTR,
+                 ATTR_NOTHROW_LEAF_LIST)
+DEF_GOMP_BUILTIN (BUILT_IN_GOMP_LOOP_ULL_DOACROSS_DYNAMIC_START,
+                 "GOMP_loop_ull_doacross_dynamic_start",
+                 BT_FN_BOOL_UINT_ULLPTR_ULL_ULLPTR_ULLPTR,
+                 ATTR_NOTHROW_LEAF_LIST)
+DEF_GOMP_BUILTIN (BUILT_IN_GOMP_LOOP_ULL_DOACROSS_GUIDED_START,
+                 "GOMP_loop_ull_doacross_guided_start",
+                 BT_FN_BOOL_UINT_ULLPTR_ULL_ULLPTR_ULLPTR,
+                 ATTR_NOTHROW_LEAF_LIST)
+DEF_GOMP_BUILTIN (BUILT_IN_GOMP_LOOP_ULL_DOACROSS_RUNTIME_START,
+                 "GOMP_loop_ull_doacross_runtime_start",
+                 BT_FN_BOOL_UINT_ULLPTR_ULLPTR_ULLPTR,
+                 ATTR_NOTHROW_LEAF_LIST)
 DEF_GOMP_BUILTIN (BUILT_IN_GOMP_LOOP_ULL_STATIC_NEXT, 
"GOMP_loop_ull_static_next",
                  BT_FN_BOOL_ULONGLONGPTR_ULONGLONGPTR, ATTR_NOTHROW_LEAF_LIST)
 DEF_GOMP_BUILTIN (BUILT_IN_GOMP_LOOP_ULL_DYNAMIC_NEXT, 
"GOMP_loop_ull_dynamic_next",
@@ -250,6 +266,10 @@ DEF_GOMP_BUILTIN (BUILT_IN_GOMP_DOACROSS
                  BT_FN_VOID_PTR, ATTR_NOTHROW_LEAF_LIST)
 DEF_GOMP_BUILTIN (BUILT_IN_GOMP_DOACROSS_WAIT, "GOMP_doacross_wait",
                  BT_FN_VOID_LONG_VAR, ATTR_NOTHROW_LEAF_LIST)
+DEF_GOMP_BUILTIN (BUILT_IN_GOMP_DOACROSS_ULL_POST, "GOMP_doacross_ull_post",
+                 BT_FN_VOID_PTR, ATTR_NOTHROW_LEAF_LIST)
+DEF_GOMP_BUILTIN (BUILT_IN_GOMP_DOACROSS_ULL_WAIT, "GOMP_doacross_ull_wait",
+                 BT_FN_VOID_ULL_VAR, ATTR_NOTHROW_LEAF_LIST)
 DEF_GOMP_BUILTIN (BUILT_IN_GOMP_PARALLEL, "GOMP_parallel",
                  BT_FN_VOID_OMPFN_PTR_UINT_UINT, ATTR_NOTHROW_LIST)
 DEF_GOMP_BUILTIN (BUILT_IN_GOMP_TASK, "GOMP_task",
--- gcc/builtin-types.def.jj    2015-09-17 09:24:53.000000000 +0200
+++ gcc/builtin-types.def       2015-10-01 17:25:48.760370499 +0200
@@ -475,6 +475,9 @@ DEF_FUNCTION_TYPE_4 (BT_FN_VOID_SIZE_CON
                     BT_CONST_VOLATILE_PTR, BT_PTR, BT_INT)
 DEF_FUNCTION_TYPE_4 (BT_FN_BOOL_UINT_LONGPTR_LONGPTR_LONGPTR,
                     BT_BOOL, BT_UINT, BT_PTR_LONG, BT_PTR_LONG, BT_PTR_LONG)
+DEF_FUNCTION_TYPE_4 (BT_FN_BOOL_UINT_ULLPTR_ULLPTR_ULLPTR,
+                    BT_BOOL, BT_UINT, BT_PTR_ULONGLONG, BT_PTR_ULONGLONG,
+                    BT_PTR_ULONGLONG)
 
 DEF_FUNCTION_TYPE_5 (BT_FN_INT_STRING_INT_SIZE_CONST_STRING_VALIST_ARG,
                     BT_INT, BT_STRING, BT_INT, BT_SIZE, BT_CONST_STRING,
@@ -502,6 +505,9 @@ DEF_FUNCTION_TYPE_5 (BT_FN_VOID_OMPFN_PT
 DEF_FUNCTION_TYPE_5 (BT_FN_BOOL_UINT_LONGPTR_LONG_LONGPTR_LONGPTR,
                     BT_BOOL, BT_UINT, BT_PTR_LONG, BT_LONG, BT_PTR_LONG,
                     BT_PTR_LONG)
+DEF_FUNCTION_TYPE_5 (BT_FN_BOOL_UINT_ULLPTR_ULL_ULLPTR_ULLPTR,
+                    BT_BOOL, BT_UINT, BT_PTR_ULONGLONG, BT_ULONGLONG,
+                    BT_PTR_ULONGLONG, BT_PTR_ULONGLONG)
 
 DEF_FUNCTION_TYPE_6 (BT_FN_INT_STRING_SIZE_INT_SIZE_CONST_STRING_VALIST_ARG,
                     BT_INT, BT_STRING, BT_SIZE, BT_INT, BT_SIZE,
@@ -578,6 +584,8 @@ DEF_FUNCTION_TYPE_VAR_1 (BT_FN_UINT32_UI
                         BT_UINT32, BT_UINT32)
 DEF_FUNCTION_TYPE_VAR_1 (BT_FN_VOID_LONG_VAR,
                         BT_VOID, BT_LONG)
+DEF_FUNCTION_TYPE_VAR_1 (BT_FN_VOID_ULL_VAR,
+                        BT_VOID, BT_ULONGLONG)
 
 DEF_FUNCTION_TYPE_VAR_2 (BT_FN_INT_FILEPTR_CONST_STRING_VAR,
                         BT_INT, BT_FILEPTR, BT_CONST_STRING)
--- gcc/fortran/types.def.jj    2015-09-17 09:31:11.000000000 +0200
+++ gcc/fortran/types.def       2015-10-01 17:30:29.856340476 +0200
@@ -156,6 +156,9 @@ DEF_FUNCTION_TYPE_4 (BT_FN_VOID_SIZE_CON
                     BT_CONST_VOLATILE_PTR, BT_PTR, BT_INT)
 DEF_FUNCTION_TYPE_4 (BT_FN_BOOL_UINT_LONGPTR_LONGPTR_LONGPTR,
                     BT_BOOL, BT_UINT, BT_PTR_LONG, BT_PTR_LONG, BT_PTR_LONG)
+DEF_FUNCTION_TYPE_4 (BT_FN_BOOL_UINT_ULLPTR_ULLPTR_ULLPTR,
+                    BT_BOOL, BT_UINT, BT_PTR_ULONGLONG, BT_PTR_ULONGLONG,
+                    BT_PTR_ULONGLONG)
 
 DEF_FUNCTION_TYPE_5 (BT_FN_VOID_OMPFN_PTR_UINT_UINT_UINT,
                     BT_VOID, BT_PTR_FN_VOID_PTR, BT_PTR, BT_UINT, BT_UINT,
@@ -170,6 +173,9 @@ DEF_FUNCTION_TYPE_5 (BT_FN_VOID_INT_SIZE
 DEF_FUNCTION_TYPE_5 (BT_FN_BOOL_UINT_LONGPTR_LONG_LONGPTR_LONGPTR,
                     BT_BOOL, BT_UINT, BT_PTR_LONG, BT_LONG, BT_PTR_LONG,
                     BT_PTR_LONG)
+DEF_FUNCTION_TYPE_5 (BT_FN_BOOL_UINT_ULLPTR_ULL_ULLPTR_ULLPTR,
+                    BT_BOOL, BT_UINT, BT_PTR_ULONGLONG, BT_ULONGLONG,
+                    BT_PTR_ULONGLONG, BT_PTR_ULONGLONG)
 
 DEF_FUNCTION_TYPE_6 (BT_FN_BOOL_LONG_LONG_LONG_LONG_LONGPTR_LONGPTR,
                      BT_BOOL, BT_LONG, BT_LONG, BT_LONG, BT_LONG,
@@ -232,6 +238,8 @@ DEF_FUNCTION_TYPE_VAR_0 (BT_FN_VOID_VAR,
 
 DEF_FUNCTION_TYPE_VAR_1 (BT_FN_VOID_LONG_VAR,
                         BT_VOID, BT_LONG)
+DEF_FUNCTION_TYPE_VAR_1 (BT_FN_VOID_ULL_VAR,
+                        BT_VOID, BT_ULONGLONG)
 
 DEF_FUNCTION_TYPE_VAR_2 (BT_FN_VOID_INT_INT_VAR, BT_VOID, BT_INT, BT_INT)
 
--- libgomp/loop_ull.c.jj       2015-06-11 10:27:48.000000000 +0200
+++ libgomp/loop_ull.c  2015-10-01 17:08:11.969445359 +0200
@@ -299,6 +299,114 @@ GOMP_loop_ull_ordered_runtime_start (boo
     }
 }
 
+/* The *_doacross_*_start routines are similar.  The only difference is that
+   this work-share construct is initialized to expect an ORDERED(N) - DOACROSS
+   section, and the worksharing loop iterates always from 0 to COUNTS[0] - 1
+   and other COUNTS array elements tell the library number of iterations
+   in the ordered inner loops.  */
+
+static bool
+gomp_loop_ull_doacross_static_start (unsigned ncounts, gomp_ull *counts,
+                                    gomp_ull chunk_size, gomp_ull *istart,
+                                    gomp_ull *iend)
+{
+  struct gomp_thread *thr = gomp_thread ();
+
+  thr->ts.static_trip = 0;
+  if (gomp_work_share_start (false))
+    {
+      gomp_loop_ull_init (thr->ts.work_share, true, 0, counts[0], 1,
+                         GFS_STATIC, chunk_size);
+      gomp_doacross_ull_init (ncounts, counts, chunk_size);
+      gomp_work_share_init_done ();
+    }
+
+  return !gomp_iter_ull_static_next (istart, iend);
+}
+
+static bool
+gomp_loop_ull_doacross_dynamic_start (unsigned ncounts, gomp_ull *counts,
+                                     gomp_ull chunk_size, gomp_ull *istart,
+                                     gomp_ull *iend)
+{
+  struct gomp_thread *thr = gomp_thread ();
+  bool ret;
+
+  if (gomp_work_share_start (false))
+    {
+      gomp_loop_ull_init (thr->ts.work_share, true, 0, counts[0], 1,
+                         GFS_DYNAMIC, chunk_size);
+      gomp_doacross_ull_init (ncounts, counts, chunk_size);
+      gomp_work_share_init_done ();
+    }
+
+#if defined HAVE_SYNC_BUILTINS && defined __LP64__
+  ret = gomp_iter_ull_dynamic_next (istart, iend);
+#else
+  gomp_mutex_lock (&thr->ts.work_share->lock);
+  ret = gomp_iter_ull_dynamic_next_locked (istart, iend);
+  gomp_mutex_unlock (&thr->ts.work_share->lock);
+#endif
+
+  return ret;
+}
+
+static bool
+gomp_loop_ull_doacross_guided_start (unsigned ncounts, gomp_ull *counts,
+                                    gomp_ull chunk_size, gomp_ull *istart,
+                                    gomp_ull *iend)
+{
+  struct gomp_thread *thr = gomp_thread ();
+  bool ret;
+
+  if (gomp_work_share_start (false))
+    {
+      gomp_loop_ull_init (thr->ts.work_share, true, 0, counts[0], 1,
+                         GFS_GUIDED, chunk_size);
+      gomp_doacross_ull_init (ncounts, counts, chunk_size);
+      gomp_work_share_init_done ();
+    }
+
+#if defined HAVE_SYNC_BUILTINS && defined __LP64__
+  ret = gomp_iter_ull_guided_next (istart, iend);
+#else
+  gomp_mutex_lock (&thr->ts.work_share->lock);
+  ret = gomp_iter_ull_guided_next_locked (istart, iend);
+  gomp_mutex_unlock (&thr->ts.work_share->lock);
+#endif
+
+  return ret;
+}
+
+bool
+GOMP_loop_ull_doacross_runtime_start (unsigned ncounts, gomp_ull *counts,
+                                     gomp_ull *istart, gomp_ull *iend)
+{
+  struct gomp_task_icv *icv = gomp_icv (false);
+  switch (icv->run_sched_var)
+    {
+    case GFS_STATIC:
+      return gomp_loop_ull_doacross_static_start (ncounts, counts,
+                                                 icv->run_sched_chunk_size,
+                                                 istart, iend);
+    case GFS_DYNAMIC:
+      return gomp_loop_ull_doacross_dynamic_start (ncounts, counts,
+                                                  icv->run_sched_chunk_size,
+                                                  istart, iend);
+    case GFS_GUIDED:
+      return gomp_loop_ull_doacross_guided_start (ncounts, counts,
+                                                 icv->run_sched_chunk_size,
+                                                 istart, iend);
+    case GFS_AUTO:
+      /* For now map to schedule(static), later on we could play with feedback
+        driven choice.  */
+      return gomp_loop_ull_doacross_static_start (ncounts, counts,
+                                                 0, istart, iend);
+    default:
+      abort ();
+    }
+}
+
 /* The *_next routines are called when the thread completes processing of
    the iteration block currently assigned to it.  If the work-share
    construct is bound directly to a parallel construct, then the iteration
@@ -466,6 +574,13 @@ extern __typeof(gomp_loop_ull_ordered_dy
 extern __typeof(gomp_loop_ull_ordered_guided_start) 
GOMP_loop_ull_ordered_guided_start
        __attribute__((alias ("gomp_loop_ull_ordered_guided_start")));
 
+extern __typeof(gomp_loop_ull_doacross_static_start) 
GOMP_loop_ull_doacross_static_start
+       __attribute__((alias ("gomp_loop_ull_doacross_static_start")));
+extern __typeof(gomp_loop_ull_doacross_dynamic_start) 
GOMP_loop_ull_doacross_dynamic_start
+       __attribute__((alias ("gomp_loop_ull_doacross_dynamic_start")));
+extern __typeof(gomp_loop_ull_doacross_guided_start) 
GOMP_loop_ull_doacross_guided_start
+       __attribute__((alias ("gomp_loop_ull_doacross_guided_start")));
+
 extern __typeof(gomp_loop_ull_static_next) GOMP_loop_ull_static_next
        __attribute__((alias ("gomp_loop_ull_static_next")));
 extern __typeof(gomp_loop_ull_dynamic_next) GOMP_loop_ull_dynamic_next
@@ -535,6 +650,33 @@ GOMP_loop_ull_ordered_guided_start (bool
 }
 
 bool
+GOMP_loop_ull_doacross_static_start (unsigned ncounts, gomp_ull *counts,
+                                    gomp_ull chunk_size, gomp_ull *istart,
+                                    gomp_ull *iend)
+{
+  return gomp_loop_ull_doacross_static_start (ncounts, counts, chunk_size,
+                                             istart, iend);
+}
+
+bool
+GOMP_loop_ull_doacross_dynamic_start (unsigned ncounts, gomp_ull *counts,
+                                     gomp_ull chunk_size, gomp_ull *istart,
+                                     gomp_ull *iend)
+{
+  return gomp_loop_ull_doacross_dynamic_start (ncounts, counts, chunk_size,
+                                              istart, iend);
+}
+
+bool
+GOMP_loop_ull_doacross_guided_start (unsigned ncounts, gomp_ull *counts,
+                                    gomp_ull chunk_size, gomp_ull *istart,
+                                    gomp_ull *iend)
+{
+  return gomp_loop_ull_doacross_guided_start (ncounts, counts, chunk_size,
+                                             istart, iend);
+}
+
+bool
 GOMP_loop_ull_static_next (gomp_ull *istart, gomp_ull *iend)
 {
   return gomp_loop_ull_static_next (istart, iend);
--- libgomp/ordered.c.jj        2015-09-24 20:20:32.000000000 +0200
+++ libgomp/ordered.c   2015-10-02 13:21:16.675194039 +0200
@@ -317,7 +317,6 @@ gomp_doacross_init (unsigned ncounts, lo
   doacross->elt_sz = elt_sz;
   doacross->ncounts = ncounts;
   doacross->flattened = false;
-  doacross->boundary = 0;
   doacross->array = (unsigned char *)
                    ((((uintptr_t) (doacross + 1)) + 63 + shift_sz)
                     & ~(uintptr_t) 63);
@@ -479,3 +478,296 @@ GOMP_doacross_wait (long first, ...)
   while (1);
   __sync_synchronize ();
 }
+
+typedef unsigned long long gomp_ull;
+
+void
+gomp_doacross_ull_init (unsigned ncounts, gomp_ull *counts, gomp_ull 
chunk_size)
+{
+  struct gomp_thread *thr = gomp_thread ();
+  struct gomp_team *team = thr->ts.team;
+  struct gomp_work_share *ws = thr->ts.work_share;
+  unsigned int i, bits[MAX_COLLAPSED_BITS], num_bits = 0;
+  unsigned long ent, num_ents, elt_sz, shift_sz;
+  struct gomp_doacross_work_share *doacross;
+
+  if (team == NULL || team->nthreads == 1)
+    return;
+
+  for (i = 0; i < ncounts; i++)
+    {
+      /* If any count is 0, GOMP_doacross_{post,wait} can't be called.  */
+      if (counts[i] == 0)
+       return;
+
+      if (num_bits <= MAX_COLLAPSED_BITS)
+       {
+         unsigned int this_bits;
+         if (counts[i] == 1)
+           this_bits = 1;
+         else
+           this_bits = __SIZEOF_LONG_LONG__ * __CHAR_BIT__
+                       - __builtin_clzll (counts[i] - 1);
+         if (num_bits + this_bits <= MAX_COLLAPSED_BITS)
+           {
+             bits[i] = this_bits;
+             num_bits += this_bits;
+           }
+         else
+           num_bits = MAX_COLLAPSED_BITS + 1;
+       }
+    }
+
+  if (ws->sched == GFS_STATIC)
+    num_ents = team->nthreads;
+  else
+    num_ents = (counts[0] - 1) / chunk_size + 1;
+  if (num_bits <= MAX_COLLAPSED_BITS)
+    {
+      elt_sz = sizeof (unsigned long);
+      shift_sz = ncounts * sizeof (unsigned int);
+    }
+  else
+    {
+      if (sizeof (gomp_ull) == sizeof (unsigned long))
+       elt_sz = sizeof (gomp_ull) * ncounts;
+      else if (sizeof (gomp_ull) == 2 * sizeof (unsigned long))
+       elt_sz = sizeof (unsigned long) * 2 * ncounts;
+      else
+       abort ();
+      shift_sz = 0;
+    }
+  elt_sz = (elt_sz + 63) & ~63UL;
+
+  doacross = gomp_malloc (sizeof (*doacross) + 63 + num_ents * elt_sz
+                         + shift_sz);
+  doacross->chunk_size_ull = chunk_size;
+  doacross->elt_sz = elt_sz;
+  doacross->ncounts = ncounts;
+  doacross->flattened = false;
+  doacross->boundary = 0;
+  doacross->array = (unsigned char *)
+                   ((((uintptr_t) (doacross + 1)) + 63 + shift_sz)
+                    & ~(uintptr_t) 63);
+  if (num_bits <= MAX_COLLAPSED_BITS)
+    {
+      unsigned int shift_count = 0;
+      doacross->flattened = true;
+      for (i = ncounts; i > 0; i--)
+       {
+         doacross->shift_counts[i - 1] = shift_count;
+         shift_count += bits[i - 1];
+       }
+      for (ent = 0; ent < num_ents; ent++)
+       *(unsigned long *) (doacross->array + ent * elt_sz) = 0;
+    }
+  else
+    for (ent = 0; ent < num_ents; ent++)
+      memset (doacross->array + ent * elt_sz, '\0',
+             sizeof (unsigned long) * ncounts);
+  if (ws->sched == GFS_STATIC && chunk_size == 0)
+    {
+      gomp_ull q = counts[0] / num_ents;
+      gomp_ull t = counts[0] % num_ents;
+      doacross->boundary_ull = t * (q + 1);
+      doacross->q_ull = q;
+      doacross->t = t;
+    }
+  ws->doacross = doacross;
+}
+
+/* DOACROSS POST operation.  */
+
+void
+GOMP_doacross_ull_post (gomp_ull *counts)
+{
+  struct gomp_thread *thr = gomp_thread ();
+  struct gomp_work_share *ws = thr->ts.work_share;
+  struct gomp_doacross_work_share *doacross = ws->doacross;
+  unsigned long ent;
+  unsigned int i;
+
+  if (__builtin_expect (doacross == NULL, 0))
+    {
+      __sync_synchronize ();
+      return;
+    }
+
+  if (__builtin_expect (ws->sched == GFS_STATIC, 1))
+    ent = thr->ts.team_id;
+  else
+    ent = counts[0] / doacross->chunk_size_ull;
+
+  if (__builtin_expect (doacross->flattened, 1))
+    {
+      unsigned long *array = (unsigned long *) (doacross->array
+                             + ent * doacross->elt_sz);
+      gomp_ull flattened
+       = counts[0] << doacross->shift_counts[0];
+
+      for (i = 1; i < doacross->ncounts; i++)
+       flattened |= counts[i] << doacross->shift_counts[i];
+      flattened++;
+      if (flattened == __atomic_load_n (array, MEMMODEL_ACQUIRE))
+       __atomic_thread_fence (MEMMODEL_RELEASE);
+      else
+       __atomic_store_n (array, flattened, MEMMODEL_RELEASE);
+      return;
+    }
+
+  __atomic_thread_fence (MEMMODEL_ACQUIRE);
+  if (sizeof (gomp_ull) == sizeof (unsigned long))
+    {
+      gomp_ull *array = (gomp_ull *) (doacross->array
+                                     + ent * doacross->elt_sz);
+
+      for (i = doacross->ncounts; i-- > 0; )
+       {
+         if (counts[i] + 1UL != __atomic_load_n (&array[i], MEMMODEL_RELAXED))
+           __atomic_store_n (&array[i], counts[i] + 1UL, MEMMODEL_RELEASE);
+       }
+    }
+  else
+    {
+      unsigned long *array = (unsigned long *) (doacross->array
+                                               + ent * doacross->elt_sz);
+
+      for (i = doacross->ncounts; i-- > 0; )
+       {
+         gomp_ull cull = counts[i] + 1UL;
+         unsigned long c = (unsigned long) cull;
+         if (c != __atomic_load_n (&array[2 * i + 1], MEMMODEL_RELAXED))
+           __atomic_store_n (&array[2 * i + 1], c, MEMMODEL_RELEASE);
+         c = cull >> (__SIZEOF_LONG_LONG__ * __CHAR_BIT__ / 2);
+         if (c != __atomic_load_n (&array[2 * i], MEMMODEL_RELAXED))
+           __atomic_store_n (&array[2 * i], c, MEMMODEL_RELEASE);
+       }
+    }
+}
+
+/* DOACROSS WAIT operation.  */
+
+void
+GOMP_doacross_ull_wait (gomp_ull first, ...)
+{
+  struct gomp_thread *thr = gomp_thread ();
+  struct gomp_work_share *ws = thr->ts.work_share;
+  struct gomp_doacross_work_share *doacross = ws->doacross;
+  va_list ap;
+  unsigned long ent;
+  unsigned int i;
+
+  if (__builtin_expect (doacross == NULL, 0))
+    {
+      __sync_synchronize ();
+      return;
+    }
+
+  if (__builtin_expect (ws->sched == GFS_STATIC, 1))
+    {
+      if (ws->chunk_size_ull == 0)
+       {
+         if (first < doacross->boundary_ull)
+           ent = first / (doacross->q_ull + 1);
+         else
+           ent = (first - doacross->boundary_ull) / doacross->q_ull
+                 + doacross->t;
+       }
+      else
+       ent = first / ws->chunk_size_ull % thr->ts.team->nthreads;
+    }
+  else
+    ent = first / doacross->chunk_size_ull;
+
+  if (__builtin_expect (doacross->flattened, 1))
+    {
+      unsigned long *array = (unsigned long *) (doacross->array
+                                               + ent * doacross->elt_sz);
+      gomp_ull flattened = first << doacross->shift_counts[0];
+      unsigned long cur;
+
+      va_start (ap, first);
+      for (i = 1; i < doacross->ncounts; i++)
+       flattened |= va_arg (ap, gomp_ull)
+                    << doacross->shift_counts[i];
+      cur = __atomic_load_n (array, MEMMODEL_ACQUIRE);
+      if (flattened < cur)
+       {
+         __atomic_thread_fence (MEMMODEL_RELEASE);
+         va_end (ap);
+         return;
+       }
+      doacross_spin (array, flattened, cur);
+      __atomic_thread_fence (MEMMODEL_RELEASE);
+      va_end (ap);
+      return;
+    }
+
+  if (sizeof (gomp_ull) == sizeof (unsigned long))
+    {
+      gomp_ull *array = (gomp_ull *) (doacross->array
+                                     + ent * doacross->elt_sz);
+      do
+       {
+         va_start (ap, first);
+         for (i = 0; i < doacross->ncounts; i++)
+           {
+             gomp_ull thisv
+               = (i ? va_arg (ap, gomp_ull) : first) + 1;
+             gomp_ull cur = __atomic_load_n (&array[i], MEMMODEL_RELAXED);
+             if (thisv < cur)
+               {
+                 i = doacross->ncounts;
+                 break;
+               }
+             if (thisv > cur)
+               break;
+           }
+         va_end (ap);
+         if (i == doacross->ncounts)
+           break;
+         cpu_relax ();
+       }
+      while (1);
+    }
+  else
+    {
+      unsigned long *array = (unsigned long *) (doacross->array
+                                               + ent * doacross->elt_sz);
+      do
+       {
+         va_start (ap, first);
+         for (i = 0; i < doacross->ncounts; i++)
+           {
+             gomp_ull thisv
+               = (i ? va_arg (ap, gomp_ull) : first) + 1;
+             unsigned long t
+               = thisv >> (__SIZEOF_LONG_LONG__ * __CHAR_BIT__ / 2);
+             unsigned long cur
+               = __atomic_load_n (&array[2 * i], MEMMODEL_RELAXED);
+             if (t < cur)
+               {
+                 i = doacross->ncounts;
+                 break;
+               }
+             if (t > cur)
+               break;
+             t = thisv;
+             cur = __atomic_load_n (&array[2 * i + 1], MEMMODEL_RELAXED);
+             if (t < cur)
+               {
+                 i = doacross->ncounts;
+                 break;
+               }
+             if (t > cur)
+               break;
+           }
+         va_end (ap);
+         if (i == doacross->ncounts)
+           break;
+         cpu_relax ();
+       }
+      while (1);
+    }
+  __sync_synchronize ();
+}
--- libgomp/libgomp.map.jj      2015-09-18 18:12:29.000000000 +0200
+++ libgomp/libgomp.map 2015-10-01 17:03:55.649130579 +0200
@@ -280,6 +280,12 @@ GOMP_4.1 {
        GOMP_loop_doacross_static_start;
        GOMP_doacross_post;
        GOMP_doacross_wait;
+       GOMP_loop_ull_doacross_dynamic_start;
+       GOMP_loop_ull_doacross_guided_start;
+       GOMP_loop_ull_doacross_runtime_start;
+       GOMP_loop_ull_doacross_static_start;
+       GOMP_doacross_ull_post;
+       GOMP_doacross_ull_wait;
 } GOMP_4.0.1;
 
 OACC_2.0 {
--- libgomp/libgomp_g.h.jj      2015-09-24 13:33:32.000000000 +0200
+++ libgomp/libgomp_g.h 2015-10-01 14:15:41.195121635 +0200
@@ -173,12 +173,34 @@ extern bool GOMP_loop_ull_ordered_guided
 extern bool GOMP_loop_ull_ordered_runtime_next (unsigned long long *,
                                                unsigned long long *);
 
+extern bool GOMP_loop_ull_doacross_static_start (unsigned,
+                                                unsigned long long *,
+                                                unsigned long long,
+                                                unsigned long long *,
+                                                unsigned long long *);
+extern bool GOMP_loop_ull_doacross_dynamic_start (unsigned,
+                                                 unsigned long long *,
+                                                 unsigned long long,
+                                                 unsigned long long *,
+                                                 unsigned long long *);
+extern bool GOMP_loop_ull_doacross_guided_start (unsigned,
+                                                unsigned long long *,
+                                                unsigned long long,
+                                                unsigned long long *,
+                                                unsigned long long *);
+extern bool GOMP_loop_ull_doacross_runtime_start (unsigned,
+                                                 unsigned long long *,
+                                                 unsigned long long *,
+                                                 unsigned long long *);
+
 /* ordered.c */
 
 extern void GOMP_ordered_start (void);
 extern void GOMP_ordered_end (void);
 extern void GOMP_doacross_post (long *);
 extern void GOMP_doacross_wait (long, ...);
+extern void GOMP_doacross_ull_post (unsigned long long *);
+extern void GOMP_doacross_ull_wait (unsigned long long, ...);
 
 /* parallel.c */
 
--- libgomp/libgomp.h.jj        2015-09-23 12:25:51.000000000 +0200
+++ libgomp/libgomp.h   2015-10-01 16:24:59.005076396 +0200
@@ -84,10 +84,14 @@ struct gomp_doacross_work_share
     /* chunk_size copy, as ws->chunk_size is multiplied by incr for
        GFS_DYNAMIC.  */
     long chunk_size;
+    /* Likewise, but for ull implementation.  */
+    unsigned long long chunk_size_ull;
     /* For schedule(static,0) this is the number
        of iterations assigned to the last thread, i.e. number of
        iterations / number of threads.  */
     long q;
+    /* Likewise, but for ull implementation.  */
+    unsigned long long q_ull;
   };
   /* Size of each array entry (padded to cache line size).  */
   unsigned long elt_sz;
@@ -102,8 +106,12 @@ struct gomp_doacross_work_share
   /* These two are only used for schedule(static,0).  */
   /* This one is number of iterations % number of threads.  */
   long t;
-  /* And this one is cached t * (q + 1).  */
-  long boundary;
+  union {
+    /* And this one is cached t * (q + 1).  */
+    long boundary;
+    /* Likewise, but for the ull implementation.  */
+    unsigned long long boundary_ull;
+  };
   /* Array of shift counts for each dimension if they can be flattened.  */
   unsigned int shift_counts[];
 };
@@ -683,6 +691,8 @@ extern void gomp_ordered_static_init (vo
 extern void gomp_ordered_static_next (void);
 extern void gomp_ordered_sync (void);
 extern void gomp_doacross_init (unsigned, long *, long);
+extern void gomp_doacross_ull_init (unsigned, unsigned long long *,
+                                   unsigned long long);
 
 /* parallel.c */
 
--- libgomp/testsuite/libgomp.c/doacross-2.c.jj 2015-10-02 09:36:19.575951751 
+0200
+++ libgomp/testsuite/libgomp.c/doacross-2.c    2015-10-02 10:14:47.098868611 
+0200
@@ -0,0 +1,225 @@
+extern void abort (void);
+
+#define N 256
+int a[N], b[N / 16][8][4], c[N / 32][8][8], g[N / 16][8][6];
+volatile int d, e;
+volatile unsigned long long f;
+
+int
+main ()
+{
+  unsigned long long i;
+  int j, k, l, m;
+  #pragma omp parallel private (l)
+  {
+    #pragma omp for schedule(static, 1) ordered (1) nowait
+    for (i = 1; i < N + f; i++)
+      {
+       #pragma omp atomic write
+       a[i] = 1;
+       #pragma omp ordered depend(sink: i - 1)
+       if (i > 1)
+         {
+           #pragma omp atomic read
+           l = a[i - 1];
+           if (l < 2)
+             abort ();
+         }
+       #pragma omp atomic write
+       a[i] = 2;
+       if (i < N - 1)
+         {
+           #pragma omp atomic read
+           l = a[i + 1];
+           if (l == 3)
+             abort ();
+         }
+       #pragma omp ordered depend(source)
+       #pragma omp atomic write
+       a[i] = 3;
+      }
+    #pragma omp for schedule(static, 0) ordered (3) nowait
+    for (i = 3; i < N / 16 - 1 + f; i++)
+      for (j = 0; j < 8; j += 2)
+       for (k = 1; k <= 3; k++)
+         {
+           #pragma omp atomic write
+           b[i][j][k] = 1;
+           #pragma omp ordered depend(sink: i, j - 2, k - 1) \
+                               depend(sink: i - 2, j - 2, k + 1)
+           #pragma omp ordered depend(sink: i - 3, j + 2, k - 2)
+           if (j >= 2 && k > 1)
+             {
+               #pragma omp atomic read
+               l = b[i][j - 2][k - 1];
+               if (l < 2)
+                 abort ();
+             }
+           #pragma omp atomic write
+           b[i][j][k] = 2;
+           if (i >= 5 && j >= 2 && k < 3)
+             {
+               #pragma omp atomic read
+               l = b[i - 2][j - 2][k + 1];
+               if (l < 2)
+                 abort ();
+             }
+           if (i >= 6 && j < N / 16 - 3 && k == 3)
+             {
+               #pragma omp atomic read
+               l = b[i - 3][j + 2][k - 2];
+               if (l < 2)
+                 abort ();
+             }
+           #pragma omp ordered depend(source)
+           #pragma omp atomic write
+           b[i][j][k] = 3;
+         }
+#define A(n) int n;
+#define B(n) A(n##0) A(n##1) A(n##2) A(n##3)
+#define C(n) B(n##0) B(n##1) B(n##2) B(n##3)
+#define D(n) C(n##0) C(n##1) C(n##2) C(n##3)
+    D(m)
+#undef A
+    #pragma omp for collapse (2) ordered(61) schedule(dynamic, 15)
+    for (i = 2; i < N / 32 + f; i++)
+      for (j = 7; j > 1; j--)
+       for (k = 6; k >= 0; k -= 2)
+#define A(n) for (n = 4; n < 5; n++)
+         D(m)
+#undef A
+           {
+             #pragma omp atomic write
+             c[i][j][k] = 1;
+#define A(n) ,n
+#define E(n) C(n##0) C(n##1) C(n##2) B(n##30) B(n##31) A(n##320) A(n##321)
+             #pragma omp ordered depend (sink: i, j, k + 2 E(m)) \
+                                 depend (sink:i - 2, j + 1, k - 4 E(m)) \
+                                 depend(sink: i - 1, j - 2, k - 2 E(m))
+             if (k <= 4)
+               {
+                 l = c[i][j][k + 2];
+                 if (l < 2)
+                   abort ();
+               }
+             #pragma omp atomic write
+             c[i][j][k] = 2;
+             if (i >= 4 && j < 7 && k >= 4)
+               {
+                 l = c[i - 2][j + 1][k - 4];
+                 if (l < 2)
+                   abort ();
+               }
+             if (i >= 3 && j >= 4 && k >= 2)
+               {
+                 l = c[i - 1][j - 2][k - 2];
+                 if (l < 2)
+                   abort ();
+               }
+             #pragma omp ordered depend (source)
+             #pragma omp atomic write
+             c[i][j][k] = 3;
+           }
+    #pragma omp for schedule(static, 0) ordered (3) nowait
+    for (j = 0; j < N / 16 - 1; j++)
+      for (k = 0; k < 8; k += 2)
+       for (i = 3; i <= 5 + f; i++)
+         {
+           #pragma omp atomic write
+           g[j][k][i] = 1;
+           #pragma omp ordered depend(sink: j, k - 2, i - 1) \
+                               depend(sink: j - 2, k - 2, i + 1)
+           #pragma omp ordered depend(sink: j - 3, k + 2, i - 2)
+           if (k >= 2 && i > 3)
+             {
+               #pragma omp atomic read
+               l = g[j][k - 2][i - 1];
+               if (l < 2)
+                 abort ();
+             }
+           #pragma omp atomic write
+           g[j][k][i] = 2;
+           if (j >= 2 && k >= 2 && i < 5)
+             {
+               #pragma omp atomic read
+               l = g[j - 2][k - 2][i + 1];
+               if (l < 2)
+                 abort ();
+             }
+           if (j >= 3 && k < N / 16 - 3 && i == 5)
+             {
+               #pragma omp atomic read
+               l = g[j - 3][k + 2][i - 2];
+               if (l < 2)
+                 abort ();
+             }
+           #pragma omp ordered depend(source)
+           #pragma omp atomic write
+           g[j][k][i] = 3;
+         }
+    #pragma omp for collapse(2) ordered(4) lastprivate (i, j, k)
+    for (i = 2; i < f + 3; i++)
+      for (j = d + 1; j >= 0; j--)
+       for (k = 0; k < d; k++)
+         for (l = 0; l < d + 2; l++)
+           {
+             #pragma omp ordered depend (source)
+             #pragma omp ordered depend (sink:i - 2, j + 2, k - 2, l)
+             if (!e)
+               abort ();
+           }
+    #pragma omp single
+    {
+      if (i != 3 || j != -1 || k != 0)
+       abort ();
+      i = 8; j = 9; k = 10;
+    }
+    #pragma omp for collapse(2) ordered(4) lastprivate (i, j, k, m)
+    for (i = 2; i < f + 3; i++)
+      for (j = d + 1; j >= 0; j--)
+       for (k = 0; k < d + 2; k++)
+         for (m = 0; m < d; m++)
+           {
+             #pragma omp ordered depend (source)
+             #pragma omp ordered depend (sink:i - 2, j + 2, k - 2, m)
+             abort ();
+           }
+    #pragma omp single
+    if (i != 3 || j != -1 || k != 2 || m != 0)
+      abort ();
+    #pragma omp for collapse(2) ordered(4) nowait
+    for (i = 2; i < f + 3; i++)
+      for (j = d; j > 0; j--)
+       for (k = 0; k < d + 2; k++)
+         for (l = 0; l < d + 4; l++)
+           {
+             #pragma omp ordered depend (source)
+             #pragma omp ordered depend (sink:i - 2, j + 2, k - 2, l)
+             if (!e)
+               abort ();
+           }
+    #pragma omp for nowait
+    for (i = 0; i < N; i++)
+      if (a[i] != 3 * (i >= 1))
+       abort ();
+    #pragma omp for collapse(2) private(k) nowait
+    for (i = 0; i < N / 16; i++)
+      for (j = 0; j < 8; j++)
+       for (k = 0; k < 4; k++)
+         if (b[i][j][k] != 3 * (i >= 3 && i < N / 16 - 1 && (j & 1) == 0 && k 
>= 1))
+           abort ();
+    #pragma omp for collapse(3) nowait
+    for (i = 0; i < N / 32; i++)
+      for (j = 0; j < 8; j++)
+       for (k = 0; k < 8; k++)
+         if (c[i][j][k] != 3 * (i >= 2 && j >= 2 && (k & 1) == 0))
+           abort ();
+    #pragma omp for collapse(2) private(k) nowait
+    for (i = 0; i < N / 16; i++)
+      for (j = 0; j < 8; j++)
+       for (k = 0; k < 6; k++)
+         if (g[i][j][k] != 3 * (i < N / 16 - 1 && (j & 1) == 0 && k >= 3))
+           abort ();
+  }
+  return 0;
+}

        Jakub

[gomp4.1] Unsigned long long doacross implementation

Reply via email to