I've committed this to gomp4 branch. It expands the acc_on_device builtin
earlier in the new oacc_xform pass. This will allow more optimization earlier on.
The existing expansion point now only needs to deal with the host-side case.
nathan
2015-08-02 Nathan Sidwell <nat...@codesourcery.com>
gcc/
* omp-low.c (oacc_xform_on_device): New function.
(execute_oacc_transform): Use get_oacc_fn_attrib. Call
oacc_xform_on_device.
* builtins.c (expand_builtin_on_device): Only expect to be
expanded on host compiler.
libgcc/
* config/nvptx/comp-acc_on_device.c: Include gomp-constants.h.
(acc_on_device): Code directly here.
libgomp/
* openacc.h (acc_on_device): Take int and explain why.
* oacc-init.c (acc_on_device): Likewise.
Index: gcc/omp-low.c
===================================================================
--- gcc/omp-low.c (revision 226462)
+++ gcc/omp-low.c (working copy)
@@ -14510,29 +14510,65 @@ make_pass_late_lower_omp (gcc::context *
return new pass_late_lower_omp (ctxt);
}
+/* Transform an acc_on_device call. The std requires this folded at
+ compile time for constant operands. We always fold it. In an
+ offloaded function we're never 'none'. We cannot detect
+ host_nonshm here, as that's a dynamic feature of the runtime.
+ However, users shouldn't be using host_nonshm anyway, only the
+ test harness. */
+
+static void
+oacc_xform_on_device (gimple_stmt_iterator *gsi, gimple stmt)
+{
+ tree arg = gimple_call_arg (stmt, 0);
+ unsigned val = GOMP_DEVICE_HOST;
+
+#ifdef ACCEL_COMPILER
+ val = GOMP_DEVICE_NOT_HOST;
+#endif
+ tree result = build2 (EQ_EXPR, boolean_type_node, arg,
+ build_int_cst (integer_type_node, val));
+#ifdef ACCEL_COMPILER
+ {
+ tree dev = build2 (EQ_EXPR, boolean_type_node, arg,
+ build_int_cst (integer_type_node,
+ ACCEL_COMPILER_acc_device));
+ result = build2 (TRUTH_OR_EXPR, boolean_type_node, result, dev);
+ }
+#endif
+ result = fold_convert (integer_type_node, result);
+ tree lhs = gimple_call_lhs (stmt);
+ gimple_seq replace = NULL;
+
+ push_gimplify_context (true);
+ gimplify_assign (lhs, result, &replace);
+ pop_gimplify_context (NULL);
+ gsi_replace_with_seq (gsi, replace, false);
+}
+
/* Main entry point for oacc transformations which run on the device
- compiler. */
+ compilerafter LTO, so we know what the target device is at this
+ point (including the host fallback). */
static unsigned int
execute_oacc_transform ()
{
basic_block bb;
- gimple_stmt_iterator gsi;
- gimple stmt;
- if (!lookup_attribute ("oacc function",
- DECL_ATTRIBUTES (current_function_decl)))
+ if (!get_oacc_fn_attrib (current_function_decl))
return 0;
-
FOR_ALL_BB_FN (bb, cfun)
{
- gsi = gsi_start_bb (bb);
-
- while (!gsi_end_p (gsi))
+ for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
+ !gsi_end_p (gsi); gsi_next (&gsi))
{
- stmt = gsi_stmt (gsi);
- gsi_next (&gsi);
+ gimple stmt = gsi_stmt (gsi);
+
+ /* acc_on_device must be evaluated at compile time for
+ constant arguments. */
+ if (gimple_call_builtin_p (stmt, BUILT_IN_ACC_ON_DEVICE))
+ oacc_xform_on_device (&gsi, stmt);
}
}
Index: gcc/builtins.c
===================================================================
--- gcc/builtins.c (revision 226462)
+++ gcc/builtins.c (working copy)
@@ -5880,43 +5880,39 @@ expand_stack_save (void)
}
-/* Expand OpenACC acc_on_device.
-
- This has to happen late (that is, not in early folding; expand_builtin_*,
- rather than fold_builtin_*), as we have to act differently for host and
- acceleration device (ACCEL_COMPILER conditional). */
+/* Expand OpenACC acc_on_device. This is expanded in the openacc
+ transform pass, but if the user has this outside of an offloaded
+ region, we'll find it here. In that case we must be host or none. */
static rtx
expand_builtin_acc_on_device (tree exp, rtx target)
{
+#ifdef ACCEL_COMPILER
+ gcc_unreachable ();
+#else
+ gcc_assert (!get_oacc_fn_attrib (current_function_decl));
+
if (!validate_arglist (exp, INTEGER_TYPE, VOID_TYPE))
return NULL_RTX;
tree arg = CALL_EXPR_ARG (exp, 0);
-
- /* Return (arg == v1 || arg == v2) ? 1 : 0. */
- machine_mode v_mode = TYPE_MODE (TREE_TYPE (arg));
- rtx v = expand_normal (arg), v1, v2;
-#ifdef ACCEL_COMPILER
- v1 = GEN_INT (GOMP_DEVICE_NOT_HOST);
- v2 = GEN_INT (ACCEL_COMPILER_acc_device);
-#else
- v1 = GEN_INT (GOMP_DEVICE_NONE);
- v2 = GEN_INT (GOMP_DEVICE_HOST);
-#endif
+ rtx val = expand_normal (arg);
machine_mode target_mode = TYPE_MODE (integer_type_node);
if (!target || !register_operand (target, target_mode))
target = gen_reg_rtx (target_mode);
emit_move_insn (target, const1_rtx);
rtx_code_label *done_label = gen_label_rtx ();
- do_compare_rtx_and_jump (v, v1, EQ, false, v_mode, NULL_RTX,
+ do_compare_rtx_and_jump (val, GEN_INT (GOMP_DEVICE_HOST), EQ,
+ false, GET_MODE (val), NULL_RTX,
NULL, done_label, PROB_EVEN);
- do_compare_rtx_and_jump (v, v2, EQ, false, v_mode, NULL_RTX,
+ do_compare_rtx_and_jump (val, GEN_INT (GOMP_DEVICE_NONE), EQ,
+ false, GET_MODE (val), NULL_RTX,
NULL, done_label, PROB_EVEN);
emit_move_insn (target, const0_rtx);
emit_label (done_label);
return target;
+#endif
}
/* Expand a thread-id/thread-count builtin for OpenACC. */
Index: libgcc/config/nvptx/gomp-acc_on_device.c
===================================================================
--- libgcc/config/nvptx/gomp-acc_on_device.c (revision 226462)
+++ libgcc/config/nvptx/gomp-acc_on_device.c (working copy)
@@ -1,6 +1,14 @@
-int acc_on_device(int d)
+#include "gomp-constants.h"
+
+/* For when the builtin is explicitly disabled. */
+int acc_on_device (int d)
{
- return __builtin_acc_on_device(d);
+ /* We can't use the builtin itself here, because that only expands
+ to device-like things inside offloaded compute regions, which
+ this isn't. Even though it'll be executed on the device --
+ unless someone builds a host-side PTX compiler, which would be
+ very strange. */
+ return d == GOMP_DEVICE_NOT_HOST || d == GOMP_DEVICE_NVIDIA_PTX;
}
int acc_on_device_h_(int *d)
Index: libgomp/openacc.h
===================================================================
--- libgomp/openacc.h (revision 226462)
+++ libgomp/openacc.h (working copy)
@@ -78,7 +78,11 @@ void acc_wait_all (void) __GOACC_NOTHROW
void acc_wait_all_async (int) __GOACC_NOTHROW;
void acc_init (acc_device_t) __GOACC_NOTHROW;
void acc_shutdown (acc_device_t) __GOACC_NOTHROW;
-int acc_on_device (acc_device_t) __GOACC_NOTHROW;
+/* Library function declaration. Although it should take an
+ acc_device_t argument, that causes problems with matching the
+ builtin, which takes an int (to avoid declaring the enumeration
+ inside the compiler). */
+int acc_on_device (int) __GOACC_NOTHROW;
void *acc_malloc (size_t) __GOACC_NOTHROW;
void acc_free (void *) __GOACC_NOTHROW;
/* Some of these would be more correct with const qualifiers, but
Index: libgomp/oacc-init.c
===================================================================
--- libgomp/oacc-init.c (revision 226462)
+++ libgomp/oacc-init.c (working copy)
@@ -632,10 +632,14 @@ acc_set_device_num (int ord, acc_device_
ialias (acc_set_device_num)
+/* The compiler always attempts to expand acc_on_device, but if the
+ user disables the builtin, or calls it via a pointer, we have this
+ version. */
+
int
-acc_on_device (acc_device_t dev)
+acc_on_device (int dev)
{
- /* Just rely on the compiler builtin. */
+ /* It is safe to use the compiler builtin, as we're the host. */
return __builtin_acc_on_device (dev);
}