Hi Nathan! On Mon, 17 Aug 2015 15:30:16 -0400, Nathan Sidwell <[email protected]> wrote: > I've committed this patch to add a new pair of internal functions. These > will > be used in implementing reductions. > > They'll be emitted around reduction finalization, and implement the locking > required for the general case of combining reduction values. They may be > transformed in the oacc_xform pass, and the default behaviour is to delete > them, > if there is no RTL expander. For PTX we delete them if they are at the > vector > level. > > This avoids needing machine-specific builtins to expand to, and thus should > result in less backend code duplication.
With the __builtin_nvptx_lock and __builtin_nvptx_unlock builtins
removed, should the gcc.target/nvptx/spinlock-1.c and
gcc.target/nvptx/spinlock-2.c test cases then be removed, too, or should
these be re-written differently?
For reference:
$ grep ^ gcc/testsuite/gcc.target/nvptx/spinlock-*.c
gcc/testsuite/gcc.target/nvptx/spinlock-1.c:/* { dg-do compile } */
gcc/testsuite/gcc.target/nvptx/spinlock-1.c:void Foo ()
gcc/testsuite/gcc.target/nvptx/spinlock-1.c:{
gcc/testsuite/gcc.target/nvptx/spinlock-1.c: __builtin_nvptx_lock (0);
gcc/testsuite/gcc.target/nvptx/spinlock-1.c: __builtin_nvptx_unlock (0);
gcc/testsuite/gcc.target/nvptx/spinlock-1.c:}
gcc/testsuite/gcc.target/nvptx/spinlock-1.c:
gcc/testsuite/gcc.target/nvptx/spinlock-1.c:
gcc/testsuite/gcc.target/nvptx/spinlock-1.c:/* { dg-final {
scan-assembler-times ".atom.global.cas.b32" 2 } } */
gcc/testsuite/gcc.target/nvptx/spinlock-1.c:/* { dg-final { scan-assembler
".global .u32 __global_lock;" } } */
gcc/testsuite/gcc.target/nvptx/spinlock-1.c:/* { dg-final {
scan-assembler-not ".shared .u32 __shared_lock;" } } */
gcc/testsuite/gcc.target/nvptx/spinlock-2.c:/* { dg-do compile } */
gcc/testsuite/gcc.target/nvptx/spinlock-2.c:void Foo ()
gcc/testsuite/gcc.target/nvptx/spinlock-2.c:{
gcc/testsuite/gcc.target/nvptx/spinlock-2.c: __builtin_nvptx_lock (1);
gcc/testsuite/gcc.target/nvptx/spinlock-2.c: __builtin_nvptx_unlock (1);
gcc/testsuite/gcc.target/nvptx/spinlock-2.c:}
gcc/testsuite/gcc.target/nvptx/spinlock-2.c:
gcc/testsuite/gcc.target/nvptx/spinlock-2.c:/* { dg-final {
scan-assembler-times ".atom.shared.cas.b32" 2 } } */
gcc/testsuite/gcc.target/nvptx/spinlock-2.c:/* { dg-final { scan-assembler
".shared .u32 __shared_lock;" } } */
gcc/testsuite/gcc.target/nvptx/spinlock-2.c:/* { dg-final {
scan-assembler-not ".global .u32 __global_lock;" } } */
> 2015-08-17 Nathan Sidwell <[email protected]>
>
> * target.def (lock_unlock): New GOACC hook.
> * targhooks.h (default_goacc_lock_unlock): Declare.
> * doc/tm.texi.in (TARGET_GOACC_LOCK_UNLOCK): Add.
> * doc/tm.texi: Rebuilt.
> * internal-fn.def (GOACC_LOCK, GOACC_UNLOCK): New.
> * internal-fn.c (expand_GOACC_LOCK, expand_GOACC_UNLOCK): New.
> * omp-low.c (execute_oacc_transform): Add lock/unlock handling.
> (default_goacc_lock_unlock): New.
> * config/nvptx/nvptx-protos.h (nvptx_expand_oacc_lock_unlock): Declare.
> * config/nvptx/nvptx.md (UNSPECV_UNLOCK): Delete.
> (oacc_lock, oacc_unlock): New expanders.
> (nvptx_spinlock, nvptx_spinunlock): Use UNSPECV_LOCK.
> * config/nvptx/nvptx.c (nvptx_expand_oacc_lock_unlock): New.
> (nvptx_expand_lock_unlock): Delete.
> (nvptx_expand_lock, nvptx_expand_unlock): Delete.
> (nvptx_expand_work_red_addr): Fixup address generation.
> (enum nvptx_types): Delete NT_VOID_UINT.
> (builtins): Delete nvptx_lock and nvptx_unlock.
> (nvptx_init_builtins): Adjust.
> (nvptx_xform_lock_unlock): New.
> (TARGET_GOACC_LOCK_UNLOCK): Override.
>
> Index: gcc/config/nvptx/nvptx-protos.h
> ===================================================================
> --- gcc/config/nvptx/nvptx-protos.h (revision 226951)
> +++ gcc/config/nvptx/nvptx-protos.h (working copy)
> @@ -34,6 +34,7 @@ extern const char *nvptx_section_for_dec
> #ifdef RTX_CODE
> extern void nvptx_expand_oacc_fork (rtx);
> extern void nvptx_expand_oacc_join (rtx);
> +extern void nvptx_expand_oacc_lock_unlock (rtx, bool);
> extern void nvptx_expand_call (rtx, rtx);
> extern rtx nvptx_expand_compare (rtx);
> extern const char *nvptx_ptx_type_from_mode (machine_mode, bool);
> Index: gcc/config/nvptx/nvptx.md
> ===================================================================
> --- gcc/config/nvptx/nvptx.md (revision 226951)
> +++ gcc/config/nvptx/nvptx.md (working copy)
> @@ -61,7 +61,6 @@
>
> (define_c_enum "unspecv" [
> UNSPECV_LOCK
> - UNSPECV_UNLOCK
> UNSPECV_CAS
> UNSPECV_XCHG
> UNSPECV_BARSYNC
> @@ -1366,6 +1365,26 @@
> return asms[INTVAL (operands[1])];
> })
>
> +(define_expand "oacc_lock"
> + [(unspec_volatile:SI [(match_operand:SI 0 "const_int_operand" "")
> + (match_operand:SI 1 "const_int_operand" "")]
> + UNSPECV_LOCK)]
> + ""
> +{
> + nvptx_expand_oacc_lock_unlock (operands[0], true);
> + DONE;
> +})
> +
> +(define_expand "oacc_unlock"
> + [(unspec_volatile:SI [(match_operand:SI 0 "const_int_operand" "")
> + (match_operand:SI 1 "const_int_operand" "")]
> + UNSPECV_LOCK)]
> + ""
> +{
> + nvptx_expand_oacc_lock_unlock (operands[0], false);
> + DONE;
> +})
> +
> (define_insn "nvptx_fork"
> [(unspec_volatile:SI [(match_operand:SI 0 "const_int_operand" "")]
> UNSPECV_FORK)]
> @@ -1576,7 +1595,7 @@
> [(parallel
> [(unspec_volatile [(match_operand:SI 0 "memory_operand" "m")
> (match_operand:SI 1 "const_int_operand" "i")]
> - UNSPECV_UNLOCK)
> + UNSPECV_LOCK)
> (match_operand:SI 2 "register_operand" "=R")
> (match_operand:BI 3 "register_operand" "=R")
> (label_ref (match_operand 4 "" ""))])]
> @@ -1586,7 +1605,7 @@
> (define_insn "nvptx_spinunlock"
> [(unspec_volatile [(match_operand:SI 0 "memory_operand" "m")
> (match_operand:SI 1 "const_int_operand" "i")]
> - UNSPECV_UNLOCK)
> + UNSPECV_LOCK)
> (match_operand:SI 2 "register_operand" "=R")]
> ""
> "atom%R1.exch.b32 %2,%0,0;")
> Index: gcc/config/nvptx/nvptx.c
> ===================================================================
> --- gcc/config/nvptx/nvptx.c (revision 226951)
> +++ gcc/config/nvptx/nvptx.c (working copy)
> @@ -1164,6 +1164,39 @@ nvptx_expand_oacc_join (rtx mode)
> emit_insn (gen_nvptx_joining (mode));
> }
>
> +/* Expander for reduction locking and unlocking. We expect SRC to be
> + gang or worker level. */
> +
> +void
> +nvptx_expand_oacc_lock_unlock (rtx src, bool lock)
> +{
> + unsigned HOST_WIDE_INT kind;
> + rtx pat;
> +
> + kind = INTVAL (src) == GOMP_DIM_GANG ? LOCK_GLOBAL : LOCK_SHARED;
> + lock_used[kind] = true;
> +
> + rtx mem = gen_rtx_MEM (SImode, lock_syms[kind]);
> + rtx space = GEN_INT (lock_space[kind]);
> + rtx barrier = gen_nvptx_membar (GEN_INT (lock_level[kind]));
> + rtx tmp = gen_reg_rtx (SImode);
> +
> + if (!lock)
> + emit_insn (barrier);
> + if (lock)
> + {
> + rtx_code_label *label = gen_label_rtx ();
> +
> + LABEL_NUSES (label)++;
> + pat = gen_nvptx_spinlock (mem, space, tmp, gen_reg_rtx (BImode),
> label);
> + }
> + else
> + pat = gen_nvptx_spinunlock (mem, space, tmp);
> + emit_insn (pat);
> + if (lock)
> + emit_insn (barrier);
> +}
> +
> /* Generate instruction(s) to unpack a 64 bit object into 2 32 bit
> objects. */
>
> @@ -3306,62 +3339,6 @@ nvptx_expand_shuffle_down (tree exp, rtx
> return target;
> }
>
> -/* Expander for locking and unlocking. */
> -static rtx
> -nvptx_expand_lock_unlock (tree exp, bool lock)
> -{
> - rtx src = expand_expr (CALL_EXPR_ARG (exp, 0),
> - NULL_RTX, SImode, EXPAND_NORMAL);
> - unsigned HOST_WIDE_INT kind;
> - rtx pat;
> -
> - kind = GET_CODE (src) == CONST_INT ? INTVAL (src) : LOCK_MAX;
> - if (kind >= LOCK_MAX)
> - error ("builtin %D requires constant argument less than %u",
> - get_callee_fndecl (exp), LOCK_MAX);
> - lock_used[kind] = true;
> -
> - rtx mem = gen_rtx_MEM (SImode, lock_syms[kind]);
> - rtx space = GEN_INT (lock_space[kind]);
> - rtx barrier = gen_nvptx_membar (GEN_INT (lock_level[kind]));
> -
> - if (!lock)
> - emit_insn (barrier);
> - if (lock)
> - {
> - rtx_code_label *label = gen_label_rtx ();
> -
> - LABEL_NUSES (label)++;
> - pat = gen_nvptx_spinlock (mem, space,
> - gen_reg_rtx (SImode), gen_reg_rtx (BImode),
> - label);
> - }
> - else
> - pat = gen_nvptx_spinunlock (mem, space, gen_reg_rtx (SImode));
> - emit_insn (pat);
> - if (lock)
> - emit_insn (barrier);
> - return const0_rtx;
> -}
> -
> -/* Lock expander. */
> -
> -static rtx
> -nvptx_expand_lock (tree exp, rtx ARG_UNUSED (target),
> - machine_mode ARG_UNUSED (mode), int ARG_UNUSED (ignore))
> -{
> - return nvptx_expand_lock_unlock (exp, true);
> -}
> -
> -/* Unlock expander. */
> -
> -static rtx
> -nvptx_expand_unlock (tree exp, rtx ARG_UNUSED (target),
> - machine_mode ARG_UNUSED (mode), int ARG_UNUSED (ignore))
> -{
> - return nvptx_expand_lock_unlock (exp, false);
> -}
> -
> /* Worker reduction address expander. */
> static rtx
> nvptx_expand_work_red_addr (tree exp, rtx target,
> @@ -3413,12 +3390,16 @@ nvptx_expand_work_red_addr (tree exp, rt
> /* Return offset into worker reduction array. */
> unsigned offset = loop.vars[ix].second;
>
> - rtx addr = gen_reg_rtx (Pmode);
> - emit_move_insn (addr,
> - gen_rtx_PLUS (Pmode, worker_red_sym, GEN_INT (offset)));
> + emit_insn (gen_rtx_SET (target, worker_red_sym));
> +
> + if (offset)
> + emit_insn (gen_rtx_SET (target,
> + gen_rtx_PLUS (Pmode, target, GEN_INT (offset))));
> +
> emit_insn (gen_rtx_SET (target,
> - gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
> + gen_rtx_UNSPEC (Pmode, gen_rtvec (1, target),
> UNSPEC_FROM_SHARED)));
> +
> return target;
> }
>
> @@ -3428,7 +3409,6 @@ enum nvptx_types
> NT_ULL_ULL_INT,
> NT_FLT_FLT_INT,
> NT_DBL_DBL_INT,
> - NT_VOID_UINT,
> NT_UINTPTR_UINT_UINT,
> NT_ULLPTR_UINT_UINT,
> NT_FLTPTR_UINT_UINT,
> @@ -3446,8 +3426,6 @@ static const struct builtin_description
> nvptx_expand_shuffle_down},
> {"__builtin_nvptx_shuffle_downd", NT_DBL_DBL_INT,
> nvptx_expand_shuffle_down},
> - {"__builtin_nvptx_lock", NT_VOID_UINT, nvptx_expand_lock},
> - {"__builtin_nvptx_unlock", NT_VOID_UINT, nvptx_expand_unlock},
> {"__builtin_nvptx_work_red_addr", NT_UINTPTR_UINT_UINT,
> nvptx_expand_work_red_addr},
> {"__builtin_nvptx_work_red_addrll", NT_ULLPTR_UINT_UINT,
> @@ -3492,9 +3470,6 @@ nvptx_init_builtins (void)
> types[NT_DBL_DBL_INT]
> = build_function_type_list (double_type_node, double_type_node,
> integer_type_node, NULL_TREE);
> - types[NT_VOID_UINT]
> - = build_function_type_list (void_type_node, unsigned_type_node,
> NULL_TREE);
> -
> types[NT_UINTPTR_UINT_UINT]
> = build_function_type_list (build_pointer_type (unsigned_type_node),
> unsigned_type_node, unsigned_type_node,
> @@ -3628,6 +3603,20 @@ nvptx_xform_fork_join (gimple_stmt_itera
>
> return false;
> }
> +
> +/* Check lock & unlock. We only need the gang- & worker-level ones.
> + */
> +
> +static bool
> +nvptx_xform_lock_unlock (gimple_stmt_iterator *ARG_UNUSED (gsi),
> + gimple stmt,
> + const int *ARG_UNUSED (dims),
> + bool ARG_UNUSED (is_fork))
> +{
> + tree arg = gimple_call_arg (stmt, 0);
> +
> + return TREE_INT_CST_LOW (arg) > GOMP_DIM_WORKER;
> +}
>
> #undef TARGET_OPTION_OVERRIDE
> #define TARGET_OPTION_OVERRIDE nvptx_option_override
> @@ -3732,6 +3721,9 @@ nvptx_xform_fork_join (gimple_stmt_itera
> #undef TARGET_GOACC_FORK_JOIN
> #define TARGET_GOACC_FORK_JOIN nvptx_xform_fork_join
>
> +#undef TARGET_GOACC_LOCK_UNLOCK
> +#define TARGET_GOACC_LOCK_UNLOCK nvptx_xform_lock_unlock
> +
> struct gcc_target targetm = TARGET_INITIALIZER;
>
> #include "gt-nvptx.h"
> Index: gcc/targhooks.h
> ===================================================================
> --- gcc/targhooks.h (revision 226951)
> +++ gcc/targhooks.h (working copy)
> @@ -111,6 +111,8 @@ extern bool default_goacc_validate_dims
> extern unsigned default_goacc_dim_limit (unsigned);
> extern bool default_goacc_fork_join (gimple_stmt_iterator *, gimple,
> const int [], bool);
> +extern bool default_goacc_lock_unlock (gimple_stmt_iterator *, gimple,
> + const int [], bool);
>
> /* These are here, and not in hooks.[ch], because not all users of
> hooks.h include tm.h, and thus we don't have CUMULATIVE_ARGS. */
> Index: gcc/target.def
> ===================================================================
> --- gcc/target.def (revision 226951)
> +++ gcc/target.def (working copy)
> @@ -1670,6 +1670,15 @@ default hook returns true, if there is n
> bool, (gimple_stmt_iterator *, gimple, const int[], bool),
> default_goacc_fork_join)
>
> +DEFHOOK
> +(lock_unlock,
> +"This hook should convert IFN_GOACC_LOCK and IFN_GOACC_UNLOCK function\n\
> +calls to target-specific gimple. It is executed during the oacc_xform\n\
> +pass. It should return true, if the functions should be deleted. The\n\
> +default hook returns true, if there is no RTL expanders for them.",
> +bool, (gimple_stmt_iterator *, gimple, const int[], bool),
> +default_goacc_lock_unlock)
> +
> HOOK_VECTOR_END (goacc)
>
> /* Functions relating to vectorization. */
> Index: gcc/internal-fn.def
> ===================================================================
> --- gcc/internal-fn.def (revision 226951)
> +++ gcc/internal-fn.def (working copy)
> @@ -83,3 +83,9 @@ DEF_INTERNAL_FN (GOACC_JOIN, ECF_NOTHROW
> single INTEGER_CST argument. */
> DEF_INTERNAL_FN (GOACC_DIM_SIZE, ECF_CONST | ECF_NOTHROW | ECF_LEAF, ".")
> DEF_INTERNAL_FN (GOACC_DIM_POS, ECF_PURE | ECF_NOTHROW | ECF_LEAF, ".")
> +
> +/* LOCK and UNLOCK operate a mutex used for reductions. The first
> + argument is the compute dimension of the reduction and the second
> + argument is a loop identifer. */
> +DEF_INTERNAL_FN (GOACC_LOCK, ECF_NOTHROW | ECF_LEAF, "..")
> +DEF_INTERNAL_FN (GOACC_UNLOCK, ECF_NOTHROW | ECF_LEAF, "..")
> Index: gcc/omp-low.c
> ===================================================================
> --- gcc/omp-low.c (revision 226951)
> +++ gcc/omp-low.c (working copy)
> @@ -14743,19 +14743,24 @@ execute_oacc_transform ()
> {
> default: break;
>
> + case IFN_GOACC_DIM_POS:
> case IFN_GOACC_DIM_SIZE:
> - oacc_xform_dim (&gsi, stmt, dims, false);
> + oacc_xform_dim (&gsi, stmt, dims,
> + ifn_code == IFN_GOACC_DIM_POS);
> break;
>
> - case IFN_GOACC_DIM_POS:
> - oacc_xform_dim (&gsi, stmt, dims, true);
> - break;
> + case IFN_GOACC_LOCK:
> + case IFN_GOACC_UNLOCK:
> + if (targetm.goacc.lock_unlock
> + (&gsi, stmt, dims, ifn_code == IFN_GOACC_LOCK))
> + goto remove;
>
> case IFN_GOACC_FORK:
> case IFN_GOACC_JOIN:
> if (targetm.goacc.fork_join
> (&gsi, stmt, dims, ifn_code == IFN_GOACC_FORK))
> {
> + remove:
> replace_uses_by (gimple_vdef (stmt),
> gimple_vuse (stmt));
> gsi_remove (&gsi, true);
> @@ -14814,7 +14819,6 @@ default_goacc_fork_join (gimple_stmt_ite
> gimple ARG_UNUSED (stmt),
> const int *ARG_UNUSED (dims), bool is_fork)
> {
> - /* If there is no expander, we can delete the functions. */
> if (is_fork)
> {
> #ifndef HAVE_oacc_fork
> @@ -14827,6 +14831,31 @@ default_goacc_fork_join (gimple_stmt_ite
> return true;
> #endif
> }
> +
> + return false;
> +}
> +
> +/* Default lock/unlock early expander. Delete the function calls if
> + there is no RTL expander. */
> +
> +bool
> +default_goacc_lock_unlock (gimple_stmt_iterator *ARG_UNUSED (gsi),
> + gimple ARG_UNUSED (stmt),
> + const int*ARG_UNUSED (dims),
> + bool is_lock)
> +{
> + if (is_lock)
> + {
> +#ifndef HAVE_oacc_lock
> + return true;
> +#endif
> + }
> + else
> + {
> +#ifndef HAVE_oacc_unlock
> + return true;
> +#endif
> + }
>
> return false;
> }
> Index: gcc/internal-fn.c
> ===================================================================
> --- gcc/internal-fn.c (revision 226951)
> +++ gcc/internal-fn.c (working copy)
> @@ -2025,6 +2025,32 @@ expand_GOACC_DIM_POS (gcall *ARG_UNUSED
> #endif
> }
>
> +static void
> +expand_GOACC_LOCK (gcall *ARG_UNUSED (stmt))
> +{
> +#ifdef HAVE_oacc_lock
> + rtx dim = expand_normal (gimple_call_arg (stmt, 0));
> + rtx id = expand_normal (gimple_call_arg (stmt, 1));
> +
> + emit_insn (gen_oacc_lock (dim, id));
> +#else
> + gcc_unreachable ();
> +#endif
> +}
> +
> +static void
> +expand_GOACC_UNLOCK (gcall *ARG_UNUSED (stmt))
> +{
> +#ifdef HAVE_oacc_unlock
> + rtx dim = expand_normal (gimple_call_arg (stmt, 0));
> + rtx id = expand_normal (gimple_call_arg (stmt, 1));
> +
> + emit_insn (gen_oacc_unlock (dim, id));
> +#else
> + gcc_unreachable ();
> +#endif
> +}
> +
> /* Routines to expand each internal function, indexed by function number.
> Each routine has the prototype:
>
> Index: gcc/doc/tm.texi
> ===================================================================
> --- gcc/doc/tm.texi (revision 226951)
> +++ gcc/doc/tm.texi (working copy)
> @@ -5760,6 +5760,13 @@ pass. It should return true, if the fun
> default hook returns true, if there is no RTL expanders for them.
> @end deftypefn
>
> +@deftypefn {Target Hook} bool TARGET_GOACC_LOCK_UNLOCK (gimple_stmt_iterator
> *@var{}, @var{gimple}, const @var{int[]}, @var{bool})
> +This hook should convert IFN_GOACC_LOCK and IFN_GOACC_UNLOCK function
> +calls to target-specific gimple. It is executed during the oacc_xform
> +pass. It should return true, if the functions should be deleted. The
> +default hook returns true, if there is no RTL expanders for them.
> +@end deftypefn
> +
> @node Anchored Addresses
> @section Anchored Addresses
> @cindex anchored addresses
> Index: gcc/doc/tm.texi.in
> ===================================================================
> --- gcc/doc/tm.texi.in (revision 226951)
> +++ gcc/doc/tm.texi.in (working copy)
> @@ -4251,6 +4251,8 @@ address; but often a machine-dependent
>
> @hook TARGET_GOACC_FORK_JOIN
>
> +@hook TARGET_GOACC_LOCK_UNLOCK
> +
> @node Anchored Addresses
> @section Anchored Addresses
> @cindex anchored addresses
Grüße,
Thomas
signature.asc
Description: PGP signature
