I've committed this to gomp4 branch.
The vector neutering code already has machinery to generate DI/DF shuffles from
the SI underlying instruction. This generalizes that machinery and changes the
shuffle-down machinery to use it. Less code duplication - yay! Also added a DF
mode shuffle down, as that was missing.
nathan
2015-07-30 Nathan Sidwell <nat...@acm.org>
gcc/
* config/nvptx/nvptx.mc (UNSPEC_BROADCAST, UNSPEC_SHFL_DOWN):
Replace with ...
(UNSPEC_SHUFFLE): ... this.
(nvptx_broadcast<mode>): Replace with ...
(nvptx_shuffle<mode>): ... this.
(thread_shuffle_down<mode>, thread_shiffle_downdi): Delete.
* config/nvptx/nvptx.c (SHUFFLE_UP, SHUFFLE_DOWN, SHUFFLE_BFLY,
SHUFFLE_IDX): New defines.
(nvptx_gen_shuffle): Break out of nvptx_gen_vcast and generalize.
(nvptx_gen_vcast): Use nvptx_gen_shuffle.
(nvptx_print_operand): Add 'S' case.
(nvptx_cannot_copy_insn_p): Adjust.
(nvptx_expand_shuffle_down): New builtin expander for shuffles.
(enum nvptx_types): Add NT_DBL_DBL_INT case.
(struct builtin_descriptor): Use ptr to fn for expander. Remove
icode and num_args.
(builtins): Adjust.
(nvptx_init_builtins): Adjust.
(nvptx_expand_builtin): Invoke builtin-specific expander function.
Index: gcc/config/nvptx/nvptx.md
===================================================================
--- gcc/config/nvptx/nvptx.md (revision 226377)
+++ gcc/config/nvptx/nvptx.md (working copy)
@@ -55,7 +55,7 @@
UNSPEC_BIT_CONV
- UNSPEC_BROADCAST
+ UNSPEC_SHUFFLE
UNSPEC_BR_UNIFIED
])
@@ -70,8 +70,6 @@
UNSPECV_FORKED
UNSPECV_JOINING
UNSPECV_JOIN
-
- UNSPECV_SHFL_DOWN
])
(define_attr "subregs_ok" "false,true"
@@ -1410,46 +1408,15 @@
})
;; only 32-bit shuffles exist.
-(define_insn "nvptx_broadcast<mode>"
+(define_insn "nvptx_shuffle<mode>"
[(set (match_operand:BITS 0 "nvptx_register_operand" "")
(unspec:BITS
- [(match_operand:BITS 1 "nvptx_register_operand" "")]
- UNSPEC_BROADCAST))]
+ [(match_operand:BITS 1 "nvptx_register_operand" "")
+ (match_operand:SI 2 "nvptx_nonmemory_operand" "")
+ (match_operand:SI 3 "const_int_operand" "")]
+ UNSPEC_SHUFFLE))]
""
- "%.\\tshfl.idx.b32\\t%0, %1, 0, 31;")
-
-(define_insn "thread_shuffle_down<mode>"
- [(set (match_operand:BITS 0 "nvptx_register_operand" "")
- (unspec_volatile:BITS [(match_operand:SI 1 "nvptx_register_operand" "")
- (match_operand:SI 2 "nvptx_nonmemory_operand"
"")]
- UNSPECV_SHFL_DOWN))]
- ""
- "%.\\tshfl.down.b32\\t%0, %1, %2, 31;")
-
-(define_expand "thread_shuffle_downdi"
- [(set (match_operand:DI 0 "nvptx_register_operand" "")
- (unspec_volatile:DI [(match_operand:DI 1 "nvptx_register_operand" "")
- (match_operand:SI 2 "nvptx_nonmemory_operand" "")]
- UNSPECV_SHFL_DOWN))]
- ""
-{
- rtx t = gen_reg_rtx (DImode);
- emit_insn (gen_lshrdi3 (t, operands[1], GEN_INT (32)));
- rtx op0 = force_reg (SImode, gen_lowpart (SImode, t));
- rtx op1 = force_reg (SImode, gen_lowpart (SImode, operands[1]));
- rtx targ0 = gen_reg_rtx (SImode);
- rtx targ1 = gen_reg_rtx (SImode);
- emit_insn (gen_thread_shuffle_downsi (targ0, op0, operands[2]));
- emit_insn (gen_thread_shuffle_downsi (targ1, op1, operands[2]));
- rtx t2 = gen_reg_rtx (DImode);
- rtx t3 = gen_reg_rtx (DImode);
- emit_insn (gen_extendsidi2 (t2, targ0));
- emit_insn (gen_extendsidi2 (t3, targ1));
- rtx t4 = gen_reg_rtx (DImode);
- emit_insn (gen_ashldi3 (t4, t2, GEN_INT (32)));
- emit_insn (gen_iordi3 (operands[0], t3, t4));
- DONE;
-})
+ "%.\\tshfl.%S3.b32\\t%0, %1, %2, 31;")
;; extract parts of a 64 bit object into 2 32-bit ints
(define_insn "unpack<mode>si2"
Index: gcc/config/nvptx/nvptx.c
===================================================================
--- gcc/config/nvptx/nvptx.c (revision 226377)
+++ gcc/config/nvptx/nvptx.c (working copy)
@@ -64,6 +64,11 @@
/* This file should be included last. */
#include "target-def.h"
+#define SHUFFLE_UP 0
+#define SHUFFLE_DOWN 1
+#define SHUFFLE_BFLY 2
+#define SHUFFLE_IDX 3
+
/* Record the function decls we've written, and the libfuncs and function
decls corresponding to them. */
static std::stringstream func_decls;
@@ -1132,17 +1137,17 @@ nvptx_gen_pack (rtx dst, rtx src0, rtx s
across the vectors of a single warp. */
static rtx
-nvptx_gen_vcast (rtx reg)
+nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, unsigned kind)
{
rtx res;
- switch (GET_MODE (reg))
+ switch (GET_MODE (dst))
{
case SImode:
- res = gen_nvptx_broadcastsi (reg, reg);
+ res = gen_nvptx_shufflesi (dst, src, idx, GEN_INT (kind));
break;
case SFmode:
- res = gen_nvptx_broadcastsf (reg, reg);
+ res = gen_nvptx_shufflesf (dst, src, idx, GEN_INT (kind));
break;
case DImode:
case DFmode:
@@ -1151,10 +1156,10 @@ nvptx_gen_vcast (rtx reg)
rtx tmp1 = gen_reg_rtx (SImode);
start_sequence ();
- emit_insn (nvptx_gen_unpack (tmp0, tmp1, reg));
- emit_insn (nvptx_gen_vcast (tmp0));
- emit_insn (nvptx_gen_vcast (tmp1));
- emit_insn (nvptx_gen_pack (reg, tmp0, tmp1));
+ emit_insn (nvptx_gen_unpack (tmp0, tmp1, src));
+ emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
+ emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
+ emit_insn (nvptx_gen_pack (dst, tmp0, tmp1));
res = get_insns ();
end_sequence ();
}
@@ -1164,21 +1169,29 @@ nvptx_gen_vcast (rtx reg)
rtx tmp = gen_reg_rtx (SImode);
start_sequence ();
- emit_insn (gen_sel_truesi (tmp, reg, GEN_INT (1), const0_rtx));
- emit_insn (nvptx_gen_vcast (tmp));
- emit_insn (gen_rtx_SET (reg, gen_rtx_NE (BImode, tmp, const0_rtx)));
+ emit_insn (gen_sel_truesi (tmp, src, GEN_INT (1), const0_rtx));
+ emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
+ emit_insn (gen_rtx_SET (dst, gen_rtx_NE (BImode, tmp, const0_rtx)));
res = get_insns ();
end_sequence ();
}
break;
- case HImode:
- case QImode:
- default:debug_rtx (reg);gcc_unreachable ();
+ default:
+ gcc_unreachable ();
}
return res;
}
+/* Generate an instruction or sequence to broadcast register REG
+ across the vectors of a single warp. */
+
+static rtx
+nvptx_gen_vcast (rtx reg)
+{
+ return nvptx_gen_shuffle (reg, reg, const0_rtx, SHUFFLE_IDX);
+}
+
/* Structure used when generating a worker-level spill or fill. */
struct wcast_data_t
@@ -1862,6 +1875,7 @@ nvptx_print_operand_address (FILE *file,
A -- print an address space identifier for a MEM
c -- print an opcode suffix for a comparison operator, including a type code
f -- print a full reg even for something that must always be split
+ S -- print a shuffle kind
t -- print a type opcode suffix, promoting QImode to 32 bits
T -- print a type size in bits
u -- print a type opcode suffix without promotions. */
@@ -1913,6 +1927,15 @@ nvptx_print_operand (FILE *file, rtx x,
fprintf (file, "%s", nvptx_ptx_type_from_mode (op_mode, false));
break;
+ case 'S':
+ {
+ unsigned kind = UINTVAL (x);
+ static const char *const kinds[] =
+ {"up", "down", "bfly", "idx"};
+ fprintf (file, "%s", kinds[kind]);
+ }
+ break;
+
case 'T':
fprintf (file, "%d", GET_MODE_BITSIZE (GET_MODE (x)));
break;
@@ -2996,8 +3019,8 @@ nvptx_cannot_copy_insn_p (rtx_insn *insn
{
switch (recog_memoized (insn))
{
- case CODE_FOR_nvptx_broadcastsi:
- case CODE_FOR_nvptx_broadcastsf:
+ case CODE_FOR_nvptx_shufflesi:
+ case CODE_FOR_nvptx_shufflesf:
case CODE_FOR_nvptx_barsync:
case CODE_FOR_nvptx_fork:
case CODE_FOR_nvptx_forked:
@@ -3101,11 +3124,39 @@ nvptx_file_end (void)
}
}
+/* Expander for the shuffle down builtins. */
+static rtx
+nvptx_expand_shuffle_down (tree exp, rtx target, machine_mode mode, int ignore)
+{
+ if (ignore)
+ return target;
+
+ if (! target)
+ target = gen_reg_rtx (mode);
+
+ rtx src = expand_expr (CALL_EXPR_ARG (exp, 0),
+ NULL_RTX, mode, EXPAND_NORMAL);
+ if (!REG_P (src))
+ src = copy_to_mode_reg (mode, src);
+
+ rtx idx = expand_expr (CALL_EXPR_ARG (exp, 1),
+ NULL_RTX, SImode, EXPAND_NORMAL);
+ if (!REG_P (idx) && GET_CODE (idx) != CONST_INT)
+ idx = copy_to_mode_reg (SImode, idx);
+
+ rtx pat = nvptx_gen_shuffle (target, src, idx, SHUFFLE_DOWN);
+ if (pat)
+ emit_insn (pat);
+
+ return target;
+}
+
enum nvptx_types
{
NT_UINT_UINT_INT,
NT_ULL_ULL_INT,
NT_FLT_FLT_INT,
+ NT_DBL_DBL_INT,
NT_MAX
};
@@ -3113,19 +3164,20 @@ enum nvptx_types
struct builtin_description
{
const char *name;
- enum insn_code icode;
unsigned short type;
- unsigned short num_args;
+ rtx (*expander) (tree, rtx, machine_mode, int);
};
static const struct builtin_description builtins[] =
{
- {"__builtin_nvptx_shuffle_down", CODE_FOR_thread_shuffle_downsi,
- NT_UINT_UINT_INT, 2},
- {"__builtin_nvptx_shuffle_downf", CODE_FOR_thread_shuffle_downsf,
- NT_FLT_FLT_INT, 2},
- { "__builtin_nvptx_shuffle_downll", CODE_FOR_thread_shuffle_downdi,
- NT_ULL_ULL_INT, 2},
+ {"__builtin_nvptx_shuffle_down", NT_UINT_UINT_INT,
+ nvptx_expand_shuffle_down},
+ {"__builtin_nvptx_shuffle_downll", NT_ULL_ULL_INT,
+ nvptx_expand_shuffle_down},
+ {"__builtin_nvptx_shuffle_downf", NT_FLT_FLT_INT,
+ nvptx_expand_shuffle_down},
+ {"__builtin_nvptx_shuffle_downd", NT_DBL_DBL_INT,
+ nvptx_expand_shuffle_down},
};
#define NVPTX_BUILTIN_MAX (sizeof (builtins) / sizeof (builtins[0]))
@@ -3159,6 +3211,9 @@ nvptx_init_builtins (void)
types[NT_FLT_FLT_INT]
= build_function_type_list (float_type_node, float_type_node,
integer_type_node, NULL_TREE);
+ types[NT_DBL_DBL_INT]
+ = build_function_type_list (double_type_node, double_type_node,
+ integer_type_node, NULL_TREE);
for (ix = 0; ix != NVPTX_BUILTIN_MAX; ix++)
nvptx_builtin_decls[ix]
@@ -3180,34 +3235,8 @@ nvptx_expand_builtin (tree exp, rtx targ
{
tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
const struct builtin_description *d = &builtins[DECL_FUNCTION_CODE (fndecl)];
- unsigned icode = d->icode;
- rtx operands[2]; /* maxium operands */
- unsigned ix;
- machine_mode tmode = insn_data[icode].operand[0].mode;
-
- if (ignore)
- return target;
-
- if (! target
- || mode != tmode
- || ! (*insn_data[icode].operand[0].predicate) (target, tmode))
- target = gen_reg_rtx (tmode);
-
- for (ix = d->num_args; ix--;)
- {
- machine_mode m = insn_data[icode].operand[ix + 1].mode;
- rtx op = expand_expr (CALL_EXPR_ARG (exp, ix),
- NULL_RTX, VOIDmode, EXPAND_NORMAL);
- if (! (*insn_data[icode].operand[ix + 1].predicate) (op, m))
- op = copy_to_mode_reg (m, op);
- operands[ix] = op;
- }
- rtx pat = GEN_FCN (icode) (target, operands[0], operands[1]);
- if (pat)
- emit_insn (pat);
-
- return target;
+ return d->expander (exp, target, mode, ignore);
}
#undef TARGET_OPTION_OVERRIDE