This patch uses IFN_COND_* to vectorise conditionally-executed,
potentially-trapping arithmetic, such as most floating-point
ops with -ftrapping-math. E.g.:
if (cond) { ... x = a + b; ... }
becomes:
...
x = IFN_COND_ADD (cond, a, b);
...
When this transformation is done on its own, the value of x for
!cond isn't important.
However, the patch also looks for the equivalent of:
y = cond ? x : a;
in which the "then" value is the result of the conditionally-executed
operation and the "else" value is the first operand of that operation.
This "else" value is the one guaranteed by IFN_COND_* and so we can
replace y with x.
The patch also adds new conditional functions for multiplication
and division, which previously weren't needed. This enables an
extra fully-masked reduction (of dubious value) in gcc.dg/vect/pr53773.c.
Tested on aarch64-linux-gnu (with and without SVE), aarch64_be-elf
and x86_64-linux-gnu. OK to install?
Richard
2018-05-16 Richard Sandiford <[email protected]>
gcc/
* internal-fn.def (IFN_COND_MUL, IFN_COND_DIV, IFN_COND_MOD): New
internal functions.
* internal-fn.h (vectorized_internal_fn_supported_p): Declare.
* internal-fn.c (FOR_EACH_CODE_MAPPING): Handle IFN_COND_MUL,
IFN_COND_DIV and IFN_COND_MOD.
(get_conditional_internal_fn): Handle RDIV_EXPR.
(can_interpret_as_conditional_op_p): Use RDIV_EXPR for floating-point
divisions.
(internal_fn_mask_index): Handle conditional internal functions.
(vectorized_internal_fn_supported_p): New function.
* optabs.def (cond_smul_optab, cond_sdiv_optab, cond_smod_optab)
(cond_udiv_optab, cond_umod_optab): New optabs.
* tree-if-conv.c: Include internal-fn.h.
(any_pred_load_store): Replace with...
(need_to_predicate): ...this new variable.
(redundant_ssa_names): New variable.
(ifcvt_can_use_mask_load_store): Move initial checks to...
(ifcvt_can_predicate): ...this new function. Handle tree codes
for which a conditional internal function exists.
(if_convertible_gimple_assign_stmt_p): Use ifcvt_can_predicate
instead of ifcvt_can_use_mask_load_store. Update after variable
name change.
(predicate_load_or_store): New function, split out from
predicate_mem_writes.
(check_redundant_cond_expr, predicate_rhs_code): New functions.
(predicate_mem_writes): Rename to...
(predicate_statements): ...this. Use predicate_load_or_store
and predicate_rhs_code.
(combine_blocks, tree_if_conversion): Update after above name changes.
(ifcvt_local_dce): Handle redundant_ssa_names.
* tree-vect-patterns.c (vect_recog_mask_conversion_pattern): Handle
general conditional functions.
* tree-vect-stmts.c (vectorizable_call): Likewise.
* config/aarch64/aarch64-sve.md (cond_<optab><mode>): New pattern
for SVE_COND_INT2_SD_OP.
* config/aarch64/iterators.md (UNSPEC_COND_MUL, UNSPEC_COND_SDIV)
(UNSPEC_UDIV): New unspecs.
(SVE_COND_INT2_OP): Include UNSPEC_MUL.
(SVE_COND_INT2_SD_OP): New int iterator.
(SVE_COND_FP2_OP): Include UNSPEC_MUL and UNSPEC_SDIV.
(optab, sve_int_op): Handle UNSPEC_COND_MUL, UNSPEC_COND_SDIV
and UNSPEC_COND_UDIV.
(sve_fp_op): Handle UNSPEC_COND_MUL and UNSPEC_COND_SDIV.
gcc/testsuite/
* gcc.dg/vect/pr53773.c: Do not expect a scalar tail when using
fully-masked loops with a fixed vector length.
* gcc.target/aarch64/sve/cond_arith_1.c: New test.
* gcc.target/aarch64/sve/cond_arith_1_run.c: Likewise.
* gcc.target/aarch64/sve/cond_arith_2.c: Likewise.
* gcc.target/aarch64/sve/cond_arith_2_run.c: Likewise.
* gcc.target/aarch64/sve/cond_arith_3.c: Likewise.
* gcc.target/aarch64/sve/cond_arith_3_run.c: Likewise.
Index: gcc/internal-fn.def
===================================================================
--- gcc/internal-fn.def 2018-05-16 11:06:14.191592902 +0100
+++ gcc/internal-fn.def 2018-05-16 11:06:14.513574219 +0100
@@ -149,6 +149,11 @@ DEF_INTERNAL_OPTAB_FN (COND_FNMA_REV, EC
DEF_INTERNAL_OPTAB_FN (COND_ADD, ECF_CONST, cond_add, cond_binary)
DEF_INTERNAL_OPTAB_FN (COND_SUB, ECF_CONST, cond_sub, cond_binary)
+DEF_INTERNAL_OPTAB_FN (COND_MUL, ECF_CONST, cond_smul, cond_binary)
+DEF_INTERNAL_SIGNED_OPTAB_FN (COND_DIV, ECF_CONST, first,
+ cond_sdiv, cond_udiv, cond_binary)
+DEF_INTERNAL_SIGNED_OPTAB_FN (COND_MOD, ECF_CONST, first,
+ cond_smod, cond_umod, cond_binary)
DEF_INTERNAL_SIGNED_OPTAB_FN (COND_MIN, ECF_CONST, first,
cond_smin, cond_umin, cond_binary)
DEF_INTERNAL_SIGNED_OPTAB_FN (COND_MAX, ECF_CONST, first,
Index: gcc/internal-fn.h
===================================================================
--- gcc/internal-fn.h 2018-05-16 11:06:14.191592902 +0100
+++ gcc/internal-fn.h 2018-05-16 11:06:14.513574219 +0100
@@ -206,4 +206,6 @@ extern void expand_internal_call (gcall
extern void expand_internal_call (internal_fn, gcall *);
extern void expand_PHI (internal_fn, gcall *);
+extern bool vectorized_internal_fn_supported_p (internal_fn, tree);
+
#endif
Index: gcc/internal-fn.c
===================================================================
--- gcc/internal-fn.c 2018-05-16 11:06:14.191592902 +0100
+++ gcc/internal-fn.c 2018-05-16 11:06:14.513574219 +0100
@@ -3208,6 +3208,9 @@ #define DEF_INTERNAL_FN(CODE, FLAGS, FNS
#define FOR_EACH_CODE_MAPPING(T) \
T (PLUS_EXPR, IFN_COND_ADD) \
T (MINUS_EXPR, IFN_COND_SUB) \
+ T (MULT_EXPR, IFN_COND_MUL) \
+ T (TRUNC_DIV_EXPR, IFN_COND_DIV) \
+ T (TRUNC_MOD_EXPR, IFN_COND_MOD) \
T (MIN_EXPR, IFN_COND_MIN) \
T (MAX_EXPR, IFN_COND_MAX) \
T (BIT_AND_EXPR, IFN_COND_AND) \
@@ -3229,13 +3232,16 @@ get_conditional_internal_fn (tree_code c
#define CASE(CODE, IFN) case CODE: return IFN;
FOR_EACH_CODE_MAPPING(CASE)
#undef CASE
+ case RDIV_EXPR:
+ return IFN_COND_DIV;
default:
return IFN_LAST;
}
}
/* If IFN implements the conditional form of a tree code, return that
- tree code, otherwise return ERROR_MARK. */
+ tree code, otherwise return ERROR_MARK. If the codes for integer
+ and floating-point operations are different, return the integer one. */
static tree_code
conditional_internal_fn_code (internal_fn ifn)
@@ -3285,13 +3291,19 @@ can_interpret_as_conditional_op_p (gimpl
tree_code code = conditional_internal_fn_code (ifn);
if (code != ERROR_MARK)
{
- *code_out = code;
*cond_out = gimple_call_arg (call, 0);
if (integer_truep (*cond_out))
*cond_out = NULL_TREE;
unsigned int nargs = gimple_call_num_args (call) - 1;
for (unsigned int i = 0; i < 3; ++i)
ops[i] = i < nargs ? gimple_call_arg (call, i + 1) : NULL_TREE;
+
+ /* CODE is set for integer operations. Adjust it if
+ floating-point ones are different. */
+ if (code == TRUNC_DIV_EXPR && FLOAT_TYPE_P (TREE_TYPE (ops[0])))
+ code = RDIV_EXPR;
+
+ *code_out = code;
return true;
}
}
@@ -3362,6 +3374,10 @@ internal_fn_mask_index (internal_fn fn)
{
switch (fn)
{
+ case IFN_COND_FMA_REV:
+ case IFN_COND_FNMA_REV:
+ return 0;
+
case IFN_MASK_LOAD:
case IFN_MASK_LOAD_LANES:
case IFN_MASK_STORE:
@@ -3375,7 +3391,7 @@ internal_fn_mask_index (internal_fn fn)
return 4;
default:
- return -1;
+ return conditional_internal_fn_code (fn) != ERROR_MARK ? 0 : -1;
}
}
@@ -3440,6 +3456,26 @@ expand_internal_call (gcall *stmt)
expand_internal_call (gimple_call_internal_fn (stmt), stmt);
}
+/* If TYPE is a vector type, return true if IFN is a direct internal
+ function that is supported for that type. If TYPE is a scalar type,
+ return true if IFN is a direct internal function that is supported for
+ the target's preferred vector version of TYPE. */
+
+bool
+vectorized_internal_fn_supported_p (internal_fn ifn, tree type)
+{
+ scalar_mode smode;
+ if (!VECTOR_TYPE_P (type) && is_a <scalar_mode> (TYPE_MODE (type), &smode))
+ {
+ machine_mode vmode = targetm.vectorize.preferred_simd_mode (smode);
+ if (VECTOR_MODE_P (vmode))
+ type = build_vector_type_for_mode (type, vmode);
+ }
+
+ return (VECTOR_MODE_P (TYPE_MODE (type))
+ && direct_internal_fn_supported_p (ifn, type, OPTIMIZE_FOR_SPEED));
+}
+
void
expand_PHI (internal_fn, gcall *)
{
Index: gcc/optabs.def
===================================================================
--- gcc/optabs.def 2018-05-16 11:06:14.191592902 +0100
+++ gcc/optabs.def 2018-05-16 11:06:14.513574219 +0100
@@ -222,8 +222,13 @@ OPTAB_D (notcc_optab, "not$acc")
OPTAB_D (movcc_optab, "mov$acc")
OPTAB_D (cond_add_optab, "cond_add$a")
OPTAB_D (cond_sub_optab, "cond_sub$a")
+OPTAB_D (cond_smul_optab, "cond_mul$a")
OPTAB_D (cond_fma_rev_optab, "cond_fma_rev$a")
OPTAB_D (cond_fnma_rev_optab, "cond_fnma_rev$a")
+OPTAB_D (cond_sdiv_optab, "cond_div$a")
+OPTAB_D (cond_smod_optab, "cond_mod$a")
+OPTAB_D (cond_udiv_optab, "cond_udiv$a")
+OPTAB_D (cond_umod_optab, "cond_umod$a")
OPTAB_D (cond_and_optab, "cond_and$a")
OPTAB_D (cond_ior_optab, "cond_ior$a")
OPTAB_D (cond_xor_optab, "cond_xor$a")
Index: gcc/tree-if-conv.c
===================================================================
--- gcc/tree-if-conv.c 2018-05-16 11:06:14.191592902 +0100
+++ gcc/tree-if-conv.c 2018-05-16 11:06:14.517573987 +0100
@@ -116,15 +116,18 @@ Software Foundation; either version 3, o
#include "builtins.h"
#include "params.h"
#include "cfganal.h"
+#include "internal-fn.h"
/* Only handle PHIs with no more arguments unless we are asked to by
simd pragma. */
#define MAX_PHI_ARG_NUM \
((unsigned) PARAM_VALUE (PARAM_MAX_TREE_IF_CONVERSION_PHI_ARGS))
-/* Indicate if new load/store that needs to be predicated is introduced
- during if conversion. */
-static bool any_pred_load_store;
+/* True if we've converted a statement that was only executed when some
+ condition C was true, and if for correctness we need to predicate the
+ statement to ensure that it is a no-op when C is false. See
+ predicate_statements for the kinds of predication we support. */
+static bool need_to_predicate;
/* Indicate if there are any complicated PHIs that need to be handled in
if-conversion. Complicated PHI has more than two arguments and can't
@@ -193,6 +196,9 @@ innermost_loop_behavior_hash::equal (con
/* Hash table to store <base reference, DR> pairs. */
static hash_map<tree_operand_hash, data_reference_p> *baseref_DR_map;
+/* List of redundant SSA names: the first should be replaced by the second. */
+static vec< std::pair<tree, tree> > redundant_ssa_names;
+
/* Structure used to predicate basic blocks. This is attached to the
->aux field of the BBs in the loop to be if-converted. */
struct bb_predicate {
@@ -919,19 +925,10 @@ ifcvt_memrefs_wont_trap (gimple *stmt, v
static bool
ifcvt_can_use_mask_load_store (gimple *stmt)
{
- tree lhs, ref;
- machine_mode mode;
- basic_block bb = gimple_bb (stmt);
- bool is_load;
-
- if (!(flag_tree_loop_vectorize || bb->loop_father->force_vectorize)
- || bb->loop_father->dont_vectorize
- || !gimple_assign_single_p (stmt)
- || gimple_has_volatile_ops (stmt))
- return false;
-
/* Check whether this is a load or store. */
- lhs = gimple_assign_lhs (stmt);
+ tree lhs = gimple_assign_lhs (stmt);
+ bool is_load;
+ tree ref;
if (gimple_store_p (stmt))
{
if (!is_gimple_val (gimple_assign_rhs1 (stmt)))
@@ -952,7 +949,7 @@ ifcvt_can_use_mask_load_store (gimple *s
/* Mask should be integer mode of the same size as the load/store
mode. */
- mode = TYPE_MODE (TREE_TYPE (lhs));
+ machine_mode mode = TYPE_MODE (TREE_TYPE (lhs));
if (!int_mode_for_mode (mode).exists () || VECTOR_MODE_P (mode))
return false;
@@ -962,6 +959,32 @@ ifcvt_can_use_mask_load_store (gimple *s
return false;
}
+/* Return true if STMT could be converted from an operation that is
+ unconditional to one that is conditional on a bb predicate mask. */
+
+static bool
+ifcvt_can_predicate (gimple *stmt)
+{
+ basic_block bb = gimple_bb (stmt);
+
+ if (!(flag_tree_loop_vectorize || bb->loop_father->force_vectorize)
+ || bb->loop_father->dont_vectorize
+ || gimple_has_volatile_ops (stmt))
+ return false;
+
+ if (gimple_assign_single_p (stmt))
+ return ifcvt_can_use_mask_load_store (stmt);
+
+ tree_code code = gimple_assign_rhs_code (stmt);
+ tree lhs_type = TREE_TYPE (gimple_assign_lhs (stmt));
+ tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
+ if (!types_compatible_p (lhs_type, rhs_type))
+ return false;
+ internal_fn cond_fn = get_conditional_internal_fn (code);
+ return (cond_fn != IFN_LAST
+ && vectorized_internal_fn_supported_p (cond_fn, lhs_type));
+}
+
/* Return true when STMT is if-convertible.
GIMPLE_ASSIGN statement is not if-convertible if,
@@ -1006,10 +1029,10 @@ if_convertible_gimple_assign_stmt_p (gim
|| ! ifcvt_memrefs_wont_trap (stmt, refs))
&& gimple_could_trap_p (stmt))
{
- if (ifcvt_can_use_mask_load_store (stmt))
+ if (ifcvt_can_predicate (stmt))
{
gimple_set_plf (stmt, GF_PLF_2, true);
- any_pred_load_store = true;
+ need_to_predicate = true;
return true;
}
if (dump_file && (dump_flags & TDF_DETAILS))
@@ -1020,7 +1043,7 @@ if_convertible_gimple_assign_stmt_p (gim
/* When if-converting stores force versioning, likewise if we
ended up generating store data races. */
if (gimple_vdef (stmt))
- any_pred_load_store = true;
+ need_to_predicate = true;
return true;
}
@@ -2052,7 +2075,7 @@ insert_gimplified_predicates (loop_p loo
stmts = bb_predicate_gimplified_stmts (bb);
if (stmts)
{
- if (any_pred_load_store)
+ if (need_to_predicate)
{
/* Insert the predicate of the BB just after the label,
as the if-conversion of memory writes will use this
@@ -2080,7 +2103,7 @@ insert_gimplified_predicates (loop_p loo
}
}
-/* Helper function for predicate_mem_writes. Returns index of existent
+/* Helper function for predicate_statements. Returns index of existent
mask if it was created for given SIZE and -1 otherwise. */
static int
@@ -2094,6 +2117,126 @@ mask_exists (int size, vec<int> vec)
return -1;
}
+/* Helper function for predicate_statements. STMT is a memory read or
+ write and it needs to be predicated by MASK. Return a statement
+ that does so. */
+
+static gimple *
+predicate_load_or_store (gimple_stmt_iterator *gsi, gassign *stmt, tree mask)
+{
+ gcall *new_stmt;
+
+ tree lhs = gimple_assign_lhs (stmt);
+ tree rhs = gimple_assign_rhs1 (stmt);
+ tree ref = TREE_CODE (lhs) == SSA_NAME ? rhs : lhs;
+ mark_addressable (ref);
+ tree addr = force_gimple_operand_gsi (gsi, build_fold_addr_expr (ref),
+ true, NULL_TREE, true, GSI_SAME_STMT);
+ tree ptr = build_int_cst (reference_alias_ptr_type (ref),
+ get_object_alignment (ref));
+ /* Copy points-to info if possible. */
+ if (TREE_CODE (addr) == SSA_NAME && !SSA_NAME_PTR_INFO (addr))
+ copy_ref_info (build2 (MEM_REF, TREE_TYPE (ref), addr, ptr),
+ ref);
+ if (TREE_CODE (lhs) == SSA_NAME)
+ {
+ new_stmt
+ = gimple_build_call_internal (IFN_MASK_LOAD, 3, addr,
+ ptr, mask);
+ gimple_call_set_lhs (new_stmt, lhs);
+ gimple_set_vuse (new_stmt, gimple_vuse (stmt));
+ }
+ else
+ {
+ new_stmt
+ = gimple_build_call_internal (IFN_MASK_STORE, 4, addr, ptr,
+ mask, rhs);
+ gimple_set_vuse (new_stmt, gimple_vuse (stmt));
+ gimple_set_vdef (new_stmt, gimple_vdef (stmt));
+ SSA_NAME_DEF_STMT (gimple_vdef (new_stmt)) = new_stmt;
+ }
+ gimple_call_set_nothrow (new_stmt, true);
+ return new_stmt;
+}
+
+/* STMT uses OP_LHS. Check whether it has the form:
+
+ ... = OP_MASK ? OP_LHS : X;
+
+ Return X if so, otherwise return null. OP_MASK is an SSA_NAME that is
+ known to have value OP_COND. */
+
+static tree
+check_redundant_cond_expr (gimple *stmt, tree op_mask, tree op_cond,
+ tree op_lhs)
+{
+ gassign *assign = dyn_cast <gassign *> (stmt);
+ if (!assign || gimple_assign_rhs_code (assign) != COND_EXPR)
+ return NULL_TREE;
+
+ tree use_cond = gimple_assign_rhs1 (assign);
+ tree if_true = gimple_assign_rhs2 (assign);
+ tree if_false = gimple_assign_rhs3 (assign);
+
+ if ((use_cond == op_mask || operand_equal_p (use_cond, op_cond, 0))
+ && if_true == op_lhs)
+ return if_false;
+
+ return NULL_TREE;
+}
+
+/* Helper function for predicate_statements. STMT is a potentially-trapping
+ arithmetic operation that needs to be predicated by MASK, an SSA_NAME that
+ has value COND. Return a statement that does so. */
+
+static gimple *
+predicate_rhs_code (gassign *stmt, tree mask, tree cond)
+{
+ tree lhs = gimple_assign_lhs (stmt);
+ tree_code code = gimple_assign_rhs_code (stmt);
+ unsigned int nops = gimple_num_ops (stmt);
+
+ /* Construct the arguments to the conditional internal function. */
+ auto_vec<tree, 8> args;
+ args.safe_grow (nops);
+ args[0] = mask;
+ for (unsigned int i = 1; i < nops; ++i)
+ args[i] = gimple_op (stmt, i);
+
+ /* Look for uses of the result to see whether they are COND_EXPRs that can
+ be folded into the conditional call, swapping arguments 1 and 2 if
+ necessary. */
+ imm_use_iterator imm_iter;
+ gimple *use_stmt;
+ bool can_swap_p = commutative_tree_code (code);
+ FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
+ {
+ tree which = check_redundant_cond_expr (use_stmt, mask, cond, lhs);
+ if (can_swap_p && which == args[2])
+ std::swap (args[1], args[2]);
+ if (which == args[1])
+ {
+ /* We have:
+
+ LHS = IFN_COND (MASK, ARGS[1], ...);
+ X = MASK ? LHS : ARGS[1];
+
+ which makes X equivalent to LHS. */
+ tree use_lhs = gimple_assign_lhs (use_stmt);
+ redundant_ssa_names.safe_push (std::make_pair (use_lhs, lhs));
+ can_swap_p = false;
+ }
+ }
+
+ /* Create and insert the call. */
+ internal_fn cond_fn = get_conditional_internal_fn (code);
+ gcall *new_stmt = gimple_build_call_internal_vec (cond_fn, args);
+ gimple_call_set_lhs (new_stmt, lhs);
+ gimple_call_set_nothrow (new_stmt, true);
+
+ return new_stmt;
+}
+
/* Predicate each write to memory in LOOP.
This function transforms control flow constructs containing memory
@@ -2158,7 +2301,7 @@ mask_exists (int size, vec<int> vec)
| goto bb_1
| end_bb_4
- predicate_mem_writes is then predicating the memory write as follows:
+ predicate_statements is then predicating the memory write as follows:
| bb_0
| i = 0
@@ -2202,7 +2345,7 @@ mask_exists (int size, vec<int> vec)
*/
static void
-predicate_mem_writes (loop_p loop)
+predicate_statements (loop_p loop)
{
unsigned int i, orig_loop_num_nodes = loop->num_nodes;
auto_vec<int, 1> vect_sizes;
@@ -2214,7 +2357,6 @@ predicate_mem_writes (loop_p loop)
basic_block bb = ifc_bbs[i];
tree cond = bb_predicate (bb);
bool swap;
- gimple *stmt;
int index;
if (is_true_predicate (cond))
@@ -2232,7 +2374,8 @@ predicate_mem_writes (loop_p loop)
for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);)
{
- if (!gimple_assign_single_p (stmt = gsi_stmt (gsi)))
+ gassign *stmt = dyn_cast <gassign *> (gsi_stmt (gsi));
+ if (!stmt)
;
else if (is_false_predicate (cond)
&& gimple_vdef (stmt))
@@ -2245,19 +2388,13 @@ predicate_mem_writes (loop_p loop)
else if (gimple_plf (stmt, GF_PLF_2))
{
tree lhs = gimple_assign_lhs (stmt);
- tree rhs = gimple_assign_rhs1 (stmt);
- tree ref, addr, ptr, mask;
- gcall *new_stmt;
+ tree mask;
+ gimple *new_stmt;
gimple_seq stmts = NULL;
machine_mode mode = TYPE_MODE (TREE_TYPE (lhs));
/* We checked before setting GF_PLF_2 that an equivalent
integer mode exists. */
int bitsize = GET_MODE_BITSIZE (mode).to_constant ();
- ref = TREE_CODE (lhs) == SSA_NAME ? rhs : lhs;
- mark_addressable (ref);
- addr = force_gimple_operand_gsi (&gsi, build_fold_addr_expr (ref),
- true, NULL_TREE, true,
- GSI_SAME_STMT);
if (!vect_sizes.is_empty ()
&& (index = mask_exists (bitsize, vect_sizes)) != -1)
/* Use created mask. */
@@ -2285,30 +2422,10 @@ predicate_mem_writes (loop_p loop)
vect_sizes.safe_push (bitsize);
vect_masks.safe_push (mask);
}
- ptr = build_int_cst (reference_alias_ptr_type (ref),
- get_object_alignment (ref));
- /* Copy points-to info if possible. */
- if (TREE_CODE (addr) == SSA_NAME && !SSA_NAME_PTR_INFO (addr))
- copy_ref_info (build2 (MEM_REF, TREE_TYPE (ref), addr, ptr),
- ref);
- if (TREE_CODE (lhs) == SSA_NAME)
- {
- new_stmt
- = gimple_build_call_internal (IFN_MASK_LOAD, 3, addr,
- ptr, mask);
- gimple_call_set_lhs (new_stmt, lhs);
- gimple_set_vuse (new_stmt, gimple_vuse (stmt));
- }
+ if (gimple_assign_single_p (stmt))
+ new_stmt = predicate_load_or_store (&gsi, stmt, mask);
else
- {
- new_stmt
- = gimple_build_call_internal (IFN_MASK_STORE, 4, addr, ptr,
- mask, rhs);
- gimple_set_vuse (new_stmt, gimple_vuse (stmt));
- gimple_set_vdef (new_stmt, gimple_vdef (stmt));
- SSA_NAME_DEF_STMT (gimple_vdef (new_stmt)) = new_stmt;
- }
- gimple_call_set_nothrow (new_stmt, true);
+ new_stmt = predicate_rhs_code (stmt, mask, cond);
gsi_replace (&gsi, new_stmt, true);
}
@@ -2392,8 +2509,8 @@ combine_blocks (struct loop *loop)
insert_gimplified_predicates (loop);
predicate_all_scalar_phis (loop);
- if (any_pred_load_store)
- predicate_mem_writes (loop);
+ if (need_to_predicate)
+ predicate_statements (loop);
/* Merge basic blocks: first remove all the edges in the loop,
except for those from the exit block. */
@@ -2733,6 +2850,12 @@ ifcvt_local_dce (basic_block bb)
enum gimple_code code;
use_operand_p use_p;
imm_use_iterator imm_iter;
+ std::pair <tree, tree> *name_pair;
+ unsigned int i;
+
+ FOR_EACH_VEC_ELT (redundant_ssa_names, i, name_pair)
+ replace_uses_by (name_pair->first, name_pair->second);
+ redundant_ssa_names.release ();
worklist.create (64);
/* Consider all phi as live statements. */
@@ -2833,7 +2956,7 @@ tree_if_conversion (struct loop *loop)
again:
rloop = NULL;
ifc_bbs = NULL;
- any_pred_load_store = false;
+ need_to_predicate = false;
any_complicated_phi = false;
/* Apply more aggressive if-conversion when loop or its outer loop were
@@ -2854,7 +2977,7 @@ tree_if_conversion (struct loop *loop)
|| !dbg_cnt (if_conversion_tree))
goto cleanup;
- if ((any_pred_load_store || any_complicated_phi)
+ if ((need_to_predicate || any_complicated_phi)
&& ((!flag_tree_loop_vectorize && !loop->force_vectorize)
|| loop->dont_vectorize))
goto cleanup;
@@ -2864,7 +2987,7 @@ tree_if_conversion (struct loop *loop)
Either version this loop, or if the pattern is right for outer-loop
vectorization, version the outer loop. In the latter case we will
still if-convert the original inner loop. */
- if (any_pred_load_store
+ if (need_to_predicate
|| any_complicated_phi
|| flag_tree_loop_if_convert != 1)
{
Index: gcc/tree-vect-patterns.c
===================================================================
--- gcc/tree-vect-patterns.c 2018-05-16 11:06:14.191592902 +0100
+++ gcc/tree-vect-patterns.c 2018-05-16 11:06:14.517573987 +0100
@@ -3955,64 +3955,67 @@ vect_recog_mask_conversion_pattern (vec<
/* Check for MASK_LOAD ans MASK_STORE calls requiring mask conversion. */
if (is_gimple_call (last_stmt)
- && gimple_call_internal_p (last_stmt)
- && (gimple_call_internal_fn (last_stmt) == IFN_MASK_STORE
- || gimple_call_internal_fn (last_stmt) == IFN_MASK_LOAD))
+ && gimple_call_internal_p (last_stmt))
{
gcall *pattern_stmt;
- bool load = (gimple_call_internal_fn (last_stmt) == IFN_MASK_LOAD);
- if (load)
+ internal_fn ifn = gimple_call_internal_fn (last_stmt);
+ int mask_argno = internal_fn_mask_index (ifn);
+ if (mask_argno < 0)
+ return NULL;
+
+ bool store_p = internal_store_fn_p (ifn);
+ if (store_p)
{
- lhs = gimple_call_lhs (last_stmt);
- vectype1 = get_vectype_for_scalar_type (TREE_TYPE (lhs));
+ int rhs_index = internal_fn_stored_value_index (ifn);
+ tree rhs = gimple_call_arg (last_stmt, rhs_index);
+ vectype1 = get_vectype_for_scalar_type (TREE_TYPE (rhs));
}
else
{
- rhs2 = gimple_call_arg (last_stmt, 3);
- vectype1 = get_vectype_for_scalar_type (TREE_TYPE (rhs2));
+ lhs = gimple_call_lhs (last_stmt);
+ vectype1 = get_vectype_for_scalar_type (TREE_TYPE (lhs));
}
- rhs1 = gimple_call_arg (last_stmt, 2);
- rhs1_type = search_type_for_mask (rhs1, vinfo);
- if (!rhs1_type)
+ tree mask_arg = gimple_call_arg (last_stmt, mask_argno);
+ tree mask_arg_type = search_type_for_mask (mask_arg, vinfo);
+ if (!mask_arg_type)
return NULL;
- vectype2 = get_mask_type_for_scalar_type (rhs1_type);
+ vectype2 = get_mask_type_for_scalar_type (mask_arg_type);
if (!vectype1 || !vectype2
|| known_eq (TYPE_VECTOR_SUBPARTS (vectype1),
TYPE_VECTOR_SUBPARTS (vectype2)))
return NULL;
- tmp = build_mask_conversion (rhs1, vectype1, stmt_vinfo, vinfo);
+ tmp = build_mask_conversion (mask_arg, vectype1, stmt_vinfo, vinfo);
- if (load)
+ auto_vec<tree, 8> args;
+ unsigned int nargs = gimple_call_num_args (last_stmt);
+ args.safe_grow (nargs);
+ for (unsigned int i = 0; i < nargs; ++i)
+ args[i] = ((int) i == mask_argno
+ ? tmp
+ : gimple_call_arg (last_stmt, i));
+ pattern_stmt = gimple_build_call_internal_vec (ifn, args);
+
+ if (!store_p)
{
lhs = vect_recog_temp_ssa_var (TREE_TYPE (lhs), NULL);
- pattern_stmt
- = gimple_build_call_internal (IFN_MASK_LOAD, 3,
- gimple_call_arg (last_stmt, 0),
- gimple_call_arg (last_stmt, 1),
- tmp);
gimple_call_set_lhs (pattern_stmt, lhs);
}
- else
- pattern_stmt
- = gimple_build_call_internal (IFN_MASK_STORE, 4,
- gimple_call_arg (last_stmt, 0),
- gimple_call_arg (last_stmt, 1),
- tmp,
- gimple_call_arg (last_stmt, 3));
-
gimple_call_set_nothrow (pattern_stmt, true);
pattern_stmt_info = new_stmt_vec_info (pattern_stmt, vinfo);
set_vinfo_for_stmt (pattern_stmt, pattern_stmt_info);
- STMT_VINFO_DATA_REF (pattern_stmt_info)
- = STMT_VINFO_DATA_REF (stmt_vinfo);
- STMT_VINFO_DR_WRT_VEC_LOOP (pattern_stmt_info)
- = STMT_VINFO_DR_WRT_VEC_LOOP (stmt_vinfo);
- DR_STMT (STMT_VINFO_DATA_REF (stmt_vinfo)) = pattern_stmt;
+ if (STMT_VINFO_DATA_REF (stmt_vinfo))
+ {
+ STMT_VINFO_DATA_REF (pattern_stmt_info)
+ = STMT_VINFO_DATA_REF (stmt_vinfo);
+ STMT_VINFO_DR_WRT_VEC_LOOP (pattern_stmt_info)
+ = STMT_VINFO_DR_WRT_VEC_LOOP (stmt_vinfo);
+ DR_STMT (STMT_VINFO_DATA_REF (stmt_vinfo)) = pattern_stmt;
+ }
*type_out = vectype1;
*type_in = vectype1;
Index: gcc/tree-vect-stmts.c
===================================================================
--- gcc/tree-vect-stmts.c 2018-05-16 11:06:14.191592902 +0100
+++ gcc/tree-vect-stmts.c 2018-05-16 11:06:14.518573929 +0100
@@ -3016,7 +3016,8 @@ vectorizable_call (gimple *gs, gimple_st
int ndts = 3;
gimple *new_stmt = NULL;
int ncopies, j;
- vec<tree> vargs = vNULL;
+ auto_vec<tree, 8> vargs;
+ auto_vec<tree, 8> orig_vargs;
enum { NARROW, NONE, WIDEN } modifier;
size_t i, nargs;
tree lhs;
@@ -3059,18 +3060,34 @@ vectorizable_call (gimple *gs, gimple_st
return false;
/* Ignore the argument of IFN_GOMP_SIMD_LANE, it is magic. */
- if (gimple_call_internal_p (stmt)
- && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE)
+ combined_fn cfn = gimple_call_combined_fn (stmt);
+ if (cfn == CFN_GOMP_SIMD_LANE)
{
nargs = 0;
rhs_type = unsigned_type_node;
}
+ int mask_opno = -1;
+ if (internal_fn_p (cfn))
+ mask_opno = internal_fn_mask_index (as_internal_fn (cfn));
+
for (i = 0; i < nargs; i++)
{
tree opvectype;
op = gimple_call_arg (stmt, i);
+ if (!vect_is_simple_use (op, vinfo, &def_stmt, &dt[i], &opvectype))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "use not simple.\n");
+ return false;
+ }
+
+ /* Skip the mask argument to an internal function. This operand
+ has been converted via a pattern if necessary. */
+ if ((int) i == mask_opno)
+ continue;
/* We can only handle calls with arguments of the same type. */
if (rhs_type
@@ -3084,14 +3101,6 @@ vectorizable_call (gimple *gs, gimple_st
if (!rhs_type)
rhs_type = TREE_TYPE (op);
- if (!vect_is_simple_use (op, vinfo, &def_stmt, &dt[i], &opvectype))
- {
- if (dump_enabled_p ())
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- "use not simple.\n");
- return false;
- }
-
if (!vectype_in)
vectype_in = opvectype;
else if (opvectype
@@ -3149,7 +3158,6 @@ vectorizable_call (gimple *gs, gimple_st
to vectorize other operations in the loop. */
fndecl = NULL_TREE;
internal_fn ifn = IFN_LAST;
- combined_fn cfn = gimple_call_combined_fn (stmt);
tree callee = gimple_call_fndecl (stmt);
/* First try using an internal function. */
@@ -3213,6 +3221,7 @@ vectorizable_call (gimple *gs, gimple_st
needs to be generated. */
gcc_assert (ncopies >= 1);
+ vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
if (!vec_stmt) /* transformation not required. */
{
STMT_VINFO_TYPE (stmt_info) = call_vec_info_type;
@@ -3226,7 +3235,13 @@ vectorizable_call (gimple *gs, gimple_st
add_stmt_cost (stmt_info->vinfo->target_cost_data, ncopies / 2,
vec_promote_demote, stmt_info, 0, vect_body);
}
-
+ if (loop_vinfo && mask_opno >= 0)
+ {
+ unsigned int nvectors = (slp_node
+ ? SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node)
+ : ncopies);
+ vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype_out);
+ }
return true;
}
@@ -3239,25 +3254,24 @@ vectorizable_call (gimple *gs, gimple_st
scalar_dest = gimple_call_lhs (stmt);
vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
+ bool masked_loop_p = loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
+
prev_stmt_info = NULL;
if (modifier == NONE || ifn != IFN_LAST)
{
tree prev_res = NULL_TREE;
+ vargs.safe_grow (nargs);
+ orig_vargs.safe_grow (nargs);
for (j = 0; j < ncopies; ++j)
{
/* Build argument list for the vectorized call. */
- if (j == 0)
- vargs.create (nargs);
- else
- vargs.truncate (0);
-
if (slp_node)
{
auto_vec<vec<tree> > vec_defs (nargs);
vec<tree> vec_oprnds0;
for (i = 0; i < nargs; i++)
- vargs.quick_push (gimple_call_arg (stmt, i));
+ vargs[i] = gimple_call_arg (stmt, i);
vect_get_slp_defs (vargs, slp_node, &vec_defs);
vec_oprnds0 = vec_defs[0];
@@ -3272,6 +3286,9 @@ vectorizable_call (gimple *gs, gimple_st
}
if (modifier == NARROW)
{
+ /* We don't define any narrowing conditional functions
+ at present. */
+ gcc_assert (mask_opno < 0);
tree half_res = make_ssa_name (vectype_in);
gcall *call
= gimple_build_call_internal_vec (ifn, vargs);
@@ -3290,6 +3307,17 @@ vectorizable_call (gimple *gs, gimple_st
}
else
{
+ if (mask_opno >= 0 && masked_loop_p)
+ {
+ unsigned int vec_num = vec_oprnds0.length ();
+ /* Always true for SLP. */
+ gcc_assert (ncopies == 1);
+ tree mask = vect_get_loop_mask (gsi, masks, vec_num,
+ vectype_out, i);
+ vargs[mask_opno] = prepare_load_store_mask
+ (TREE_TYPE (mask), mask, vargs[mask_opno], gsi);
+ }
+
gcall *call;
if (ifn != IFN_LAST)
call = gimple_build_call_internal_vec (ifn, vargs);
@@ -3319,17 +3347,22 @@ vectorizable_call (gimple *gs, gimple_st
vec_oprnd0
= vect_get_vec_def_for_operand (op, stmt);
else
- {
- vec_oprnd0 = gimple_call_arg (new_stmt, i);
- vec_oprnd0
- = vect_get_vec_def_for_stmt_copy (dt[i], vec_oprnd0);
- }
+ vec_oprnd0
+ = vect_get_vec_def_for_stmt_copy (dt[i], orig_vargs[i]);
+
+ orig_vargs[i] = vargs[i] = vec_oprnd0;
+ }
- vargs.quick_push (vec_oprnd0);
+ if (mask_opno >= 0 && masked_loop_p)
+ {
+ tree mask = vect_get_loop_mask (gsi, masks, ncopies,
+ vectype_out, j);
+ vargs[mask_opno]
+ = prepare_load_store_mask (TREE_TYPE (mask), mask,
+ vargs[mask_opno], gsi);
}
- if (gimple_call_internal_p (stmt)
- && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE)
+ if (cfn == CFN_GOMP_SIMD_LANE)
{
tree cst = build_index_vector (vectype_out, j * nunits_out, 1);
tree new_var
@@ -3341,6 +3374,9 @@ vectorizable_call (gimple *gs, gimple_st
}
else if (modifier == NARROW)
{
+ /* We don't define any narrowing conditional functions at
+ present. */
+ gcc_assert (mask_opno < 0);
tree half_res = make_ssa_name (vectype_in);
gcall *call = gimple_build_call_internal_vec (ifn, vargs);
gimple_call_set_lhs (call, half_res);
@@ -3380,6 +3416,8 @@ vectorizable_call (gimple *gs, gimple_st
}
else if (modifier == NARROW)
{
+ /* We don't define any narrowing conditional functions at present. */
+ gcc_assert (mask_opno < 0);
for (j = 0; j < ncopies; ++j)
{
/* Build argument list for the vectorized call. */
Index: gcc/config/aarch64/aarch64-sve.md
===================================================================
--- gcc/config/aarch64/aarch64-sve.md 2018-05-16 11:06:14.191592902 +0100
+++ gcc/config/aarch64/aarch64-sve.md 2018-05-16 11:06:14.511574335 +0100
@@ -1769,6 +1769,17 @@ (define_insn "cond_<optab><mode>"
"<sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
)
+(define_insn "cond_<optab><mode>"
+ [(set (match_operand:SVE_SDI 0 "register_operand" "=w")
+ (unspec:SVE_SDI
+ [(match_operand:<VPRED> 1 "register_operand" "Upl")
+ (match_operand:SVE_SDI 2 "register_operand" "0")
+ (match_operand:SVE_SDI 3 "register_operand" "w")]
+ SVE_COND_INT2_SD_OP))]
+ "TARGET_SVE"
+ "<sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
+)
+
;; Set operand 0 to the last active element in operand 3, or to tied
;; operand 1 if no elements are active.
(define_insn "fold_extract_last_<mode>"
Index: gcc/config/aarch64/iterators.md
===================================================================
--- gcc/config/aarch64/iterators.md 2018-05-16 11:06:14.191592902 +0100
+++ gcc/config/aarch64/iterators.md 2018-05-16 11:06:14.512574277 +0100
@@ -442,6 +442,9 @@ (define_c_enum "unspec"
UNSPEC_UMUL_HIGHPART ; Used in aarch64-sve.md.
UNSPEC_COND_ADD ; Used in aarch64-sve.md.
UNSPEC_COND_SUB ; Used in aarch64-sve.md.
+ UNSPEC_COND_MUL ; Used in aarch64-sve.md.
+ UNSPEC_COND_SDIV ; Used in aarch64-sve.md.
+ UNSPEC_COND_UDIV ; Used in aarch64-sve.md.
UNSPEC_COND_SMAX ; Used in aarch64-sve.md.
UNSPEC_COND_UMAX ; Used in aarch64-sve.md.
UNSPEC_COND_SMIN ; Used in aarch64-sve.md.
@@ -1502,13 +1505,17 @@ (define_int_iterator UNPACK_UNSIGNED [UN
(define_int_iterator MUL_HIGHPART [UNSPEC_SMUL_HIGHPART UNSPEC_UMUL_HIGHPART])
(define_int_iterator SVE_COND_INT2_OP [UNSPEC_COND_ADD UNSPEC_COND_SUB
+ UNSPEC_COND_MUL
UNSPEC_COND_SMAX UNSPEC_COND_UMAX
UNSPEC_COND_SMIN UNSPEC_COND_UMIN
UNSPEC_COND_AND
UNSPEC_COND_ORR
UNSPEC_COND_EOR])
-(define_int_iterator SVE_COND_FP2_OP [UNSPEC_COND_ADD UNSPEC_COND_SUB])
+(define_int_iterator SVE_COND_INT2_SD_OP [UNSPEC_COND_SDIV UNSPEC_COND_UDIV])
+
+(define_int_iterator SVE_COND_FP2_OP [UNSPEC_COND_ADD UNSPEC_COND_SUB
+ UNSPEC_COND_MUL UNSPEC_COND_SDIV])
(define_int_iterator SVE_COND_FP3_OP [UNSPEC_COND_FMLA UNSPEC_COND_FMLS])
@@ -1541,6 +1548,9 @@ (define_int_attr optab [(UNSPEC_ANDF "an
(UNSPEC_XORV "xor")
(UNSPEC_COND_ADD "add")
(UNSPEC_COND_SUB "sub")
+ (UNSPEC_COND_MUL "mul")
+ (UNSPEC_COND_SDIV "div")
+ (UNSPEC_COND_UDIV "udiv")
(UNSPEC_COND_SMAX "smax")
(UNSPEC_COND_UMAX "umax")
(UNSPEC_COND_SMIN "smin")
@@ -1759,6 +1769,9 @@ (define_int_attr cmp_op [(UNSPEC_COND_LT
(define_int_attr sve_int_op [(UNSPEC_COND_ADD "add")
(UNSPEC_COND_SUB "sub")
+ (UNSPEC_COND_MUL "mul")
+ (UNSPEC_COND_SDIV "sdiv")
+ (UNSPEC_COND_UDIV "udiv")
(UNSPEC_COND_SMAX "smax")
(UNSPEC_COND_UMAX "umax")
(UNSPEC_COND_SMIN "smin")
@@ -1769,5 +1782,7 @@ (define_int_attr sve_int_op [(UNSPEC_CON
(define_int_attr sve_fp_op [(UNSPEC_COND_ADD "fadd")
(UNSPEC_COND_SUB "fsub")
+ (UNSPEC_COND_MUL "fmul")
+ (UNSPEC_COND_SDIV "fdiv")
(UNSPEC_COND_FMLA "fmla")
(UNSPEC_COND_FMLS "fmls")])
Index: gcc/testsuite/gcc.dg/vect/pr53773.c
===================================================================
--- gcc/testsuite/gcc.dg/vect/pr53773.c 2018-05-16 11:06:14.191592902 +0100
+++ gcc/testsuite/gcc.dg/vect/pr53773.c 2018-05-16 11:06:14.515574103 +0100
@@ -14,5 +14,8 @@ foo (int integral, int decimal, int powe
return integral+decimal;
}
-/* { dg-final { scan-tree-dump-times "\\* 10" 2 "optimized" } } */
+/* We can avoid a scalar tail when using fully-masked loops with a fixed
+ vector length. */
+/* { dg-final { scan-tree-dump-times "\\* 10" 2 "optimized" { target { { !
vect_fully_masked } || vect_variable_length } } } } */
+/* { dg-final { scan-tree-dump-times "\\* 10" 0 "optimized" { target {
vect_fully_masked && { ! vect_variable_length } } } } } */
Index: gcc/testsuite/gcc.target/aarch64/sve/cond_arith_1.c
===================================================================
--- /dev/null 2018-04-20 16:19:46.369131350 +0100
+++ gcc/testsuite/gcc.target/aarch64/sve/cond_arith_1.c 2018-05-16
11:06:14.515574103 +0100
@@ -0,0 +1,64 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define TEST(TYPE, NAME, OP) \
+ void __attribute__ ((noinline, noclone)) \
+ test_##TYPE##_##NAME (TYPE *__restrict x, \
+ TYPE *__restrict y, \
+ TYPE *__restrict z, \
+ TYPE *__restrict pred, int n) \
+ { \
+ for (int i = 0; i < n; ++i) \
+ x[i] = pred[i] != 1 ? y[i] OP z[i] : y[i]; \
+ }
+
+#define TEST_INT_TYPE(TYPE) \
+ TEST (TYPE, div, /)
+
+#define TEST_FP_TYPE(TYPE) \
+ TEST (TYPE, add, +) \
+ TEST (TYPE, sub, -) \
+ TEST (TYPE, mul, *) \
+ TEST (TYPE, div, /)
+
+#define TEST_ALL \
+ TEST_INT_TYPE (int8_t) \
+ TEST_INT_TYPE (uint8_t) \
+ TEST_INT_TYPE (int16_t) \
+ TEST_INT_TYPE (uint16_t) \
+ TEST_INT_TYPE (int32_t) \
+ TEST_INT_TYPE (uint32_t) \
+ TEST_INT_TYPE (int64_t) \
+ TEST_INT_TYPE (uint64_t) \
+ TEST_FP_TYPE (float) \
+ TEST_FP_TYPE (double)
+
+TEST_ALL
+
+/* { dg-final { scan-assembler-not {\t.div\tz[0-9]+\.b} } } */ \
+/* { dg-final { scan-assembler-not {\t.div\tz[0-9]+\.h} } } */ \
+/* { dg-final { scan-assembler-times {\tsdiv\tz[0-9]+\.s, p[0-7]/m,} 7 } } */
+/* At present we don't vectorize the uint8_t or uint16_t loops because the
+ division is done directly in the narrow type, rather than being widened
+ to int first. */
+/* { dg-final { scan-assembler-times {\tudiv\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tsdiv\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tudiv\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfdiv\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfdiv\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* We fail to optimize away the SEL for the int8_t and int16_t loops,
+ because the 32-bit result is converted before selection. */
+/* { dg-final { scan-assembler-times {\tsel\t} 2 } } */
Index: gcc/testsuite/gcc.target/aarch64/sve/cond_arith_1_run.c
===================================================================
--- /dev/null 2018-04-20 16:19:46.369131350 +0100
+++ gcc/testsuite/gcc.target/aarch64/sve/cond_arith_1_run.c 2018-05-16
11:06:14.516574045 +0100
@@ -0,0 +1,33 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_arith_1.c"
+
+#define N 99
+
+#undef TEST
+#define TEST(TYPE, NAME, OP) \
+ { \
+ TYPE x[N], y[N], z[N], pred[N]; \
+ for (int i = 0; i < N; ++i) \
+ { \
+ y[i] = i * i; \
+ z[i] = ((i + 2) % 3) * (i + 1); \
+ pred[i] = i % 3; \
+ } \
+ test_##TYPE##_##NAME (x, y, z, pred, N); \
+ for (int i = 0; i < N; ++i) \
+ { \
+ TYPE expected = i % 3 != 1 ? y[i] OP z[i] : y[i]; \
+ if (x[i] != expected) \
+ __builtin_abort (); \
+ asm volatile ("" ::: "memory"); \
+ } \
+ }
+
+int
+main (void)
+{
+ TEST_ALL
+ return 0;
+}
Index: gcc/testsuite/gcc.target/aarch64/sve/cond_arith_2.c
===================================================================
--- /dev/null 2018-04-20 16:19:46.369131350 +0100
+++ gcc/testsuite/gcc.target/aarch64/sve/cond_arith_2.c 2018-05-16
11:06:14.516574045 +0100
@@ -0,0 +1,63 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -fno-vect-cost-model" } */
+
+#include <stdint.h>
+
+#define TEST(DATA_TYPE, PRED_TYPE, NAME, OP) \
+ void __attribute__ ((noinline, noclone)) \
+ test_##DATA_TYPE##_##PRED_TYPE##_##NAME (DATA_TYPE *__restrict x, \
+ DATA_TYPE *__restrict y, \
+ DATA_TYPE *__restrict z, \
+ PRED_TYPE *__restrict pred, \
+ int n) \
+ { \
+ for (int i = 0; i < n; ++i)
\
+ x[i] = pred[i] != 1 ? y[i] OP z[i] : y[i]; \
+ }
+
+#define TEST_INT_TYPE(DATA_TYPE, PRED_TYPE) \
+ TEST (DATA_TYPE, PRED_TYPE, div, /)
+
+#define TEST_FP_TYPE(DATA_TYPE, PRED_TYPE) \
+ TEST (DATA_TYPE, PRED_TYPE, add, +) \
+ TEST (DATA_TYPE, PRED_TYPE, sub, -) \
+ TEST (DATA_TYPE, PRED_TYPE, mul, *) \
+ TEST (DATA_TYPE, PRED_TYPE, div, /)
+
+#define TEST_ALL \
+ TEST_INT_TYPE (int32_t, int8_t) \
+ TEST_INT_TYPE (uint32_t, int8_t) \
+ TEST_INT_TYPE (int32_t, int16_t) \
+ TEST_INT_TYPE (uint32_t, int16_t) \
+ TEST_INT_TYPE (int64_t, int8_t) \
+ TEST_INT_TYPE (uint64_t, int8_t) \
+ TEST_INT_TYPE (int64_t, int16_t) \
+ TEST_INT_TYPE (uint64_t, int16_t) \
+ TEST_INT_TYPE (int64_t, int32_t) \
+ TEST_INT_TYPE (uint64_t, int32_t) \
+ TEST_FP_TYPE (float, int8_t) \
+ TEST_FP_TYPE (float, int16_t) \
+ TEST_FP_TYPE (double, int8_t) \
+ TEST_FP_TYPE (double, int16_t) \
+ TEST_FP_TYPE (double, int32_t)
+
+TEST_ALL
+
+/* { dg-final { scan-assembler-times {\tsdiv\tz[0-9]+\.s, p[0-7]/m,} 6 } } */
+/* { dg-final { scan-assembler-times {\tudiv\tz[0-9]+\.s, p[0-7]/m,} 6 } } */
+/* { dg-final { scan-assembler-times {\tsdiv\tz[0-9]+\.d, p[0-7]/m,} 14 } } */
+/* { dg-final { scan-assembler-times {\tudiv\tz[0-9]+\.d, p[0-7]/m,} 14 } } */
+
+/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.s, p[0-7]/m,} 6 } } */
+/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.d, p[0-7]/m,} 14 } } */
+
+/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.s, p[0-7]/m,} 6 } } */
+/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.d, p[0-7]/m,} 14 } } */
+
+/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.s, p[0-7]/m,} 6 } } */
+/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.d, p[0-7]/m,} 14 } } */
+
+/* { dg-final { scan-assembler-times {\tfdiv\tz[0-9]+\.s, p[0-7]/m,} 6 } } */
+/* { dg-final { scan-assembler-times {\tfdiv\tz[0-9]+\.d, p[0-7]/m,} 14 } } */
+
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
Index: gcc/testsuite/gcc.target/aarch64/sve/cond_arith_2_run.c
===================================================================
--- /dev/null 2018-04-20 16:19:46.369131350 +0100
+++ gcc/testsuite/gcc.target/aarch64/sve/cond_arith_2_run.c 2018-05-16
11:06:14.516574045 +0100
@@ -0,0 +1,34 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_arith_2.c"
+
+#define N 99
+
+#undef TEST
+#define TEST(DATA_TYPE, PRED_TYPE, NAME, OP) \
+ { \
+ DATA_TYPE x[N], y[N], z[N]; \
+ PRED_TYPE pred[N]; \
+ for (int i = 0; i < N; ++i) \
+ { \
+ y[i] = i * i; \
+ z[i] = ((i + 2) % 3) * (i + 1); \
+ pred[i] = i % 3; \
+ } \
+ test_##DATA_TYPE##_##PRED_TYPE##_##NAME (x, y, z, pred, N); \
+ for (int i = 0; i < N; ++i) \
+ { \
+ DATA_TYPE expected = i % 3 != 1 ? y[i] OP z[i] : y[i]; \
+ if (x[i] != expected) \
+ __builtin_abort (); \
+ asm volatile ("" ::: "memory"); \
+ } \
+ }
+
+int
+main (void)
+{
+ TEST_ALL
+ return 0;
+}
Index: gcc/testsuite/gcc.target/aarch64/sve/cond_arith_3.c
===================================================================
--- /dev/null 2018-04-20 16:19:46.369131350 +0100
+++ gcc/testsuite/gcc.target/aarch64/sve/cond_arith_3.c 2018-05-16
11:06:14.516574045 +0100
@@ -0,0 +1,62 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define TEST(TYPE, NAME, OP) \
+ void __attribute__ ((noinline, noclone)) \
+ test_##TYPE##_##NAME (TYPE *__restrict x, \
+ TYPE *__restrict y, \
+ TYPE *__restrict z, \
+ TYPE *__restrict pred, int n) \
+ { \
+ for (int i = 0; i < n; ++i) \
+ x[i] = pred[i] != 1 ? y[i] OP z[i] : 1; \
+ }
+
+#define TEST_INT_TYPE(TYPE) \
+ TEST (TYPE, div, /)
+
+#define TEST_FP_TYPE(TYPE) \
+ TEST (TYPE, add, +) \
+ TEST (TYPE, sub, -) \
+ TEST (TYPE, mul, *) \
+ TEST (TYPE, div, /)
+
+#define TEST_ALL \
+ TEST_INT_TYPE (int8_t) \
+ TEST_INT_TYPE (uint8_t) \
+ TEST_INT_TYPE (int16_t) \
+ TEST_INT_TYPE (uint16_t) \
+ TEST_INT_TYPE (int32_t) \
+ TEST_INT_TYPE (uint32_t) \
+ TEST_INT_TYPE (int64_t) \
+ TEST_INT_TYPE (uint64_t) \
+ TEST_FP_TYPE (float) \
+ TEST_FP_TYPE (double)
+
+TEST_ALL
+
+/* { dg-final { scan-assembler-not {\t.div\tz[0-9]+\.b} } } */ \
+/* { dg-final { scan-assembler-not {\t.div\tz[0-9]+\.h} } } */ \
+/* { dg-final { scan-assembler-times {\tsdiv\tz[0-9]+\.s, p[0-7]/m,} 7 } } */
+/* At present we don't vectorize the uint8_t or uint16_t loops because the
+ division is done directly in the narrow type, rather than being widened
+ to int first. */
+/* { dg-final { scan-assembler-times {\tudiv\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tsdiv\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tudiv\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfdiv\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfdiv\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tsel\t} 14 } } */
Index: gcc/testsuite/gcc.target/aarch64/sve/cond_arith_3_run.c
===================================================================
--- /dev/null 2018-04-20 16:19:46.369131350 +0100
+++ gcc/testsuite/gcc.target/aarch64/sve/cond_arith_3_run.c 2018-05-16
11:06:14.516574045 +0100
@@ -0,0 +1,34 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_arith_3.c"
+
+#define N 99
+
+#undef TEST
+#define TEST(TYPE, NAME, OP) \
+ { \
+ TYPE x[N], y[N], z[N], pred[N]; \
+ for (int i = 0; i < N; ++i) \
+ { \
+ x[i] = -1; \
+ y[i] = i * i; \
+ z[i] = ((i + 2) % 3) * (i + 1); \
+ pred[i] = i % 3; \
+ } \
+ test_##TYPE##_##NAME (x, y, z, pred, N); \
+ for (int i = 0; i < N; ++i) \
+ { \
+ TYPE expected = i % 3 != 1 ? y[i] OP z[i] : 1; \
+ if (x[i] != expected) \
+ __builtin_abort (); \
+ asm volatile ("" ::: "memory"); \
+ } \
+ }
+
+int
+main (void)
+{
+ TEST_ALL
+ return 0;
+}