The following adds vectorizer support for reduc_sbool_{and,ior,xor}_scal
in the epilogue of bool reductions.
* config/i386/sse.md (reduc_sbool_and_scal_qi): Dummy for testing.
* tree-vectorizer.h (reduction_fn_for_scalar_code): Add
optional vector type argument.
* tree-vect-loop.cc (reduction_fn_for_scalar_code): When a
mask vector type is specified, return the corresponding
MASK functions for AND, IOR and XOR.
(vect_create_epilog_for_reduction): Pun to masks to an
integer vector type only when we do not support direct mask
reduction.
(vectorizable_reduction): Prefer direct mask reduction over
integer vector reduction.
---
gcc/config/i386/sse.md | 11 ++++++
gcc/tree-vect-loop.cc | 90 +++++++++++++++++++++++++-----------------
gcc/tree-vectorizer.h | 3 +-
3 files changed, 66 insertions(+), 38 deletions(-)
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 8b28c8edb19..7f3361f8781 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -4013,6 +4013,17 @@
DONE;
})
+(define_expand "reduc_sbool_and_scal_qi"
+ [(and:QI
+ (match_operand:QI 0 "register_operand")
+ (match_operand:QI 1 "register_operand")
+ (match_operand:SI 2 "const_0_to_255_operand"))]
+ "TARGET_AVX512F"
+{
+ emit_move_insn (operands[0], operands[1]);
+ DONE;
+})
+
(define_insn "<mask_codefor>reducep<mode><mask_name><round_saeonly_name>"
[(set (match_operand:VFH_AVX512VL 0 "register_operand" "=v")
(unspec:VFH_AVX512VL
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index f656437ea5c..f523b264dfc 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -3244,7 +3244,8 @@ fold_left_reduction_fn (code_helper code, internal_fn
*reduc_fn)
Return FALSE if CODE currently cannot be vectorized as reduction. */
bool
-reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
+reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn,
+ tree vectype)
{
if (code.is_tree_code ())
switch (tree_code (code))
@@ -3262,15 +3263,18 @@ reduction_fn_for_scalar_code (code_helper code,
internal_fn *reduc_fn)
return true;
case BIT_AND_EXPR:
- *reduc_fn = IFN_REDUC_AND;
+ *reduc_fn = ((vectype && VECTOR_BOOLEAN_TYPE_P (vectype))
+ ? IFN_REDUC_SBOOL_AND : IFN_REDUC_AND);
return true;
case BIT_IOR_EXPR:
- *reduc_fn = IFN_REDUC_IOR;
+ *reduc_fn = ((vectype && VECTOR_BOOLEAN_TYPE_P (vectype))
+ ? IFN_REDUC_SBOOL_IOR : IFN_REDUC_IOR);
return true;
case BIT_XOR_EXPR:
- *reduc_fn = IFN_REDUC_XOR;
+ *reduc_fn = ((vectype && VECTOR_BOOLEAN_TYPE_P (vectype))
+ ? IFN_REDUC_SBOOL_XOR : IFN_REDUC_XOR);
return true;
case MULT_EXPR:
@@ -5559,9 +5563,12 @@ vect_create_epilog_for_reduction (loop_vec_info
loop_vinfo,
/* Shouldn't be used beyond this point. */
exit_bb = nullptr;
- /* For the actual reduction work on a bool data vector instead of a
- mask vector. */
- if (VECTOR_BOOLEAN_TYPE_P (vectype))
+ /* If we are operating on a mask vector and do not support direct mask
+ reduction, work on a bool data vector instead of a mask vector. */
+ if (VECTOR_BOOLEAN_TYPE_P (vectype)
+ && reduc_fn != IFN_REDUC_SBOOL_AND
+ && reduc_fn != IFN_REDUC_SBOOL_IOR
+ && reduc_fn != IFN_REDUC_SBOOL_XOR)
{
gcc_assert (reduc_inputs.length () == 1);
vectype = get_related_vectype_for_scalar_type (loop_vinfo->vector_mode,
@@ -7295,29 +7302,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
VECT_REDUC_INFO_CODE (reduc_info) = orig_code;
- /* For now see to implement the epilogue reduction on a bool data,
- not the mask type. */
- tree orig_vectype_out = vectype_out;
- if (VECTOR_BOOLEAN_TYPE_P (vectype_out))
- {
- vectype_out
- = get_related_vectype_for_scalar_type (loop_vinfo->vector_mode,
- TREE_TYPE (vectype_out),
- TYPE_VECTOR_SUBPARTS
- (orig_vectype_out));
- if (!vectype_out
- || maybe_ne (TYPE_VECTOR_SUBPARTS (vectype_out),
- TYPE_VECTOR_SUBPARTS (orig_vectype_out))
- || !expand_vec_cond_expr_p (vectype_out, orig_vectype_out))
- {
- if (dump_enabled_p ())
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- "cannot turn mask into bool data vector for "
- "reduction epilogue.\n");
- return false;
- }
- }
-
reduction_type = VECT_REDUC_INFO_TYPE (reduc_info);
if (reduction_type == TREE_CODE_REDUCTION)
{
@@ -7383,6 +7367,29 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
return false;
}
+ /* See if we can convert a mask vector to a corresponding bool data vector
+ to perform the epilogue reduction. */
+ tree alt_vectype_out = NULL_TREE;
+ if (VECTOR_BOOLEAN_TYPE_P (vectype_out))
+ {
+ alt_vectype_out
+ = get_related_vectype_for_scalar_type (loop_vinfo->vector_mode,
+ TREE_TYPE (vectype_out),
+ TYPE_VECTOR_SUBPARTS
+ (vectype_out));
+ if (!alt_vectype_out
+ || maybe_ne (TYPE_VECTOR_SUBPARTS (alt_vectype_out),
+ TYPE_VECTOR_SUBPARTS (vectype_out))
+ || !expand_vec_cond_expr_p (alt_vectype_out, vectype_out))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "cannot turn mask into bool data vector for "
+ "reduction epilogue.\n");
+ alt_vectype_out = NULL_TREE;
+ }
+ }
+
internal_fn reduc_fn = IFN_LAST;
if (reduction_type == TREE_CODE_REDUCTION
|| reduction_type == FOLD_LEFT_REDUCTION
@@ -7391,17 +7398,28 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
{
if (reduction_type == FOLD_LEFT_REDUCTION
? fold_left_reduction_fn (orig_code, &reduc_fn)
- : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
+ : reduction_fn_for_scalar_code (orig_code, &reduc_fn, vectype_out))
{
if (reduc_fn != IFN_LAST
&& !direct_internal_fn_supported_p (reduc_fn, vectype_out,
OPTIMIZE_FOR_SPEED))
{
- if (dump_enabled_p ())
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- "reduc op not supported by target.\n");
+ if (reduction_type != FOLD_LEFT_REDUCTION
+ && alt_vectype_out
+ && reduction_fn_for_scalar_code (orig_code, &reduc_fn,
+ alt_vectype_out)
+ && reduc_fn != IFN_LAST
+ && direct_internal_fn_supported_p (reduc_fn, alt_vectype_out,
+ OPTIMIZE_FOR_SPEED))
+ ;
+ else
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "reduc op not supported by target.\n");
- reduc_fn = IFN_LAST;
+ reduc_fn = IFN_LAST;
+ }
}
}
else
@@ -7438,8 +7456,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
return false;
}
- vectype_out = orig_vectype_out;
-
/* For SLP reductions, see if there is a neutral value we can use. */
tree neutral_op = NULL_TREE;
tree initial_value = NULL_TREE;
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 4785cbdd61d..f5827fd26f5 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -2658,7 +2658,8 @@ extern tree vect_gen_loop_len_mask (loop_vec_info,
gimple_stmt_iterator *,
unsigned int);
extern gimple_seq vect_gen_len (tree, tree, tree, tree);
extern vect_reduc_info info_for_reduction (loop_vec_info, slp_tree);
-extern bool reduction_fn_for_scalar_code (code_helper, internal_fn *);
+extern bool reduction_fn_for_scalar_code (code_helper, internal_fn *,
+ tree = NULL_TREE);
/* Drive for loop transformation stage. */
extern class loop *vect_transform_loop (loop_vec_info, gimple *);
--
2.51.0