Hi,
Given below test case,
int foo (unsigned short a[], unsigned int x)
{
unsigned int i;
for (i = 0; i < 1000; i++)
{
x = a[i];
a[i] = (unsigned short)(x >= 32768 ? x - 32768 : 0);
}
return x;
}
it now can be vectorized on AArch64, but generated assembly is way from optimal:
.L4:
ldr q4, [x3, x1]
add w2, w2, 1
cmp w2, w0
ushll v1.4s, v4.4h, 0
ushll2 v0.4s, v4.8h, 0
add v3.4s, v1.4s, v6.4s
add v2.4s, v0.4s, v6.4s
cmhi v1.4s, v1.4s, v5.4s
cmhi v0.4s, v0.4s, v5.4s
and v1.16b, v3.16b, v1.16b
and v0.16b, v2.16b, v0.16b
xtn v2.4h, v1.4s
xtn2 v2.8h, v0.4s
str q2, [x3, x1]
add x1, x1, 16
bcc .L4
The vectorized loop has 15 instructions, which can be greatly simplified by
turning cond_expr into max_expr, as below:
.L4:
ldr q1, [x3, x1]
add w2, w2, 1
cmp w2, w0
umax v0.8h, v1.8h, v2.8h
add v0.8h, v0.8h, v2.8h
str q0, [x3, x1]
add x1, x1, 16
bcc .L4
This patch addresses the issue by adding new vectorization pattern.
Bootstrap and test on x86_64 and AArch64. Is it OK?
Thanks,
bin
2016-10-11 Bin Cheng <bin.ch...@arm.com>
* tree-vect-patterns.c (vect_recog_min_max_modify_pattern): New.
(vect_vect_recog_func_ptrs): New element for above pattern.
* tree-vectorizer.h (NUM_PATTERNS): Increase by 1.
gcc/testsuite/ChangeLog
2016-10-11 Bin Cheng <bin.ch...@arm.com>
* gcc.dg/vect/vect-umax-modify-pattern.c: New test.
* gcc.dg/vect/vect-umin-modify-pattern.c: New test.
diff --git a/gcc/tree-vect-patterns.c b/gcc/tree-vect-patterns.c
index 7e6e45d..b502498 100644
--- a/gcc/tree-vect-patterns.c
+++ b/gcc/tree-vect-patterns.c
@@ -65,6 +65,8 @@ static gimple *vect_recog_divmod_pattern (vec<gimple *> *,
static gimple *vect_recog_mult_pattern (vec<gimple *> *,
tree *, tree *);
+static gimple *vect_recog_min_max_modify_pattern (vec<gimple *> *,
+ tree *, tree *);
static gimple *vect_recog_mixed_size_cond_pattern (vec<gimple *> *,
tree *, tree *);
static gimple *vect_recog_bool_pattern (vec<gimple *> *, tree *, tree *);
@@ -91,6 +93,7 @@ static vect_recog_func
vect_vect_recog_func_ptrs[NUM_PATTERNS] = {
{ vect_recog_vector_vector_shift_pattern, "vector_vector_shift" },
{ vect_recog_divmod_pattern, "divmod" },
{ vect_recog_mult_pattern, "mult" },
+ { vect_recog_min_max_modify_pattern, "min_max_modify" },
{ vect_recog_mixed_size_cond_pattern, "mixed_size_cond" },
{ vect_recog_bool_pattern, "bool" },
{ vect_recog_mask_conversion_pattern, "mask_conversion" }
@@ -2964,6 +2967,262 @@ vect_recog_divmod_pattern (vec<gimple *> *stmts,
return pattern_stmt;
}
+/* Function vect_recog_min_max_modify_pattern
+
+ Try to find the following pattern:
+
+ type x_t, iftmp1_t, iftmp2_t;
+ TYPE a_T;
+ loop:
+ S1 a_T = (TYPE)x_t;
+ S2 iftmp1_t = x_t - CONST1;
+ S3 iftmp2_t = (a_T > (TYPE)CONST2) ? iftmp1_t : (CONST2 - CONST1);
+
+ Or
+
+ loop:
+ S1 a_T = (TYPE)x_t;
+ S2 iftmp1_t = x_t + CONST1;
+ S3 iftmp2_t = (a_T < (TYPE)CONST2) ? iftmp1_t : (CONST2 + CONST1);
+
+ where both type 'TYPE' and 'type' are unsigned integral types and
+ TYPE is larger than type in prevision. All constant operands need
+ to fit into 'type'.
+
+ Input:
+
+ * LAST_STMT: A stmt from which the pattern search begins.
+
+ Output:
+
+ * TYPE_IN: The type of the input arguments to the pattern.
+
+ * TYPE_OUT: The type of the output of this pattern.
+
+ * Return value: A new stmt that will be used to replace the pattern,
+ as well as an additional def_stmt is generated.
+
+ patt_5 = MAX_EXPR <x_t, CONST2>;
+ iftmp2_t = patt_5 - CONST1;
+
+ Or
+
+ patt_5 = MIN_EXPR <x_t, CONST2>;
+ iftmp2_t = patt_5 + CONST1; */
+
+static gimple *
+vect_recog_min_max_modify_pattern (vec<gimple *> *stmts, tree *type_in,
+ tree *type_out)
+{
+ gimple *last_stmt = (*stmts)[0];
+ stmt_vec_info stmt_vinfo = vinfo_for_stmt (last_stmt), def_stmt_info;
+ gimple *pattern_stmt, *def_stmt;
+ vec_info *vinfo = stmt_vinfo->vinfo;
+ enum tree_code code, p_code;
+ tree op0 = NULL_TREE, op1, p_op1, orig_type;
+ bool promotion;
+
+ if (!is_gimple_assign (last_stmt)
+ || gimple_assign_rhs_code (last_stmt) != COND_EXPR
+ || STMT_VINFO_DEF_TYPE (stmt_vinfo) != vect_internal_def)
+ return NULL;
+
+ tree cond_expr = gimple_assign_rhs1 (last_stmt);
+ if (!COMPARISON_CLASS_P (cond_expr))
+ return NULL;
+
+ enum tree_code cond_code = TREE_CODE (cond_expr);
+ if (cond_code != LE_EXPR && cond_code != LT_EXPR
+ && cond_code != GE_EXPR && cond_code != GT_EXPR)
+ return NULL;
+
+ tree then_clause = gimple_assign_rhs2 (last_stmt);
+ tree else_clause = gimple_assign_rhs3 (last_stmt);
+ tree type = TREE_TYPE (then_clause);
+ if (!INTEGRAL_TYPE_P (type) || !TYPE_OVERFLOW_WRAPS (type))
+ return NULL;
+
+ tree vectype = get_vectype_for_scalar_type (type);
+ if (vectype == NULL_TREE)
+ return NULL;
+
+ /* Make sure the constant operand appears in else clause. */
+ if (TREE_CODE (then_clause) == INTEGER_CST)
+ {
+ cond_code = invert_tree_comparison (cond_code, false);
+ std::swap (then_clause, else_clause);
+ }
+ if (TREE_CODE (then_clause) != SSA_NAME
+ || TREE_CODE (else_clause) != INTEGER_CST)
+ return NULL;
+
+ tree cond_op0 = TREE_OPERAND (cond_expr, 0);
+ tree cond_op1 = TREE_OPERAND (cond_expr, 1);
+ tree cond_type = TREE_TYPE (cond_op0);
+ if (!INTEGRAL_TYPE_P (cond_type) || TREE_CODE (cond_op1) != INTEGER_CST)
+ return NULL;
+
+ /* Handle simple boundary cases. Below code snippet:
+
+ type x_t;
+ signed type a_T;
+
+ a_T = (signed type)x_t;
+ r = (a_t OP 0) ? then_clause : else_clause;
+
+ in which OP is either "<" or ">=". It equals to:
+
+ type x_t;
+ signed type a_T;
+
+ a_T = (signed type)x_t;
+ r = (x_t OP TYPE_MAX_VALUE (type)) ? then_clause : else_clause;
+
+ in which OP is ">" or "<=". */
+ if (tree_nop_conversion_p (type, cond_type)
+ && TYPE_UNSIGNED (type) && !TYPE_UNSIGNED (cond_type)
+ && TREE_CODE (cond_op0) == SSA_NAME && integer_zerop (cond_op1)
+ && type_conversion_p (cond_op0, last_stmt, false,
+ &orig_type, &def_stmt, &promotion)
+ && orig_type && types_compatible_p (type, orig_type))
+ {
+ if (cond_code == LT_EXPR)
+ cond_code = GT_EXPR;
+ else if (cond_code == GE_EXPR)
+ cond_code = LE_EXPR;
+ else
+ return NULL;
+
+ cond_op0 = gimple_assign_rhs1 (def_stmt);
+ cond_op1 = TYPE_MAX_VALUE (cond_type);
+ cond_type = orig_type;
+ }
+
+ /* Check if MAX_EXPR/MIN_EXPR is supported. Do it here so we can early
+ out if operator is not supported. */
+ if (cond_code == GE_EXPR || cond_code == GT_EXPR)
+ code = MAX_EXPR;
+ else if (cond_code == LE_EXPR || cond_code == LT_EXPR)
+ code = MIN_EXPR;
+ else
+ return NULL;
+ optab optab = optab_for_tree_code (code, vectype, optab_default);
+ if (!optab || optab_handler (optab, TYPE_MODE (vectype)) == CODE_FOR_nothing)
+ return NULL;
+
+ /* Check if cond_op0 is converted from type. */
+ if (TYPE_PRECISION (cond_type) > TYPE_PRECISION (type))
+ {
+ if (!int_fits_type_p (cond_op1, type))
+ return NULL;
+ else
+ cond_op1 = fold_convert (type, cond_op1);
+
+ orig_type = NULL_TREE;
+ /* If the conversion is folded in cond_expr. */
+ if (CONVERT_EXPR_CODE_P (TREE_CODE (cond_op0)))
+ {
+ cond_op0 = TREE_OPERAND (cond_op0, 0);
+ orig_type = TREE_TYPE (cond_op0);
+ }
+ /* Or not. */
+ else if (type_conversion_p (cond_op0, last_stmt, false,
+ &orig_type, &def_stmt, &promotion))
+ {
+ cond_op0 = gimple_assign_rhs1 (def_stmt);
+ }
+
+ if (!orig_type || !types_compatible_p (type, orig_type))
+ return NULL;
+ }
+ /* Only unsigned type supported so far. */
+ if (!TYPE_UNSIGNED (type) || !TYPE_UNSIGNED (cond_type))
+ return NULL;
+
+ def_stmt = SSA_NAME_DEF_STMT (then_clause);
+ if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
+ return NULL;
+ enum tree_code then_code = gimple_assign_rhs_code (def_stmt);
+ if (then_code != PLUS_EXPR && then_code != MINUS_EXPR)
+ return NULL;
+
+ tree then_op0 = gimple_assign_rhs1 (def_stmt);
+ tree then_op1 = gimple_assign_rhs2 (def_stmt);
+ if (TREE_CODE (then_op1) != INTEGER_CST
+ || !operand_equal_p (cond_op0, then_op0, 0))
+ return NULL;
+
+ /* MAX_EXPR case. */
+ if (cond_code == GE_EXPR || cond_code == GT_EXPR)
+ {
+ if (then_code == PLUS_EXPR)
+ then_op1 = fold_build1 (NEGATE_EXPR, type, then_op1);
+
+ op1 = fold_build2 (PLUS_EXPR, type, else_clause, then_op1);
+ if (wi::to_widest (cond_op1) != wi::to_widest (op1)
+ /* Boundary cases need more care, for example:
+
+ cond_op0 > (CONST2 - 1) ? then_clause : CONST2 - CONST1
+
+ equals to:
+
+ cond_op0 >= (CONST2) ? then_clause : CONST2 - CONST1
+
+ In these cases, cond_op0 and cond_op1 have the same value,
+ as well as then_clause and else_clause. It doesn't matter
+ which value is taken. */
+ && !(cond_code == GT_EXPR
+ && (wi::to_widest (cond_op1) + 1) == wi::to_widest (op1))
+ && !(cond_code == GE_EXPR
+ && (wi::to_widest (cond_op1) - 1) == wi::to_widest (op1)))
+ return NULL;
+
+ op0 = then_op0;
+ code = MAX_EXPR;
+ p_code = MINUS_EXPR;
+ p_op1 = then_op1;
+ }
+ /* MIN_EXPR case. */
+ if (cond_code == LE_EXPR || cond_code == LT_EXPR)
+ {
+ if (then_code == MINUS_EXPR)
+ then_op1 = fold_build1 (NEGATE_EXPR, type, then_op1);
+
+ op1 = fold_build2 (MINUS_EXPR, type, else_clause, then_op1);
+ if (wi::to_widest (cond_op1) != wi::to_widest (op1)
+ && !(cond_code == LT_EXPR
+ && (wi::to_widest (cond_op1) - 1) == wi::to_widest (op1))
+ && !(cond_code == LE_EXPR
+ && (wi::to_widest (cond_op1) + 1) == wi::to_widest (op1)))
+ return NULL;
+
+ op0 = then_op0;
+ code = MIN_EXPR;
+ p_code = PLUS_EXPR;
+ p_op1 = then_op1;
+ }
+ gcc_assert (op0 != NULL_TREE);
+
+ def_stmt = gimple_build_assign (vect_recog_temp_ssa_var (type, NULL),
+ code, op0, op1);
+ pattern_stmt = gimple_build_assign (vect_recog_temp_ssa_var (type, NULL),
+ p_code, gimple_assign_lhs (def_stmt),
+ p_op1);
+
+ new_pattern_def_seq (stmt_vinfo, def_stmt);
+ def_stmt_info = new_stmt_vec_info (def_stmt, vinfo);
+ set_vinfo_for_stmt (def_stmt, def_stmt_info);
+ STMT_VINFO_VECTYPE (def_stmt_info) = vectype;
+ *type_in = vectype;
+ *type_out = vectype;
+
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "vect_recog_min_max_modify_pattern: detected:\n");
+
+ return pattern_stmt;
+}
+
/* Function vect_recog_mixed_size_cond_pattern
Try to find the following pattern:
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 29ef676..e8f93ea 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -1180,7 +1180,7 @@ extern bool is_simple_and_all_uses_invariant (gimple *,
loop_vec_info);
Additional pattern recognition functions can (and will) be added
in the future. */
typedef gimple *(* vect_recog_func_ptr) (vec<gimple *> *, tree *, tree *);
-#define NUM_PATTERNS 14
+#define NUM_PATTERNS 15
void vect_pattern_recog (vec_info *);
/* In tree-vectorizer.c. */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-umax-modify-pattern.c
b/gcc/testsuite/gcc.dg/vect/vect-umax-modify-pattern.c
new file mode 100644
index 0000000..87ac36b
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-umax-modify-pattern.c
@@ -0,0 +1,48 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+int foo1 (unsigned short a[], unsigned int x)
+{
+ unsigned int i;
+ for (i = 0; i < 1000; i++)
+ {
+ x = a[i];
+ a[i] = (unsigned short)(x >= 32768 ? x - 32768 : 0);
+ }
+ return x;
+}
+
+int foo2 (unsigned short a[], unsigned int x)
+{
+ unsigned int i;
+ for (i = 0; i < 1000; i++)
+ {
+ x = a[i];
+ a[i] = (unsigned short)(x > 32768 ? x - 32768 : 0);
+ }
+ return x;
+}
+
+int foo3 (unsigned short a[], unsigned int x)
+{
+ unsigned int i;
+ for (i = 0; i < 1000; i++)
+ {
+ x = a[i];
+ a[i] = (unsigned short)(x > 1000 ? x - 1000 : 0);
+ }
+ return x;
+}
+
+int foo4 (unsigned short a[], unsigned int x)
+{
+ unsigned int i;
+ for (i = 0; i < 1000; i++)
+ {
+ x = a[i];
+ a[i] = (unsigned short)(x >= 2 ? x - 32768 : 32770);
+ }
+ return x;
+}
+/* { dg-final { scan-tree-dump-times "MAX_EXPR " 4 "optimized" { xfail
vect_no_int_min_max } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-umin-modify-pattern.c
b/gcc/testsuite/gcc.dg/vect/vect-umin-modify-pattern.c
new file mode 100644
index 0000000..85f8bbc
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-umin-modify-pattern.c
@@ -0,0 +1,49 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+int foo1 (unsigned short a[], unsigned int x)
+{
+ unsigned int i;
+ for (i = 0; i < 1000; i++)
+ {
+ x = a[i];
+ a[i] = (unsigned short)(x <= 32768 ? x + 32768 : 0);
+ }
+ return x;
+}
+
+int foo2 (unsigned short a[], unsigned int x)
+{
+ unsigned int i;
+ for (i = 0; i < 1000; i++)
+ {
+ x = a[i];
+ a[i] = (unsigned short)(x < 32768 ? x + 32768 : 0);
+ }
+ return x;
+}
+
+int foo3 (unsigned short a[], unsigned int x)
+{
+ unsigned int i;
+ for (i = 0; i < 1000; i++)
+ {
+ x = a[i];
+ a[i] = (unsigned short)(x < 1000 ? x - 1000 : 0);
+ }
+ return x;
+}
+
+int foo4 (unsigned short a[], unsigned int x)
+{
+ unsigned int i;
+ for (i = 0; i < 1000; i++)
+ {
+ x = a[i];
+ a[i] = (unsigned short)(x <= 2 ? x + 999 : 1001);
+ }
+ return x;
+}
+
+/* { dg-final { scan-tree-dump-times "MIN_EXPR " 4 "optimized" { xfail
vect_no_int_min_max } } } */