Hi,
Take subroutine "DACOP" from spec2k/200.sixtrack as an example, the loop needs
to be versioned for vectorization because of possibly alias. The alias check
for data-references are like:
//pair 1
dr_a:
(Data Ref:
bb: 8
stmt: _92 = da.cc[_27];
ref: da.cc[_27];
)
dr_b:
(Data Ref:
bb: 8
stmt: da.cc[_93] = _92;
ref: da.cc[_93];
)
//pair 2
dr_a:
(Data Ref:
bb: 8
stmt: pretmp_29 = da.i2[_27];
ref: da.i2[_27];
)
dr_b:
(Data Ref:
bb: 8
stmt: da.i2[_93] = pretmp_29;
ref: da.i2[_93];
)
//pair 3
dr_a:
(Data Ref:
bb: 8
stmt: pretmp_28 = da.i1[_27];
ref: da.i1[_27];
)
dr_b:
(Data Ref:
bb: 8
stmt: da.i1[_93] = pretmp_28;
ref: da.i1[_93];
)
The code generated for alias checks are as below:
<bb 23>:
# iy_186 = PHI <_413(22), 2(2)>
# ivtmp_1050 = PHI <ivtmp_1049(22), 512(2)>
_155 = iy_186 + -2;
_156 = _155 * 516;
_241 = iy_186 + -1;
_242 = _241 * 516;
_328 = iy_186 * 516;
_413 = iy_186 + 1;
_414 = _413 * 516;
_499 = iy_186 + 2;
_500 = _499 * 516;
_998 = iy_186 * 516;
_997 = (sizetype) _998;
_996 = _997 + 6;
_995 = _996 * 4;
_994 = global_Output.2_16 + _995;
_993 = iy_186 * 516;
_992 = (long unsigned int) _993;
_991 = _992 * 4;
_990 = _991 + 18446744073709547488;
_989 = global_Input.0_153 + _990;
_886 = _989 >= _994;
_885 = iy_186 * 516;
_884 = (sizetype) _885;
_883 = _884 + 1040;
_882 = _883 * 4;
_881 = global_Input.0_153 + _882;
_880 = (sizetype) _998;
_879 = _880 + 2;
_878 = _879 * 4;
_877 = global_Output.2_16 + _878;
_876 = _877 >= _881;
_875 = _876 | _886;
_874 = iy_186 * 516;
_873 = (sizetype) _874;
_872 = _873 + 514;
_871 = _872 * 4;
_870 = global_Output.2_16 + _871;
_869 = local_Filter_33 >= _870;
_868 = local_Filter_33 + 100;
_867 = (sizetype) _874;
_866 = _867 + 2;
_865 = _866 * 4;
_864 = global_Output.2_16 + _865;
_863 = _864 >= _868;
_862 = _863 | _869;
_861 = _862 & _875;
if (_861 != 0)
goto <bb 7>;
else
goto <bb 4>;
It contains quite a lot redundant computations. Root cause is vectorizer
simply translates alias checks into full address expressions comparison, and
CSE opportunities are covered by foler. This patch improves function
vect_create_cond_for_alias_checks by simplifying the comparison by comparing
DR_BASE_ADDRESS/DR_INIT of both data-reference at compilation time. It also
simplifies conditions:
(addr_a_min + addr_a_length) <= addr_b_min || (addr_b_min + addr_b_length) <=
addr_a_min
into below form:
cond_expr = addr_b_min - addr_a_min
cond_expr >= addr_a_length || cond_expr <= -addr_b_length
if the comparison is done in signed type. And this can be further simplified
by folder if addr_a_length and addr_b_lengnth are equal/const, which is quite
common.
I looked into generated assembly, this patch does introduces small regression
in some cases, but overall I think it's good. Bootstrap and test on x86_64 and
AArch64. Is it OK?
Thanks,
bin
2016-06-08 Bin Cheng <bin.ch...@arm.com>
* tree-vect-loop-manip.c (vect_create_cond_for_alias_checks): New
Parameter. Simplify alias check conditions at compilation time.
(vect_loop_versioning): Pass new argument to above function.
diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c
index 438458e..b38a6e4 100644
--- a/gcc/tree-vect-loop-manip.c
+++ b/gcc/tree-vect-loop-manip.c
@@ -2211,17 +2211,22 @@ vect_create_cond_for_align_checks (loop_vec_info
loop_vinfo,
Output:
COND_EXPR - conditional expression.
+ COND_EXPR_STMT_LIST - statements needed to construct the conditional
+ expression.
The returned COND_EXPR is the conditional expression to be used in the if
statement that controls which version of the loop gets executed at runtime.
*/
void
-vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
+vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo,
+ tree * cond_expr,
+ gimple_seq *cond_expr_stmt_list)
{
vec<dr_with_seg_len_pair_t> comp_alias_ddrs =
LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
tree part_cond_expr;
+ gimple_seq seq;
/* Create expression
((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
@@ -2237,21 +2242,17 @@ vect_create_cond_for_alias_checks (loop_vec_info
loop_vinfo, tree * cond_expr)
for (size_t i = 0, s = comp_alias_ddrs.length (); i < s; ++i)
{
+ enum tree_code code;
+ tree type_a, type_b, type_offset = ssizetype;
const dr_with_seg_len& dr_a = comp_alias_ddrs[i].first;
const dr_with_seg_len& dr_b = comp_alias_ddrs[i].second;
tree segment_length_a = dr_a.seg_len;
tree segment_length_b = dr_b.seg_len;
tree addr_base_a = DR_BASE_ADDRESS (dr_a.dr);
tree addr_base_b = DR_BASE_ADDRESS (dr_b.dr);
+ tree init_a = DR_INIT (dr_a.dr), init_b = DR_INIT (dr_b.dr);
tree offset_a = DR_OFFSET (dr_a.dr), offset_b = DR_OFFSET (dr_b.dr);
- offset_a = fold_build2 (PLUS_EXPR, TREE_TYPE (offset_a),
- offset_a, DR_INIT (dr_a.dr));
- offset_b = fold_build2 (PLUS_EXPR, TREE_TYPE (offset_b),
- offset_b, DR_INIT (dr_b.dr));
- addr_base_a = fold_build_pointer_plus (addr_base_a, offset_a);
- addr_base_b = fold_build_pointer_plus (addr_base_b, offset_b);
-
if (dump_enabled_p ())
{
dump_printf_loc (MSG_NOTE, vect_location,
@@ -2262,31 +2263,140 @@ vect_create_cond_for_alias_checks (loop_vec_info
loop_vinfo, tree * cond_expr)
dump_printf (MSG_NOTE, "\n");
}
- tree seg_a_min = addr_base_a;
- tree seg_a_max = fold_build_pointer_plus (addr_base_a, segment_length_a);
/* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
- [a, a+12) */
+ [a, a+12). */
if (tree_int_cst_compare (DR_STEP (dr_a.dr), size_zero_node) < 0)
{
tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_a.dr)));
- seg_a_min = fold_build_pointer_plus (seg_a_max, unit_size);
- seg_a_max = fold_build_pointer_plus (addr_base_a, unit_size);
+ init_a = fold_build2 (PLUS_EXPR, type_offset, init_a,
+ fold_convert (type_offset, unit_size));
}
-
- tree seg_b_min = addr_base_b;
- tree seg_b_max = fold_build_pointer_plus (addr_base_b, segment_length_b);
if (tree_int_cst_compare (DR_STEP (dr_b.dr), size_zero_node) < 0)
{
tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_b.dr)));
- seg_b_min = fold_build_pointer_plus (seg_b_max, unit_size);
- seg_b_max = fold_build_pointer_plus (addr_base_b, unit_size);
+ init_b = fold_build2 (PLUS_EXPR, type_offset, init_b,
+ fold_convert (type_offset, unit_size));
}
- part_cond_expr =
- fold_build2 (TRUTH_OR_EXPR, boolean_type_node,
- fold_build2 (LE_EXPR, boolean_type_node, seg_a_max, seg_b_min),
- fold_build2 (LE_EXPR, boolean_type_node, seg_b_max, seg_a_min));
+ /* No need to consider init parts if they are equal in both DRs. */
+ if (wi::to_widest (init_a) != wi::to_widest (init_b))
+ {
+ if (wi::to_widest (init_a) < wi::to_widest (init_b))
+ {
+ init_b = fold_build2 (MINUS_EXPR, type_offset, init_b, init_a);
+ init_a = fold_convert (type_offset, integer_zero_node);
+ }
+ else
+ {
+ init_a = fold_build2 (MINUS_EXPR, type_offset, init_a, init_b);
+ init_b = fold_convert (type_offset, integer_zero_node);
+ }
+
+ offset_a = fold_build2 (PLUS_EXPR, type_offset, offset_a, init_a);
+ offset_b = fold_build2 (PLUS_EXPR, type_offset, offset_b, init_b);
+ }
+
+ /* No need to consider base parts if they are equal in both DRs. */
+ if (operand_equal_p (addr_base_a, addr_base_b, 0))
+ {
+ code = PLUS_EXPR;
+ type_a = type_offset;
+ type_b = type_offset;
+ addr_base_a = offset_a;
+ addr_base_b = offset_b;
+ }
+ else
+ {
+ type_offset = sizetype;
+ code = POINTER_PLUS_EXPR;
+ type_a = TREE_TYPE (addr_base_a);
+ type_b = TREE_TYPE (addr_base_b);
+ addr_base_a = fold_build_pointer_plus (addr_base_a, offset_a);
+ addr_base_b = fold_build_pointer_plus (addr_base_b, offset_b);
+ }
+
+ segment_length_a = fold_convert (type_offset, segment_length_a);
+ segment_length_b = fold_convert (type_offset, segment_length_b);
+
+ addr_base_a = force_gimple_operand (addr_base_a, &seq, true, NULL);
+ gimple_seq_add_seq (cond_expr_stmt_list, seq);
+ addr_base_b = force_gimple_operand (addr_base_b, &seq, true, NULL);
+ gimple_seq_add_seq (cond_expr_stmt_list, seq);
+
+ tree seg_a_min;
+ tree seg_a_max;
+ if (tree_int_cst_compare (DR_STEP (dr_a.dr), size_zero_node) < 0)
+ {
+ seg_a_min = fold_build2 (code, type_a,
+ addr_base_a, segment_length_a);
+ seg_a_max = addr_base_a;
+ /* We need absolute value of segnment length later. */
+ segment_length_a = fold_build2 (MINUS_EXPR, type_offset,
+ integer_zero_node, segment_length_a);
+ }
+ else
+ {
+ seg_a_min = addr_base_a;
+ seg_a_max = fold_build2 (code, type_a,
+ addr_base_a, segment_length_a);
+ }
+
+ tree seg_b_min;
+ tree seg_b_max;
+ if (tree_int_cst_compare (DR_STEP (dr_b.dr), size_zero_node) < 0)
+ {
+ seg_b_min = fold_build2 (code, type_b,
+ addr_base_b, segment_length_b);
+ seg_b_max = addr_base_b;
+ /* We need absolute value of segnment length later. */
+ segment_length_b = fold_build2 (MINUS_EXPR, type_offset,
+ integer_zero_node, segment_length_b);
+ }
+ else
+ {
+ seg_b_min = addr_base_b;
+ seg_b_max = fold_build2 (code, type_b,
+ addr_base_b, segment_length_b);
+ }
+
+ if (!TYPE_UNSIGNED (type_offset))
+ {
+ /* If comparison is done in signed type, below condition checks:
+
+ (seg_a_max <= seg_b_min || seg_b_max <= seg_a_min)
+
+ can be simplified into:
+
+ cond_expr = seg_b_min - seg_a_min;
+ (cond_expr >= segment_length_a
+ || cond_expr <= -segment_length_b)
+
+ It can be further simplified by folder if
+
+ (segment_length_a == segment_length_b)
+
+ holds, which is common. */
+ tree diff = fold_build2 (MINUS_EXPR, type_offset,
+ seg_b_min, seg_a_min);
+ diff = force_gimple_operand (diff, &seq, true, NULL);
+ gimple_seq_add_seq (cond_expr_stmt_list, seq);
+ segment_length_b = fold_build2 (MINUS_EXPR, type_offset,
+ integer_zero_node, segment_length_b);
+ tree expr1 = fold_build2 (GE_EXPR, boolean_type_node,
+ diff, segment_length_a);
+ tree expr2 = fold_build2 (LE_EXPR, boolean_type_node,
+ diff, segment_length_b);
+ part_cond_expr = fold_build2 (TRUTH_OR_EXPR,
+ boolean_type_node, expr1, expr2);
+ }
+ else
+ {
+ part_cond_expr =
+ fold_build2 (TRUTH_OR_EXPR, boolean_type_node,
+ fold_build2 (LE_EXPR, boolean_type_node, seg_a_max, seg_b_min),
+ fold_build2 (LE_EXPR, boolean_type_node, seg_b_max, seg_a_min));
+ }
if (*cond_expr)
*cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
@@ -2356,7 +2466,8 @@ vect_loop_versioning (loop_vec_info loop_vinfo,
&cond_expr_stmt_list);
if (version_alias)
- vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
+ vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr,
+ &cond_expr_stmt_list);
cond_expr = force_gimple_operand_1 (cond_expr, &gimplify_stmt_list,
is_gimple_condexpr, NULL_TREE);