On Tue, 27 Aug 2019 at 17:29, Richard Sandiford <richard.sandif...@arm.com> wrote: > > Richard Biener <richard.guent...@gmail.com> writes: > > On Tue, Aug 27, 2019 at 11:58 AM Richard Sandiford > > <richard.sandif...@arm.com> wrote: > >> ifcvt produces: > >> > >> <bb 3> [local count: 1063004407]: > >> # i_34 = PHI <i_30(10), 0(21)> > >> # ivtmp_5 = PHI <ivtmp_6(10), 100(21)> > >> _1 = (long unsigned int) i_34; > >> _2 = _1 * 2; > >> _3 = a_23(D) + _2; > >> _4 = *_3; > >> _7 = b_24(D) + _2; > >> _49 = _4 > 0; > >> _8 = .MASK_LOAD (_7, 16B, _49); > >> _12 = _4 > 0; > >> _13 = _8 > 0; > >> _9 = _12 & _13; > >> _10 = _4 > 0; > >> _11 = _8 > 0; > >> _27 = ~_11; > >> _15 = _10 & _27; > >> _14 = c_25(D) + _2; > >> iftmp.0_26 = .MASK_LOAD (_14, 16B, _15); > >> iftmp.0_19 = _9 ? _4 : iftmp.0_26; > >> _17 = x_28(D) + _2; > >> _50 = _4 > 0; > >> .MASK_STORE (_17, 16B, _50, iftmp.0_19); > >> i_30 = i_34 + 1; > >> ivtmp_6 = ivtmp_5 - 1; > >> if (ivtmp_6 != 0) > >> goto <bb 10>; [98.99%] > >> else > >> goto <bb 9>; [1.01%] > >> > >> <bb 10> [local count: 1052266994]: > >> goto <bb 3>; [100.00%] > >> > >> which has 4 copies of _4 > 0 (a[i] > 0) and 2 copies of _8 > 0 (b[i] > 0). > > > > Huh. if-conversion does > > > > /* Now all statements are if-convertible. Combine all the basic > > blocks into one huge basic block doing the if-conversion > > on-the-fly. */ > > combine_blocks (loop); > > > > /* Delete dead predicate computations. */ > > ifcvt_local_dce (loop->header); > > > > /* Perform local CSE, this esp. helps the vectorizer analysis if loads > > and stores are involved. CSE only the loop body, not the entry > > PHIs, those are to be kept in sync with the non-if-converted copy. > > ??? We'll still keep dead stores though. */ > > exit_bbs = BITMAP_ALLOC (NULL); > > bitmap_set_bit (exit_bbs, single_exit (loop)->dest->index); > > bitmap_set_bit (exit_bbs, loop->latch->index); > > todo |= do_rpo_vn (cfun, loop_preheader_edge (loop), exit_bbs); > > > > which should remove those redundant _4 > 0 checks. In fact when I > > run this on x86_64 with -mavx512bw I see > > > > <bb 3> [local count: 1063004407]: > > # i_25 = PHI <i_20(9), 0(21)> > > # ivtmp_24 = PHI <ivtmp_12(9), 100(21)> > > _1 = (long unsigned int) i_25; > > _2 = _1 * 2; > > _3 = a_14(D) + _2; > > _4 = *_3; > > _5 = b_15(D) + _2; > > _49 = _4 > 0; > > _6 = .MASK_LOAD (_5, 16B, _49); > > _22 = _6 > 0; > > _28 = ~_22; > > _29 = _28 & _49; > > _7 = c_16(D) + _2; > > iftmp.0_17 = .MASK_LOAD (_7, 16B, _29); > > iftmp.0_10 = _29 ? iftmp.0_17 : _4; > > _8 = x_18(D) + _2; > > .MASK_STORE (_8, 16B, _49, iftmp.0_10); > > i_20 = i_25 + 1; > > ivtmp_12 = ivtmp_24 - 1; > > if (ivtmp_12 != 0) > > > > after if-conversion (that should be the case already on the GCC 9 branch). > > Gah, sorry for the noise. Turns out I still had a local change that was > trying to poke the patch into doing something wrong. Will try to check > my facts more carefully next time. > > The redundant pattern statements I was thinking of come from > vect_recog_mask_conversion_pattern, but I guess that isn't so > interesting here. > > So yeah, let's drop this whole vn thing for now... The attached version drops VN, and uses operand_equal_p for comparison. Does it look OK ?
Thanks, Prathamesh > > Thanks, > Richard
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fmla_2.c b/gcc/testsuite/gcc.target/aarch64/sve/fmla_2.c index 5c04bcdb3f5..a1b0667dab5 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/fmla_2.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/fmla_2.c @@ -15,5 +15,9 @@ f (double *restrict a, double *restrict b, double *restrict c, } } -/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */ +/* See https://gcc.gnu.org/ml/gcc-patches/2019-08/msg01644.html + for XFAILing the below test. */ + +/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 2 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 3 } } */ /* { dg-final { scan-assembler-not {\tfmad\t} } } */ diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c index b0cbbac0cb5..0fc7171d7ea 100644 --- a/gcc/tree-vect-loop.c +++ b/gcc/tree-vect-loop.c @@ -8609,6 +8609,9 @@ vect_transform_loop (loop_vec_info loop_vinfo) basic_block bb = bbs[i]; stmt_vec_info stmt_info; + if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) + loop_vinfo->cond_to_vec_mask = new cond_vmask_map_type (8); + for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si)) { @@ -8717,6 +8720,12 @@ vect_transform_loop (loop_vec_info loop_vinfo) } } } + + if (loop_vinfo->cond_to_vec_mask) + { + delete loop_vinfo->cond_to_vec_mask; + loop_vinfo->cond_to_vec_mask = 0; + } } /* BBs in loop */ /* The vectorization factor is always > 1, so if we use an IV increment of 1. diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c index 1e2dfe5d22d..862206b3256 100644 --- a/gcc/tree-vect-stmts.c +++ b/gcc/tree-vect-stmts.c @@ -1989,17 +1989,31 @@ check_load_store_masking (loop_vec_info loop_vinfo, tree vectype, static tree prepare_load_store_mask (tree mask_type, tree loop_mask, tree vec_mask, - gimple_stmt_iterator *gsi) + gimple_stmt_iterator *gsi, tree mask, + cond_vmask_map_type *cond_to_vec_mask) { gcc_assert (useless_type_conversion_p (mask_type, TREE_TYPE (vec_mask))); if (!loop_mask) return vec_mask; gcc_assert (TREE_TYPE (loop_mask) == mask_type); + + tree *slot = 0; + if (cond_to_vec_mask) + { + cond_vmask_key cond (mask, loop_mask); + slot = &cond_to_vec_mask->get_or_insert (cond); + if (*slot) + return *slot; + } + tree and_res = make_temp_ssa_name (mask_type, NULL, "vec_mask_and"); gimple *and_stmt = gimple_build_assign (and_res, BIT_AND_EXPR, vec_mask, loop_mask); gsi_insert_before (gsi, and_stmt, GSI_SAME_STMT); + + if (slot) + *slot = and_res; return and_res; } @@ -3514,8 +3528,10 @@ vectorizable_call (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, gcc_assert (ncopies == 1); tree mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_out, i); + tree scalar_mask = gimple_call_arg (gsi_stmt (*gsi), mask_opno); vargs[mask_opno] = prepare_load_store_mask - (TREE_TYPE (mask), mask, vargs[mask_opno], gsi); + (TREE_TYPE (mask), mask, vargs[mask_opno], gsi, + scalar_mask, vinfo->cond_to_vec_mask); } gcall *call; @@ -3564,9 +3580,11 @@ vectorizable_call (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, { tree mask = vect_get_loop_mask (gsi, masks, ncopies, vectype_out, j); + tree scalar_mask = gimple_call_arg (gsi_stmt (*gsi), mask_opno); vargs[mask_opno] = prepare_load_store_mask (TREE_TYPE (mask), mask, - vargs[mask_opno], gsi); + vargs[mask_opno], gsi, + scalar_mask, vinfo->cond_to_vec_mask); } if (cfn == CFN_GOMP_SIMD_LANE) @@ -8109,7 +8127,8 @@ vectorizable_store (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, vectype, j); if (vec_mask) final_mask = prepare_load_store_mask (mask_vectype, final_mask, - vec_mask, gsi); + vec_mask, gsi, mask, + vinfo->cond_to_vec_mask); gcall *call; if (final_mask) @@ -8163,7 +8182,8 @@ vectorizable_store (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, vectype, vec_num * j + i); if (vec_mask) final_mask = prepare_load_store_mask (mask_vectype, final_mask, - vec_mask, gsi); + vec_mask, gsi, mask, + vinfo->cond_to_vec_mask); if (memory_access_type == VMAT_GATHER_SCATTER) { @@ -9304,7 +9324,8 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, vectype, j); if (vec_mask) final_mask = prepare_load_store_mask (mask_vectype, final_mask, - vec_mask, gsi); + vec_mask, gsi, mask, + vinfo->cond_to_vec_mask); gcall *call; if (final_mask) @@ -9355,7 +9376,8 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, vectype, vec_num * j + i); if (vec_mask) final_mask = prepare_load_store_mask (mask_vectype, final_mask, - vec_mask, gsi); + vec_mask, gsi, mask, + vinfo->cond_to_vec_mask); if (i > 0) dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, gsi, @@ -9975,6 +9997,38 @@ vectorizable_condition (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, /* Handle cond expr. */ for (j = 0; j < ncopies; j++) { + tree vec_mask = NULL_TREE; + + if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) + && TREE_CODE_CLASS (TREE_CODE (cond_expr)) == tcc_comparison + && loop_vinfo->cond_to_vec_mask) + { + vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo); + if (masks) + { + tree loop_mask = vect_get_loop_mask (gsi, masks, + ncopies, vectype, j); + + cond_vmask_key cond (cond_expr, loop_mask); + tree *slot = loop_vinfo->cond_to_vec_mask->get (cond); + if (slot && *slot) + vec_mask = *slot; + else + { + cond.cond_ops.code + = invert_tree_comparison (cond.cond_ops.code, true); + slot = loop_vinfo->cond_to_vec_mask->get (cond); + if (slot && *slot) + { + vec_mask = *slot; + tree tmp = then_clause; + then_clause = else_clause; + else_clause = tmp; + } + } + } + } + stmt_vec_info new_stmt_info = NULL; if (j == 0) { @@ -10054,6 +10108,8 @@ vectorizable_condition (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, if (masked) vec_compare = vec_cond_lhs; + else if (vec_mask) + vec_compare = vec_mask; else { vec_cond_rhs = vec_oprnds1[i]; diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c index dc181524744..fb404d4b91e 100644 --- a/gcc/tree-vectorizer.c +++ b/gcc/tree-vectorizer.c @@ -461,7 +461,8 @@ vec_info::vec_info (vec_info::vec_kind kind_in, void *target_cost_data_in, vec_info_shared *shared_) : kind (kind_in), shared (shared_), - target_cost_data (target_cost_data_in) + target_cost_data (target_cost_data_in), + cond_to_vec_mask (0) { stmt_vec_infos.create (50); } diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 1456cde4c2c..81629361f14 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -26,6 +26,7 @@ typedef class _stmt_vec_info *stmt_vec_info; #include "tree-data-ref.h" #include "tree-hash-traits.h" #include "target.h" +#include "hash-map.h" /* Used for naming of new temporaries. */ enum vect_var_kind { @@ -193,6 +194,81 @@ public: poly_uint64 min_value; }; +struct cond_vmask_key +{ + cond_vmask_key (tree t, tree loop_mask_) + : cond_ops (t), loop_mask (loop_mask_) + {} + + hashval_t hash () const + { + inchash::hash h; + h.add_int (cond_ops.code); + h.add_int (TREE_HASH (cond_ops.op0)); + h.add_int (TREE_HASH (cond_ops.op1)); + h.add_int (TREE_HASH (loop_mask)); + return h.end (); + } + + void mark_empty () + { + loop_mask = NULL_TREE; + } + + bool is_empty () + { + return loop_mask == NULL_TREE; + } + + tree_cond_ops cond_ops; + tree loop_mask; +}; + +inline bool operator== (const cond_vmask_key& c1, const cond_vmask_key& c2) +{ + return c1.loop_mask == c2.loop_mask + && c1.cond_ops == c2.cond_ops; +} + +struct cond_vmask_key_traits +{ + typedef cond_vmask_key value_type; + typedef cond_vmask_key compare_type; + + static inline hashval_t hash (value_type v) + { + return v.hash (); + } + + static inline bool equal (value_type existing, value_type candidate) + { + return existing == candidate; + } + + static inline void mark_empty (value_type& v) + { + v.mark_empty (); + } + + static inline bool is_empty (value_type v) + { + return v.is_empty (); + } + + static void mark_deleted (value_type&) {} + + static inline bool is_deleted (value_type) + { + return false; + } + + static inline void remove (value_type &) {} +}; + +typedef hash_map<cond_vmask_key, tree, + simple_hashmap_traits <cond_vmask_key_traits, tree> > + cond_vmask_map_type; + /* Vectorizer state shared between different analyses like vector sizes of the same CFG region. */ class vec_info_shared { @@ -255,6 +331,8 @@ public: /* Cost data used by the target cost model. */ void *target_cost_data; + cond_vmask_map_type *cond_to_vec_mask; + private: stmt_vec_info new_stmt_vec_info (gimple *stmt); void set_vinfo_for_stmt (gimple *, stmt_vec_info); diff --git a/gcc/tree.c b/gcc/tree.c index 8f80012c6e8..32a8fcf1eb8 100644 --- a/gcc/tree.c +++ b/gcc/tree.c @@ -15204,6 +15204,44 @@ max_object_size (void) return TYPE_MAX_VALUE (ptrdiff_type_node); } +/* If code(T) is comparison op or def of comparison stmt, + extract it's operands. + Else return <NE_EXPR, T, 0>. */ + +tree_cond_ops::tree_cond_ops (tree t) +{ + if (TREE_CODE_CLASS (TREE_CODE (t)) == tcc_comparison) + { + this->code = TREE_CODE (t); + this->op0 = TREE_OPERAND (t, 0); + this->op1 = TREE_OPERAND (t, 1); + return; + } + + else if (TREE_CODE (t) == SSA_NAME) + { + gassign *stmt = dyn_cast<gassign *> (SSA_NAME_DEF_STMT (t)); + if (stmt) + { + tree_code code = gimple_assign_rhs_code (stmt); + if (TREE_CODE_CLASS (code) == tcc_comparison) + { + this->code = code; + this->op0 = gimple_assign_rhs1 (stmt); + this->op1 = gimple_assign_rhs2 (stmt); + return; + } + } + + this->code = NE_EXPR; + this->op0 = t; + this->op1 = build_zero_cst (TREE_TYPE (t)); + } + + else + gcc_unreachable (); +} + #if CHECKING_P namespace selftest { diff --git a/gcc/tree.h b/gcc/tree.h index 94dbb95a78a..e6d6e9541c3 100644 --- a/gcc/tree.h +++ b/gcc/tree.h @@ -6141,4 +6141,25 @@ public: operator location_t () const { return m_combined_loc; } }; +struct tree_cond_ops +{ + tree_code code; + tree op0; + tree op1; + + tree_cond_ops (tree); +}; + +/* ??? Not sure if it's good idea to include fold-const.h + only for operand_equal_p ? */ +extern bool operand_equal_p (const_tree, const_tree, unsigned int); + +inline bool +operator== (const tree_cond_ops& o1, const tree_cond_ops &o2) +{ + return o1.code == o2.code + && operand_equal_p (o1.op0, o2.op0, 0) + && operand_equal_p (o1.op1, o2.op1, 0); +} + #endif /* GCC_TREE_H */