On Tue, 27 Aug 2019 at 21:14, Richard Sandiford <richard.sandif...@arm.com> wrote: > > Richard should have the final say, but some comments... > > Prathamesh Kulkarni <prathamesh.kulka...@linaro.org> writes: > > diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c > > index 1e2dfe5d22d..862206b3256 100644 > > --- a/gcc/tree-vect-stmts.c > > +++ b/gcc/tree-vect-stmts.c > > @@ -1989,17 +1989,31 @@ check_load_store_masking (loop_vec_info loop_vinfo, > > tree vectype, > > > > static tree > > prepare_load_store_mask (tree mask_type, tree loop_mask, tree vec_mask, > > - gimple_stmt_iterator *gsi) > > + gimple_stmt_iterator *gsi, tree mask, > > + cond_vmask_map_type *cond_to_vec_mask) > > "scalar_mask" might be a better name. But maybe we should key off the > vector mask after all, now that we're relying on the code having no > redundancies. > > Passing the vinfo would be better than passing the cond_vmask_map_type > directly. > > > { > > gcc_assert (useless_type_conversion_p (mask_type, TREE_TYPE (vec_mask))); > > if (!loop_mask) > > return vec_mask; > > > > gcc_assert (TREE_TYPE (loop_mask) == mask_type); > > + > > + tree *slot = 0; > > + if (cond_to_vec_mask) > > The pointer should never be null in this context. Disabling check for NULL results in segfault with cond_arith_4.c because we reach prepare_load_store_mask via vect_schedule_slp, called from here in vect_transform_loop: /* Schedule the SLP instances first, then handle loop vectorization below. */ if (!loop_vinfo->slp_instances.is_empty ()) { DUMP_VECT_SCOPE ("scheduling SLP instances"); vect_schedule_slp (loop_vinfo); }
which is before bb processing loop. > > > + { > > + cond_vmask_key cond (mask, loop_mask); > > + slot = &cond_to_vec_mask->get_or_insert (cond); > > + if (*slot) > > + return *slot; > > + } > > + > > tree and_res = make_temp_ssa_name (mask_type, NULL, "vec_mask_and"); > > gimple *and_stmt = gimple_build_assign (and_res, BIT_AND_EXPR, > > vec_mask, loop_mask); > > gsi_insert_before (gsi, and_stmt, GSI_SAME_STMT); > > + > > + if (slot) > > + *slot = and_res; > > return and_res; > > } > > [...] > > @@ -9975,6 +9997,38 @@ vectorizable_condition (stmt_vec_info stmt_info, > > gimple_stmt_iterator *gsi, > > /* Handle cond expr. */ > > for (j = 0; j < ncopies; j++) > > { > > + tree vec_mask = NULL_TREE; > > + > > + if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) > > Nit: one condition per line when the whole thing doesn't fit on a single line. > > > + && TREE_CODE_CLASS (TREE_CODE (cond_expr)) == tcc_comparison > > Why restrict this to embedded comparisons? It should work for separate > comparisons too. > > > + && loop_vinfo->cond_to_vec_mask) > > This should always be nonnull given the above. > > > + { > > + vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo); > > + if (masks) > > This is never null. > > > + { > > + tree loop_mask = vect_get_loop_mask (gsi, masks, > > + ncopies, vectype, j); > > + > > + cond_vmask_key cond (cond_expr, loop_mask); > > + tree *slot = loop_vinfo->cond_to_vec_mask->get (cond); > > + if (slot && *slot) > > + vec_mask = *slot; > > + else > > + { > > + cond.cond_ops.code > > + = invert_tree_comparison (cond.cond_ops.code, true); > > + slot = loop_vinfo->cond_to_vec_mask->get (cond); > > + if (slot && *slot) > > + { > > + vec_mask = *slot; > > + tree tmp = then_clause; > > + then_clause = else_clause; > > + else_clause = tmp; > > Can use std::swap. > > > + } > > + } > > + } > > + } > > + > > stmt_vec_info new_stmt_info = NULL; > > if (j == 0) > > { > > @@ -10054,6 +10108,8 @@ vectorizable_condition (stmt_vec_info stmt_info, > > gimple_stmt_iterator *gsi, > > > > if (masked) > > vec_compare = vec_cond_lhs; > > + else if (vec_mask) > > + vec_compare = vec_mask; > > If we do drop the comparison check above, this should win over "masked". > > > @@ -193,6 +194,81 @@ public: > > poly_uint64 min_value; > > }; > > > > +struct cond_vmask_key > > I'm no good at naming things, but since "vmask" doesn't occur elsewhere > in target-independent code, how about "vec_masked_cond_key"? > > > +{ > > + cond_vmask_key (tree t, tree loop_mask_) > > + : cond_ops (t), loop_mask (loop_mask_) > > + {} > > + > > + hashval_t hash () const > > + { > > + inchash::hash h; > > + h.add_int (cond_ops.code); > > + h.add_int (TREE_HASH (cond_ops.op0)); > > + h.add_int (TREE_HASH (cond_ops.op1)); > > These two need to use inchash::add_expr, since you're hashing for > operand_equal_p. > > > + h.add_int (TREE_HASH (loop_mask)); > > + return h.end (); > > + } > > + > > + void mark_empty () > > + { > > + loop_mask = NULL_TREE; > > + } > > + > > + bool is_empty () > > + { > > + return loop_mask == NULL_TREE; > > + } > > + > > + tree_cond_ops cond_ops; > > + tree loop_mask; > > +}; > > + > > +inline bool operator== (const cond_vmask_key& c1, const cond_vmask_key& c2) > > +{ > > + return c1.loop_mask == c2.loop_mask > > + && c1.cond_ops == c2.cond_ops; > > Multi-line expressions should be in brackets (or just put this one on > a single line). > > > +} > > + > > +struct cond_vmask_key_traits > > Might as well make this: > > template<> > struct default_hash_traits<cond_vmask_key> > > and then you can drop the third template parameter from hash_map. > > > +{ > > + typedef cond_vmask_key value_type; > > + typedef cond_vmask_key compare_type; > > + > > + static inline hashval_t hash (value_type v) > > + { > > + return v.hash (); > > + } > > + > > + static inline bool equal (value_type existing, value_type candidate) > > + { > > + return existing == candidate; > > + } > > + > > + static inline void mark_empty (value_type& v) > > + { > > + v.mark_empty (); > > + } > > + > > + static inline bool is_empty (value_type v) > > + { > > + return v.is_empty (); > > + } > > Making hash (), mask_empty () and is_empty () forward to cond_vmask_key > functions of the same name seems unnecessary. I think we should just put > the implementation here and not define the functions in cond_vmask_key > itself. > > > + > > + static void mark_deleted (value_type&) {} > > + > > + static inline bool is_deleted (value_type) > > + { > > + return false; > > + } > > + > > + static inline void remove (value_type &) {} > > +}; > > + > > +typedef hash_map<cond_vmask_key, tree, > > + simple_hashmap_traits <cond_vmask_key_traits, tree> > > > + cond_vmask_map_type; > > + > > /* Vectorizer state shared between different analyses like vector sizes > > of the same CFG region. */ > > class vec_info_shared { > > @@ -255,6 +331,8 @@ public: > > /* Cost data used by the target cost model. */ > > void *target_cost_data; > > > > + cond_vmask_map_type *cond_to_vec_mask; > > + > > private: > > stmt_vec_info new_stmt_vec_info (gimple *stmt); > > void set_vinfo_for_stmt (gimple *, stmt_vec_info); > > diff --git a/gcc/tree.c b/gcc/tree.c > > index 8f80012c6e8..32a8fcf1eb8 100644 > > --- a/gcc/tree.c > > +++ b/gcc/tree.c > > @@ -15204,6 +15204,44 @@ max_object_size (void) > > return TYPE_MAX_VALUE (ptrdiff_type_node); > > } > > > > +/* If code(T) is comparison op or def of comparison stmt, > > + extract it's operands. > > + Else return <NE_EXPR, T, 0>. */ > > + > > +tree_cond_ops::tree_cond_ops (tree t) > > +{ > > + if (TREE_CODE_CLASS (TREE_CODE (t)) == tcc_comparison) > > + { > > + this->code = TREE_CODE (t); > > + this->op0 = TREE_OPERAND (t, 0); > > + this->op1 = TREE_OPERAND (t, 1); > > + return; > > + } > > + > > + else if (TREE_CODE (t) == SSA_NAME) > > Better as just an "if", given the early return above. > > > + { > > + gassign *stmt = dyn_cast<gassign *> (SSA_NAME_DEF_STMT (t)); > > + if (stmt) > > + { > > + tree_code code = gimple_assign_rhs_code (stmt); > > + if (TREE_CODE_CLASS (code) == tcc_comparison) > > + { > > + this->code = code; > > + this->op0 = gimple_assign_rhs1 (stmt); > > + this->op1 = gimple_assign_rhs2 (stmt); > > + return; > > + } > > + } > > + > > + this->code = NE_EXPR; > > + this->op0 = t; > > + this->op1 = build_zero_cst (TREE_TYPE (t)); > > Think we should use this as the default for non-SSA_NAMEs too, > rather than assert below. > > > + } > > + > > + else > > + gcc_unreachable (); > > +} > > + > > #if CHECKING_P > > > > namespace selftest { > > diff --git a/gcc/tree.h b/gcc/tree.h > > index 94dbb95a78a..e6d6e9541c3 100644 > > --- a/gcc/tree.h > > +++ b/gcc/tree.h > > @@ -6141,4 +6141,25 @@ public: > > operator location_t () const { return m_combined_loc; } > > }; > > > > +struct tree_cond_ops > > +{ > > + tree_code code; > > + tree op0; > > + tree op1; > > + > > + tree_cond_ops (tree); > > +}; > > + > > +/* ??? Not sure if it's good idea to include fold-const.h > > + only for operand_equal_p ? */ > > Maybe put the new stuff in fold-const.h? > > > +extern bool operand_equal_p (const_tree, const_tree, unsigned int); > > + > > +inline bool > > +operator== (const tree_cond_ops& o1, const tree_cond_ops &o2) > > +{ > > + return o1.code == o2.code > > + && operand_equal_p (o1.op0, o2.op0, 0) > > + && operand_equal_p (o1.op1, o2.op1, 0); > > Multi-line expression should be enclosed in brackets. Thanks for the suggestions, I tried addressing these in attached patch. Does it look OK ? Thanks, Prathamesh > > Thanks, > Richard > > > +} > > + > > #endif /* GCC_TREE_H */ >
diff --git a/gcc/fold-const.c b/gcc/fold-const.c index 0bd68b5e2d4..fbe2dd74375 100644 --- a/gcc/fold-const.c +++ b/gcc/fold-const.c @@ -14769,6 +14769,41 @@ tree_nonzero_bits (const_tree t) return wi::shwi (-1, TYPE_PRECISION (TREE_TYPE (t))); } +/* If code(T) is comparison op or def of comparison stmt, + extract it's operands. + Else return <NE_EXPR, T, 0>. */ + +tree_cond_ops::tree_cond_ops (tree t) +{ + if (TREE_CODE_CLASS (TREE_CODE (t)) == tcc_comparison) + { + this->code = TREE_CODE (t); + this->op0 = TREE_OPERAND (t, 0); + this->op1 = TREE_OPERAND (t, 1); + return; + } + + if (TREE_CODE (t) == SSA_NAME) + { + gassign *stmt = dyn_cast<gassign *> (SSA_NAME_DEF_STMT (t)); + if (stmt) + { + tree_code code = gimple_assign_rhs_code (stmt); + if (TREE_CODE_CLASS (code) == tcc_comparison) + { + this->code = code; + this->op0 = gimple_assign_rhs1 (stmt); + this->op1 = gimple_assign_rhs2 (stmt); + return; + } + } + } + + this->code = NE_EXPR; + this->op0 = t; + this->op1 = build_zero_cst (TREE_TYPE (t)); +} + #if CHECKING_P namespace selftest { diff --git a/gcc/fold-const.h b/gcc/fold-const.h index 54c850a3ee1..cb51a24b073 100644 --- a/gcc/fold-const.h +++ b/gcc/fold-const.h @@ -212,4 +212,22 @@ extern tree fold_build_pointer_plus_hwi_loc (location_t loc, tree ptr, HOST_WIDE #define fold_build_pointer_plus_hwi(p,o) \ fold_build_pointer_plus_hwi_loc (UNKNOWN_LOCATION, p, o) + +struct tree_cond_ops +{ + tree_code code; + tree op0; + tree op1; + + tree_cond_ops (tree); +}; + +inline bool +operator== (const tree_cond_ops& o1, const tree_cond_ops &o2) +{ + return (o1.code == o2.code + && operand_equal_p (o1.op0, o2.op0, 0) + && operand_equal_p (o1.op1, o2.op1, 0)); +} + #endif // GCC_FOLD_CONST_H diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fmla_2.c b/gcc/testsuite/gcc.target/aarch64/sve/fmla_2.c index 5c04bcdb3f5..a1b0667dab5 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/fmla_2.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/fmla_2.c @@ -15,5 +15,9 @@ f (double *restrict a, double *restrict b, double *restrict c, } } -/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */ +/* See https://gcc.gnu.org/ml/gcc-patches/2019-08/msg01644.html + for XFAILing the below test. */ + +/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 2 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 3 } } */ /* { dg-final { scan-assembler-not {\tfmad\t} } } */ diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c index b0cbbac0cb5..a0062edb9d8 100644 --- a/gcc/tree-vect-loop.c +++ b/gcc/tree-vect-loop.c @@ -8609,6 +8609,9 @@ vect_transform_loop (loop_vec_info loop_vinfo) basic_block bb = bbs[i]; stmt_vec_info stmt_info; + if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) + loop_vinfo->vec_masked_cond_map = new vec_masked_cond_map_type (8); + for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si)) { @@ -8717,6 +8720,12 @@ vect_transform_loop (loop_vec_info loop_vinfo) } } } + + if (loop_vinfo->vec_masked_cond_map) + { + delete loop_vinfo->vec_masked_cond_map; + loop_vinfo->vec_masked_cond_map = 0; + } } /* BBs in loop */ /* The vectorization factor is always > 1, so if we use an IV increment of 1. diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c index 1e2dfe5d22d..24c4d638da9 100644 --- a/gcc/tree-vect-stmts.c +++ b/gcc/tree-vect-stmts.c @@ -1989,17 +1989,29 @@ check_load_store_masking (loop_vec_info loop_vinfo, tree vectype, static tree prepare_load_store_mask (tree mask_type, tree loop_mask, tree vec_mask, - gimple_stmt_iterator *gsi) + gimple_stmt_iterator *gsi, tree mask, + vec_info *vinfo) { gcc_assert (useless_type_conversion_p (mask_type, TREE_TYPE (vec_mask))); if (!loop_mask) return vec_mask; gcc_assert (TREE_TYPE (loop_mask) == mask_type); + + tree *slot = 0; + vec_masked_cond_key cond (mask, loop_mask); + if (vinfo->vec_masked_cond_map) + slot = &vinfo->vec_masked_cond_map->get_or_insert (cond); + if (slot && *slot) + return *slot; + tree and_res = make_temp_ssa_name (mask_type, NULL, "vec_mask_and"); gimple *and_stmt = gimple_build_assign (and_res, BIT_AND_EXPR, vec_mask, loop_mask); gsi_insert_before (gsi, and_stmt, GSI_SAME_STMT); + + if (slot) + *slot = and_res; return and_res; } @@ -3514,8 +3526,10 @@ vectorizable_call (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, gcc_assert (ncopies == 1); tree mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_out, i); + tree scalar_mask = gimple_call_arg (gsi_stmt (*gsi), mask_opno); vargs[mask_opno] = prepare_load_store_mask - (TREE_TYPE (mask), mask, vargs[mask_opno], gsi); + (TREE_TYPE (mask), mask, vargs[mask_opno], gsi, + scalar_mask, vinfo); } gcall *call; @@ -3564,9 +3578,11 @@ vectorizable_call (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, { tree mask = vect_get_loop_mask (gsi, masks, ncopies, vectype_out, j); + tree scalar_mask = gimple_call_arg (gsi_stmt (*gsi), mask_opno); vargs[mask_opno] = prepare_load_store_mask (TREE_TYPE (mask), mask, - vargs[mask_opno], gsi); + vargs[mask_opno], gsi, + scalar_mask, vinfo); } if (cfn == CFN_GOMP_SIMD_LANE) @@ -8109,7 +8125,8 @@ vectorizable_store (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, vectype, j); if (vec_mask) final_mask = prepare_load_store_mask (mask_vectype, final_mask, - vec_mask, gsi); + vec_mask, gsi, mask, + vinfo); gcall *call; if (final_mask) @@ -8163,7 +8180,8 @@ vectorizable_store (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, vectype, vec_num * j + i); if (vec_mask) final_mask = prepare_load_store_mask (mask_vectype, final_mask, - vec_mask, gsi); + vec_mask, gsi, mask, + vinfo); if (memory_access_type == VMAT_GATHER_SCATTER) { @@ -9304,7 +9322,8 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, vectype, j); if (vec_mask) final_mask = prepare_load_store_mask (mask_vectype, final_mask, - vec_mask, gsi); + vec_mask, gsi, mask, + vinfo); gcall *call; if (final_mask) @@ -9355,7 +9374,8 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, vectype, vec_num * j + i); if (vec_mask) final_mask = prepare_load_store_mask (mask_vectype, final_mask, - vec_mask, gsi); + vec_mask, gsi, mask, + vinfo); if (i > 0) dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, gsi, @@ -9975,6 +9995,32 @@ vectorizable_condition (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, /* Handle cond expr. */ for (j = 0; j < ncopies; j++) { + tree vec_mask = NULL_TREE; + + if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) + && loop_vinfo->vec_masked_cond_map) + { + vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo); + tree loop_mask = vect_get_loop_mask (gsi, masks, + ncopies, vectype, j); + + vec_masked_cond_key cond (cond_expr, loop_mask); + tree *slot = loop_vinfo->vec_masked_cond_map->get (cond); + if (slot && *slot) + vec_mask = *slot; + else + { + cond.cond_ops.code + = invert_tree_comparison (cond.cond_ops.code, true); + slot = loop_vinfo->vec_masked_cond_map->get (cond); + if (slot && *slot) + { + vec_mask = *slot; + std::swap (then_clause, else_clause); + } + } + } + stmt_vec_info new_stmt_info = NULL; if (j == 0) { @@ -10054,6 +10100,8 @@ vectorizable_condition (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, if (masked) vec_compare = vec_cond_lhs; + else if (vec_mask) + vec_compare = vec_mask; else { vec_cond_rhs = vec_oprnds1[i]; diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c index dc181524744..721d3295ca6 100644 --- a/gcc/tree-vectorizer.c +++ b/gcc/tree-vectorizer.c @@ -461,7 +461,8 @@ vec_info::vec_info (vec_info::vec_kind kind_in, void *target_cost_data_in, vec_info_shared *shared_) : kind (kind_in), shared (shared_), - target_cost_data (target_cost_data_in) + target_cost_data (target_cost_data_in), + vec_masked_cond_map (0) { stmt_vec_infos.create (50); } diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 1456cde4c2c..3e0afac840f 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -26,6 +26,7 @@ typedef class _stmt_vec_info *stmt_vec_info; #include "tree-data-ref.h" #include "tree-hash-traits.h" #include "target.h" +#include "hash-map.h" /* Used for naming of new temporaries. */ enum vect_var_kind { @@ -193,6 +194,60 @@ public: poly_uint64 min_value; }; +struct vec_masked_cond_key +{ + vec_masked_cond_key (tree t, tree loop_mask_) + : cond_ops (t), loop_mask (loop_mask_) + {} + + tree_cond_ops cond_ops; + tree loop_mask; +}; + +template<> +struct default_hash_traits<vec_masked_cond_key> +{ + typedef vec_masked_cond_key value_type; + typedef vec_masked_cond_key compare_type; + + static inline hashval_t hash (value_type v) + { + inchash::hash h; + h.add_int (v.cond_ops.code); + inchash::add_expr (v.cond_ops.op0, h, 0); + inchash::add_expr (v.cond_ops.op1, h, 0); + h.add_int (TREE_HASH (v.loop_mask)); + return h.end (); + } + + static inline bool equal (value_type existing, value_type candidate) + { + return (existing.loop_mask == candidate.loop_mask + && existing.cond_ops == candidate.cond_ops); + } + + static inline void mark_empty (value_type& v) + { + v.loop_mask = NULL_TREE; + } + + static inline bool is_empty (value_type v) + { + return v.loop_mask == NULL_TREE; + } + + static void mark_deleted (value_type&) {} + + static inline bool is_deleted (value_type) + { + return false; + } + + static inline void remove (value_type &) {} +}; + +typedef hash_map<vec_masked_cond_key, tree> vec_masked_cond_map_type; + /* Vectorizer state shared between different analyses like vector sizes of the same CFG region. */ class vec_info_shared { @@ -255,6 +310,8 @@ public: /* Cost data used by the target cost model. */ void *target_cost_data; + vec_masked_cond_map_type *vec_masked_cond_map; + private: stmt_vec_info new_stmt_vec_info (gimple *stmt); void set_vinfo_for_stmt (gimple *, stmt_vec_info);