On Thu, Aug 5, 2021 at 2:04 PM Richard Sandiford
<[email protected]> wrote:
>
> Richard Biener <[email protected]> writes:
> > On Tue, Aug 3, 2021 at 2:09 PM Richard Sandiford via Gcc-patches
> > <[email protected]> wrote:
> >>
> >> When the vectoriser scalarises a strided store, it counts one
> >> scalar_store for each element plus one vec_to_scalar extraction
> >> for each element. However, extracting element 0 is free on AArch64,
> >> so it should have zero cost.
> >>
> >> I don't have a testcase that requires this for existing -mtune
> >> options, but it becomes more important with a later patch.
> >>
> >> gcc/
> >> * config/aarch64/aarch64.c (aarch64_is_store_elt_extraction): New
> >> function, split out from...
> >> (aarch64_detect_vector_stmt_subtype): ...here.
> >> (aarch64_add_stmt_cost): Treat extracting element 0 as free.
> >> ---
> >> gcc/config/aarch64/aarch64.c | 22 +++++++++++++++++++---
> >> 1 file changed, 19 insertions(+), 3 deletions(-)
> >>
> >> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> >> index 36f11808916..084f8caa0da 100644
> >> --- a/gcc/config/aarch64/aarch64.c
> >> +++ b/gcc/config/aarch64/aarch64.c
> >> @@ -14622,6 +14622,18 @@ aarch64_builtin_vectorization_cost (enum
> >> vect_cost_for_stmt type_of_cost,
> >> }
> >> }
> >>
> >> +/* Return true if an operaton of kind KIND for STMT_INFO represents
> >> + the extraction of an element from a vector in preparation for
> >> + storing the element to memory. */
> >> +static bool
> >> +aarch64_is_store_elt_extraction (vect_cost_for_stmt kind,
> >> + stmt_vec_info stmt_info)
> >> +{
> >> + return (kind == vec_to_scalar
> >> + && STMT_VINFO_DATA_REF (stmt_info)
> >> + && DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)));
> >> +}
> >
> > It would be nice to put functions like this in tree-vectorizer.h in some
> > section marked with a comment to contain helpers for the target
> > add_stmt_cost.
>
> Yeah, I guess that would avoid pointless cut-&-paste between targets.
> How does this look? Tested on aarch64-linux-gnu and x86_64-linux-gnu.
Looks good besides ...
> Thanks,
> Richard
>
>
> gcc/
> * tree-vectorizer.h (vect_is_store_elt_extraction, vect_is_reduction)
> (vect_reduc_type, vect_embedded_comparison_type, vect_comparison_type)
> (vect_is_extending_load, vect_is_integer_truncation): New functions,
> moved from aarch64.c but given different names.
> * config/aarch64/aarch64.c (aarch64_is_store_elt_extraction)
> (aarch64_is_reduction, aarch64_reduc_type)
> (aarch64_embedded_comparison_type, aarch64_comparison_type)
> (aarch64_extending_load_p, aarch64_integer_truncation_p): Delete
> in favor of the above. Update callers accordingly.
> ---
> gcc/config/aarch64/aarch64.c | 125 ++++-------------------------------
> gcc/tree-vectorizer.h | 104 +++++++++++++++++++++++++++++
> 2 files changed, 118 insertions(+), 111 deletions(-)
>
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index deb22477e28..fd8681747ca 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -2192,4 +2192,108 @@ extern vect_pattern_decl_t slp_patterns[];
> /* Number of supported pattern matchers. */
> extern size_t num__slp_patterns;
>
> +/* ----------------------------------------------------------------------
> + Target support routines
> + -----------------------------------------------------------------------
> + The following routines are provided to simplify costing decisions in
> + target code. Please add more as needed. */
> +
> +/* Return true if an operaton of kind KIND for STMT_INFO represents
> + the extraction of an element from a vector in preparation for
> + storing the element to memory. */
> +inline bool
> +vect_is_store_elt_extraction (vect_cost_for_stmt kind, stmt_vec_info
> stmt_info)
> +{
> + return (kind == vec_to_scalar
> + && STMT_VINFO_DATA_REF (stmt_info)
> + && DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)));
> +}
> +
> +/* Return true if STMT_INFO represents part of a reduction. */
> +inline bool
> +vect_is_reduction (stmt_vec_info stmt_info)
> +{
> + return (STMT_VINFO_REDUC_DEF (stmt_info)
> + || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)));
> +}
> +
> +/* If STMT_INFO describes a reduction, return the type of reduction
> + it describes, otherwise return -1. */
> +inline int
it's not clear what 'type of reduction' is - why not return enum
vect_reduction_type?
Because of the -1? Maybe we can simply add a NOT_REDUCTION member
to the enum? Or simply adjust the comment as "return the vect_reduction_type
of the reduction it describes, otherwise return -1"?
> +vect_reduc_type (vec_info *vinfo, stmt_vec_info stmt_info)
> +{
> + if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
> + if (STMT_VINFO_REDUC_DEF (stmt_info))
> + {
> + stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
> + return int (STMT_VINFO_REDUC_TYPE (reduc_info));
> + }
> + return -1;
> +}
> +
> +/* If STMT_INFO is a COND_EXPR that includes an embedded comparison, return
> the
> + scalar type of the values being compared. Return null otherwise. */
> +inline tree
> +vect_embedded_comparison_type (stmt_vec_info stmt_info)
> +{
> + if (auto *assign = dyn_cast<gassign *> (stmt_info->stmt))
> + if (gimple_assign_rhs_code (assign) == COND_EXPR)
> + {
> + tree cond = gimple_assign_rhs1 (assign);
> + if (COMPARISON_CLASS_P (cond))
> + return TREE_TYPE (TREE_OPERAND (cond, 0));
> + }
> + return NULL_TREE;
> +}
> +
> +/* If STMT_INFO is a comparison or contains an embedded comparison, return
> the
> + scalar type of the values being compared. Return null otherwise. */
> +inline tree
> +vect_comparison_type (stmt_vec_info stmt_info)
> +{
> + if (auto *assign = dyn_cast<gassign *> (stmt_info->stmt))
> + if (TREE_CODE_CLASS (gimple_assign_rhs_code (assign)) == tcc_comparison)
> + return TREE_TYPE (gimple_assign_rhs1 (assign));
> + return vect_embedded_comparison_type (stmt_info);
> +}
> +
> +/* Return true if STMT_INFO extends the result of a load. */
> +inline bool
> +vect_is_extending_load (class vec_info *vinfo, stmt_vec_info stmt_info)
> +{
> + /* Although this is quite large for an inline function, this part
> + at least should be inline. */
> + gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
> + if (!assign || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
> + return false;
> +
> + tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
> + tree lhs_type = TREE_TYPE (gimple_assign_lhs (assign));
> + tree rhs_type = TREE_TYPE (rhs);
> + if (!INTEGRAL_TYPE_P (lhs_type)
> + || !INTEGRAL_TYPE_P (rhs_type)
> + || TYPE_PRECISION (lhs_type) <= TYPE_PRECISION (rhs_type))
> + return false;
> +
> + stmt_vec_info def_stmt_info = vinfo->lookup_def (rhs);
> + return (def_stmt_info
> + && STMT_VINFO_DATA_REF (def_stmt_info)
> + && DR_IS_READ (STMT_VINFO_DATA_REF (def_stmt_info)));
> +}
> +
> +/* Return true if STMT_INFO is an integer truncation. */
> +inline bool
> +vect_is_integer_truncation (stmt_vec_info stmt_info)
> +{
> + gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
> + if (!assign || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
> + return false;
> +
> + tree lhs_type = TREE_TYPE (gimple_assign_lhs (assign));
> + tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
> + return (INTEGRAL_TYPE_P (lhs_type)
> + && INTEGRAL_TYPE_P (rhs_type)
> + && TYPE_PRECISION (lhs_type) < TYPE_PRECISION (rhs_type));
> +}
> +
> #endif /* GCC_TREE_VECTORIZER_H */
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index e02cbcbcb38..a4456a86764 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -14790,40 +14790,6 @@ aarch64_builtin_vectorization_cost (enum
> vect_cost_for_stmt type_of_cost,
> }
> }
>
> -/* Return true if an operaton of kind KIND for STMT_INFO represents
> - the extraction of an element from a vector in preparation for
> - storing the element to memory. */
> -static bool
> -aarch64_is_store_elt_extraction (vect_cost_for_stmt kind,
> - stmt_vec_info stmt_info)
> -{
> - return (kind == vec_to_scalar
> - && STMT_VINFO_DATA_REF (stmt_info)
> - && DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)));
> -}
> -
> -/* Return true if STMT_INFO represents part of a reduction. */
> -static bool
> -aarch64_is_reduction (stmt_vec_info stmt_info)
> -{
> - return (STMT_VINFO_REDUC_DEF (stmt_info)
> - || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)));
> -}
> -
> -/* If STMT_INFO describes a reduction, return the type of reduction
> - it describes, otherwise return -1. */
> -static int
> -aarch64_reduc_type (vec_info *vinfo, stmt_vec_info stmt_info)
> -{
> - if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
> - if (STMT_VINFO_REDUC_DEF (stmt_info))
> - {
> - stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
> - return int (STMT_VINFO_REDUC_TYPE (reduc_info));
> - }
> - return -1;
> -}
> -
> /* Return true if an access of kind KIND for STMT_INFO represents one
> vector of an LD[234] or ST[234] operation. Return the total number of
> vectors (2, 3 or 4) if so, otherwise return a value outside that range.
> */
> @@ -14844,32 +14810,6 @@ aarch64_ld234_st234_vectors (vect_cost_for_stmt
> kind, stmt_vec_info stmt_info)
> return 0;
> }
>
> -/* If STMT_INFO is a COND_EXPR that includes an embedded comparison, return
> the
> - scalar type of the values being compared. Return null otherwise. */
> -static tree
> -aarch64_embedded_comparison_type (stmt_vec_info stmt_info)
> -{
> - if (auto *assign = dyn_cast<gassign *> (stmt_info->stmt))
> - if (gimple_assign_rhs_code (assign) == COND_EXPR)
> - {
> - tree cond = gimple_assign_rhs1 (assign);
> - if (COMPARISON_CLASS_P (cond))
> - return TREE_TYPE (TREE_OPERAND (cond, 0));
> - }
> - return NULL_TREE;
> -}
> -
> -/* If STMT_INFO is a comparison or contains an embedded comparison, return
> the
> - scalar type of the values being compared. Return null otherwise. */
> -static tree
> -aarch64_comparison_type (stmt_vec_info stmt_info)
> -{
> - if (auto *assign = dyn_cast<gassign *> (stmt_info->stmt))
> - if (TREE_CODE_CLASS (gimple_assign_rhs_code (assign)) == tcc_comparison)
> - return TREE_TYPE (gimple_assign_rhs1 (assign));
> - return aarch64_embedded_comparison_type (stmt_info);
> -}
> -
> /* Return true if creating multiple copies of STMT_INFO for Advanced SIMD
> vectors would produce a series of LDP or STP operations. KIND is the
> kind of statement that STMT_INFO represents. */
> @@ -14896,43 +14836,6 @@ aarch64_advsimd_ldp_stp_p (enum vect_cost_for_stmt
> kind,
> return is_gimple_assign (stmt_info->stmt);
> }
>
> -/* Return true if STMT_INFO extends the result of a load. */
> -static bool
> -aarch64_extending_load_p (class vec_info *vinfo, stmt_vec_info stmt_info)
> -{
> - gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
> - if (!assign || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
> - return false;
> -
> - tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
> - tree lhs_type = TREE_TYPE (gimple_assign_lhs (assign));
> - tree rhs_type = TREE_TYPE (rhs);
> - if (!INTEGRAL_TYPE_P (lhs_type)
> - || !INTEGRAL_TYPE_P (rhs_type)
> - || TYPE_PRECISION (lhs_type) <= TYPE_PRECISION (rhs_type))
> - return false;
> -
> - stmt_vec_info def_stmt_info = vinfo->lookup_def (rhs);
> - return (def_stmt_info
> - && STMT_VINFO_DATA_REF (def_stmt_info)
> - && DR_IS_READ (STMT_VINFO_DATA_REF (def_stmt_info)));
> -}
> -
> -/* Return true if STMT_INFO is an integer truncation. */
> -static bool
> -aarch64_integer_truncation_p (stmt_vec_info stmt_info)
> -{
> - gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
> - if (!assign || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
> - return false;
> -
> - tree lhs_type = TREE_TYPE (gimple_assign_lhs (assign));
> - tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
> - return (INTEGRAL_TYPE_P (lhs_type)
> - && INTEGRAL_TYPE_P (rhs_type)
> - && TYPE_PRECISION (lhs_type) < TYPE_PRECISION (rhs_type));
> -}
> -
> /* Return true if STMT_INFO is the second part of a two-statement
> multiply-add
> or multiply-subtract sequence that might be suitable for fusing into a
> single instruction. If VEC_FLAGS is zero, analyze the operation as
> @@ -15035,7 +14938,7 @@ aarch64_sve_in_loop_reduction_latency (vec_info
> *vinfo,
> tree vectype,
> const sve_vec_cost *sve_costs)
> {
> - switch (aarch64_reduc_type (vinfo, stmt_info))
> + switch (vect_reduc_type (vinfo, stmt_info))
> {
> case EXTRACT_LAST_REDUCTION:
> return sve_costs->clast_cost;
> @@ -15126,7 +15029,7 @@ aarch64_detect_scalar_stmt_subtype (vec_info *vinfo,
> vect_cost_for_stmt kind,
> {
> /* Detect an extension of a loaded value. In general, we'll be able to
> fuse
> the extension with the load. */
> - if (kind == scalar_stmt && aarch64_extending_load_p (vinfo, stmt_info))
> + if (kind == scalar_stmt && vect_is_extending_load (vinfo, stmt_info))
> return 0;
>
> return stmt_cost;
> @@ -15158,7 +15061,7 @@ aarch64_detect_vector_stmt_subtype (vec_info *vinfo,
> vect_cost_for_stmt kind,
> /* Detect cases in which vec_to_scalar is describing the extraction of a
> vector element in preparation for a scalar store. The store itself is
> costed separately. */
> - if (aarch64_is_store_elt_extraction (kind, stmt_info))
> + if (vect_is_store_elt_extraction (kind, stmt_info))
> return simd_costs->store_elt_extra_cost;
>
> /* Detect SVE gather loads, which are costed as a single scalar_load
> @@ -15197,7 +15100,7 @@ aarch64_detect_vector_stmt_subtype (vec_info *vinfo,
> vect_cost_for_stmt kind,
> instruction like FADDP or MAXV. */
> if (kind == vec_to_scalar
> && where == vect_epilogue
> - && aarch64_is_reduction (stmt_info))
> + && vect_is_reduction (stmt_info))
> switch (GET_MODE_INNER (TYPE_MODE (vectype)))
> {
> case E_QImode:
> @@ -15247,12 +15150,12 @@ aarch64_sve_adjust_stmt_cost (class vec_info
> *vinfo, vect_cost_for_stmt kind,
> on the fly. Optimistically assume that a load followed by an extension
> will fold to this form during combine, and that the extension therefore
> comes for free. */
> - if (kind == vector_stmt && aarch64_extending_load_p (vinfo, stmt_info))
> + if (kind == vector_stmt && vect_is_extending_load (vinfo, stmt_info))
> stmt_cost = 0;
>
> /* For similar reasons, vector_stmt integer truncations are a no-op,
> because we can just ignore the unused upper bits of the source. */
> - if (kind == vector_stmt && aarch64_integer_truncation_p (stmt_info))
> + if (kind == vector_stmt && vect_is_integer_truncation (stmt_info))
> stmt_cost = 0;
>
> /* Advanced SIMD can load and store pairs of registers using LDP and STP,
> @@ -15327,7 +15230,7 @@ aarch64_adjust_stmt_cost (vect_cost_for_stmt kind,
> stmt_vec_info stmt_info,
> }
>
> if (kind == vector_stmt || kind == vec_to_scalar)
> - if (tree cmp_type = aarch64_embedded_comparison_type (stmt_info))
> + if (tree cmp_type = vect_embedded_comparison_type (stmt_info))
> {
> if (FLOAT_TYPE_P (cmp_type))
> stmt_cost += simd_costs->fp_stmt_cost;
> @@ -15337,7 +15240,7 @@ aarch64_adjust_stmt_cost (vect_cost_for_stmt kind,
> stmt_vec_info stmt_info,
> }
>
> if (kind == scalar_stmt)
> - if (tree cmp_type = aarch64_embedded_comparison_type (stmt_info))
> + if (tree cmp_type = vect_embedded_comparison_type (stmt_info))
> {
> if (FLOAT_TYPE_P (cmp_type))
> stmt_cost += aarch64_tune_params.vec_costs->scalar_fp_stmt_cost;
> @@ -15387,12 +15290,12 @@ aarch64_count_ops (class vec_info *vinfo,
> aarch64_vector_costs *costs,
> /* Calculate the minimum cycles per iteration imposed by a reduction
> operation. */
> if ((kind == vector_stmt || kind == vec_to_scalar)
> - && aarch64_is_reduction (stmt_info))
> + && vect_is_reduction (stmt_info))
> {
> unsigned int base
> = aarch64_in_loop_reduction_latency (vinfo, stmt_info, vectype,
> vec_flags);
> - if (aarch64_reduc_type (vinfo, stmt_info) == FOLD_LEFT_REDUCTION)
> + if (vect_reduc_type (vinfo, stmt_info) == FOLD_LEFT_REDUCTION)
> {
> if (aarch64_sve_mode_p (TYPE_MODE (vectype)))
> {
> @@ -15491,7 +15394,7 @@ aarch64_count_ops (class vec_info *vinfo,
> aarch64_vector_costs *costs,
>
> /* Add any embedded comparison operations. */
> if ((kind == scalar_stmt || kind == vector_stmt || kind == vec_to_scalar)
> - && aarch64_embedded_comparison_type (stmt_info))
> + && vect_embedded_comparison_type (stmt_info))
> ops->general_ops += num_copies;
>
> /* Detect COND_REDUCTIONs and things that would need to become
> @@ -15500,7 +15403,7 @@ aarch64_count_ops (class vec_info *vinfo,
> aarch64_vector_costs *costs,
> have only accounted for one. */
> if (vec_flags && (kind == vector_stmt || kind == vec_to_scalar))
> {
> - int reduc_type = aarch64_reduc_type (vinfo, stmt_info);
> + int reduc_type = vect_reduc_type (vinfo, stmt_info);
> if ((reduc_type == EXTRACT_LAST_REDUCTION && (vec_flags & VEC_ADVSIMD))
> || reduc_type == COND_REDUCTION)
> ops->general_ops += num_copies;
> @@ -15508,7 +15411,7 @@ aarch64_count_ops (class vec_info *vinfo,
> aarch64_vector_costs *costs,
>
> /* Count the predicate operations needed by an SVE comparison. */
> if (sve_issue && (kind == vector_stmt || kind == vec_to_scalar))
> - if (tree type = aarch64_comparison_type (stmt_info))
> + if (tree type = vect_comparison_type (stmt_info))
> {
> unsigned int base = (FLOAT_TYPE_P (type)
> ? sve_issue->fp_cmp_pred_ops
> @@ -15586,7 +15489,7 @@ aarch64_add_stmt_cost (class vec_info *vinfo, void
> *data, int count,
> /* If we scalarize a strided store, the vectorizer costs one
> vec_to_scalar for each element. However, we can store the first
> element using an FP store without a separate extract step. */
> - if (aarch64_is_store_elt_extraction (kind, stmt_info))
> + if (vect_is_store_elt_extraction (kind, stmt_info))
> count -= 1;
>
> stmt_cost = aarch64_detect_scalar_stmt_subtype