Hi Hongtao,
on 2023/6/14 16:17, Hongtao Liu wrote:
> On Tue, Jun 13, 2023 at 10:07 AM Kewen Lin via Gcc-patches
> <[email protected]> wrote:
>>
>> This patch adjusts the cost handling on
>> VMAT_CONTIGUOUS_PERMUTE in function vectorizable_load. We
>> don't call function vect_model_load_cost for it any more.
>>
>> As the affected test case gcc.target/i386/pr70021.c shows,
>> the previous costing can under-cost the total generated
>> vector loads as for VMAT_CONTIGUOUS_PERMUTE function
>> vect_model_load_cost doesn't consider the group size which
>> is considered as vec_num during the transformation.
> The original PR is for the correctness issue, and I'm not sure how
> much of a performance impact the patch would be, but the change looks
> reasonable, so the test change looks ok to me.
> I'll track performance impact on SPEC2017 to see if there's any
> regression caused by the patch(Guess probably not).
Thanks for the feedback and further tracking! Hope this (and
this whole series) doesn't impact SPEC2017 performance on x86. :)
BR,
Kewen
>>
>> This patch makes the count of vector load in costing become
>> consistent with what we generates during the transformation.
>> To be more specific, for the given test case, for memory
>> access b[i_20], it costed for 2 vector loads before,
>> with this patch it costs 8 instead, it matches the final
>> count of generated vector loads basing from b. This costing
>> change makes cost model analysis feel it's not profitable
>> to vectorize the first loop, so this patch adjusts the test
>> case without vect cost model any more.
>>
>> But note that this test case also exposes something we can
>> improve further is that although the number of vector
>> permutation what we costed and generated are consistent,
>> but DCE can further optimize some unused permutation out,
>> it would be good if we can predict that and generate only
>> those necessary permutations.
>>
>> gcc/ChangeLog:
>>
>> * tree-vect-stmts.cc (vect_model_load_cost): Assert this function
>> only
>> handle memory_access_type VMAT_CONTIGUOUS, remove some
>> VMAT_CONTIGUOUS_PERMUTE related handlings.
>> (vectorizable_load): Adjust the cost handling on
>> VMAT_CONTIGUOUS_PERMUTE
>> without calling vect_model_load_cost.
>>
>> gcc/testsuite/ChangeLog:
>>
>> * gcc.target/i386/pr70021.c: Adjust with -fno-vect-cost-model.
>> ---
>> gcc/testsuite/gcc.target/i386/pr70021.c | 2 +-
>> gcc/tree-vect-stmts.cc | 88 ++++++++++++++-----------
>> 2 files changed, 51 insertions(+), 39 deletions(-)
>>
>> diff --git a/gcc/testsuite/gcc.target/i386/pr70021.c
>> b/gcc/testsuite/gcc.target/i386/pr70021.c
>> index 6562c0f2bd0..d509583601e 100644
>> --- a/gcc/testsuite/gcc.target/i386/pr70021.c
>> +++ b/gcc/testsuite/gcc.target/i386/pr70021.c
>> @@ -1,7 +1,7 @@
>> /* PR target/70021 */
>> /* { dg-do run } */
>> /* { dg-require-effective-target avx2 } */
>> -/* { dg-options "-O2 -ftree-vectorize -mavx2 -fdump-tree-vect-details
>> -mtune=skylake" } */
>> +/* { dg-options "-O2 -ftree-vectorize -mavx2 -fdump-tree-vect-details
>> -mtune=skylake -fno-vect-cost-model" } */
>>
>> #include "avx2-check.h"
>>
>> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
>> index 7f8d9db5363..e7a97dbe05d 100644
>> --- a/gcc/tree-vect-stmts.cc
>> +++ b/gcc/tree-vect-stmts.cc
>> @@ -1134,8 +1134,7 @@ vect_model_load_cost (vec_info *vinfo,
>> slp_tree slp_node,
>> stmt_vector_for_cost *cost_vec)
>> {
>> - gcc_assert (memory_access_type == VMAT_CONTIGUOUS
>> - || memory_access_type == VMAT_CONTIGUOUS_PERMUTE);
>> + gcc_assert (memory_access_type == VMAT_CONTIGUOUS);
>>
>> unsigned int inside_cost = 0, prologue_cost = 0;
>> bool grouped_access_p = STMT_VINFO_GROUPED_ACCESS (stmt_info);
>> @@ -1174,26 +1173,6 @@ vect_model_load_cost (vec_info *vinfo,
>> once per group anyhow. */
>> bool first_stmt_p = (first_stmt_info == stmt_info);
>>
>> - /* We assume that the cost of a single load-lanes instruction is
>> - equivalent to the cost of DR_GROUP_SIZE separate loads. If a grouped
>> - access is instead being provided by a load-and-permute operation,
>> - include the cost of the permutes. */
>> - if (first_stmt_p
>> - && memory_access_type == VMAT_CONTIGUOUS_PERMUTE)
>> - {
>> - /* Uses an even and odd extract operations or shuffle operations
>> - for each needed permute. */
>> - int group_size = DR_GROUP_SIZE (first_stmt_info);
>> - int nstmts = ncopies * ceil_log2 (group_size) * group_size;
>> - inside_cost += record_stmt_cost (cost_vec, nstmts, vec_perm,
>> - stmt_info, 0, vect_body);
>> -
>> - if (dump_enabled_p ())
>> - dump_printf_loc (MSG_NOTE, vect_location,
>> - "vect_model_load_cost: strided group_size = %d
>> .\n",
>> - group_size);
>> - }
>> -
>> vect_get_load_cost (vinfo, stmt_info, ncopies, alignment_support_scheme,
>> misalignment, first_stmt_p, &inside_cost,
>> &prologue_cost,
>> cost_vec, cost_vec, true);
>> @@ -10652,11 +10631,22 @@ vectorizable_load (vec_info *vinfo,
>> alignment support schemes. */
>> if (costing_p)
>> {
>> - if (memory_access_type == VMAT_CONTIGUOUS_REVERSE)
>> + /* For VMAT_CONTIGUOUS_PERMUTE if it's grouped load, we
>> + only need to take care of the first stmt, whose
>> + stmt_info is first_stmt_info, vec_num iterating on it
>> + will cover the cost for the remaining, it's consistent
>> + with transforming. For the prologue cost for realign,
>> + we only need to count it once for the whole group. */
>> + bool first_stmt_info_p = first_stmt_info == stmt_info;
>> + bool add_realign_cost = first_stmt_info_p && i == 0;
>> + if (memory_access_type == VMAT_CONTIGUOUS_REVERSE
>> + || (memory_access_type == VMAT_CONTIGUOUS_PERMUTE
>> + && (!grouped_load || first_stmt_info_p)))
>> vect_get_load_cost (vinfo, stmt_info, 1,
>> alignment_support_scheme,
>> misalignment,
>> - false, &inside_cost, &prologue_cost,
>> - cost_vec, cost_vec, true);
>> + add_realign_cost, &inside_cost,
>> + &prologue_cost, cost_vec, cost_vec,
>> + true);
>> }
>> else
>> {
>> @@ -10774,8 +10764,7 @@ vectorizable_load (vec_info *vinfo,
>> ??? This is a hack to prevent compile-time issues as seen
>> in PR101120 and friends. */
>> if (costing_p
>> - && memory_access_type != VMAT_CONTIGUOUS
>> - && memory_access_type != VMAT_CONTIGUOUS_PERMUTE)
>> + && memory_access_type != VMAT_CONTIGUOUS)
>> {
>> vect_transform_slp_perm_load (vinfo, slp_node, vNULL, nullptr,
>> vf,
>> true, &n_perms, nullptr);
>> @@ -10790,20 +10779,44 @@ vectorizable_load (vec_info *vinfo,
>> gcc_assert (ok);
>> }
>> }
>> - else if (!costing_p)
>> + else
>> {
>> if (grouped_load)
>> {
>> if (memory_access_type != VMAT_LOAD_STORE_LANES)
>> - vect_transform_grouped_load (vinfo, stmt_info, dr_chain,
>> - group_size, gsi);
>> - *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
>> - }
>> - else
>> - {
>> - STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
>> + {
>> + gcc_assert (memory_access_type == VMAT_CONTIGUOUS_PERMUTE);
>> + /* We assume that the cost of a single load-lanes
>> instruction
>> + is equivalent to the cost of DR_GROUP_SIZE separate
>> loads.
>> + If a grouped access is instead being provided by a
>> + load-and-permute operation, include the cost of the
>> + permutes. */
>> + if (costing_p && first_stmt_info == stmt_info)
>> + {
>> + /* Uses an even and odd extract operations or shuffle
>> + operations for each needed permute. */
>> + int group_size = DR_GROUP_SIZE (first_stmt_info);
>> + int nstmts = ceil_log2 (group_size) * group_size;
>> + inside_cost
>> + += record_stmt_cost (cost_vec, nstmts, vec_perm,
>> + stmt_info, 0, vect_body);
>> +
>> + if (dump_enabled_p ())
>> + dump_printf_loc (
>> + MSG_NOTE, vect_location,
>> + "vect_model_load_cost: strided group_size = %d
>> .\n",
>> + group_size);
>> + }
>> + else if (!costing_p)
>> + vect_transform_grouped_load (vinfo, stmt_info, dr_chain,
>> + group_size, gsi);
>> + }
>> + if (!costing_p)
>> + *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
>> }
>> - }
>> + else if (!costing_p)
>> + STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
>> + }
>> dr_chain.release ();
>> }
>> if (!slp && !costing_p)
>> @@ -10814,8 +10827,7 @@ vectorizable_load (vec_info *vinfo,
>> gcc_assert (memory_access_type != VMAT_INVARIANT
>> && memory_access_type != VMAT_ELEMENTWISE
>> && memory_access_type != VMAT_STRIDED_SLP);
>> - if (memory_access_type != VMAT_CONTIGUOUS
>> - && memory_access_type != VMAT_CONTIGUOUS_PERMUTE)
>> + if (memory_access_type != VMAT_CONTIGUOUS)
>> {
>> if (dump_enabled_p ())
>> dump_printf_loc (MSG_NOTE, vect_location,
>> --
>> 2.31.1
>>
>
>