On 05/17/2016 12:27 AM, Bin.Cheng wrote:
>> As profile-guided optimization can provide very useful information
>> about basic block frequencies within a loop, following patch set leverages
>> that information. It speeds up a single benchmark from upcoming SPECv6
>> suite by 20% (-O2 -profile-generate/-fprofile use) and I think it can
>> also improve others (currently measuring numbers for PGO).
> Hi,
> Is this 20% improvement from this patch, or does it include the
> existing PGO's improvement?
Hello.
It shows that current trunk (compared to GCC 6 branch)
has significantly improved the benchmark with PGO.
Currently, my patch improves PGO by ~5% w/ -O2, but our plan is to
improve static profile that would utilize the patch.
>
> For the patch:
>> +
>> + /* Return true if the frequency has a valid value. */
>> + bool has_frequency ();
>> +
>> /* Return infinite comp_cost. */
>> static comp_cost get_infinite ();
>>
>> @@ -249,6 +272,9 @@ private:
>> complexity field should be larger for more
>> complex expressions and addressing modes). */
>> int m_scratch; /* Scratch used during cost computation. */
>> + sreal m_frequency; /* Frequency of the basic block this comp_cost
>> + belongs to. */
>> + sreal m_cost_scaled; /* Scalled runtime cost. */
> IMHO we shouldn't embed frequency in comp_cost, neither record scaled
> cost in it. I would suggest we compute cost and amortize the cost
> over frequency in get_computation_cost_at before storing it into
> comp_cost. That is, once cost is computed/stored in comp_cost, it is
> already scaled with frequency. One argument is frequency info is only
> valid for use's statement/basic_block, it really doesn't have clear
> meaning in comp_cost structure. Outside of function
> get_computation_cost_at, I found it's hard to understand/remember
> what's the meaning of comp_cost.m_frequency and where it came from.
> There are other reasons embedded in below comments.
>>
>>
>> comp_cost&
>> @@ -257,6 +283,8 @@ comp_cost::operator= (const comp_cost& other)
>> m_cost = other.m_cost;
>> m_complexity = other.m_complexity;
>> m_scratch = other.m_scratch;
>> + m_frequency = other.m_frequency;
>> + m_cost_scaled = other.m_cost_scaled;
>>
>> return *this;
>> }
>> @@ -275,6 +303,7 @@ operator+ (comp_cost cost1, comp_cost cost2)
>>
>> cost1.m_cost += cost2.m_cost;
>> cost1.m_complexity += cost2.m_complexity;
>> + cost1.m_cost_scaled += cost2.m_cost_scaled;
>>
>> return cost1;
>> }
>> @@ -290,6 +319,8 @@ comp_cost
>> comp_cost::operator+= (HOST_WIDE_INT c)
> This and below operators need check for infinite cost first and return
> immediately.
>> {
>> this->m_cost += c;
>> + if (has_frequency ())
>> + this->m_cost_scaled += scale_cost (c);
>>
>> return *this;
>> }
>> @@ -5047,18 +5128,21 @@ get_computation_cost_at (struct ivopts_data *data,
>> (symbol/var1/const parts may be omitted). If we are looking for an
>> address, find the cost of addressing this. */
>> if (address_p)
>> - return cost + get_address_cost (symbol_present, var_present,
>> - offset, ratio, cstepi,
>> - mem_mode,
>> - TYPE_ADDR_SPACE (TREE_TYPE (utype)),
>> - speed, stmt_is_after_inc, can_autoinc);
>> + {
>> + cost += get_address_cost (symbol_present, var_present,
>> + offset, ratio, cstepi,
>> + mem_mode,
>> + TYPE_ADDR_SPACE (TREE_TYPE (utype)),
>> + speed, stmt_is_after_inc, can_autoinc);
>> + goto ret;
>> + }
>>
>> /* Otherwise estimate the costs for computing the expression. */
>> if (!symbol_present && !var_present && !offset)
>> {
>> if (ratio != 1)
>> cost += mult_by_coeff_cost (ratio, TYPE_MODE (ctype), speed);
>> - return cost;
>> + goto ret;
>> }
>>
>> /* Symbol + offset should be compile-time computable so consider that they
>> @@ -5077,7 +5161,8 @@ get_computation_cost_at (struct ivopts_data *data,
>> aratio = ratio > 0 ? ratio : -ratio;
>> if (aratio != 1)
>> cost += mult_by_coeff_cost (aratio, TYPE_MODE (ctype), speed);
>> - return cost;
>> +
>> + goto ret;
>>
>> fallback:
>> if (can_autoinc)
>> @@ -5093,8 +5178,13 @@ fallback:
>> if (address_p)
>> comp = build_simple_mem_ref (comp);
>>
>> - return comp_cost (computation_cost (comp, speed), 0);
>> + cost = comp_cost (computation_cost (comp, speed), 0);
>> }
>> +
>> +ret:
>> + cost.calculate_scaled_cost (at->bb->frequency,
>> + data->current_loop->header->frequency);
> Here cost consists of two parts. One is for loop invariant
> computation, we amortize is against avg_loop_niter and record register
> pressure (either via invriant variables or invariant expressions) for
> it; the other is loop variant part. For the first part, we should
> not scaled it using frequency, since we have already assumed it would
> be hoisted out of loop. No matter where the use is, hoisted loop
> invariant has the same frequency as loop header. This is the second
> reason I want to factor frequency out of comp_cost. It's easier to
> scale with frequency only it's necessary.
>
>> + return cost;
>> }
>>
>> /* Determines the cost of the computation by that USE is expressed
>> @@ -5922,16 +6012,19 @@ determine_group_iv_costs (struct ivopts_data *data)
>> group = data->vgroups[i];
>>
>> fprintf (dump_file, "Group %d:\n", i);
>> - fprintf (dump_file, " cand\tcost\tcompl.\tinv.ex.\tdepends on\n");
>> + fprintf (dump_file, " cand\tcost\tscaled\tfreq.\tcompl.\tinv.ex."
>> + "\tdepends on\n");
>> for (j = 0; j < group->n_map_members; j++)
>> {
>> if (!group->cost_map[j].cand
>> || group->cost_map[j].cost.infinite_cost_p ())
>> continue;
>>
>> - fprintf (dump_file, " %d\t%d\t%d\t",
>> + fprintf (dump_file, " %d\t%d\t%2.2f\t%2.2f\t%d\t",
>> group->cost_map[j].cand->id,
>> group->cost_map[j].cost.get_cost (),
>> + group->cost_map[j].cost.get_cost_scaled (),
>> + group->cost_map[j].cost.get_frequency (),
>> group->cost_map[j].cost.get_complexity ());
>> if (group->cost_map[j].inv_expr != NULL)
>> fprintf (dump_file, "%d\t",
>> @@ -5948,6 +6041,19 @@ determine_group_iv_costs (struct ivopts_data *data)
>> }
>> fprintf (dump_file, "\n");
>> }
>> +
>> + for (i = 0; i < data->vgroups.length (); i++)
>> + {
>> + group = data->vgroups[i];
>> + for (j = 0; j < group->n_map_members; j++)
>> + {
>> + if (!group->cost_map[j].cand
>> + || group->cost_map[j].cost.infinite_cost_p ())
>> + continue;
>> +
>> + group->cost_map[j].cost.propagate_scaled_cost ();
>> + }
>> + }
> This is wrong. m_frequency and m_cost_scaled are initialized to
> sreal(0) by default, and are never changed later for conditional
> iv_use. As a matter of factor, costs computed for all conditional
> iv_uses are wrong (value is 0). This makes the observed improvement
> not that promising. Considering code generation is very sensitive to
> cost computation, it maybe just hit some special cases. Eitherway we
> need more work/investigation on the impact of this patch.
>
> Again, I would suggest we factor out frequency out of comp_cost and
> only scale the cost in place when we compute cost for each use. Then
> we can measure what's the impact on code generation.
>
> Thanks,
> bin
>
All remarks were applied in third version of the patch. Together with the
previous
patch, it survives bootstrap and regression tests on x86_64-linux-gnu.
I'm going to re-test the patch on SPEC benchmarks.
Martin
>From 24e5d3f6747c77d1437feab11ff1e3888779b4d4 Mon Sep 17 00:00:00 2001
From: marxin <[email protected]>
Date: Tue, 17 May 2016 15:22:43 +0200
Subject: [PATCH 2/4] Add profiling support for IVOPTS
gcc/ChangeLog:
2016-05-17 Martin Liska <[email protected]>
* tree-ssa-loop-ivopts.c (get_computation_cost_at): Scale
computed costs by frequency of BB they belong to.
---
gcc/tree-ssa-loop-ivopts.c | 42 ++++++++++++++++++++++++++++++++++--------
1 file changed, 34 insertions(+), 8 deletions(-)
diff --git a/gcc/tree-ssa-loop-ivopts.c b/gcc/tree-ssa-loop-ivopts.c
index f48b2f6..8a82831 100644
--- a/gcc/tree-ssa-loop-ivopts.c
+++ b/gcc/tree-ssa-loop-ivopts.c
@@ -4995,18 +4995,21 @@ get_computation_cost_at (struct ivopts_data *data,
(symbol/var1/const parts may be omitted). If we are looking for an
address, find the cost of addressing this. */
if (address_p)
- return cost + get_address_cost (symbol_present, var_present,
- offset, ratio, cstepi,
- mem_mode,
- TYPE_ADDR_SPACE (TREE_TYPE (utype)),
- speed, stmt_is_after_inc, can_autoinc);
+ {
+ cost += get_address_cost (symbol_present, var_present,
+ offset, ratio, cstepi,
+ mem_mode,
+ TYPE_ADDR_SPACE (TREE_TYPE (utype)),
+ speed, stmt_is_after_inc, can_autoinc);
+ goto ret;
+ }
/* Otherwise estimate the costs for computing the expression. */
if (!symbol_present && !var_present && !offset)
{
if (ratio != 1)
cost += mult_by_coeff_cost (ratio, TYPE_MODE (ctype), speed);
- return cost;
+ goto ret;
}
/* Symbol + offset should be compile-time computable so consider that they
@@ -5025,7 +5028,7 @@ get_computation_cost_at (struct ivopts_data *data,
aratio = ratio > 0 ? ratio : -ratio;
if (aratio != 1)
cost += mult_by_coeff_cost (aratio, TYPE_MODE (ctype), speed);
- return cost;
+ goto ret;
fallback:
if (can_autoinc)
@@ -5041,8 +5044,31 @@ fallback:
if (address_p)
comp = build_simple_mem_ref (comp);
- return comp_cost (computation_cost (comp, speed), 0);
+ cost = comp_cost (computation_cost (comp, speed), 0);
}
+
+ret:
+ /* Scale (multiply) the computed cost (except scratch part that should be
+ hoisted out a loop) by header->frequency / at->frequency,
+ which makes expected cost more accurate. */
+ int loop_freq = data->current_loop->header->frequency;
+ int bb_freq = at->bb->frequency;
+ if (loop_freq != 0)
+ {
+ gcc_assert (cost.scratch <= cost.cost);
+ int scaled_cost
+ = cost.scratch + (cost.cost - cost.scratch) * bb_freq / loop_freq;
+
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ fprintf (dump_file, "Scaling iv_use based on cand %d "
+ "by %2.2f: %d (scratch: %d) -> %d (%d/%d)\n",
+ cand->id, 1.0f * bb_freq / loop_freq, cost.cost,
+ cost.scratch, scaled_cost, bb_freq, loop_freq);
+
+ cost.cost = scaled_cost;
+ }
+
+ return cost;
}
/* Determines the cost of the computation by that USE is expressed
--
2.8.2