On 16/11/18 18:19, Pat Haugen wrote:
On 11/8/18 6:10 AM, Kyrill Tkachov wrote:
The attached patch avoids that by making the alap calculation only
look at true dependencies. This shouldn't be too bad, since we use
INSN_PRIORITY as the final tie-breaker than that does take
anti-dependencies into account.
This reduces the number of spills in the hot function from 436.cactusADM
by 14% on aarch64 at -O3 (and the number of instructions in general).
SPEC2017 shows a minor improvement on Cortex-A72 (about 0.1% overall).
Thanks to Wilco for the benchmarking.
I tried the patch on PowerPC since it also uses SCHED_PRESSURE_MODEL algorithm.
For CPU2006 only cactusADM had a noticeable difference, but I'm seeing a 5%
degradation. Looking at the generated asm for function
bench_staggeredleapfrog2_(), I see about a 1% increase in number of loads and
stores generated and an extra 100 bytes allocated on the stack.
-Pat
This is a follow-up from
https://gcc.gnu.org/ml/gcc-patches/2018-11/msg01525.html
This version introduces an "artificial" property of the dependencies produced in
sched-deps.c that is recorded when they are created due to
MAX_PENDING_LIST_LENGTH
and they are thus ignored in the model_analyze_insns ALAP calculation.
This approach gives most of the benefits of the original patch [1] on aarch64.
I tried it on the cactusADM hot function (bench_staggeredleapfrog2_) on
powerpc64le-unknown-linux-gnu
with -O3 and found that the initial version proposed did indeed increase the
instruction count
and stack space. This version gives a small improvement on powerpc in terms of
instruction count
(number of st* instructions stays the same), so I'm hoping this version
addresses Pat's concerns.
Pat, could you please try this version out if you've got the chance?
In terms of implementation, it required extending the various add_dependency
functions/helpers to
take an extra argument marking the dependency as "artificial", which is what
most of the patch diff
does. It is otherwise a fairly simple patch.
Bootstrapped and tested on aarch64-none-linux-gnu.
Is this ok for trunk if the PPC performance results look ok?
[1] https://gcc.gnu.org/ml/gcc-patches/2018-11/msg01514.html
2018-11-19 Richard Sandiford <richard.sandif...@arm.com>
Kyrylo Tkachov <kyrylo.tkac...@arm.com>
* haifa-sched.c (model_analyze_insns): Avoid counting artificial
anti-dependencies.
* sched-int.h (struct _dep): Add artificial field.
(DEP_ARTIFICIAL): Define accessor.
(DEP_ANTI_ARTIFICIAL): Define.
(DEP_POSTPONED): Adjust definition.
(add_dependence): Add default bool argument to prototype.
* sched-deps.c (init_dep_1): Initialize artificial field.
(add_dependence_1): Add default bool parameter. Handle in definition.
(add_dependence_list): Likewise.
(add_dependence_list_and_free): Likewise.
(flush_pending_lists): Likewise.
(haifa_note_dep): Handle DEP_ANTI_ARTIFICIAL in ds.
(sched_analyze_1): Mark new dependencies created as part of handling
MAX_PENDING_LIST_LENGTH limit as artificial.
(sched_analyze_2): Likewise.
(sched_analyze_insn): Likewise.
(deps_analyze_insn): Likewise.
(dump_ds): Handle DEP_ANTI_ARTIFICIAL.
diff --git a/gcc/haifa-sched.c b/gcc/haifa-sched.c
index 2c84ce3814357b30e1aaed57f1de3034d99afd57..c1787d01c5f4765a63986efe04c59748792e4932 100644
--- a/gcc/haifa-sched.c
+++ b/gcc/haifa-sched.c
@@ -3504,8 +3504,13 @@ model_analyze_insns (void)
FOR_EACH_DEP (iter, SD_LIST_FORW, sd_it, dep)
{
con = MODEL_INSN_INFO (DEP_CON (dep));
- if (con->insn && insn->alap < con->alap + 1)
- insn->alap = con->alap + 1;
+ /* Consider all dependencies except those introduced artificially
+ by the scheduler as part of adhering to the
+ MAX_PENDING_LIST_LENGTH limit. */
+ unsigned int min_alap
+ = con->alap + !DEP_ARTIFICIAL (dep);
+ if (con->insn && insn->alap < min_alap)
+ insn->alap = min_alap;
}
insn->old_queue = QUEUE_INDEX (iter);
diff --git a/gcc/sched-deps.c b/gcc/sched-deps.c
index f89f28269fd5ecf96688ed255d07b6976d2180c4..a83b779086099e9a1a0690b480230f4d7ad3e0ba 100644
--- a/gcc/sched-deps.c
+++ b/gcc/sched-deps.c
@@ -100,6 +100,7 @@ init_dep_1 (dep_t dep, rtx_insn *pro, rtx_insn *con, enum reg_note type, ds_t ds
DEP_COST (dep) = UNKNOWN_DEP_COST;
DEP_NONREG (dep) = 0;
DEP_MULTIPLE (dep) = 0;
+ DEP_ARTIFICIAL (dep) = 0;
DEP_REPLACE (dep) = NULL;
}
@@ -472,16 +473,18 @@ static int cache_size;
static bool mark_as_hard;
static int deps_may_trap_p (const_rtx);
-static void add_dependence_1 (rtx_insn *, rtx_insn *, enum reg_note);
+static void add_dependence_1 (rtx_insn *, rtx_insn *, enum reg_note,
+ bool = false);
static void add_dependence_list (rtx_insn *, rtx_insn_list *, int,
- enum reg_note, bool);
+ enum reg_note, bool, bool = false);
static void add_dependence_list_and_free (struct deps_desc *, rtx_insn *,
rtx_insn_list **, int, enum reg_note,
- bool);
+ bool, bool = false);
static void delete_all_dependences (rtx_insn *);
static void chain_to_prev_insn (rtx_insn *);
-static void flush_pending_lists (struct deps_desc *, rtx_insn *, int, int);
+static void flush_pending_lists (struct deps_desc *, rtx_insn *, int, int,
+ bool = false);
static void sched_analyze_1 (struct deps_desc *, rtx, rtx_insn *);
static void sched_analyze_2 (struct deps_desc *, rtx, rtx_insn *);
static void sched_analyze_insn (struct deps_desc *, rtx, rtx_insn *);
@@ -1507,9 +1510,12 @@ sd_debug_lists (rtx insn, sd_list_types_def types)
for REG_DEP_CONTROL dependencies. For these, we optionally promote
the type to REG_DEP_ANTI if we can determine that predication is
impossible; otherwise we add additional true dependencies on the
- INSN_COND_DEPS list of the jump (which PRO must be). */
+ INSN_COND_DEPS list of the jump (which PRO must be).
+ ARTIFICIAL is true if this dependency is introduced to keep the
+ list length down as part of MAX_PENDING_LIST_LENGTH. */
void
-add_dependence (rtx_insn *con, rtx_insn *pro, enum reg_note dep_type)
+add_dependence (rtx_insn *con, rtx_insn *pro, enum reg_note dep_type,
+ bool artificial)
{
if (dep_type == REG_DEP_CONTROL
&& !(current_sched_info->flags & DO_PREDICATION))
@@ -1550,35 +1556,40 @@ add_dependence (rtx_insn *con, rtx_insn *pro, enum reg_note dep_type)
}
}
- add_dependence_1 (con, pro, dep_type);
+ add_dependence_1 (con, pro, dep_type, artificial);
}
/* A convenience wrapper to operate on an entire list. HARD should be
- true if DEP_NONREG should be set on newly created dependencies. */
+ true if DEP_NONREG should be set on newly created dependencies.
+ ARTIFICIAL is true if this dependency is introduced to keep the
+ list length down as part of MAX_PENDING_LIST_LENGTH. */
static void
add_dependence_list (rtx_insn *insn, rtx_insn_list *list, int uncond,
- enum reg_note dep_type, bool hard)
+ enum reg_note dep_type, bool hard, bool artificial)
{
mark_as_hard = hard;
for (; list; list = list->next ())
{
if (uncond || ! sched_insns_conditions_mutex_p (insn, list->insn ()))
- add_dependence (insn, list->insn (), dep_type);
+ add_dependence (insn, list->insn (), dep_type, artificial);
}
mark_as_hard = false;
}
/* Similar, but free *LISTP at the same time, when the context
is not readonly. HARD should be true if DEP_NONREG should be set on
- newly created dependencies. */
+ newly created dependencies. ARTIFICIAL is true if this dependency
+ is introduced to keep the list length down as part of
+ MAX_PENDING_LIST_LENGTH. */
static void
add_dependence_list_and_free (struct deps_desc *deps, rtx_insn *insn,
rtx_insn_list **listp,
- int uncond, enum reg_note dep_type, bool hard)
+ int uncond, enum reg_note dep_type, bool hard,
+ bool artificial)
{
- add_dependence_list (insn, *listp, uncond, dep_type, hard);
+ add_dependence_list (insn, *listp, uncond, dep_type, hard, artificial);
/* We don't want to short-circuit dependencies involving debug
insns, because they may cause actual dependencies to be
@@ -1746,16 +1757,18 @@ add_insn_mem_dependence (struct deps_desc *deps, bool read_p,
/* Make a dependency between every memory reference on the pending lists
and INSN, thus flushing the pending lists. FOR_READ is true if emitting
- dependencies for a read operation, similarly with FOR_WRITE. */
+ dependencies for a read operation, similarly with FOR_WRITE.
+ ARTIFICIAL is true if this dependency is introduced to keep the
+ list length down as part of MAX_PENDING_LIST_LENGTH. */
static void
flush_pending_lists (struct deps_desc *deps, rtx_insn *insn, int for_read,
- int for_write)
+ int for_write, bool artificial)
{
if (for_write)
{
add_dependence_list_and_free (deps, insn, &deps->pending_read_insns,
- 1, REG_DEP_ANTI, true);
+ 1, REG_DEP_ANTI, true, true);
if (!deps->readonly)
{
free_EXPR_LIST_list (&deps->pending_read_mems);
@@ -1765,15 +1778,15 @@ flush_pending_lists (struct deps_desc *deps, rtx_insn *insn, int for_read,
add_dependence_list_and_free (deps, insn, &deps->pending_write_insns, 1,
for_read ? REG_DEP_ANTI : REG_DEP_OUTPUT,
- true);
+ true, artificial && for_read);
add_dependence_list_and_free (deps, insn,
&deps->last_pending_memory_flush, 1,
for_read ? REG_DEP_ANTI : REG_DEP_OUTPUT,
- true);
+ true, artificial && for_read);
add_dependence_list_and_free (deps, insn, &deps->pending_jump_insns, 1,
- REG_DEP_ANTI, true);
+ REG_DEP_ANTI, true, artificial);
if (DEBUG_INSN_P (insn))
{
@@ -1863,6 +1876,8 @@ haifa_note_dep (rtx_insn *elem, ds_t ds)
init_dep (dep, elem, cur_insn, ds_to_dt (ds));
if (mark_as_hard)
DEP_NONREG (dep) = 1;
+ if (ds & DEP_ANTI_ARTIFICIAL)
+ DEP_ARTIFICIAL (dep) = 1;
maybe_add_or_update_dep_1 (dep, false, NULL_RTX, NULL_RTX);
}
@@ -2495,7 +2510,7 @@ sched_analyze_1 (struct deps_desc *deps, rtx x, rtx_insn *insn)
these lists get long. When compiling GCC with itself,
this flush occurs 8 times for sparc, and 10 times for m88k using
the default value of 32. */
- flush_pending_lists (deps, insn, false, true);
+ flush_pending_lists (deps, insn, false, true, true);
}
else
{
@@ -2707,7 +2722,7 @@ sched_analyze_2 (struct deps_desc *deps, rtx x, rtx_insn *insn)
+ deps->pending_write_list_length)
>= MAX_PENDING_LIST_LENGTH
&& !DEBUG_INSN_P (insn))
- flush_pending_lists (deps, insn, true, true);
+ flush_pending_lists (deps, insn, true, true, true);
add_insn_mem_dependence (deps, true, insn, x);
}
@@ -3234,12 +3249,12 @@ sched_analyze_insn (struct deps_desc *deps, rtx x, rtx_insn *insn)
REG_DEP_OUTPUT, false);
add_dependence_list_and_free (deps, insn,
®_last->implicit_sets, 0,
- REG_DEP_ANTI, false);
+ REG_DEP_ANTI, false, true);
add_dependence_list_and_free (deps, insn, ®_last->uses, 0,
- REG_DEP_ANTI, false);
+ REG_DEP_ANTI, false, true);
add_dependence_list_and_free (deps, insn,
®_last->control_uses, 0,
- REG_DEP_ANTI, false);
+ REG_DEP_ANTI, false, true);
add_dependence_list_and_free (deps, insn,
®_last->clobbers, 0,
REG_DEP_OUTPUT, false);
@@ -3686,7 +3701,7 @@ deps_analyze_insn (struct deps_desc *deps, rtx_insn *insn)
{
/* Keep the list a reasonable size. */
if (deps->pending_flush_length++ >= MAX_PENDING_LIST_LENGTH)
- flush_pending_lists (deps, insn, true, true);
+ flush_pending_lists (deps, insn, true, true, true);
else
deps->pending_jump_insns
= alloc_INSN_LIST (insn, deps->pending_jump_insns);
@@ -4273,10 +4288,13 @@ estimate_dep_weak (rtx mem1, rtx mem2)
}
/* Add or update backward dependence between INSN and ELEM with type DEP_TYPE.
- This function can handle same INSN and ELEM (INSN == ELEM).
+ This function can handle same INSN and ELEM (INSN == ELEM). ARTIFICIAL is
+ true if this dependency is introduced to keep the list length down as part
+ of MAX_PENDING_LIST_LENGTH.
It is a convenience wrapper. */
static void
-add_dependence_1 (rtx_insn *insn, rtx_insn *elem, enum reg_note dep_type)
+add_dependence_1 (rtx_insn *insn, rtx_insn *elem, enum reg_note dep_type,
+ bool artificial)
{
ds_t ds;
bool internal;
@@ -4301,6 +4319,9 @@ add_dependence_1 (rtx_insn *insn, rtx_insn *elem, enum reg_note dep_type)
else
cur_insn = insn;
+ if (artificial)
+ ds |= DEP_ANTI_ARTIFICIAL;
+
note_dep (elem, ds);
if (!internal)
cur_insn = NULL;
@@ -4559,6 +4580,8 @@ dump_ds (FILE *f, ds_t s)
fprintf (f, "DEP_OUTPUT; ");
if (s & DEP_ANTI)
fprintf (f, "DEP_ANTI; ");
+ if (s & DEP_ANTI_ARTIFICIAL)
+ fprintf (f, "DEP_ANTI_ARTIFICIAL; ");
if (s & DEP_CONTROL)
fprintf (f, "DEP_CONTROL; ");
diff --git a/gcc/sched-int.h b/gcc/sched-int.h
index d46d8999346ef33da274792138c84ade669ae569..5d3ce34c94deedb9f4d2504b42ac03d5196b365b 100644
--- a/gcc/sched-int.h
+++ b/gcc/sched-int.h
@@ -235,6 +235,10 @@ struct _dep
unsigned nonreg:1;
unsigned multiple:1;
+ /* This dependency is introduced to keep compilation time down due
+ to MAX_PENDING_LIST_LENGTH. */
+ unsigned artificial:1;
+
/* Cached cost of the dependency. Make sure to update UNKNOWN_DEP_COST
when changing the size of this field. */
int cost:20;
@@ -252,6 +256,7 @@ typedef dep_def *dep_t;
#define DEP_COST(D) ((D)->cost)
#define DEP_NONREG(D) ((D)->nonreg)
#define DEP_MULTIPLE(D) ((D)->multiple)
+#define DEP_ARTIFICIAL(D) ((D)->artificial)
#define DEP_REPLACE(D) ((D)->replace)
/* Functions to work with dep. */
@@ -1141,6 +1146,10 @@ enum SPEC_TYPES_OFFSETS {
Therefore, it can appear only in the TODO_SPEC field of an instruction. */
#define HARD_DEP (DEP_CONTROL << 1)
+/* Dependency introduced to limit compilation time when due to
+ MAX_PENDING_LIST_LENGTH. */
+#define DEP_ANTI_ARTIFICIAL (HARD_DEP << 1)
+
/* Like HARD_DEP, but dependencies can perhaps be broken by modifying
the instructions. This is used for example to change:
@@ -1150,7 +1159,7 @@ enum SPEC_TYPES_OFFSETS {
For instructions that have this bit set, one of the dependencies of
the instructions will have a non-NULL REPLACE field in its DEP_T.
Just like HARD_DEP, this bit is only ever set in TODO_SPEC. */
-#define DEP_POSTPONED (HARD_DEP << 1)
+#define DEP_POSTPONED (DEP_ANTI_ARTIFICIAL << 1)
/* Set if a dependency is cancelled via speculation. */
#define DEP_CANCELLED (DEP_POSTPONED << 1)
@@ -1345,7 +1354,8 @@ extern rtx sched_get_reverse_condition_uncached (const rtx_insn *);
extern bool sched_insns_conditions_mutex_p (const rtx_insn *,
const rtx_insn *);
extern bool sched_insn_is_legitimate_for_speculation_p (const rtx_insn *, ds_t);
-extern void add_dependence (rtx_insn *, rtx_insn *, enum reg_note);
+extern void add_dependence (rtx_insn *, rtx_insn *, enum reg_note,
+ bool = false);
extern void sched_analyze (struct deps_desc *, rtx_insn *, rtx_insn *);
extern void init_deps (struct deps_desc *, bool);
extern void init_deps_reg_last (struct deps_desc *);