This patch introduces balancing of long-running instructions that may clog the pipeline.
gcc/ChangeLog: 2017-10-11 Robin Dapp <rd...@linux.vnet.ibm.com> * config/s390/s390.c (NUM_SIDES): New constant. (LONGRUNNING_THRESHOLD): New constant. (LATENCY_FACTOR): New constant. (s390_sched_score): Lower score for long-running instructions on same side. (s390_sched_variable_issue): Bookkeeping for long-running instructions. --- gcc/config/s390/s390.c | 64 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 63 insertions(+), 1 deletion(-) diff --git a/gcc/config/s390/s390.c b/gcc/config/s390/s390.c index 36bc67d..2430933 100644 --- a/gcc/config/s390/s390.c +++ b/gcc/config/s390/s390.c @@ -355,6 +355,18 @@ static rtx_insn *last_scheduled_insn; #define MAX_SCHED_UNITS 3 static int last_scheduled_unit_distance[MAX_SCHED_UNITS]; +#define NUM_SIDES 2 +static int current_side = 1; +#define LONGRUNNING_THRESHOLD 5 + +/* Estimate of number of cycles a long-running insn occupies an + execution unit. */ +static unsigned fxu_longrunning[NUM_SIDES]; +static unsigned vfu_longrunning[NUM_SIDES]; + +/* Factor to scale latencies by, determined by measurements. */ +#define LATENCY_FACTOR 4 + /* The maximum score added for an instruction whose unit hasn't been in use for MAX_SCHED_MIX_DISTANCE steps. Increase this value to give instruction mix scheduling more priority over instruction @@ -14483,7 +14495,24 @@ s390_sched_score (rtx_insn *insn) if (m & unit_mask) score += (last_scheduled_unit_distance[i] * MAX_SCHED_MIX_SCORE / MAX_SCHED_MIX_DISTANCE); + + unsigned latency = insn_default_latency (insn); + + int other_side = 1 - current_side; + + /* Try to delay long-running insns when side is busy. */ + if (latency > LONGRUNNING_THRESHOLD) + { + if (get_attr_z13_unit_fxu (insn) && fxu_longrunning[current_side] + && fxu_longrunning[other_side] <= fxu_longrunning[current_side]) + score = MAX (0, score - 10); + + if (get_attr_z13_unit_vfu (insn) && vfu_longrunning[current_side] + && vfu_longrunning[other_side] <= vfu_longrunning[current_side]) + score = MAX (0, score - 10); + } } + return score; } @@ -14602,6 +14631,8 @@ s390_sched_variable_issue (FILE *file, int verbose, rtx_insn *insn, int more) { last_scheduled_insn = insn; + bool starts_group = false; + if (s390_tune >= PROCESSOR_2827_ZEC12 && reload_completed && recog_memoized (insn) >= 0) @@ -14609,6 +14640,11 @@ s390_sched_variable_issue (FILE *file, int verbose, rtx_insn *insn, int more) unsigned int mask = s390_get_sched_attrmask (insn); if ((mask & S390_SCHED_ATTR_MASK_CRACKED) != 0 + || (mask & S390_SCHED_ATTR_MASK_EXPANDED) != 0 + || (mask & S390_SCHED_ATTR_MASK_GROUPALONE) != 0) + starts_group = true; + + if ((mask & S390_SCHED_ATTR_MASK_CRACKED) != 0 || (mask & S390_SCHED_ATTR_MASK_EXPANDED) != 0) s390_sched_state = S390_SCHED_STATE_CRACKED; else if ((mask & S390_SCHED_ATTR_MASK_ENDGROUP) != 0 @@ -14623,8 +14659,13 @@ s390_sched_variable_issue (FILE *file, int verbose, rtx_insn *insn, int more) case 1: case 2: case S390_SCHED_STATE_NORMAL: + if (s390_sched_state == 0) + starts_group = true; if (s390_sched_state == S390_SCHED_STATE_NORMAL) - s390_sched_state = 1; + { + starts_group = true; + s390_sched_state = 1; + } else s390_sched_state++; @@ -14650,6 +14691,27 @@ s390_sched_variable_issue (FILE *file, int verbose, rtx_insn *insn, int more) last_scheduled_unit_distance[i]++; } + /* If this insn started a new group, the side flipped. */ + if (starts_group) + current_side = current_side ? 0 : 1; + + for (int i = 0; i < 2; i++) + { + if (fxu_longrunning[i] >= 1) + fxu_longrunning[i] -= 1; + if (vfu_longrunning[i] >= 1) + vfu_longrunning[i] -= 1; + } + + unsigned latency = insn_default_latency (insn); + if (latency > LONGRUNNING_THRESHOLD) + { + if (get_attr_z13_unit_fxu (insn)) + fxu_longrunning[current_side] = latency * LATENCY_FACTOR; + else + vfu_longrunning[current_side] = latency * LATENCY_FACTOR; + } + if (verbose > 5) { unsigned int sched_mask; -- 2.9.4