I've a question about the way IRA accumulates information from nested regions into parent regions. ira-int.h says:
/* Accumulated usage references of the allocno. Here and below, word 'accumulated' means info for given region and all nested subregions. In this case, 'accumulated' means sum of references of the corresponding pseudo-register in this region and in all nested subregions recursively. */ int nrefs; and things like ALLOCNO_HARD_REG_COSTS are also accumulated in this way: /* Array of usage costs (accumulated and the one updated during coloring) for each hard register of the allocno cover class. The member value can be NULL if all costs are the same and equal to COVER_CLASS_COST. For example, the costs of two different hard registers can be different if one hard register is callee-saved and another one is callee-used and the allocno lives through calls. Another example can be case when for some insn the corresponding pseudo-register value should be put in specific register class (e.g. AREG for x86) which is a strict subset of the allocno cover class (GENERAL_REGS for x86). We have updated costs to reflect the situation when the usage cost of a hard register is decreased because the allocno is connected to another allocno by a copy and the another allocno has been assigned to the hard register. */ int *hard_reg_costs, *updated_hard_reg_costs; If PARENT_A is a parent of allocno A, propagate_allocno_info does the equivalent of: ALLOCNO_HARD_REG_COSTS(parent_a)[i] += ALLOCNO_HARD_REG_COSTS(a)[i] which is what you'd expect from the comments. But after that, should each update ALLOCNO_HARD_REG_COSTS(a)[i] be reflected in ALLOCNO_HARD_REG_COSTS(parent_a)[i]? It isn't entirely clear from the code. E.g. process_regs_for_copy (which I believe is called after propagate_allocno_info) updates ALLOCNO_HARD_REG_COSTS: ALLOCNO_HARD_REG_COSTS (a)[index] -= cost; and I can't see how this update would ever propagate to parent allocnos. Also, color_pass has a loop that updates the costs of child allocnos after a parent has been allocated: /* Update costs of the corresponding allocnos (not caps) in the subloops. */ for (subloop_node = loop_tree_node->subloops; subloop_node != NULL; subloop_node = subloop_node->subloop_next) { ira_assert (subloop_node->bb == NULL); EXECUTE_IF_SET_IN_BITMAP (consideration_allocno_bitmap, 0, j, bi) { a = ira_allocnos[j]; ira_assert (ALLOCNO_CAP_MEMBER (a) == NULL); mode = ALLOCNO_MODE (a); rclass = ALLOCNO_COVER_CLASS (a); hard_regno = ALLOCNO_HARD_REGNO (a); if (hard_regno >= 0) { index = ira_class_hard_reg_index[rclass][hard_regno]; ira_assert (index >= 0); } regno = ALLOCNO_REGNO (a); /* ??? conflict costs */ subloop_allocno = subloop_node->regno_allocno_map[regno]; if (subloop_allocno == NULL || ALLOCNO_CAP (subloop_allocno) != NULL) continue; ira_assert (bitmap_bit_p (subloop_node->all_allocnos, ALLOCNO_NUM (subloop_allocno))); if (flag_ira_algorithm == IRA_ALGORITHM_MIXED && (loop_tree_node->reg_pressure[rclass] <= ira_available_class_regs[rclass])) { if (! ALLOCNO_ASSIGNED_P (subloop_allocno)) { ALLOCNO_HARD_REGNO (subloop_allocno) = hard_regno; ALLOCNO_ASSIGNED_P (subloop_allocno) = true; if (hard_regno >= 0) update_copy_costs (subloop_allocno, true); /* We don't need updated costs anymore: */ ira_free_allocno_updated_costs (subloop_allocno); } continue; } exit_freq = ira_loop_edge_freq (subloop_node, regno, true); enter_freq = ira_loop_edge_freq (subloop_node, regno, false); ira_assert (regno < ira_reg_equiv_len); if (ira_reg_equiv_invariant_p[regno] || ira_reg_equiv_const[regno] != NULL_RTX) { if (! ALLOCNO_ASSIGNED_P (subloop_allocno)) { ALLOCNO_HARD_REGNO (subloop_allocno) = hard_regno; ALLOCNO_ASSIGNED_P (subloop_allocno) = true; if (hard_regno >= 0) update_copy_costs (subloop_allocno, true); /* We don't need updated costs anymore: */ ira_free_allocno_updated_costs (subloop_allocno); } } else if (hard_regno < 0) { ALLOCNO_UPDATED_MEMORY_COST (subloop_allocno) -= ((ira_memory_move_cost[mode][rclass][1] * enter_freq) + (ira_memory_move_cost[mode][rclass][0] * exit_freq)); } else { cover_class = ALLOCNO_COVER_CLASS (subloop_allocno); ira_allocate_and_set_costs (&ALLOCNO_HARD_REG_COSTS (subloop_allocno), cover_class, ALLOCNO_COVER_CLASS_COST (subloop_allocno)); ira_allocate_and_set_costs (&ALLOCNO_CONFLICT_HARD_REG_COSTS (subloop_allocno), cover_class, 0); cost = (ira_register_move_cost[mode][rclass][rclass] * (exit_freq + enter_freq)); ALLOCNO_HARD_REG_COSTS (subloop_allocno)[index] -= cost; ALLOCNO_CONFLICT_HARD_REG_COSTS (subloop_allocno)[index] -= cost; ALLOCNO_UPDATED_MEMORY_COST (subloop_allocno) += (ira_memory_move_cost[mode][rclass][0] * enter_freq + ira_memory_move_cost[mode][rclass][1] * exit_freq); if (ALLOCNO_COVER_CLASS_COST (subloop_allocno) > ALLOCNO_HARD_REG_COSTS (subloop_allocno)[index]) ALLOCNO_COVER_CLASS_COST (subloop_allocno) = ALLOCNO_HARD_REG_COSTS (subloop_allocno)[index]; } } } A lot of the fields updated here are described as "accumulated", but the updates to SUBLOOP_ALLOCNO aren't reflected in A (or A's parents). You might be wondering why this matters. Well, ALLOCNO_HARD_REG_COSTS is sometimes accessed after coloring is complete. move_spill_restore is one such function, and has: FOR_EACH_ALLOCNO (a, ai) { regno = ALLOCNO_REGNO (a); loop_node = ALLOCNO_LOOP_TREE_NODE (a); if (ALLOCNO_CAP_MEMBER (a) != NULL || ALLOCNO_CAP (a) != NULL || (hard_regno = ALLOCNO_HARD_REGNO (a)) < 0 || loop_node->children == NULL /* don't do the optimization because it can create copies and the reload pass can spill the allocno set by copy although the allocno will not get memory slot. */ || ira_reg_equiv_invariant_p[regno] || ira_reg_equiv_const[regno] != NULL_RTX || !bitmap_bit_p (loop_node->border_allocnos, ALLOCNO_NUM (a))) continue; mode = ALLOCNO_MODE (a); rclass = ALLOCNO_COVER_CLASS (a); index = ira_class_hard_reg_index[rclass][hard_regno]; ira_assert (index >= 0); ---> cost = (ALLOCNO_MEMORY_COST (a) ---> - (ALLOCNO_HARD_REG_COSTS (a) == NULL ---> ? ALLOCNO_COVER_CLASS_COST (a) ---> : ALLOCNO_HARD_REG_COSTS (a)[index])); for (subloop_node = loop_node->subloops; subloop_node != NULL; subloop_node = subloop_node->subloop_next) { ira_assert (subloop_node->bb == NULL); subloop_allocno = subloop_node->regno_allocno_map[regno]; if (subloop_allocno == NULL) continue; /* We have accumulated cost. To get the real cost of allocno usage in the loop we should subtract costs of the subloop allocnos. */ ---> cost -= (ALLOCNO_MEMORY_COST (subloop_allocno) ---> - (ALLOCNO_HARD_REG_COSTS (subloop_allocno) == NULL ---> ? ALLOCNO_COVER_CLASS_COST (subloop_allocno) ---> : ALLOCNO_HARD_REG_COSTS (subloop_allocno)[index])); exit_freq = ira_loop_edge_freq (subloop_node, regno, true); enter_freq = ira_loop_edge_freq (subloop_node, regno, false); if ((hard_regno2 = ALLOCNO_HARD_REGNO (subloop_allocno)) < 0) cost -= (ira_memory_move_cost[mode][rclass][0] * exit_freq + ira_memory_move_cost[mode][rclass][1] * enter_freq); else { cost += (ira_memory_move_cost[mode][rclass][0] * exit_freq + ira_memory_move_cost[mode][rclass][1] * enter_freq); if (hard_regno2 != hard_regno) cost -= (ira_register_move_cost[mode][rclass][rclass] * (exit_freq + enter_freq)); } } if ((parent = loop_node->parent) != NULL && (parent_allocno = parent->regno_allocno_map[regno]) != NULL) { exit_freq = ira_loop_edge_freq (loop_node, regno, true); enter_freq = ira_loop_edge_freq (loop_node, regno, false); if ((hard_regno2 = ALLOCNO_HARD_REGNO (parent_allocno)) < 0) cost -= (ira_memory_move_cost[mode][rclass][0] * exit_freq + ira_memory_move_cost[mode][rclass][1] * enter_freq); else { cost += (ira_memory_move_cost[mode][rclass][1] * exit_freq + ira_memory_move_cost[mode][rclass][0] * enter_freq); if (hard_regno2 != hard_regno) cost -= (ira_register_move_cost[mode][rclass][rclass] * (exit_freq + enter_freq)); } } if (cost < 0) { ALLOCNO_HARD_REGNO (a) = -1; if (internal_flag_ira_verbose > 3 && ira_dump_file != NULL) { fprintf (ira_dump_file, " Moving spill/restore for a%dr%d up from loop %d", ALLOCNO_NUM (a), regno, loop_node->loop->num); fprintf (ira_dump_file, " - profit %d\n", -cost); } changed_p = true; } } As I understand it, the code marked "--->" is trying to calculate the cost to A _in isolation_ of spilling to memory. Then the other (unmarked) cost adjustments calculate the effect this has on the child allocnos. If that's right, we're assuming here that ALLOCNO_MEMORY_COST, ALLOCNO_COVER_CLASS_COST and ALLOCNO_HARD_REG_COST are still accumulated. The comment above the second "--->" block seems to bear out this intention. The problem is that the costs aren't fully accumulated. After the color_pass code quoted above, it is no longer true to say that: ALLOCNO_HARD_REG_COSTS (a)[i] - \Sigma ALLOCNO_HARD_REG_COSTS (subloop_allocno)[i]) is the cost of using I for A itself. Instead, each execution of: cost = (ira_register_move_cost[mode][rclass][rclass] * (exit_freq + enter_freq)); ALLOCNO_HARD_REG_COSTS (subloop_allocno)[index] -= cost; will also subtract COST from the cost of spilling A (the parent allocno) in move_spill_restore. Is this intentional? Although I suspect it isn't intentional, I can imagine it doesn't show up much on targets whose memory move costs are significantly higher than their register move costs. Which brings us to MIPS. The big question there is: what do we do with the accumulator registers? Do we put them in the same cover class as GPRs? Or perhaps that's jumping the gun. Perhaps the first question is: should we mark the accumulator registers as fixed, or at least hide them from the register allocator? I'm planning to do the latter for MIPS16, but I don't think it's a good idea for normal MIPS for two reasons: - The DSP ASE provides 4 accumulators. We want to apply normal register allocation to them. - Some targets have multiply-accumulate instructions that operate on LO and HI. But it isn't always a win to use them. If a target has both multiply-accumulate _and_ pipelined three-operand multiplication instructions, it is often better to use the latter for parallel multiply-accumulate chains. We've traditionally treated the choice as a register-allocation problem, which seems to have worked reasonably well. Also, the macc instruction on some targets can copy the LO result to a GPR too. The register allocator can currently take advantage of that, allocating a GPR when it's useful and not wasting one otherwise. (From what I've seen in the past, JPEG FFT loops tend to be on the borderline as far as register pressure on MIPS goes, so this can be an important win.) But there are only a limited number of accumulator registers (1 or 4, depending on the target). There's quite a high likelihood that any given value will need to be spilled from the accumulators at some point. When that happens, it's better to spill to a GPR than it is to spill to memory, since any load or store has to go through a GPR anyway. It therefore seems better to put GPRs and accumulator registers in the same cover class. We currently give moves between GPRs and accumulators a higher cost than GPR loads and stores.[*] On the targets for which this cost is accurate, we _don't_ want to use LO and HI as spill space. We also don't want to move between one accumulator and another if we can help it. And IRA generally seems happy with this. [*] Which isn't an accurate reflection of all targets, but that's another story. We ought eventually to put this in the CPU cost table. The hitch is that the cost of storing an accumulator to memory is the cost of a GPR<->accumulator move plus the cost of a load or store. The cost of moving between one accumulator and another is the cost of two GPR<->accumulator moves. Both of these aggregate costs are accurate, to the extent that the constituent costs are accurate (see [*] above). So we have a situation in which the worst-case register<->register cost (acc<->acc) outweighs the worst-cost register<->memory cost (acc<->mem). And that goes against the cover class documentation. For the most part things Just Work. But in the code quoted above, this cost: cost = (ira_register_move_cost[mode][rclass][rclass] * (exit_freq + enter_freq)); ALLOCNO_HARD_REG_COSTS (subloop_allocno)[index] -= cost; outweights this cost: cost += (ira_memory_move_cost[mode][rclass][1] * exit_freq + ira_memory_move_cost[mode][rclass][0] * enter_freq); and so move_spill_restore can actually spill an allocno in cases where ira-emit.c wouldn't need to do anything. In particular, I saw this in zlib's infblock.c (part of CSiBE). We had a pseudo register P with several allocnos, and every allocno was allocated the same register ($21, as it happens). $21 wasn't used by any other pseudo register, so no spill code was needed for P. But move_spill_restore nevertheless spilled some of the allocnos, introducing lots of redundant loads and stores. If you disable move_spill_restore, ira-emit.c does the right thing. So the move_spill_restore problem affects brok^Wunsual targets like MIPS more than most, and I realise that, in the scenario above, MIPS is going against design. On the other hand, the code doesn't look quite as I'd expect. In summary: - Vlad, could you clarify when costs are and aren't accumulated? - I'd appreciate suggestions about how to handle the MIPS accumulators. There's an auxillary question about whether we should use REGNO_REG_CLASS more often in cases where a hard register has already been chosen, since this can make a big difference on some targets. For the record, the current CSiBE results for MIPS are: -O0 -EB -mips16 -mabi=32 -mhard-float 4499177 3998513 : 88.87% -O0 -EB -mips16 -mabi=32 -msoft-float 4481089 3980969 : 88.84% -O0 -EB -mips16 -mabi=o64 -mhard-float 4316945 3968473 : 91.93% -O0 -EB -mips16 -mabi=o64 -msoft-float 4299545 3950849 : 91.89% -O0 -EB -mno-mips16 -mabi=32 -mhard-float 5948865 5805185 : 97.58% -O0 -EB -mno-mips16 -mabi=32 -msoft-float 6510657 6361441 : 97.71% -O0 -EB -mno-mips16 -mabi=o64 -mhard-float 5939997 5800337 : 97.65% -O0 -EB -mno-mips16 -mabi=o64 -msoft-float 6260013 6125921 : 97.86% -O0 -EL -mips16 -mabi=32 -mhard-float 4498873 3998465 : 88.88% -O0 -EL -mips16 -mabi=32 -msoft-float 4480817 3980937 : 88.84% -O0 -EL -mips16 -mabi=o64 -mhard-float 4315985 3967389 : 91.92% -O0 -EL -mips16 -mabi=o64 -msoft-float 4298537 3949781 : 91.89% -O0 -EL -mno-mips16 -mabi=32 -mhard-float 5948029 5804461 : 97.59% -O0 -EL -mno-mips16 -mabi=32 -msoft-float 6509789 6360717 : 97.71% -O0 -EL -mno-mips16 -mabi=o64 -mhard-float 5938965 5799193 : 97.65% -O0 -EL -mno-mips16 -mabi=o64 -msoft-float 6258965 6124793 : 97.86% -Os -EB -mips16 -mabi=32 -mhard-float 2839673 2840465 : 100.03% -Os -EB -mips16 -mabi=32 -msoft-float 2822009 2822893 : 100.03% -Os -EB -mips16 -mabi=o64 -mhard-float 2837857 2855169 : 100.61% -Os -EB -mips16 -mabi=o64 -msoft-float 2819405 2836701 : 100.61% -Os -EB -mno-mips16 -mabi=32 -mhard-float 3498309 3502713 : 100.13% -Os -EB -mno-mips16 -mabi=32 -msoft-float 3966753 3974741 : 100.20% -Os -EB -mno-mips16 -mabi=o64 -mhard-float 3497989 3503253 : 100.15% -Os -EB -mno-mips16 -mabi=o64 -msoft-float 3756725 3766389 : 100.26% -Os -EL -mips16 -mabi=32 -mhard-float 2838993 2839973 : 100.03% -Os -EL -mips16 -mabi=32 -msoft-float 2821185 2822017 : 100.03% -Os -EL -mips16 -mabi=o64 -mhard-float 2837253 2855009 : 100.63% -Os -EL -mips16 -mabi=o64 -msoft-float 2818769 2836541 : 100.63% -Os -EL -mno-mips16 -mabi=32 -mhard-float 3497845 3502121 : 100.12% -Os -EL -mno-mips16 -mabi=32 -msoft-float 3965313 3971237 : 100.15% -Os -EL -mno-mips16 -mabi=o64 -mhard-float 3497269 3502565 : 100.15% -Os -EL -mno-mips16 -mabi=o64 -msoft-float 3756037 3765701 : 100.26% and from what I've seen, the increases seem to be from this kind of needless spilling. (A lot of the individual -Os tests are better with IRA, but the ones for which this problem triggers are much worse). Richard