[PATCH][MIPS] Scheduler fix for the 74k & 24k.
Hello, This patch fixes a bug with the 74k & 24k schedulers. Back in 2006 (2ca4dfa486bd358c6e466328839977250d160393) a mips_store_data_bypass_p was added to the mips backend. Unfortunately it was defined in terms of !store_data_bypass_p, though it was correctly used for the sb1 processor pipeline descriptor at that time. Later during a code-cleanup in 2012 (e053750d33e14ca245e14e1c467709a9bf6c6282) the 24k & 74k bypasses were changed from the correct !store_data_bypass_p to !mips_store_data_bypass_p. This lead to those bypasses having inverted guard conditions. This patch brings mips_store_data_bypass_p into line with its comments and the comments of store_data_bypass_p. It also corrects the sb1's pipeline description. Thanks, Simon gcc/ * config/mips/mips.c (mips_store_data_bypass_p): Bring code into line with comments. * config/mips/sb1.md: Update usage of mips_store_data_bypass_p. diff --git a/gcc/config/mips/mips.c b/gcc/config/mips/mips.c index 2fe143c..23f12d1 100644 --- a/gcc/config/mips/mips.c +++ b/gcc/config/mips/mips.c @@ -13709,7 +13709,7 @@ mips_store_data_bypass_p (rtx out_insn, rtx in_insn) if (GET_CODE (PATTERN (in_insn)) == UNSPEC_VOLATILE) return false; - return !store_data_bypass_p (out_insn, in_insn); + return store_data_bypass_p (out_insn, in_insn); } diff --git a/gcc/config/mips/sb1.md b/gcc/config/mips/sb1.md index 311300e..c12fc91 100644 --- a/gcc/config/mips/sb1.md +++ b/gcc/config/mips/sb1.md @@ -216,7 +216,7 @@ "ir_sb1_load,ir_sb1a_load,ir_sb1_fpload,ir_sb1_fpload_32bitfp, ir_sb1_fpidxload,ir_sb1_fpidxload_32bitfp" "ir_sb1_store,ir_sb1_fpstore,ir_sb1_fpidxstore" - "mips_store_data_bypass_p") + "!mips_store_data_bypass_p") ;; On SB-1, simple alu instructions can execute on the LS1 unit. @@ -289,7 +289,7 @@ (define_bypass 5 "ir_sb1a_simple_alu,ir_sb1_alu,ir_sb1_alu_0,ir_sb1_mfhi,ir_sb1_mflo" "ir_sb1_store,ir_sb1_fpstore,ir_sb1_fpidxstore" - "mips_store_data_bypass_p") + "!mips_store_data_bypass_p") ;; mf{hi,lo} is 1 cycle. @@ -351,7 +351,7 @@ (define_bypass 7 "ir_sb1_mulsi,ir_sb1_muldi" "ir_sb1_store,ir_sb1_fpstore,ir_sb1_fpidxstore" - "mips_store_data_bypass_p") + "!mips_store_data_bypass_p") ;; The divide unit is not pipelined. Divide busy is asserted in the 4th ;; cycle, and then deasserted on the latency cycle. So only one divide at -- 2.1.0
[PATCH, MIPS, Ping] Inline memcpy for MipsR6
Hello, > This patch enables inline memcpy for R6 which was previously disabled and > adds support for expansion when source and destination are at least half-word > aligned. https://gcc.gnu.org/ml/gcc-patches/2015-07/msg00749.html Thanks, Simon
RE: [PATCH, MIPS, Ping] Inline memcpy for MipsR6
Catherine, Inline-memcpy-2.c updated to not run with -Os. Patch rebased off current gcc sources. Thanks, Simon diff --git a/gcc/config/mips/mips.c b/gcc/config/mips/mips.c index 1733457..627e078 100644 --- a/gcc/config/mips/mips.c +++ b/gcc/config/mips/mips.c @@ -7520,12 +7520,22 @@ mips_block_move_straight (rtx dest, rtx src, HOST_WIDE_INT length) half-word alignment, it is usually better to move in half words. For instance, lh/lh/sh/sh is usually better than lwl/lwr/swl/swr and lw/lw/sw/sw is usually better than ldl/ldr/sdl/sdr. - Otherwise move word-sized chunks. */ - if (MEM_ALIGN (src) == BITS_PER_WORD / 2 - && MEM_ALIGN (dest) == BITS_PER_WORD / 2) -bits = BITS_PER_WORD / 2; + Otherwise move word-sized chunks. + + For ISA_HAS_LWL_LWR we rely on the lwl/lwr & swl/swr load. Otherwise + picking the minimum of alignment or BITS_PER_WORD gets us the + desired size for bits. */ + + if (!ISA_HAS_LWL_LWR) +bits = MIN (BITS_PER_WORD, MIN (MEM_ALIGN (src), MEM_ALIGN (dest))); else -bits = BITS_PER_WORD; +{ + if (MEM_ALIGN (src) == BITS_PER_WORD / 2 + && MEM_ALIGN (dest) == BITS_PER_WORD / 2) + bits = BITS_PER_WORD / 2; + else + bits = BITS_PER_WORD; +} mode = mode_for_size (bits, MODE_INT, 0); delta = bits / BITS_PER_UNIT; @@ -7644,8 +7654,9 @@ mips_block_move_loop (rtx dest, rtx src, HOST_WIDE_INT length, bool mips_expand_block_move (rtx dest, rtx src, rtx length) { - /* Disable entirely for R6 initially. */ - if (!ISA_HAS_LWL_LWR) + if (!ISA_HAS_LWL_LWR + && (MEM_ALIGN (src) < MIPS_MIN_MOVE_MEM_ALIGN + || MEM_ALIGN (dest) < MIPS_MIN_MOVE_MEM_ALIGN)) return false; if (CONST_INT_P (length)) diff --git a/gcc/config/mips/mips.h b/gcc/config/mips/mips.h index ec69ed5..4b1787d 100644 --- a/gcc/config/mips/mips.h +++ b/gcc/config/mips/mips.h @@ -2969,6 +2969,9 @@ while (0) #undef PTRDIFF_TYPE #define PTRDIFF_TYPE (POINTER_SIZE == 64 ? "long int" : "int") +/* The minimum alignment of any expanded block move. */ +#define MIPS_MIN_MOVE_MEM_ALIGN 16 + /* The maximum number of bytes that can be copied by one iteration of a movmemsi loop; see mips_block_move_loop. */ #define MIPS_MAX_MOVE_BYTES_PER_LOOP_ITER \ diff --git a/gcc/testsuite/gcc.target/mips/inline-memcpy-1.c b/gcc/testsuite/gcc.target/mips/inline-memcpy-1.c new file mode 100644 index 000..5a254b1 --- /dev/null +++ b/gcc/testsuite/gcc.target/mips/inline-memcpy-1.c @@ -0,0 +1,16 @@ +/* { dg-options "-fno-common isa_rev>=6" } */ +/* { dg-skip-if "code quality test" { *-*-* } { "-O0" "-Os" } { "" } } */ +/* { dg-final { scan-assembler-not "\tmemcpy" } } */ + +/* Test that memcpy is inline for target hardware + without swl, swr. */ + +#include + +char c[40] __attribute__ ((aligned(8))); + +void +f1 () +{ + memcpy (c, "1234567890QWERTYUIOPASDFGHJKLZXCVBNM", 32); +} diff --git a/gcc/testsuite/gcc.target/mips/inline-memcpy-2.c b/gcc/testsuite/gcc.target/mips/inline-memcpy-2.c new file mode 100644 index 000..e144e61 --- /dev/null +++ b/gcc/testsuite/gcc.target/mips/inline-memcpy-2.c @@ -0,0 +1,17 @@ +/* { dg-options "-fno-common isa_rev>=6" } */ +/* { dg-skip-if "code quality test" { *-*-* } { "-O0" "-Os"} { "" } } */ +/* { dg-final { scan-assembler-not "\tmemcpy" } } */ +/* { dg-final { scan-assembler-times "\tsh\t" 16 } } */ + +/* Test that inline memcpy is expanded for target hardware without + swl, swr when alignment is halfword and sufficent shs are produced. */ + +#include + +char c[40] __attribute__ ((aligned(2))); + +void +f1 () +{ + memcpy (c, "1234567890QWERTYUIOPASDFGHJKLZXCVBNM", 32); +} diff --git a/gcc/testsuite/gcc.target/mips/inline-memcpy-3.c b/gcc/testsuite/gcc.target/mips/inline-memcpy-3.c new file mode 100644 index 000..96a0387 --- /dev/null +++ b/gcc/testsuite/gcc.target/mips/inline-memcpy-3.c @@ -0,0 +1,18 @@ +/* { dg-options "-fno-common isa_rev<=5" } */ +/* { dg-skip-if "code quality test" { *-*-* } { "-O0" "-Os"} { "" } } */ +/* { dg-final { scan-assembler-not "\tmemcpy" } } */ +/* { dg-final { scan-assembler-times "swl" 8 } } */ +/* { dg-final { scan-assembler-times "swr" 8 } } */ + +/* Test that inline memcpy for hardware with swl, swr handles subword + alignment and produces enough swl/swrs for mips32. */ + +#include + +char c[40] __attribute__ ((aligned(2))); + +void +f1 () +{ + memcpy (c, "1234567890QWERTYUIOPASDFGHJKLZXCVBNM", 32); +} diff --git a/gcc/testsuite/gcc.target/mips/inline-memcpy-4.c b/gcc/testsuite/gcc.target/mips/inline-memcpy-4.c new file mode 100644 index 000..0e7a22e --- /dev/null +++ b/gcc/testsuite/gcc.target/mips/inline-memcpy-4.c @@ -0,0 +1,18 @@ +/* { dg-options "-fno-common isa_rev<=5 -mabi=64" } */ +/* { dg-skip-if "code quality test" { *-*-* } { "-O0" "-Os"} { "" } } */ +/* { dg-final { scan-assembler-not "\tmemcpy" } } */ +/* { dg-final { scan-assembler-times "sdl" 4 } } */ +/* { dg-final { scan-assembler-times "sdr" 4 } } *
RE: [PATCH] Target hook for disabling the delay slot filler.
The profitability of using an ordinary branch over a delay slot branch depends on how the delay slot is filled. If a delay slot can be filled from an instruction preceding the branch or instructions proceeding that must be executed on both sides then it is profitable to use a delay slot branch. For cases when instructions are chosen from one side of the branch, the proposed optimization strategy is to not speculatively execute instructions when ordinary branches could be used. Performance-wise this avoids executing instructions which the eager delay filler picked wrongly. Since most branches have a compact form disabling the eager delay filler should be no worse than altering it not to fill delay slots in this case. Thanks, Simon -Original Message- From: Jeff Law [mailto:l...@redhat.com] Sent: 15 September 2015 16:02 To: Bernd Schmidt; Simon Dardis; gcc-patches@gcc.gnu.org Subject: Re: [PATCH] Target hook for disabling the delay slot filler. On 09/15/2015 08:27 AM, Bernd Schmidt wrote: > On 09/15/2015 04:19 PM, Simon Dardis wrote: >> This patch adds a target hook for disabling the eager delay slot >> filler which when disabled can give better code. No new regressions. >> Ok to commit? > > Hmm. Whether a branch was filled by the simple or eager filler is an > implementation detail - is there some better way to describe which > kind of branch is profitable? And more importantly, it's far better to be able to describe when it is not profitable to use eager filling rather than just disabling it completely. Jeff
FW: [PATCH] Target hook for disabling the delay slot filler.
> Are you trying to say that you have the option as to what kind of > branch to use? ie, "ordinary", presumably without a delay slot or one > with a delay slot? > Is the "ordinary" actually just a nullified delay slot or some form of > likely/not likely static hint? Specifically for MIPSR6: the ISA possesses traditional delay slot branches and a normal branch (no delay slots, not annulling, no hints, subtle static hazard), aka "compact branch" in MIPS terminology. They could be described as nullify on taken delay slot branch but we saw little to no value in that. Matthew Fortune provided a writeup with their handling in GCC: https://gcc.gnu.org/ml/gcc-patches/2015-07/msg01892.html > But what is the compact form at the micro-architectural level? My > mips-fu has diminished greatly, but my recollection is the bubble is > always there. Is that not the case? The pipeline bubble will exist but the performance impact varies across R6 cores. High-end OoO cores won't be impacted as much, but lower end cores will. microMIPSR6 removes delay slot branches altogether which pushes the simplest micro-architectures to optimize away the cost of a pipeline bubble. For non-microMIPSR6 this is why we have different branch policies implemented in the MIPS backend to allow branch usage to be tuned. By default, if a delay slot can be filled then we use a delay slot branch otherwise we use a compact branch as the only thing in the DS would be a NOP anyway. Compact branches do a strange restriction in that they cannot be followed by a CTI. This is to simplify branch predictors apparently but this may be lifted in future ISA releases. > If it is able to find insns from the commonly executed path that don't > have a long latency, then the fill is usually profitable (since the > pipeline bubble always exists). However, pulling a long latency > instruction (say anything that might cache miss or an fdiv/fsqrt) off > the slow path and conditionally nullifying it can be *awful*. > Everything else is in-between. I agree. The variability in profit/loss in a concern and I see two ways to deal with it: A) modify the delay slot filler so that it choses speculative instructions of less than some $cost and avoid instruction duplication when the eager filler picks an instruction from a block with multiple predecessors. Making such changes would be invasive and require more target specific hooks. B) Use compact branches instead of speculative delay slot execution and forsake variable performance for a consistent pipeline bubble by not using the speculative delay filler altogether. Between these two choices, B seems to better option as due to sheer simplicity. Choosing neither gives speculative instruction execution when there could be a small consistent penalty instead. Thanks, Simon ________ From: Jeff Law [l...@redhat.com] Sent: 17 September 2015 17:55 To: Simon Dardis; Bernd Schmidt Cc: gcc-patches@gcc.gnu.org Subject: Re: [PATCH] Target hook for disabling the delay slot filler. On 09/17/2015 03:52 AM, Simon Dardis wrote: > The profitability of using an ordinary branch over a delay slot branch > depends on how the delay slot is filled. If a delay slot can be filled > from an instruction preceding the branch or instructions proceeding > that must be executed on both sides then it is profitable to use a delay slot > branch. Agreed. It's an over-simplification, but for the purposes of this discussion it's close enough. > > For cases when instructions are chosen from one side of the branch, > the proposed optimization strategy is to not speculatively execute > instructions when ordinary branches could be used. Performance-wise > this avoids executing instructions which the eager delay filler picked > wrongly. Are you trying to say that you have the option as to what kind of branch to use? ie, "ordinary", presumably without a delay slot or one with a delay slot? Is the "ordinary" actually just a nullified delay slot or some form of likely/not likely static hint? > > Since most branches have a compact form disabling the eager delay > filler should be no worse than altering it not to fill delay slots in this > case. But what is the compact form at the micro-architectural level? My mips-fu has diminished greatly, but my recollection is the bubble is always there. Is that not the case? fill_eager_delay_slots is most definitely speculative and its profitability is largely dependent on the cost of what insns it finds to fill those delay slots and whether they're from the common or uncommon path. If it is able to find insns from the commonly executed path that don't have a long latency, then the fill is usually profitable (since the pipeline bubble always exists). However, pulling a long late
[PATCH, Mips] Compact branch/delay slot optimization.
Hello, The following patch adds three small optimizations related to compact branches for MIPSR6: When the result of a load is used by a delay slot branch immediately afterwards, undo the delay slot branch scheduling to hide the pipeline bubble if safe and use a compact branch instead. Undo delay slot scheduling if an orphaned high-part relocation is in a delay slot and use a compact branch is used instead. Undo delay slot scheduling in the case where a forbidden slot hazard is immediately followed by a delay slot branch. This would cause a nop to be inserted otherwise. No regressions. OK to apply? Thanks, Simon gcc/ * config/mips/mips.c: (mips_break_sequence): New function. (mips_reorg_process_insns) Use it. Use compact branches in selected situations. gcc/testsuite/ * gcc.target/mips/split-ds-sequence.c: Test for the above. Index: config/mips/mips.c === --- config/mips/mips.c (revision 227676) +++ config/mips/mips.c (working copy) @@ -16973,6 +16973,23 @@ } } +/* Remove a SEQUENCE and replace it with the delay slot instruction + followed by the branch and return the instruction in the delay slot. + Return the first of the two new instructions. + Subroutine of mips_reorg_process_insns. */ + +static rtx_insn * +mips_break_sequence (rtx_insn * insn) +{ + rtx_insn * before = PREV_INSN (insn); + rtx_insn * branch = SEQ_BEGIN (insn); + rtx_insn * ds = SEQ_END (insn); + remove_insn (insn); + add_insn_after (ds, before, NULL); + add_insn_after (branch, ds, NULL); + return ds; +} + /* Go through the instruction stream and insert nops where necessary. Also delete any high-part relocations whose partnering low parts are now all dead. See if the whole function can then be put into @@ -17065,6 +17082,66 @@ { if (GET_CODE (PATTERN (insn)) == SEQUENCE) { + rtx_insn * next_active = next_active_insn (insn); + /* Undo delay slots to avoid bubbles if the next instruction can +be placed in a forbidden slot or the cost of adding an +explicit NOP in a forbidden slot is OK. */ + if (TARGET_CB_MAYBE + && INSN_P (SEQ_BEGIN (insn)) + && INSN_P (SEQ_END (insn)) + && ((next_active + && INSN_P (next_active) + && GET_CODE (PATTERN (next_active)) != SEQUENCE + && get_attr_can_delay (next_active) == CAN_DELAY_YES) + || !optimize_size)) + { + /* To hide a potential pipeline bubble, if we scan backwards +from the current SEQUENCE and find that there is a load +of a value that is used in the CTI and there are no +dependencies between the CTI and instruction in the delay +slot, break the sequence so the load delay is hidden. */ + HARD_REG_SET uses; + CLEAR_HARD_REG_SET (uses); + note_uses (&PATTERN (SEQ_BEGIN (insn)), record_hard_reg_uses, +&uses); + HARD_REG_SET delay_sets; + CLEAR_HARD_REG_SET (delay_sets); + note_stores (PATTERN (SEQ_END (insn)), record_hard_reg_sets, + &delay_sets); + + rtx prev = prev_active_insn (insn); + if (prev + && GET_CODE (PATTERN (prev)) == SET + && MEM_P (SET_SRC (PATTERN (prev + { + HARD_REG_SET sets; + CLEAR_HARD_REG_SET (sets); + note_stores (PATTERN (prev), record_hard_reg_sets, + &sets); + + /* Re-order if safe. */ + if (!hard_reg_set_intersect_p (delay_sets, uses) + && hard_reg_set_intersect_p (uses, sets)) + { + next_insn = mips_break_sequence (insn); + /* Need to process the hazards of the newly +introduced instructions. */ + continue; + } + } + + /* If we find an orphaned high-part relocation in a delay +slot then we can convert to a compact branch and get +the orphaned high part deleted. */ + if (mips_orphaned_high_part_p (&htab, SEQ_END (insn))) + { + next_insn = mips_break_sequence (insn); + /* Need to process the hazards of the newly +introduced instructions. */ + continue; + } + } + /* If we find an orphaned high-part relocation in a
[PATCH, MIPS, PR/61114] Migrate to reduc_..._scal optabs.
Hello, This patch migrates the MIPS backend to the new vector reduction optabs. No new regressions, ok to apply? Thanks, Simon gcc/ChangeLog: * config/mips/loongson.md (vec_loongson_extract_lo_): New, extract low part to scalar. (reduc_uplus_): Remove. (reduc_plus_scal_): Rename from reduc_splus_, Use vec loongson_extract_lo_. (reduc_smax_scal_, reduc_smin_scal_): Rename from reduc_smax_, reduc_smax_, fix constraints, use vec loongson_extract_lo_. (reduc_umax_scal_, reduc_umin_scal_): Rename, change constraints. Index: config/mips/loongson.md === --- config/mips/loongson.md (revision 228282) +++ config/mips/loongson.md (working copy) @@ -852,58 +852,66 @@ "dsrl\t%0,%1,%2" [(set_attr "type" "fcvt")]) -(define_expand "reduc_uplus_" - [(match_operand:VWH 0 "register_operand" "") - (match_operand:VWH 1 "register_operand" "")] +(define_insn "vec_loongson_extract_lo_" + [(set (match_operand: 0 "register_operand" "=r") +(vec_select: + (match_operand:VWHB 1 "register_operand" "f") + (parallel [(const_int 0)])))] "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS" -{ - mips_expand_vec_reduc (operands[0], operands[1], gen_add3); - DONE; -}) + "mfc1\t%0,%1" + [(set_attr "type" "mfc")]) -; ??? Given that we're not describing a widening reduction, we should -; not have separate optabs for signed and unsigned. -(define_expand "reduc_splus_" - [(match_operand:VWHB 0 "register_operand" "") +(define_expand "reduc_plus_scal_" + [(match_operand: 0 "register_operand" "") (match_operand:VWHB 1 "register_operand" "")] "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS" { - emit_insn (gen_reduc_uplus_(operands[0], operands[1])); + rtx tmp = gen_reg_rtx (GET_MODE (operands[1])); + mips_expand_vec_reduc (tmp, operands[1], gen_add3); + emit_insn ( gen_vec_loongson_extract_lo_ (operands[0], tmp)); DONE; }) -(define_expand "reduc_smax_" - [(match_operand:VWHB 0 "register_operand" "") - (match_operand:VWHB 1 "register_operand" "")] +(define_expand "reduc_smax_scal_" + [(match_operand:HI 0 "register_operand" "") + (match_operand:VH 1 "register_operand" "")] "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS" { - mips_expand_vec_reduc (operands[0], operands[1], gen_smax3); + rtx tmp = gen_reg_rtx (GET_MODE (operands[1])); + mips_expand_vec_reduc (tmp, operands[1], gen_smax3); + emit_insn ( gen_vec_loongson_extract_lo_ (operands[0], tmp)); DONE; }) -(define_expand "reduc_smin_" - [(match_operand:VWHB 0 "register_operand" "") - (match_operand:VWHB 1 "register_operand" "")] +(define_expand "reduc_smin_scal_" + [(match_operand:HI 0 "register_operand" "") + (match_operand:VH 1 "register_operand" "")] "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS" { - mips_expand_vec_reduc (operands[0], operands[1], gen_smin3); + rtx tmp = gen_reg_rtx (GET_MODE (operands[1])); + mips_expand_vec_reduc (tmp, operands[1], gen_smin3); + emit_insn ( gen_vec_loongson_extract_lo_ (operands[0], tmp)); DONE; }) -(define_expand "reduc_umax_" - [(match_operand:VB 0 "register_operand" "") +(define_expand "reduc_umax_scal_" + [(match_operand:QI 0 "register_operand" "") (match_operand:VB 1 "register_operand" "")] "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS" { - mips_expand_vec_reduc (operands[0], operands[1], gen_umax3); + rtx tmp = gen_reg_rtx (GET_MODE (operands[1])); + mips_expand_vec_reduc (tmp, operands[1], gen_umax3); + emit_insn ( gen_vec_loongson_extract_lo_ (operands[0], tmp)); DONE; }) -(define_expand "reduc_umin_" - [(match_operand:VB 0 "register_operand" "") +(define_expand "reduc_umin_scal_" + [(match_operand:QI 0 "register_operand" "") (match_operand:VB 1 "register_operand" "")] "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS" { - mips_expand_vec_reduc (operands[0], operands[1], gen_umin3); + rtx tmp = gen_reg_rtx (GET_MODE (operands[1])); + mips_expand_vec_reduc (tmp, operands[1], gen_umin3); + emit_insn ( gen_vec_loongson_extract_lo_ (operands[0], tmp)); DONE; })
RE: [PATCH, Mips] Compact branch/delay slot optimization.
structions. */ + continue; + } + } + /* If we find an orphaned high-part relocation in a delay slot, it's easier to turn that instruction into a NOP than to delete it. The delay slot will be a NOP either way. */ @@ -17099,6 +17189,33 @@ { mips_avoid_hazard (last_insn, insn, &hilo_delay, &delayed_reg, lo_reg, &fs_delay); + /* When a compact branch introduces a forbidden slot hazard +and the next useful instruction is a SEQUENCE of a jump +and a non-nop instruction in the delay slot, remove the +sequence and replace it with the delay slot instruction +then the jump to clear the forbidden slot hazard. */ + + if (fs_delay) + { + /* Search onwards from the current position looking for +a SEQUENCE. We are looking for pipeline hazards here +and do not need to worry about labels or barriers as +the optimization only undoes delay slot filling which +only affects the order of the branch and its delay +slot. */ + rtx_insn * next = next_active_insn (insn); + if (next + && USEFUL_INSN_P (next) + && GET_CODE (PATTERN (next)) == SEQUENCE + && mips_breakable_sequence_p (next)) + { + last_insn = insn; + next_insn = mips_break_sequence (next); + /* Need to process the hazards of the newly +introduced instructions. */ + continue; + } + } last_insn = insn; } } Index: testsuite/gcc.target/mips/split-ds-sequence.c === --- testsuite/gcc.target/mips/split-ds-sequence.c (revision 0) +++ testsuite/gcc.target/mips/split-ds-sequence.c (working copy) @@ -0,0 +1,19 @@ +/* { dg-options "isa_rev>=6" } */ +/* { dg-skip-if "code quality test" { *-*-* } { "-mcompact-branches=never" } { "" } } */ +/* { dg-final { scan-assembler-not "nop" } } */ + +int +testg2 (int a, int c) +{ + + int j = 0; + do +{ + j += a; +} + while (j < 56); + + j += c; + return j; + +} -Original Message- From: Simon Dardis Sent: 25 September 2015 15:56 To: Moore, Catherine Cc: gcc-patches@gcc.gnu.org Subject: [PATCH, Mips] Compact branch/delay slot optimization. Hello, The following patch adds three small optimizations related to compact branches for MIPSR6: When the result of a load is used by a delay slot branch immediately afterwards, undo the delay slot branch scheduling to hide the pipeline bubble if safe and use a compact branch instead. Undo delay slot scheduling if an orphaned high-part relocation is in a delay slot and use a compact branch is used instead. Undo delay slot scheduling in the case where a forbidden slot hazard is immediately followed by a delay slot branch. This would cause a nop to be inserted otherwise. No regressions. OK to apply? Thanks, Simon gcc/ * config/mips/mips.c: (mips_break_sequence): New function. (mips_reorg_process_insns) Use it. Use compact branches in selected situations. gcc/testsuite/ * gcc.target/mips/split-ds-sequence.c: Test for the above.
RE: [PATCH, MIPS, PR/61114] Migrate to reduc_..._scal optabs.
On the change from smin/smax it was a deliberate change as I managed to confuse myself of the mode patterns, correct version follows. Reverted back to VWHB for smax/smin. Stylistic point addressed. No new regression, ok for commit? Thanks, Simon Index: config/mips/loongson.md === --- config/mips/loongson.md (revision 228282) +++ config/mips/loongson.md (working copy) @@ -852,58 +852,66 @@ "dsrl\t%0,%1,%2" [(set_attr "type" "fcvt")]) -(define_expand "reduc_uplus_" - [(match_operand:VWH 0 "register_operand" "") - (match_operand:VWH 1 "register_operand" "")] +(define_insn "vec_loongson_extract_lo_" + [(set (match_operand: 0 "register_operand" "=r") +(vec_select: + (match_operand:VWHB 1 "register_operand" "f") + (parallel [(const_int 0)])))] "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS" -{ - mips_expand_vec_reduc (operands[0], operands[1], gen_add3); - DONE; -}) + "mfc1\t%0,%1" + [(set_attr "type" "mfc")]) -; ??? Given that we're not describing a widening reduction, we should -; not have separate optabs for signed and unsigned. -(define_expand "reduc_splus_" - [(match_operand:VWHB 0 "register_operand" "") +(define_expand "reduc_plus_scal_" + [(match_operand: 0 "register_operand" "") (match_operand:VWHB 1 "register_operand" "")] "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS" { - emit_insn (gen_reduc_uplus_(operands[0], operands[1])); + rtx tmp = gen_reg_rtx (GET_MODE (operands[1])); + mips_expand_vec_reduc (tmp, operands[1], gen_add3); + emit_insn (gen_vec_loongson_extract_lo_ (operands[0], tmp)); DONE; }) -(define_expand "reduc_smax_" - [(match_operand:VWHB 0 "register_operand" "") +(define_expand "reduc_smax_scal_" + [(match_operand: 0 "register_operand" "") (match_operand:VWHB 1 "register_operand" "")] "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS" { - mips_expand_vec_reduc (operands[0], operands[1], gen_smax3); + rtx tmp = gen_reg_rtx (GET_MODE (operands[1])); + mips_expand_vec_reduc (tmp, operands[1], gen_smax3); + emit_insn (gen_vec_loongson_extract_lo_ (operands[0], tmp)); DONE; }) -(define_expand "reduc_smin_" - [(match_operand:VWHB 0 "register_operand" "") +(define_expand "reduc_smin_scal_" + [(match_operand: 0 "register_operand" "") (match_operand:VWHB 1 "register_operand" "")] "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS" { - mips_expand_vec_reduc (operands[0], operands[1], gen_smin3); + rtx tmp = gen_reg_rtx (GET_MODE (operands[1])); + mips_expand_vec_reduc (tmp, operands[1], gen_smin3); + emit_insn (gen_vec_loongson_extract_lo_ (operands[0], tmp)); DONE; }) -(define_expand "reduc_umax_" - [(match_operand:VB 0 "register_operand" "") +(define_expand "reduc_umax_scal_" + [(match_operand: 0 "register_operand" "") (match_operand:VB 1 "register_operand" "")] "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS" { - mips_expand_vec_reduc (operands[0], operands[1], gen_umax3); + rtx tmp = gen_reg_rtx (GET_MODE (operands[1])); + mips_expand_vec_reduc (tmp, operands[1], gen_umax3); + emit_insn (gen_vec_loongson_extract_lo_ (operands[0], tmp)); DONE; }) -(define_expand "reduc_umin_" - [(match_operand:VB 0 "register_operand" "") +(define_expand "reduc_umin_scal_" + [(match_operand: 0 "register_operand" "") (match_operand:VB 1 "register_operand" "")] "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS" { - mips_expand_vec_reduc (operands[0], operands[1], gen_umin3); + rtx tmp = gen_reg_rtx (GET_MODE (operands[1])); + mips_expand_vec_reduc (tmp, operands[1], gen_umin3); + emit_insn (gen_vec_loongson_extract_lo_ (operands[0], tmp)); DONE; }) -Original Message- From: Alan Lawrence [mailto:alan.lawre...@arm.com] Sent: 06 October 2015 11:12 To: Simon Dardis; Matthew Fortune; Moore, Catherine Cc: gcc-patches@gcc.gnu.org Subject: Re: [PATCH, MIPS, PR/61114] Migrate to reduc_..._scal optabs. Thanks for working on this, Simon! On 01/10/15 15:43, Simon Dardis wrote: > -(define_expand "reduc_smax_" > - [(match_operand:VWHB 0 "register_operand" "") > - (match_operand:VWHB 1 "register_operand" "")] > +(define_expand "reduc_smax_scal_" > +
RE: FW: [PATCH] Target hook for disabling the delay slot filler.
> -Original Message- > From: Jeff Law [mailto:l...@redhat.com] > Sent: 08 October 2015 20:44 > To: Simon Dardis; Bernd Schmidt > Cc: gcc-patches@gcc.gnu.org > Subject: Re: FW: [PATCH] Target hook for disabling the delay slot filler. > > On 09/18/2015 05:10 AM, Simon Dardis wrote: > >> Are you trying to say that you have the option as to what kind of > >> branch to use? ie, "ordinary", presumably without a delay slot or > >> one with a delay slot? > > > >> Is the "ordinary" actually just a nullified delay slot or some form > >> of likely/not likely static hint? > > > > Specifically for MIPSR6: the ISA possesses traditional delay slot > > branches and a normal branch (no delay slots, not annulling, no hints, > > subtle static hazard), aka "compact branch" in MIPS terminology. They > > could be described as nullify on taken delay slot branch but we saw little > > to > no value in that. > > > > Matthew Fortune provided a writeup with their handling in GCC: > > > > https://gcc.gnu.org/ml/gcc-patches/2015-07/msg01892.html > Thanks. I never looked at that message, almost certainly because it was MIPS > specific. I'm trying hard to stay out of backends that have good active > maintainers, and MIPS certainly qualifies on that point. > > > > > >> But what is the compact form at the micro-architectural level? My > >> mips-fu has diminished greatly, but my recollection is the bubble is > >> always there. Is that not the case? > > > > The pipeline bubble will exist but the performance impact varies > > across > > R6 cores. High-end OoO cores won't be impacted as much, but lower end > > cores will. microMIPSR6 removes delay slot branches altogether which > > pushes the simplest micro-architectures to optimize away the cost of a > > pipeline bubble. > [ ... snip more micro-archticture stuff ... ] Thanks. That helps a lot. I > didn't > realize the bubble was being squashed to varying degrees. And FWIW, I > wouldn't be surprised if you reach a point on the OoO cores where you'll just > want to move away from delay slots totally and rely on your compact > branches as much as possible. It may give your hardware guys a degree of > freedom that helps them in the common case (compact branches) at the > expense of slowing down code with old fashioned delay slots. > > > Compact branches do a strange restriction in that they cannot be > > followed by a CTI. This is to simplify branch predictors apparently > > but this may be lifted in future ISA releases. > Come on! :-) There's some really neat things you can do when you allow > branches in delay slots. The PA was particularly fun in that regard. > My recollection is HP had some hand written assembly code in their libraries > which exploited the out-of-line execution you could get in this case. We > never tried to exploit in GCC simply because the opportunities didn't see all > that common or profitable. > > > > > > >> If it is able to find insns from the commonly executed path that > >> don't have a long latency, then the fill is usually profitable (since > >> the pipeline bubble always exists). However, pulling a long latency > >> instruction (say anything that might cache miss or an fdiv/fsqrt) off > >> the slow path and conditionally nullifying it can be *awful*. > >> Everything else is in-between. > > > > I agree. The variability in profit/loss in a concern and I see two > > ways to deal with it: > > > > A) modify the delay slot filler so that it choses speculative > > instructions of less than some $cost and avoid instruction duplication > > when the eager filler picks an instruction from a block with multiple > > predecessors. Making such changes would be invasive and require more > target specific hooks. > The cost side here should be handled by existing mechanisms. You just > never allow anything other than simple arith, logicals & copies. > > You'd need a hook to avoid this when copying was needed. > > You'd probably also need some kind of target hook to indicate the level of > prediction where this is profitable since the cost varies across your micro- > architectures. > > And you'd also have to worry about the special code which triggers when > there's a well predicted branch, but a resource conflict. In that case reorg > can > fill the slot from the predicted path and insert compensation code on the > non-predicted path. > > > > > > > B) Use compact branches instead
RE: FW: [PATCH] Target hook for disabling the delay slot filler.
> On 10/23/2015 11:31 AM, Bernd Schmidt wrote: > > On 10/23/2015 04:57 PM, Simon Dardis wrote: > > > >> Patch below. Target hook renamed to > >> TARGET_NO_SPECULATION_IN_DELAY_SLOTS_P. > >> > >> Tested on mips-img-elf, no new regressions. > > > > As far as I'm concerned this is ok, and IIUC Jeff was on board too. > > This is assuming the test included a bootstrap, otherwise please do > > that. You should also include a ChangeLog in future submissions. > Just to be explicit, I'm on board. > > Jeff I've done bootstrap and regression. No new failures. gcc/ * target.def (TARGET_NO_SPECULATION_IN_DELAY_SLOTS_P): New hook. * doc/tm.texi.in (TARGET_NO_SPECULATION_IN_DELAY_SLOTS_P): Document. * doc/tm.texi: Regenerated. * reorg.c (dbr_schedule): Use new hook. * config/mips/mips.c (mips_no_speculation_in_delay_slots_p): New. testsuite/ * gcc.target/mips/ds-schedule-1.c: New. * gcc.target/mips/ds-schedule-2.c: New. Committed as r229383. Thanks, Simon
RE: [PATCH, MIPS, PR/61114] Migrate to reduc_..._scal optabs.
Committed r229844. Thanks, Simon > -Original Message- > From: Moore, Catherine [mailto:catherine_mo...@mentor.com] > Sent: 03 November 2015 14:09 > To: Simon Dardis; Alan Lawrence; Matthew Fortune > Cc: gcc-patches@gcc.gnu.org > Subject: RE: [PATCH, MIPS, PR/61114] Migrate to reduc_..._scal optabs. > > > > > -Original Message- > > From: Simon Dardis [mailto:simon.dar...@imgtec.com] > > Sent: Wednesday, October 07, 2015 6:51 AM > > To: Alan Lawrence; Matthew Fortune; Moore, Catherine > > Cc: gcc-patches@gcc.gnu.org > > Subject: RE: [PATCH, MIPS, PR/61114] Migrate to reduc_..._scal optabs. > > > > On the change from smin/smax it was a deliberate change as I managed > > to confuse myself of the mode patterns, correct version follows. > > Reverted back to VWHB for smax/smin. Stylistic point addressed. > > > > No new regression, ok for commit? > > > > Yes, OK to commit. Sorry for the delay in review. > Catherine > > > > > Index: config/mips/loongson.md > > > == > > = > > --- config/mips/loongson.md (revision 228282) > > +++ config/mips/loongson.md (working copy) > > @@ -852,58 +852,66 @@ > >"dsrl\t%0,%1,%2" > >[(set_attr "type" "fcvt")]) > > > > -(define_expand "reduc_uplus_" > > - [(match_operand:VWH 0 "register_operand" "") > > - (match_operand:VWH 1 "register_operand" "")] > > +(define_insn "vec_loongson_extract_lo_" > > + [(set (match_operand: 0 "register_operand" "=r") > > +(vec_select: > > + (match_operand:VWHB 1 "register_operand" "f") > > + (parallel [(const_int 0)])))] > >"TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS" > > -{ > > - mips_expand_vec_reduc (operands[0], operands[1], > gen_add3); > > - DONE; > > -}) > > + "mfc1\t%0,%1" > > + [(set_attr "type" "mfc")]) > > > > -; ??? Given that we're not describing a widening reduction, we should > > -; not have separate optabs for signed and unsigned. > > -(define_expand "reduc_splus_" > > - [(match_operand:VWHB 0 "register_operand" "") > > +(define_expand "reduc_plus_scal_" > > + [(match_operand: 0 "register_operand" "") > > (match_operand:VWHB 1 "register_operand" "")] > >"TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS" > > { > > - emit_insn (gen_reduc_uplus_(operands[0], operands[1])); > > + rtx tmp = gen_reg_rtx (GET_MODE (operands[1])); > > + mips_expand_vec_reduc (tmp, operands[1], gen_add3); > emit_insn > > + (gen_vec_loongson_extract_lo_ (operands[0], tmp)); > >DONE; > > }) > > > > -(define_expand "reduc_smax_" > > - [(match_operand:VWHB 0 "register_operand" "") > > +(define_expand "reduc_smax_scal_" > > + [(match_operand: 0 "register_operand" "") > > (match_operand:VWHB 1 "register_operand" "")] > >"TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS" > > { > > - mips_expand_vec_reduc (operands[0], operands[1], > gen_smax3); > > + rtx tmp = gen_reg_rtx (GET_MODE (operands[1])); > > + mips_expand_vec_reduc (tmp, operands[1], gen_smax3); > > + emit_insn (gen_vec_loongson_extract_lo_ (operands[0], tmp)); > >DONE; > > }) > > > > -(define_expand "reduc_smin_" > > - [(match_operand:VWHB 0 "register_operand" "") > > +(define_expand "reduc_smin_scal_" > > + [(match_operand: 0 "register_operand" "") > > (match_operand:VWHB 1 "register_operand" "")] > >"TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS" > > { > > - mips_expand_vec_reduc (operands[0], operands[1], > gen_smin3); > > + rtx tmp = gen_reg_rtx (GET_MODE (operands[1])); > > + mips_expand_vec_reduc (tmp, operands[1], gen_smin3); > > + emit_insn (gen_vec_loongson_extract_lo_ (operands[0], tmp)); > >DONE; > > }) > > > > -(define_expand "reduc_umax_" > > - [(match_operand:VB 0 "register_operand" "") > > +(define_expand "reduc_umax_scal_" > > + [(match_operand: 0 "register_operand" "") > > (match_operand:VB 1 "registe
RE: [PATCH, Mips] Compact branch/delay slot optimization.
Committed as r230160. Thanks, Simon > -Original Message- > From: Moore, Catherine [mailto:catherine_mo...@mentor.com] > Sent: 28 October 2015 14:00 > To: Simon Dardis; Matthew Fortune > Cc: gcc-patches@gcc.gnu.org > Subject: RE: [PATCH, Mips] Compact branch/delay slot optimization. > > > > > -Original Message- > > From: Simon Dardis [mailto:simon.dar...@imgtec.com] > > Sent: Tuesday, October 06, 2015 10:00 AM > > To: Moore, Catherine; Matthew Fortune > > Cc: gcc-patches@gcc.gnu.org > > Subject: RE: [PATCH, Mips] Compact branch/delay slot optimization. > > > > Hello, > > > > I'd like to resubmit the previous patch as it failed to check if the > > branch inside the sequence had a compact form. > > > > Thanks, > > Simon > > > > gcc/ > > * config/mips/mips.c: (mips_breakable_sequence_p): New function. > > (mips_break_sequence): New function. > > (mips_reorg_process_insns) Use them. Use compact branches in > > selected > > situations. > > > > gcc/testsuite/ > > * gcc.target/mips/split-ds-sequence.c: Test for the above. > > Hi Simon, > This patch looks okay with the exception of one stylistic change. > Please change all instances of : > +mips_breakable_sequence_p (rtx_insn * insn) > To: > +mips_breakable_sequence_p (rtx_insn *insn) > Okay, with those changes. > Thanks, > Catherine > > > > > > Index: config/mips/mips.c > > > == > > = > > --- config/mips/mips.c (revision 228282) > > +++ config/mips/mips.c (working copy) > > @@ -16973,6 +16973,34 @@ > >} > > } > > > > +/* A SEQUENCE is breakable iff the branch inside it has a compact form > > + and the target has compact branches. */ > > + > > +static bool > > +mips_breakable_sequence_p (rtx_insn * insn) { > > + return (insn && GET_CODE (PATTERN (insn)) == SEQUENCE > > + && TARGET_CB_MAYBE > > + && get_attr_compact_form (SEQ_BEGIN (insn)) != > > COMPACT_FORM_NEVER); > > +} > > + > > +/* Remove a SEQUENCE and replace it with the delay slot instruction > > + followed by the branch and return the instruction in the delay slot. > > + Return the first of the two new instructions. > > + Subroutine of mips_reorg_process_insns. */ > > + > > +static rtx_insn * > > +mips_break_sequence (rtx_insn * insn) { > > + rtx_insn * before = PREV_INSN (insn); > > + rtx_insn * branch = SEQ_BEGIN (insn); > > + rtx_insn * ds = SEQ_END (insn); > > + remove_insn (insn); > > + add_insn_after (ds, before, NULL); > > + add_insn_after (branch, ds, NULL); > > + return ds; > > +} > > + > > /* Go through the instruction stream and insert nops where necessary. > > Also delete any high-part relocations whose partnering low parts > > are now all dead. See if the whole function can then be put into > > @@ -17065,6 +17093,68 @@ > > { > > if (GET_CODE (PATTERN (insn)) == SEQUENCE) > > { > > + rtx_insn * next_active = next_active_insn (insn); > > + /* Undo delay slots to avoid bubbles if the next instruction can > > +be placed in a forbidden slot or the cost of adding an > > +explicit NOP in a forbidden slot is OK and if the SEQUENCE is > > +safely breakable. */ > > + if (TARGET_CB_MAYBE > > + && mips_breakable_sequence_p (insn) > > + && INSN_P (SEQ_BEGIN (insn)) > > + && INSN_P (SEQ_END (insn)) > > + && ((next_active > > + && INSN_P (next_active) > > + && GET_CODE (PATTERN (next_active)) != SEQUENCE > > + && get_attr_can_delay (next_active) == > > CAN_DELAY_YES) > > + || !optimize_size)) > > + { > > + /* To hide a potential pipeline bubble, if we scan backwards > > +from the current SEQUENCE and find that there is a load > > +of a value that is used in the CTI and there are no > > +dependencies between the CTI and instruction in the > > delay > > +slot, break the sequence so the load delay is hidden. */ > > + HARD_REG_SET uses; > > + CLEAR_HARD_REG_SET (uses); > > + note_uses (&PATTERN (SEQ_BEGIN (insn)), &
[PATCH] Mips: Inline memcpy for R6
Hello, This patch enables inline memcpy for R6 which was previously disabled and adds support for expansion when source and destination are at least half-word aligned. gcc/ * config/mips/mips.c (mips_expand_block_move): Enable inline memcpy expansion when !ISA_HAS_LWL_LWR. (mips_block_move_straight): Update the size of elements copied to account for alignment when !ISA_HAS_LWL_LWR. * config/mips/mips.h (MIPS_MIN_MOVE_MEM_ALIGN): New macro. gcc/testsuite/ * inline-memcpy-1.c: Test for inline expansion of memcpy. * inline-memcpy-2.c: Ditto. * inline-memcpy-3.c: Ditto. * inline-memcpy-4.c: Ditto. * inline-memcpy-5.c: Ditto. Thanks, Simon diff --git a/gcc/config/mips/mips.c b/gcc/config/mips/mips.c index 6f5421a..1f7c105 100644 --- a/gcc/config/mips/mips.c +++ b/gcc/config/mips/mips.c @@ -8187,12 +8187,22 @@ mips_block_move_straight (rtx dest, rtx src, HOST_WIDE_INT length) half-word alignment, it is usually better to move in half words. For instance, lh/lh/sh/sh is usually better than lwl/lwr/swl/swr and lw/lw/sw/sw is usually better than ldl/ldr/sdl/sdr. - Otherwise move word-sized chunks. */ - if (MEM_ALIGN (src) == BITS_PER_WORD / 2 - && MEM_ALIGN (dest) == BITS_PER_WORD / 2) -bits = BITS_PER_WORD / 2; + Otherwise move word-sized chunks. + + For ISA_HAS_LWL_LWR we rely on the lwl/lwr & swl/swr load. Otherwise + picking the minimum of alignment or BITS_PER_WORD gets us the + desired size for bits. */ + + if (!ISA_HAS_LWL_LWR) +bits = MIN (BITS_PER_WORD, MIN (MEM_ALIGN (src), MEM_ALIGN (dest))); else -bits = BITS_PER_WORD; +{ + if (MEM_ALIGN (src) == BITS_PER_WORD / 2 + && MEM_ALIGN (dest) == BITS_PER_WORD / 2) + bits = BITS_PER_WORD / 2; + else + bits = BITS_PER_WORD; +} mode = mode_for_size (bits, MODE_INT, 0); delta = bits / BITS_PER_UNIT; @@ -8311,8 +8321,8 @@ bool mips_expand_block_move (rtx dest, rtx src, rtx length) { if (!ISA_HAS_LWL_LWR - && (MEM_ALIGN (src) < BITS_PER_WORD - || MEM_ALIGN (dest) < BITS_PER_WORD)) + && (MEM_ALIGN (src) < MIPS_MIN_MOVE_MEM_ALIGN + || MEM_ALIGN (dest) < MIPS_MIN_MOVE_MEM_ALIGN)) return false; if (CONST_INT_P (length)) diff --git a/gcc/config/mips/mips.h b/gcc/config/mips/mips.h index a2380e5..6578ae5 100644 --- a/gcc/config/mips/mips.h +++ b/gcc/config/mips/mips.h @@ -3041,6 +3041,9 @@ while (0) #undef PTRDIFF_TYPE #define PTRDIFF_TYPE (POINTER_SIZE == 64 ? "long int" : "int") +/* The minimum alignment of any expanded block move. */ +#define MIPS_MIN_MOVE_MEM_ALIGN 16 + /* The maximum number of bytes that can be copied by one iteration of a movmemsi loop; see mips_block_move_loop. */ #define MIPS_MAX_MOVE_BYTES_PER_LOOP_ITER \ diff --git a/gcc/testsuite/gcc.target/mips/inline-memcpy-1.c b/gcc/testsuite/gcc.target/mips/inline-memcpy-1.c new file mode 100644 index 000..5a254b1 --- /dev/null +++ b/gcc/testsuite/gcc.target/mips/inline-memcpy-1.c @@ -0,0 +1,16 @@ +/* { dg-options "-fno-common isa_rev>=6" } */ +/* { dg-skip-if "code quality test" { *-*-* } { "-O0" "-Os" } { "" } } */ +/* { dg-final { scan-assembler-not "\tmemcpy" } } */ + +/* Test that memcpy is inline for target hardware + without swl, swr. */ + +#include + +char c[40] __attribute__ ((aligned(8))); + +void +f1 () +{ + memcpy (c, "1234567890QWERTYUIOPASDFGHJKLZXCVBNM", 32); +} diff --git a/gcc/testsuite/gcc.target/mips/inline-memcpy-2.c b/gcc/testsuite/gcc.target/mips/inline-memcpy-2.c new file mode 100644 index 000..c06be15 --- /dev/null +++ b/gcc/testsuite/gcc.target/mips/inline-memcpy-2.c @@ -0,0 +1,17 @@ +/* { dg-options "-fno-common isa_rev>=6" } */ +/* { dg-skip-if "code quality test" { *-*-* } { "-O0" } { "" } } */ +/* { dg-final { scan-assembler-not "\tmemcpy" } } */ +/* { dg-final { scan-assembler-times "\tsh\t" 16 } } */ + +/* Test that inline memcpy is expanded for target hardware without + swl, swr when alignment is halfword and sufficent shs are produced. */ + +#include + +char c[40] __attribute__ ((aligned(2))); + +void +f1 () +{ + memcpy (c, "1234567890QWERTYUIOPASDFGHJKLZXCVBNM", 32); +} diff --git a/gcc/testsuite/gcc.target/mips/inline-memcpy-3.c b/gcc/testsuite/gcc.target/mips/inline-memcpy-3.c new file mode 100644 index 000..96a0387 --- /dev/null +++ b/gcc/testsuite/gcc.target/mips/inline-memcpy-3.c @@ -0,0 +1,18 @@ +/* { dg-options "-fno-common isa_rev<=5" } */ +/* { dg-skip-if "code quality test" { *-*-* } { "-O0" "-Os"} { "" } } */ +/* { dg-final { scan-assembler-not "\tmemcpy" } } */ +/* { dg-final { scan-assembler-times "swl" 8 } } */ +/* { dg-final { scan-assembler-times "swr" 8 } } */ + +/* Test that inline memcpy for hardware with swl, swr handles subword + alignment and produces enough swl/swrs for mips32. */ + +#include + +char c[40] __attribute__ ((aligned(2))); + +void +f1 () +{ + memcpy (c, "1234567
RE: [PATCH, MIPS, Ping] Inline memcpy for MipsR6
Checked in as revision 227026. Thanks, Simon -Original Message- From: Moore, Catherine [mailto:catherine_mo...@mentor.com] Sent: 01 August 2015 20:18 To: Simon Dardis; gcc-patches@gcc.gnu.org Cc: Moore, Catherine Subject: RE: [PATCH, MIPS, Ping] Inline memcpy for MipsR6 > -Original Message- > From: Simon Dardis [mailto:simon.dar...@imgtec.com] > Sent: Wednesday, July 29, 2015 4:29 AM > To: gcc-patches@gcc.gnu.org > Cc: Moore, Catherine > Subject: [PATCH, MIPS, Ping] Inline memcpy for MipsR6 > > > This patch enables inline memcpy for R6 which was previously > > disabled and > adds support for expansion when source and destination are at least > half- word aligned. > > https://gcc.gnu.org/ml/gcc-patches/2015-07/msg00749.html > Hi Simon, Two things need to be fixed up with this patch before committing. 1. The new test inline-memcpy-2.c should not be run with -OS (like the other new tests that you submitted). 2. Your patch is against older source than what is currently in the repository, causing this hunk not to apply cleanly: @@ -8311,8 +8321,8 @@ bool mips_expand_block_move (rtx dest, rtx src, rtx length) { if (!ISA_HAS_LWL_LWR - && (MEM_ALIGN (src) < BITS_PER_WORD - || MEM_ALIGN (dest) < BITS_PER_WORD)) + && (MEM_ALIGN (src) < MIPS_MIN_MOVE_MEM_ALIGN + || MEM_ALIGN (dest) < MIPS_MIN_MOVE_MEM_ALIGN)) return false; if (CONST_INT_P (length)) The correct patch should like this: @@ -7780,8 +7790,9 @@ bool mips_expand_block_move (rtx dest, rtx src, rtx length) { - /* Disable entirely for R6 initially. */ - if (!ISA_HAS_LWL_LWR) + if (!ISA_HAS_LWL_LWR + && (MEM_ALIGN (src) < MIPS_MIN_MOVE_MEM_ALIGN + || MEM_ALIGN (dest) < MIPS_MIN_MOVE_MEM_ALIGN)) return false; if (CONST_INT_P (length)) Okay with those changes. Thanks, Catherine
[PATCH] Target hook for disabling the delay slot filler.
Hello all, This patch adds a target hook for disabling the eager delay slot filler which when disabled can give better code. No new regressions. Ok to commit? Thanks, Simon gcc/ * target.def (use_eager_delay_filler_p): New hook for selectively disabling eager delay slot filler. * reorg.c (dbr_schedule): Use the new hook. * config/mips/mips.c (mips_use_eager_delay_filler_p): New static function. (TARGET_USE_EAGER_DELAY_FILLER_P): Define. * doc/tm.texi.in: Add placeholder for new hook. * doc/tm.texi: Regenerate. gcc/testsuite/ * gcc.target/mips/ds-schedule-1.c: New file. * gcc.target/mips/ds-schedule-2.c: Likewise. Index: gcc/config/mips/mips.c === --- gcc/config/mips/mips.c (revision 227676) +++ gcc/config/mips/mips.c (working copy) @@ -14425,6 +14425,14 @@ return cached_can_issue_more; } +/* Implement USE_EAGER_DELAY_FILLER. */ + +static bool +mips_use_eager_delay_filler_p () +{ + return TARGET_CB_NEVER; +} + /* Update round-robin counters for ALU1/2 and FALU1/2. */ static void @@ -19982,6 +19990,9 @@ #undef TARGET_IN_SMALL_DATA_P #define TARGET_IN_SMALL_DATA_P mips_in_small_data_p +#undef TARGET_USE_EAGER_DELAY_FILLER_P +#define TARGET_USE_EAGER_DELAY_FILLER_P mips_use_eager_delay_filler_p + #undef TARGET_MACHINE_DEPENDENT_REORG #define TARGET_MACHINE_DEPENDENT_REORG mips_reorg Index: gcc/doc/tm.texi === --- gcc/doc/tm.texi (revision 227676) +++ gcc/doc/tm.texi (working copy) @@ -10949,6 +10949,15 @@ definition is null. @end deftypefn +@deftypefn {Target Hook} bool TARGET_USE_EAGER_DELAY_FILLER_P (void) +This predicate controls the use of the eager delay slot filler. Targets +such as certain MIPS architectures possess both branches with and without +delay slots. As the eager delay slot filler can increase code size, +disabling it is beneficial when ordinary branches are available. Use of +delay slot branches filled using the basic filler is often still desirable +as the delay slot can hide a pipeline bubble. +@end deftypefn + @deftypefn {Target Hook} void TARGET_INIT_BUILTINS (void) Define this hook if you have any machine-specific built-in functions that need to be defined. It should be a function that performs the Index: gcc/doc/tm.texi.in === --- gcc/doc/tm.texi.in (revision 227676) +++ gcc/doc/tm.texi.in (working copy) @@ -7985,6 +7985,8 @@ @hook TARGET_MACHINE_DEPENDENT_REORG +@hook TARGET_USE_EAGER_DELAY_FILLER_P + @hook TARGET_INIT_BUILTINS @hook TARGET_BUILTIN_DECL Index: gcc/reorg.c === --- gcc/reorg.c (revision 227676) +++ gcc/reorg.c (working copy) @@ -3793,7 +3793,8 @@ { fill_simple_delay_slots (1); fill_simple_delay_slots (0); - fill_eager_delay_slots (); + if (targetm.use_eager_delay_filler_p ()) + fill_eager_delay_slots (); relax_delay_slots (first); } Index: gcc/target.def === --- gcc/target.def (revision 227676) +++ gcc/target.def (working copy) @@ -3618,6 +3618,17 @@ definition is null.", void, (void), NULL) +/* Control of eager delay slot filling in delayed-branch scheduling. */ +DEFHOOK +(use_eager_delay_filler_p, + "This predicate controls the use of the eager delay slot filler. Targets\n\ +such as certain MIPS architectures possess both branches with and without\n\ +delay slots. As the eager delay slot filler can increase code size,\n\ +disabling it is beneficial when ordinary branches are available. Use of\n\ +delay slot branches filled using the basic filler is often still desirable\n\ +as the delay slot can hide a pipeline bubble.", bool, (void), + hook_bool_void_true) + /* Create the __builtin_va_list type. */ DEFHOOK (build_builtin_va_list, Index: gcc/testsuite/gcc.target/mips/ds-schedule-1.c === --- gcc/testsuite/gcc.target/mips/ds-schedule-1.c (revision 0) +++ gcc/testsuite/gcc.target/mips/ds-schedule-1.c (working copy) @@ -0,0 +1,29 @@ +/* { dg-options "isa_rev>=6 -mcompact-branches=optimal -mno-abicalls -G4" } */ +/* { dg-final { scan-assembler-not "bne\t" } } */ +/* { dg-final { scan-assembler-not "beq\t" } } */ +/* { dg-final { scan-assembler-times "\\(foo\\)" 1 } } */ + +/* Test that when compact branches are used, that a compact branch is + produced in the case where code expansion would have occurred if a + delay slot branch would have be used. 'foo' should only be + referenced once in the program text. */ + +struct list +{ + struct list *next; + int element; +}; + +struct list *gr; + +int foo; + +extern void t (int, int, int*); + +void +f (struct list **