[PATCH][MIPS] Scheduler fix for the 74k & 24k.

2015-07-21 Thread Simon Dardis
Hello,

This patch fixes a bug with the 74k & 24k schedulers. 

Back in 2006  (2ca4dfa486bd358c6e466328839977250d160393) a 
mips_store_data_bypass_p was added to the mips backend. Unfortunately it was 
defined in terms of !store_data_bypass_p, though it was correctly used for the 
sb1 processor pipeline descriptor at that time. Later during a code-cleanup in 
2012 (e053750d33e14ca245e14e1c467709a9bf6c6282) the 24k & 74k bypasses were 
changed from the correct !store_data_bypass_p to !mips_store_data_bypass_p. 
This lead to those bypasses having inverted guard conditions.

This patch brings mips_store_data_bypass_p into line with its comments and the 
comments of store_data_bypass_p. It also corrects the sb1's pipeline 
description.
 
Thanks,
Simon

gcc/
* config/mips/mips.c (mips_store_data_bypass_p): Bring code into
line with comments.
* config/mips/sb1.md: Update usage of mips_store_data_bypass_p.

diff --git a/gcc/config/mips/mips.c b/gcc/config/mips/mips.c
index 2fe143c..23f12d1 100644
--- a/gcc/config/mips/mips.c
+++ b/gcc/config/mips/mips.c
@@ -13709,7 +13709,7 @@ mips_store_data_bypass_p (rtx out_insn, rtx in_insn)
   if (GET_CODE (PATTERN (in_insn)) == UNSPEC_VOLATILE)
 return false;
 
-  return !store_data_bypass_p (out_insn, in_insn);
+  return store_data_bypass_p (out_insn, in_insn);
 }
 

 
diff --git a/gcc/config/mips/sb1.md b/gcc/config/mips/sb1.md
index 311300e..c12fc91 100644
--- a/gcc/config/mips/sb1.md
+++ b/gcc/config/mips/sb1.md
@@ -216,7 +216,7 @@
   "ir_sb1_load,ir_sb1a_load,ir_sb1_fpload,ir_sb1_fpload_32bitfp,
ir_sb1_fpidxload,ir_sb1_fpidxload_32bitfp"
   "ir_sb1_store,ir_sb1_fpstore,ir_sb1_fpidxstore"
-  "mips_store_data_bypass_p")
+  "!mips_store_data_bypass_p")
 
 ;; On SB-1, simple alu instructions can execute on the LS1 unit.
 
@@ -289,7 +289,7 @@
 (define_bypass 5
   "ir_sb1a_simple_alu,ir_sb1_alu,ir_sb1_alu_0,ir_sb1_mfhi,ir_sb1_mflo"
   "ir_sb1_store,ir_sb1_fpstore,ir_sb1_fpidxstore"
-  "mips_store_data_bypass_p")
+  "!mips_store_data_bypass_p")
 
 ;; mf{hi,lo} is 1 cycle.  
 
@@ -351,7 +351,7 @@
 (define_bypass 7
   "ir_sb1_mulsi,ir_sb1_muldi"
   "ir_sb1_store,ir_sb1_fpstore,ir_sb1_fpidxstore"
-  "mips_store_data_bypass_p")
+  "!mips_store_data_bypass_p")
 
 ;; The divide unit is not pipelined.  Divide busy is asserted in the 4th
 ;; cycle, and then deasserted on the latency cycle.  So only one divide at
-- 
2.1.0



[PATCH, MIPS, Ping] Inline memcpy for MipsR6

2015-07-29 Thread Simon Dardis
Hello,

> This patch enables inline memcpy for R6 which was previously disabled and 
> adds support for expansion when source and destination are at least half-word 
> aligned.

https://gcc.gnu.org/ml/gcc-patches/2015-07/msg00749.html

Thanks,
Simon


RE: [PATCH, MIPS, Ping] Inline memcpy for MipsR6

2015-08-03 Thread Simon Dardis
Catherine,

Inline-memcpy-2.c updated to not run with -Os.

Patch rebased off current gcc sources.

Thanks,
Simon


diff --git a/gcc/config/mips/mips.c b/gcc/config/mips/mips.c
index 1733457..627e078 100644
--- a/gcc/config/mips/mips.c
+++ b/gcc/config/mips/mips.c
@@ -7520,12 +7520,22 @@ mips_block_move_straight (rtx dest, rtx src, 
HOST_WIDE_INT length)
  half-word alignment, it is usually better to move in half words.
  For instance, lh/lh/sh/sh is usually better than lwl/lwr/swl/swr
  and lw/lw/sw/sw is usually better than ldl/ldr/sdl/sdr.
- Otherwise move word-sized chunks.  */
-  if (MEM_ALIGN (src) == BITS_PER_WORD / 2
-  && MEM_ALIGN (dest) == BITS_PER_WORD / 2)
-bits = BITS_PER_WORD / 2;
+ Otherwise move word-sized chunks.
+
+ For ISA_HAS_LWL_LWR we rely on the lwl/lwr & swl/swr load. Otherwise
+ picking the minimum of alignment or BITS_PER_WORD gets us the
+ desired size for bits.  */
+
+  if (!ISA_HAS_LWL_LWR)
+bits = MIN (BITS_PER_WORD, MIN (MEM_ALIGN (src), MEM_ALIGN (dest)));
   else
-bits = BITS_PER_WORD;
+{
+  if (MEM_ALIGN (src) == BITS_PER_WORD / 2
+ && MEM_ALIGN (dest) == BITS_PER_WORD / 2)
+   bits = BITS_PER_WORD / 2;
+  else
+   bits = BITS_PER_WORD;
+}
 
   mode = mode_for_size (bits, MODE_INT, 0);
   delta = bits / BITS_PER_UNIT;
@@ -7644,8 +7654,9 @@ mips_block_move_loop (rtx dest, rtx src, HOST_WIDE_INT 
length,
 bool
 mips_expand_block_move (rtx dest, rtx src, rtx length)
 {
-  /* Disable entirely for R6 initially.  */
-  if (!ISA_HAS_LWL_LWR)
+  if (!ISA_HAS_LWL_LWR
+  && (MEM_ALIGN (src) < MIPS_MIN_MOVE_MEM_ALIGN
+ || MEM_ALIGN (dest) < MIPS_MIN_MOVE_MEM_ALIGN))
 return false;
 
   if (CONST_INT_P (length))
diff --git a/gcc/config/mips/mips.h b/gcc/config/mips/mips.h
index ec69ed5..4b1787d 100644
--- a/gcc/config/mips/mips.h
+++ b/gcc/config/mips/mips.h
@@ -2969,6 +2969,9 @@ while (0)
 #undef PTRDIFF_TYPE
 #define PTRDIFF_TYPE (POINTER_SIZE == 64 ? "long int" : "int")
 
+/* The minimum alignment of any expanded block move.  */
+#define MIPS_MIN_MOVE_MEM_ALIGN 16
+
 /* The maximum number of bytes that can be copied by one iteration of
a movmemsi loop; see mips_block_move_loop.  */
 #define MIPS_MAX_MOVE_BYTES_PER_LOOP_ITER \
diff --git a/gcc/testsuite/gcc.target/mips/inline-memcpy-1.c 
b/gcc/testsuite/gcc.target/mips/inline-memcpy-1.c
new file mode 100644
index 000..5a254b1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/mips/inline-memcpy-1.c
@@ -0,0 +1,16 @@
+/* { dg-options "-fno-common isa_rev>=6" } */
+/* { dg-skip-if "code quality test" { *-*-* } { "-O0" "-Os" } { "" } } */
+/* { dg-final { scan-assembler-not "\tmemcpy" } } */
+
+/* Test that memcpy is inline for target hardware
+   without swl, swr.  */
+
+#include 
+
+char c[40] __attribute__ ((aligned(8)));
+
+void
+f1 ()
+{
+  memcpy (c, "1234567890QWERTYUIOPASDFGHJKLZXCVBNM", 32);
+}
diff --git a/gcc/testsuite/gcc.target/mips/inline-memcpy-2.c 
b/gcc/testsuite/gcc.target/mips/inline-memcpy-2.c
new file mode 100644
index 000..e144e61
--- /dev/null
+++ b/gcc/testsuite/gcc.target/mips/inline-memcpy-2.c
@@ -0,0 +1,17 @@
+/* { dg-options "-fno-common isa_rev>=6" } */
+/* { dg-skip-if "code quality test" { *-*-* } { "-O0" "-Os"} { "" } } */
+/* { dg-final { scan-assembler-not "\tmemcpy" } } */
+/* { dg-final { scan-assembler-times "\tsh\t" 16 } } */
+
+/* Test that inline memcpy is expanded for target hardware without
+   swl, swr when alignment is halfword and sufficent shs are produced.  */
+
+#include 
+
+char c[40] __attribute__ ((aligned(2)));
+
+void
+f1 ()
+{
+  memcpy (c, "1234567890QWERTYUIOPASDFGHJKLZXCVBNM", 32);
+}
diff --git a/gcc/testsuite/gcc.target/mips/inline-memcpy-3.c 
b/gcc/testsuite/gcc.target/mips/inline-memcpy-3.c
new file mode 100644
index 000..96a0387
--- /dev/null
+++ b/gcc/testsuite/gcc.target/mips/inline-memcpy-3.c
@@ -0,0 +1,18 @@
+/* { dg-options "-fno-common isa_rev<=5" } */
+/* { dg-skip-if "code quality test" { *-*-* } { "-O0" "-Os"} { "" } } */
+/* { dg-final { scan-assembler-not "\tmemcpy" } } */
+/* { dg-final { scan-assembler-times "swl" 8 } } */
+/* { dg-final { scan-assembler-times "swr" 8 } } */
+
+/* Test that inline memcpy for hardware with swl, swr handles subword
+   alignment and produces enough swl/swrs for mips32.  */
+
+#include 
+
+char c[40] __attribute__ ((aligned(2)));
+
+void
+f1 ()
+{
+  memcpy (c, "1234567890QWERTYUIOPASDFGHJKLZXCVBNM", 32);
+}
diff --git a/gcc/testsuite/gcc.target/mips/inline-memcpy-4.c 
b/gcc/testsuite/gcc.target/mips/inline-memcpy-4.c
new file mode 100644
index 000..0e7a22e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/mips/inline-memcpy-4.c
@@ -0,0 +1,18 @@
+/* { dg-options "-fno-common isa_rev<=5 -mabi=64" } */
+/* { dg-skip-if "code quality test" { *-*-* } { "-O0" "-Os"} { "" } } */
+/* { dg-final { scan-assembler-not "\tmemcpy" } } */
+/* { dg-final { scan-assembler-times "sdl" 4 } } */
+/* { dg-final { scan-assembler-times "sdr" 4 } } *

RE: [PATCH] Target hook for disabling the delay slot filler.

2015-09-17 Thread Simon Dardis
The profitability of using an ordinary branch over a delay slot branch
depends on how the delay slot is filled. If a delay slot can be filled from
an instruction preceding the branch or instructions proceeding that must be 
executed on both sides then it is profitable to use a delay slot branch.

For cases when instructions are chosen from one side of the branch, 
the proposed optimization strategy is to not speculatively execute 
instructions when ordinary branches could be used. Performance-wise
this avoids executing instructions which the eager delay filler picked
wrongly.

Since most branches have a compact form disabling the eager delay filler
should be no worse than altering it not to fill delay slots in this case.

Thanks,
Simon

-Original Message-
From: Jeff Law [mailto:l...@redhat.com] 
Sent: 15 September 2015 16:02
To: Bernd Schmidt; Simon Dardis; gcc-patches@gcc.gnu.org
Subject: Re: [PATCH] Target hook for disabling the delay slot filler.

On 09/15/2015 08:27 AM, Bernd Schmidt wrote:
> On 09/15/2015 04:19 PM, Simon Dardis wrote:
>> This patch adds a target hook for disabling the eager delay slot 
>> filler which when disabled can give better code. No new regressions.
>> Ok to commit?
>
> Hmm. Whether a branch was filled by the simple or eager filler is an 
> implementation detail - is there some better way to describe which 
> kind of branch is profitable?
And more importantly, it's far better to be able to describe when it is not 
profitable to use eager filling rather than just disabling it completely.

Jeff


FW: [PATCH] Target hook for disabling the delay slot filler.

2015-09-18 Thread Simon Dardis
> Are you trying to say that you have the option as to what kind of 
> branch to use?  ie, "ordinary", presumably without a delay slot or one 
> with a delay slot?

> Is the "ordinary" actually just a nullified delay slot or some form of 
> likely/not likely static hint?

Specifically for MIPSR6: the ISA possesses traditional delay slot branches and
a normal branch (no delay slots, not annulling, no hints, subtle static hazard),
aka "compact branch" in MIPS terminology. They could be described as nullify
on taken delay slot branch but we saw little to no value in that.

Matthew Fortune provided a writeup with their handling in GCC: 

https://gcc.gnu.org/ml/gcc-patches/2015-07/msg01892.html

> But what is the compact form at the micro-architectural level?  My
> mips-fu has diminished greatly, but my recollection is the bubble is
> always there.   Is that not the case?

The pipeline bubble will exist but the performance impact varies across
R6 cores. High-end OoO cores won't be impacted as much, but lower
end cores will. microMIPSR6 removes delay slot branches altogether which
pushes the simplest micro-architectures to optimize away the cost of a
pipeline bubble.

For non-microMIPSR6 this is why we have different branch policies implemented
in the MIPS backend to allow branch usage to be tuned. By default, if a delay
slot can be filled then we use a delay slot branch otherwise we use a compact
branch as the only thing in the DS would be a NOP anyway.

Compact branches do a strange restriction in that they cannot be followed by a 
CTI. This is to simplify branch predictors apparently but this may be lifted in
future ISA releases.

> If it is able to find insns from the commonly executed path that don't 
> have a long latency, then the fill is usually profitable (since the 
> pipeline bubble always exists).  However, pulling a long latency 
> instruction (say anything that might cache miss or an fdiv/fsqrt) off 
> the slow path and conditionally nullifying it can be *awful*.
> Everything else is in-between.

I agree. The variability in profit/loss in a concern and I see two ways to deal
with it:

A) modify the delay slot filler so that it choses speculative instructions of 
less than some $cost and avoid instruction duplication when the eager filler
picks an instruction from a block with multiple predecessors. Making such
changes would be invasive and require more target specific hooks.

B) Use compact branches instead of speculative delay slot execution and forsake
variable performance for a consistent pipeline bubble by not using the
speculative delay filler altogether.

Between these two choices, B seems to better option as due to sheer simplicity.
Choosing neither gives speculative instruction execution when there could be a
small consistent penalty instead.

Thanks,
Simon
________
From: Jeff Law [l...@redhat.com]
Sent: 17 September 2015 17:55
To: Simon Dardis; Bernd Schmidt
Cc: gcc-patches@gcc.gnu.org
Subject: Re: [PATCH] Target hook for disabling the delay slot filler.

On 09/17/2015 03:52 AM, Simon Dardis wrote:
> The profitability of using an ordinary branch over a delay slot branch 
> depends on how the delay slot is filled. If a delay slot can be filled 
> from an instruction preceding the branch or instructions proceeding 
> that must be executed on both sides then it is profitable to use a delay slot 
> branch.
Agreed.  It's an over-simplification, but for the purposes of this discussion 
it's close enough.


>
> For cases when instructions are chosen from one side of the branch, 
> the proposed optimization strategy is to not speculatively execute 
> instructions when ordinary branches could be used. Performance-wise 
> this avoids executing instructions which the eager delay filler picked 
> wrongly.
Are you trying to say that you have the option as to what kind of branch to 
use?  ie, "ordinary", presumably without a delay slot or one with a delay slot?

Is the "ordinary" actually just a nullified delay slot or some form of 
likely/not likely static hint?



>
> Since most branches have a compact form disabling the eager delay 
> filler should be no worse than altering it not to fill delay slots in this 
> case.
But what is the compact form at the micro-architectural level?  My mips-fu has 
diminished greatly, but my recollection is the bubble is
always there.   Is that not the case?

fill_eager_delay_slots is most definitely speculative and its profitability is 
largely dependent on the cost of what insns it finds to fill those delay slots 
and whether they're from the common or uncommon path.

If it is able to find insns from the commonly executed path that don't have a 
long latency, then the fill is usually profitable (since the pipeline bubble 
always exists).  However, pulling a long late

[PATCH, Mips] Compact branch/delay slot optimization.

2015-09-25 Thread Simon Dardis
Hello,

The following patch adds three small optimizations related to compact branches 
for MIPSR6: 

When the result of a load is used by a delay slot branch immediately 
afterwards, undo the
delay slot branch scheduling to hide the pipeline bubble if safe and use a 
compact branch
instead.

Undo delay slot scheduling if an orphaned high-part relocation is in a delay 
slot and use a
compact branch is used instead.

Undo delay slot scheduling in the case where a forbidden slot hazard is 
immediately followed
by a delay slot branch. This would cause a nop to be inserted otherwise.

No regressions. OK to apply?

Thanks,
Simon

gcc/
* config/mips/mips.c: (mips_break_sequence): New function. 
  (mips_reorg_process_insns) Use it. Use compact branches in selected
  situations.

gcc/testsuite/
* gcc.target/mips/split-ds-sequence.c: Test for the above.

Index: config/mips/mips.c
===
--- config/mips/mips.c  (revision 227676)
+++ config/mips/mips.c  (working copy)
@@ -16973,6 +16973,23 @@
   }
 }
 
+/* Remove a SEQUENCE and replace it with the delay slot instruction
+   followed by the branch and return the instruction in the delay slot.
+   Return the first of the two new instructions.
+   Subroutine of mips_reorg_process_insns.  */
+
+static rtx_insn *
+mips_break_sequence (rtx_insn * insn)
+{
+  rtx_insn * before = PREV_INSN (insn);
+  rtx_insn * branch = SEQ_BEGIN (insn);
+  rtx_insn * ds = SEQ_END (insn);
+  remove_insn (insn);
+  add_insn_after (ds, before, NULL);
+  add_insn_after (branch, ds, NULL);
+  return ds;
+}
+
 /* Go through the instruction stream and insert nops where necessary.
Also delete any high-part relocations whose partnering low parts
are now all dead.  See if the whole function can then be put into
@@ -17065,6 +17082,66 @@
{
  if (GET_CODE (PATTERN (insn)) == SEQUENCE)
{
+ rtx_insn * next_active = next_active_insn (insn);
+ /* Undo delay slots to avoid bubbles if the next instruction can
+be placed in a forbidden slot or the cost of adding an
+explicit NOP in a forbidden slot is OK.  */
+ if (TARGET_CB_MAYBE
+ && INSN_P (SEQ_BEGIN (insn))
+ && INSN_P (SEQ_END (insn))
+ && ((next_active
+  && INSN_P (next_active)
+  && GET_CODE (PATTERN (next_active)) != SEQUENCE
+  && get_attr_can_delay (next_active) == CAN_DELAY_YES)
+ || !optimize_size))
+   {
+ /* To hide a potential pipeline bubble, if we scan backwards
+from the current SEQUENCE and find that there is a load
+of a value that is used in the CTI and there are no
+dependencies between the CTI and instruction in the delay
+slot, break the sequence so the load delay is hidden.  */
+ HARD_REG_SET uses;
+ CLEAR_HARD_REG_SET (uses);
+ note_uses (&PATTERN (SEQ_BEGIN (insn)), record_hard_reg_uses,
+&uses);
+ HARD_REG_SET delay_sets;
+ CLEAR_HARD_REG_SET (delay_sets);
+ note_stores (PATTERN (SEQ_END (insn)), record_hard_reg_sets,
+  &delay_sets);
+
+ rtx prev = prev_active_insn (insn);
+ if (prev
+ && GET_CODE (PATTERN (prev)) == SET
+ && MEM_P (SET_SRC (PATTERN (prev
+   {
+ HARD_REG_SET sets;
+ CLEAR_HARD_REG_SET (sets);
+ note_stores (PATTERN (prev), record_hard_reg_sets,
+  &sets);
+
+ /* Re-order if safe.  */
+ if (!hard_reg_set_intersect_p (delay_sets, uses)
+ && hard_reg_set_intersect_p (uses, sets))
+   {
+ next_insn = mips_break_sequence (insn);
+ /* Need to process the hazards of the newly
+introduced instructions.  */
+ continue;
+   }
+   }
+
+ /* If we find an orphaned high-part relocation in a delay
+slot then we can convert to a compact branch and get
+the orphaned high part deleted.  */
+ if (mips_orphaned_high_part_p (&htab, SEQ_END (insn)))
+   {
+ next_insn = mips_break_sequence (insn);
+ /* Need to process the hazards of the newly
+introduced instructions.  */
+ continue;
+   }
+   }
+
  /* If we find an orphaned high-part relocation in a

[PATCH, MIPS, PR/61114] Migrate to reduc_..._scal optabs.

2015-10-01 Thread Simon Dardis
Hello,

This patch migrates the MIPS backend to the new vector reduction optabs. 


No new regressions, ok to apply?

Thanks,
Simon

gcc/ChangeLog:

* config/mips/loongson.md   (vec_loongson_extract_lo_): New, 
extract low part to scalar.
(reduc_uplus_): Remove.
(reduc_plus_scal_): Rename from reduc_splus_, Use  vec 
loongson_extract_lo_.
(reduc_smax_scal_, reduc_smin_scal_): Rename from 
reduc_smax_, 
reduc_smax_, fix constraints, use vec loongson_extract_lo_.
(reduc_umax_scal_, reduc_umin_scal_): Rename, change 
constraints.

Index: config/mips/loongson.md
===
--- config/mips/loongson.md (revision 228282)
+++ config/mips/loongson.md (working copy)
@@ -852,58 +852,66 @@
   "dsrl\t%0,%1,%2"
   [(set_attr "type" "fcvt")])
 
-(define_expand "reduc_uplus_"
-  [(match_operand:VWH 0 "register_operand" "")
-   (match_operand:VWH 1 "register_operand" "")]
+(define_insn "vec_loongson_extract_lo_"
+  [(set (match_operand: 0 "register_operand" "=r")
+(vec_select:
+  (match_operand:VWHB 1 "register_operand" "f")
+  (parallel [(const_int 0)])))]
   "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
-{
-  mips_expand_vec_reduc (operands[0], operands[1], gen_add3);
-  DONE;
-})
+  "mfc1\t%0,%1"
+  [(set_attr "type" "mfc")])
 
-; ??? Given that we're not describing a widening reduction, we should
-; not have separate optabs for signed and unsigned.
-(define_expand "reduc_splus_"
-  [(match_operand:VWHB 0 "register_operand" "")
+(define_expand "reduc_plus_scal_"
+  [(match_operand: 0 "register_operand" "")
(match_operand:VWHB 1 "register_operand" "")]
   "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 {
-  emit_insn (gen_reduc_uplus_(operands[0], operands[1]));
+  rtx tmp = gen_reg_rtx (GET_MODE (operands[1]));
+  mips_expand_vec_reduc (tmp, operands[1], gen_add3);
+  emit_insn ( gen_vec_loongson_extract_lo_ (operands[0], tmp));
   DONE;
 })
 
-(define_expand "reduc_smax_"
-  [(match_operand:VWHB 0 "register_operand" "")
-   (match_operand:VWHB 1 "register_operand" "")]
+(define_expand "reduc_smax_scal_"
+  [(match_operand:HI 0 "register_operand" "")
+   (match_operand:VH 1 "register_operand" "")]
   "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 {
-  mips_expand_vec_reduc (operands[0], operands[1], gen_smax3);
+  rtx tmp = gen_reg_rtx (GET_MODE (operands[1]));
+  mips_expand_vec_reduc (tmp, operands[1], gen_smax3);
+  emit_insn ( gen_vec_loongson_extract_lo_ (operands[0], tmp));
   DONE;
 })
 
-(define_expand "reduc_smin_"
-  [(match_operand:VWHB 0 "register_operand" "")
-   (match_operand:VWHB 1 "register_operand" "")]
+(define_expand "reduc_smin_scal_"
+  [(match_operand:HI 0 "register_operand" "")
+   (match_operand:VH 1 "register_operand" "")]
   "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 {
-  mips_expand_vec_reduc (operands[0], operands[1], gen_smin3);
+  rtx tmp = gen_reg_rtx (GET_MODE (operands[1]));
+  mips_expand_vec_reduc (tmp, operands[1], gen_smin3);
+  emit_insn ( gen_vec_loongson_extract_lo_ (operands[0], tmp));
   DONE;
 })
 
-(define_expand "reduc_umax_"
-  [(match_operand:VB 0 "register_operand" "")
+(define_expand "reduc_umax_scal_"
+  [(match_operand:QI 0 "register_operand" "")
(match_operand:VB 1 "register_operand" "")]
   "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 {
-  mips_expand_vec_reduc (operands[0], operands[1], gen_umax3);
+  rtx tmp = gen_reg_rtx (GET_MODE (operands[1]));
+  mips_expand_vec_reduc (tmp, operands[1], gen_umax3);
+  emit_insn ( gen_vec_loongson_extract_lo_ (operands[0], tmp));
   DONE;
 })
 
-(define_expand "reduc_umin_"
-  [(match_operand:VB 0 "register_operand" "")
+(define_expand "reduc_umin_scal_"
+  [(match_operand:QI 0 "register_operand" "")
(match_operand:VB 1 "register_operand" "")]
   "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 {
-  mips_expand_vec_reduc (operands[0], operands[1], gen_umin3);
+  rtx tmp = gen_reg_rtx (GET_MODE (operands[1]));
+  mips_expand_vec_reduc (tmp, operands[1], gen_umin3);
+  emit_insn ( gen_vec_loongson_extract_lo_ (operands[0], tmp));
   DONE;
 })




RE: [PATCH, Mips] Compact branch/delay slot optimization.

2015-10-06 Thread Simon Dardis
structions.  */
+ continue;
+   }
+   }
+
  /* If we find an orphaned high-part relocation in a delay
 slot, it's easier to turn that instruction into a NOP than
 to delete it.  The delay slot will be a NOP either way.  */
@@ -17099,6 +17189,33 @@
{
  mips_avoid_hazard (last_insn, insn, &hilo_delay,
 &delayed_reg, lo_reg, &fs_delay);
+ /* When a compact branch introduces a forbidden slot hazard
+and the next useful instruction is a SEQUENCE of a jump
+and a non-nop instruction in the delay slot, remove the
+sequence and replace it with the delay slot instruction
+then the jump to clear the forbidden slot hazard.  */
+
+ if (fs_delay)
+   {
+ /* Search onwards from the current position looking for
+a SEQUENCE.  We are looking for pipeline hazards here
+and do not need to worry about labels or barriers as
+the optimization only undoes delay slot filling which
+only affects the order of the branch and its delay
+slot.  */
+ rtx_insn * next = next_active_insn (insn);
+ if (next
+ && USEFUL_INSN_P (next)
+ && GET_CODE (PATTERN (next)) == SEQUENCE
+ && mips_breakable_sequence_p (next))
+   {
+ last_insn = insn;
+ next_insn = mips_break_sequence (next);
+ /* Need to process the hazards of the newly
+introduced instructions.  */
+ continue;
+   }
+   }
  last_insn = insn;
}
}
Index: testsuite/gcc.target/mips/split-ds-sequence.c
===
--- testsuite/gcc.target/mips/split-ds-sequence.c   (revision 0)
+++ testsuite/gcc.target/mips/split-ds-sequence.c   (working copy)
@@ -0,0 +1,19 @@
+/* { dg-options "isa_rev>=6" } */
+/* { dg-skip-if "code quality test" { *-*-* } { "-mcompact-branches=never" } { 
"" } } */
+/* { dg-final { scan-assembler-not "nop" } } */
+
+int
+testg2 (int a, int c)
+{
+
+  int j = 0;
+  do
+{
+  j += a;
+}
+  while (j < 56);
+
+  j += c;
+  return j;
+
+}

-Original Message-
From: Simon Dardis 
Sent: 25 September 2015 15:56
To: Moore, Catherine
Cc: gcc-patches@gcc.gnu.org
Subject: [PATCH, Mips] Compact branch/delay slot optimization.

Hello,

The following patch adds three small optimizations related to compact branches 
for MIPSR6: 

When the result of a load is used by a delay slot branch immediately 
afterwards, undo the delay slot branch scheduling to hide the pipeline bubble 
if safe and use a compact branch instead.

Undo delay slot scheduling if an orphaned high-part relocation is in a delay 
slot and use a compact branch is used instead.

Undo delay slot scheduling in the case where a forbidden slot hazard is 
immediately followed by a delay slot branch. This would cause a nop to be 
inserted otherwise.

No regressions. OK to apply?

Thanks,
Simon

gcc/
* config/mips/mips.c: (mips_break_sequence): New function. 
  (mips_reorg_process_insns) Use it. Use compact branches in selected
  situations.

gcc/testsuite/
* gcc.target/mips/split-ds-sequence.c: Test for the above.



RE: [PATCH, MIPS, PR/61114] Migrate to reduc_..._scal optabs.

2015-10-07 Thread Simon Dardis
On the change from smin/smax it was a deliberate change as I managed to confuse 
myself of the mode patterns, correct version follows. Reverted back to VWHB for 
smax/smin. Stylistic point addressed.

No new regression, ok for commit?

Thanks,
Simon

Index: config/mips/loongson.md
===
--- config/mips/loongson.md (revision 228282)
+++ config/mips/loongson.md (working copy)
@@ -852,58 +852,66 @@
   "dsrl\t%0,%1,%2"
   [(set_attr "type" "fcvt")])
 
-(define_expand "reduc_uplus_"
-  [(match_operand:VWH 0 "register_operand" "")
-   (match_operand:VWH 1 "register_operand" "")]
+(define_insn "vec_loongson_extract_lo_"
+  [(set (match_operand: 0 "register_operand" "=r")
+(vec_select:
+  (match_operand:VWHB 1 "register_operand" "f")
+  (parallel [(const_int 0)])))]
   "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
-{
-  mips_expand_vec_reduc (operands[0], operands[1], gen_add3);
-  DONE;
-})
+  "mfc1\t%0,%1"
+  [(set_attr "type" "mfc")])
 
-; ??? Given that we're not describing a widening reduction, we should
-; not have separate optabs for signed and unsigned.
-(define_expand "reduc_splus_"
-  [(match_operand:VWHB 0 "register_operand" "")
+(define_expand "reduc_plus_scal_"
+  [(match_operand: 0 "register_operand" "")
(match_operand:VWHB 1 "register_operand" "")]
   "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 {
-  emit_insn (gen_reduc_uplus_(operands[0], operands[1]));
+  rtx tmp = gen_reg_rtx (GET_MODE (operands[1]));
+  mips_expand_vec_reduc (tmp, operands[1], gen_add3);
+  emit_insn (gen_vec_loongson_extract_lo_ (operands[0], tmp));
   DONE;
 })
 
-(define_expand "reduc_smax_"
-  [(match_operand:VWHB 0 "register_operand" "")
+(define_expand "reduc_smax_scal_"
+  [(match_operand: 0 "register_operand" "")
(match_operand:VWHB 1 "register_operand" "")]
   "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 {
-  mips_expand_vec_reduc (operands[0], operands[1], gen_smax3);
+  rtx tmp = gen_reg_rtx (GET_MODE (operands[1]));
+  mips_expand_vec_reduc (tmp, operands[1], gen_smax3);
+  emit_insn (gen_vec_loongson_extract_lo_ (operands[0], tmp));
   DONE;
 })
 
-(define_expand "reduc_smin_"
-  [(match_operand:VWHB 0 "register_operand" "")
+(define_expand "reduc_smin_scal_"
+  [(match_operand: 0 "register_operand" "")
(match_operand:VWHB 1 "register_operand" "")]
   "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 {
-  mips_expand_vec_reduc (operands[0], operands[1], gen_smin3);
+  rtx tmp = gen_reg_rtx (GET_MODE (operands[1]));
+  mips_expand_vec_reduc (tmp, operands[1], gen_smin3);
+  emit_insn (gen_vec_loongson_extract_lo_ (operands[0], tmp));
   DONE;
 })
 
-(define_expand "reduc_umax_"
-  [(match_operand:VB 0 "register_operand" "")
+(define_expand "reduc_umax_scal_"
+  [(match_operand: 0 "register_operand" "")
(match_operand:VB 1 "register_operand" "")]
   "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 {
-  mips_expand_vec_reduc (operands[0], operands[1], gen_umax3);
+  rtx tmp = gen_reg_rtx (GET_MODE (operands[1]));
+  mips_expand_vec_reduc (tmp, operands[1], gen_umax3);
+  emit_insn (gen_vec_loongson_extract_lo_ (operands[0], tmp));
   DONE;
 })
 
-(define_expand "reduc_umin_"
-  [(match_operand:VB 0 "register_operand" "")
+(define_expand "reduc_umin_scal_"
+  [(match_operand: 0 "register_operand" "")
(match_operand:VB 1 "register_operand" "")]
   "TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
 {
-  mips_expand_vec_reduc (operands[0], operands[1], gen_umin3);
+  rtx tmp = gen_reg_rtx (GET_MODE (operands[1]));
+  mips_expand_vec_reduc (tmp, operands[1], gen_umin3);
+  emit_insn (gen_vec_loongson_extract_lo_ (operands[0], tmp));
   DONE;
 })


-Original Message-
From: Alan Lawrence [mailto:alan.lawre...@arm.com] 
Sent: 06 October 2015 11:12
To: Simon Dardis; Matthew Fortune; Moore, Catherine
Cc: gcc-patches@gcc.gnu.org
Subject: Re: [PATCH, MIPS, PR/61114] Migrate to reduc_..._scal optabs.

Thanks for working on this, Simon!

On 01/10/15 15:43, Simon Dardis wrote:
> -(define_expand "reduc_smax_"
> -  [(match_operand:VWHB 0 "register_operand" "")
> -   (match_operand:VWHB 1 "register_operand" "")]
> +(define_expand "reduc_smax_scal_"
> +  

RE: FW: [PATCH] Target hook for disabling the delay slot filler.

2015-10-23 Thread Simon Dardis


> -Original Message-
> From: Jeff Law [mailto:l...@redhat.com]
> Sent: 08 October 2015 20:44
> To: Simon Dardis; Bernd Schmidt
> Cc: gcc-patches@gcc.gnu.org
> Subject: Re: FW: [PATCH] Target hook for disabling the delay slot filler.
> 
> On 09/18/2015 05:10 AM, Simon Dardis wrote:
> >> Are you trying to say that you have the option as to what kind of
> >> branch to use?  ie, "ordinary", presumably without a delay slot or
> >> one with a delay slot?
> >
> >> Is the "ordinary" actually just a nullified delay slot or some form
> >> of likely/not likely static hint?
> >
> > Specifically for MIPSR6: the ISA possesses traditional delay slot
> > branches and a normal branch (no delay slots, not annulling, no hints,
> > subtle static hazard), aka "compact branch" in MIPS terminology. They
> > could be described as nullify on taken delay slot branch but we saw little 
> > to
> no value in that.
> >
> > Matthew Fortune provided a writeup with their handling in GCC:
> >
> > https://gcc.gnu.org/ml/gcc-patches/2015-07/msg01892.html
> Thanks. I never looked at that message, almost certainly because it was MIPS
> specific.  I'm trying hard to stay out of backends that have good active
> maintainers, and MIPS certainly qualifies on that point.
> 
> 
> >
> >> But what is the compact form at the micro-architectural level?  My
> >> mips-fu has diminished greatly, but my recollection is the bubble is
> >> always there.   Is that not the case?
> >
> > The pipeline bubble will exist but the performance impact varies
> > across
> > R6 cores. High-end OoO cores won't be impacted as much, but lower end
> > cores will. microMIPSR6 removes delay slot branches altogether which
> > pushes the simplest micro-architectures to optimize away the cost of a
> > pipeline bubble.
> [ ... snip more micro-archticture stuff ... ] Thanks.  That helps a lot.  I 
> didn't
> realize the bubble was being squashed to varying degrees.  And FWIW, I
> wouldn't be surprised if you reach a point on the OoO cores where you'll just
> want to move away from delay slots totally and rely on your compact
> branches as much as possible.  It may give your hardware guys a degree of
> freedom that helps them in the common case (compact branches) at the
> expense of slowing down code with old fashioned delay slots.
> 
> > Compact branches do a strange restriction in that they cannot be
> > followed by a CTI. This is to simplify branch predictors apparently
> > but this may be lifted in future ISA releases.
> Come on! :-)  There's some really neat things you can do when you allow
> branches in delay slots.  The PA was particularly fun in that regard.
> My recollection is HP had some hand written assembly code in their libraries
> which exploited the out-of-line execution you could get in this case.  We
> never tried to exploit in GCC simply because the opportunities didn't see all
> that common or profitable.
> 
> 
> 
> >
> >> If it is able to find insns from the commonly executed path that
> >> don't have a long latency, then the fill is usually profitable (since
> >> the pipeline bubble always exists).  However, pulling a long latency
> >> instruction (say anything that might cache miss or an fdiv/fsqrt) off
> >> the slow path and conditionally nullifying it can be *awful*.
> >> Everything else is in-between.
> >
> > I agree. The variability in profit/loss in a concern and I see two
> > ways to deal with it:
> >
> > A) modify the delay slot filler so that it choses speculative
> > instructions of less than some $cost and avoid instruction duplication
> > when the eager filler picks an instruction from a block with multiple
> > predecessors. Making such changes would be invasive and require more
> target specific hooks.
> The cost side here should be handled by existing mechanisms.  You just
> never allow anything other than simple arith, logicals & copies.
> 
> You'd need a hook to avoid this when copying was needed.
> 
> You'd probably also need some kind of target hook to indicate the level of
> prediction where this is profitable since the cost varies across your micro-
> architectures.
> 
> And you'd also have to worry about the special code which triggers when
> there's a well predicted branch, but a resource conflict.  In that case reorg 
> can
> fill the slot from the predicted path and insert compensation code on the
> non-predicted path.
> 
> 
> 
> >
> > B) Use compact branches instead 

RE: FW: [PATCH] Target hook for disabling the delay slot filler.

2015-10-26 Thread Simon Dardis
> On 10/23/2015 11:31 AM, Bernd Schmidt wrote:
> > On 10/23/2015 04:57 PM, Simon Dardis wrote:
> >
> >> Patch below. Target hook renamed to
> >> TARGET_NO_SPECULATION_IN_DELAY_SLOTS_P.
> >>
> >> Tested on mips-img-elf, no new regressions.
> >
> > As far as I'm concerned this is ok, and IIUC Jeff was on board too.
> > This is assuming the test included a bootstrap, otherwise please do
> > that. You should also include a ChangeLog in future submissions.
> Just to be explicit, I'm on board.
> 
> Jeff

I've done bootstrap and regression. No new failures.

gcc/
* target.def (TARGET_NO_SPECULATION_IN_DELAY_SLOTS_P): New hook.
* doc/tm.texi.in (TARGET_NO_SPECULATION_IN_DELAY_SLOTS_P): Document.
* doc/tm.texi: Regenerated.
* reorg.c (dbr_schedule): Use new hook.
* config/mips/mips.c (mips_no_speculation_in_delay_slots_p): New.

testsuite/
* gcc.target/mips/ds-schedule-1.c: New.
* gcc.target/mips/ds-schedule-2.c: New.

Committed as r229383.

Thanks,
Simon


RE: [PATCH, MIPS, PR/61114] Migrate to reduc_..._scal optabs.

2015-11-06 Thread Simon Dardis
Committed r229844.

Thanks,
Simon

> -Original Message-
> From: Moore, Catherine [mailto:catherine_mo...@mentor.com]
> Sent: 03 November 2015 14:09
> To: Simon Dardis; Alan Lawrence; Matthew Fortune
> Cc: gcc-patches@gcc.gnu.org
> Subject: RE: [PATCH, MIPS, PR/61114] Migrate to reduc_..._scal optabs.
> 
> 
> 
> > -Original Message-
> > From: Simon Dardis [mailto:simon.dar...@imgtec.com]
> > Sent: Wednesday, October 07, 2015 6:51 AM
> > To: Alan Lawrence; Matthew Fortune; Moore, Catherine
> > Cc: gcc-patches@gcc.gnu.org
> > Subject: RE: [PATCH, MIPS, PR/61114] Migrate to reduc_..._scal optabs.
> >
> > On the change from smin/smax it was a deliberate change as I managed
> > to confuse myself of the mode patterns, correct version follows.
> > Reverted back to VWHB for smax/smin. Stylistic point addressed.
> >
> > No new regression, ok for commit?
> >
> 
> Yes, OK to commit.  Sorry for the delay in review.
> Catherine
> 
> >
> > Index: config/mips/loongson.md
> >
> ==
> > =
> > --- config/mips/loongson.md (revision 228282)
> > +++ config/mips/loongson.md (working copy)
> > @@ -852,58 +852,66 @@
> >"dsrl\t%0,%1,%2"
> >[(set_attr "type" "fcvt")])
> >
> > -(define_expand "reduc_uplus_"
> > -  [(match_operand:VWH 0 "register_operand" "")
> > -   (match_operand:VWH 1 "register_operand" "")]
> > +(define_insn "vec_loongson_extract_lo_"
> > +  [(set (match_operand: 0 "register_operand" "=r")
> > +(vec_select:
> > +  (match_operand:VWHB 1 "register_operand" "f")
> > +  (parallel [(const_int 0)])))]
> >"TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
> > -{
> > -  mips_expand_vec_reduc (operands[0], operands[1],
> gen_add3);
> > -  DONE;
> > -})
> > +  "mfc1\t%0,%1"
> > +  [(set_attr "type" "mfc")])
> >
> > -; ??? Given that we're not describing a widening reduction, we should
> > -; not have separate optabs for signed and unsigned.
> > -(define_expand "reduc_splus_"
> > -  [(match_operand:VWHB 0 "register_operand" "")
> > +(define_expand "reduc_plus_scal_"
> > +  [(match_operand: 0 "register_operand" "")
> > (match_operand:VWHB 1 "register_operand" "")]
> >"TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
> >  {
> > -  emit_insn (gen_reduc_uplus_(operands[0], operands[1]));
> > +  rtx tmp = gen_reg_rtx (GET_MODE (operands[1]));
> > + mips_expand_vec_reduc (tmp, operands[1], gen_add3);
> emit_insn
> > + (gen_vec_loongson_extract_lo_ (operands[0], tmp));
> >DONE;
> >  })
> >
> > -(define_expand "reduc_smax_"
> > -  [(match_operand:VWHB 0 "register_operand" "")
> > +(define_expand "reduc_smax_scal_"
> > +  [(match_operand: 0 "register_operand" "")
> > (match_operand:VWHB 1 "register_operand" "")]
> >"TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
> >  {
> > -  mips_expand_vec_reduc (operands[0], operands[1],
> gen_smax3);
> > +  rtx tmp = gen_reg_rtx (GET_MODE (operands[1]));
> > + mips_expand_vec_reduc (tmp, operands[1], gen_smax3);
> > + emit_insn (gen_vec_loongson_extract_lo_ (operands[0], tmp));
> >DONE;
> >  })
> >
> > -(define_expand "reduc_smin_"
> > -  [(match_operand:VWHB 0 "register_operand" "")
> > +(define_expand "reduc_smin_scal_"
> > +  [(match_operand: 0 "register_operand" "")
> > (match_operand:VWHB 1 "register_operand" "")]
> >"TARGET_HARD_FLOAT && TARGET_LOONGSON_VECTORS"
> >  {
> > -  mips_expand_vec_reduc (operands[0], operands[1],
> gen_smin3);
> > +  rtx tmp = gen_reg_rtx (GET_MODE (operands[1]));
> > + mips_expand_vec_reduc (tmp, operands[1], gen_smin3);
> > + emit_insn (gen_vec_loongson_extract_lo_ (operands[0], tmp));
> >DONE;
> >  })
> >
> > -(define_expand "reduc_umax_"
> > -  [(match_operand:VB 0 "register_operand" "")
> > +(define_expand "reduc_umax_scal_"
> > +  [(match_operand: 0 "register_operand" "")
> > (match_operand:VB 1 "registe

RE: [PATCH, Mips] Compact branch/delay slot optimization.

2015-11-11 Thread Simon Dardis
Committed as r230160.

Thanks,
Simon

> -Original Message-
> From: Moore, Catherine [mailto:catherine_mo...@mentor.com]
> Sent: 28 October 2015 14:00
> To: Simon Dardis; Matthew Fortune
> Cc: gcc-patches@gcc.gnu.org
> Subject: RE: [PATCH, Mips] Compact branch/delay slot optimization.
> 
> 
> 
> > -Original Message-
> > From: Simon Dardis [mailto:simon.dar...@imgtec.com]
> > Sent: Tuesday, October 06, 2015 10:00 AM
> > To: Moore, Catherine; Matthew Fortune
> > Cc: gcc-patches@gcc.gnu.org
> > Subject: RE: [PATCH, Mips] Compact branch/delay slot optimization.
> >
> > Hello,
> >
> > I'd like to resubmit the previous patch as it failed to check if the
> > branch inside the sequence had a compact form.
> >
> > Thanks,
> > Simon
> >
> > gcc/
> > * config/mips/mips.c: (mips_breakable_sequence_p): New function.
> >   (mips_break_sequence): New function.
> >   (mips_reorg_process_insns) Use them. Use compact branches in
> > selected
> >   situations.
> >
> > gcc/testsuite/
> > * gcc.target/mips/split-ds-sequence.c: Test for the above.
> 
> Hi Simon,
> This patch looks okay with the exception of one stylistic change.
> Please change all instances of :
> +mips_breakable_sequence_p (rtx_insn * insn)
> To:
> +mips_breakable_sequence_p (rtx_insn *insn)
> Okay, with those changes.
> Thanks,
> Catherine
> 
> 
> >
> > Index: config/mips/mips.c
> >
> ==
> > =
> > --- config/mips/mips.c  (revision 228282)
> > +++ config/mips/mips.c  (working copy)
> > @@ -16973,6 +16973,34 @@
> >}
> >  }
> >
> > +/* A SEQUENCE is breakable iff the branch inside it has a compact form
> > +   and the target has compact branches.  */
> > +
> > +static bool
> > +mips_breakable_sequence_p (rtx_insn * insn) {
> > +  return (insn && GET_CODE (PATTERN (insn)) == SEQUENCE
> > + && TARGET_CB_MAYBE
> > + && get_attr_compact_form (SEQ_BEGIN (insn)) !=
> > COMPACT_FORM_NEVER);
> > +}
> > +
> > +/* Remove a SEQUENCE and replace it with the delay slot instruction
> > +   followed by the branch and return the instruction in the delay slot.
> > +   Return the first of the two new instructions.
> > +   Subroutine of mips_reorg_process_insns.  */
> > +
> > +static rtx_insn *
> > +mips_break_sequence (rtx_insn * insn) {
> > +  rtx_insn * before = PREV_INSN (insn);
> > +  rtx_insn * branch = SEQ_BEGIN (insn);
> > +  rtx_insn * ds = SEQ_END (insn);
> > +  remove_insn (insn);
> > +  add_insn_after (ds, before, NULL);
> > +  add_insn_after (branch, ds, NULL);
> > +  return ds;
> > +}
> > +
> >  /* Go through the instruction stream and insert nops where necessary.
> > Also delete any high-part relocations whose partnering low parts
> > are now all dead.  See if the whole function can then be put into
> > @@ -17065,6 +17093,68 @@
> > {
> >   if (GET_CODE (PATTERN (insn)) == SEQUENCE)
> > {
> > + rtx_insn * next_active = next_active_insn (insn);
> > + /* Undo delay slots to avoid bubbles if the next instruction can
> > +be placed in a forbidden slot or the cost of adding an
> > +explicit NOP in a forbidden slot is OK and if the SEQUENCE is
> > +safely breakable.  */
> > + if (TARGET_CB_MAYBE
> > + && mips_breakable_sequence_p (insn)
> > + && INSN_P (SEQ_BEGIN (insn))
> > + && INSN_P (SEQ_END (insn))
> > + && ((next_active
> > +  && INSN_P (next_active)
> > +  && GET_CODE (PATTERN (next_active)) != SEQUENCE
> > +  && get_attr_can_delay (next_active) ==
> > CAN_DELAY_YES)
> > + || !optimize_size))
> > +   {
> > + /* To hide a potential pipeline bubble, if we scan backwards
> > +from the current SEQUENCE and find that there is a load
> > +of a value that is used in the CTI and there are no
> > +dependencies between the CTI and instruction in the
> > delay
> > +slot, break the sequence so the load delay is hidden.  */
> > + HARD_REG_SET uses;
> > + CLEAR_HARD_REG_SET (uses);
> > + note_uses (&PATTERN (SEQ_BEGIN (insn)),
&

[PATCH] Mips: Inline memcpy for R6

2015-07-09 Thread Simon Dardis
Hello,

This patch enables inline memcpy for R6 which was previously 
disabled and adds support for expansion when source and 
destination are at least half-word aligned.

gcc/

* config/mips/mips.c (mips_expand_block_move): Enable inline memcpy
expansion when !ISA_HAS_LWL_LWR.
(mips_block_move_straight): Update the size of elements copied to
account for alignment when !ISA_HAS_LWL_LWR.
* config/mips/mips.h (MIPS_MIN_MOVE_MEM_ALIGN): New macro.

gcc/testsuite/

* inline-memcpy-1.c: Test for inline expansion of memcpy.
* inline-memcpy-2.c: Ditto.
* inline-memcpy-3.c: Ditto.
* inline-memcpy-4.c: Ditto.
* inline-memcpy-5.c: Ditto.

Thanks,
Simon

diff --git a/gcc/config/mips/mips.c b/gcc/config/mips/mips.c
index 6f5421a..1f7c105 100644
--- a/gcc/config/mips/mips.c
+++ b/gcc/config/mips/mips.c
@@ -8187,12 +8187,22 @@ mips_block_move_straight (rtx dest, rtx src, 
HOST_WIDE_INT length)
  half-word alignment, it is usually better to move in half words.
  For instance, lh/lh/sh/sh is usually better than lwl/lwr/swl/swr
  and lw/lw/sw/sw is usually better than ldl/ldr/sdl/sdr.
- Otherwise move word-sized chunks.  */
-  if (MEM_ALIGN (src) == BITS_PER_WORD / 2
-  && MEM_ALIGN (dest) == BITS_PER_WORD / 2)
-bits = BITS_PER_WORD / 2;
+ Otherwise move word-sized chunks.
+
+ For ISA_HAS_LWL_LWR we rely on the lwl/lwr & swl/swr load. Otherwise
+ picking the minimum of alignment or BITS_PER_WORD gets us the
+ desired size for bits.  */
+
+  if (!ISA_HAS_LWL_LWR)
+bits = MIN (BITS_PER_WORD, MIN (MEM_ALIGN (src), MEM_ALIGN (dest)));
   else
-bits = BITS_PER_WORD;
+{
+  if (MEM_ALIGN (src) == BITS_PER_WORD / 2
+ && MEM_ALIGN (dest) == BITS_PER_WORD / 2)
+   bits = BITS_PER_WORD / 2;
+  else
+   bits = BITS_PER_WORD;
+}
 
   mode = mode_for_size (bits, MODE_INT, 0);
   delta = bits / BITS_PER_UNIT;
@@ -8311,8 +8321,8 @@ bool
 mips_expand_block_move (rtx dest, rtx src, rtx length)
 {
   if (!ISA_HAS_LWL_LWR
-   && (MEM_ALIGN (src) < BITS_PER_WORD
-  || MEM_ALIGN (dest) < BITS_PER_WORD))
+  && (MEM_ALIGN (src) < MIPS_MIN_MOVE_MEM_ALIGN
+ || MEM_ALIGN (dest) < MIPS_MIN_MOVE_MEM_ALIGN))
 return false;
 
   if (CONST_INT_P (length))
diff --git a/gcc/config/mips/mips.h b/gcc/config/mips/mips.h
index a2380e5..6578ae5 100644
--- a/gcc/config/mips/mips.h
+++ b/gcc/config/mips/mips.h
@@ -3041,6 +3041,9 @@ while (0)
 #undef PTRDIFF_TYPE
 #define PTRDIFF_TYPE (POINTER_SIZE == 64 ? "long int" : "int")
 
+/* The minimum alignment of any expanded block move.  */
+#define MIPS_MIN_MOVE_MEM_ALIGN 16
+
 /* The maximum number of bytes that can be copied by one iteration of
a movmemsi loop; see mips_block_move_loop.  */
 #define MIPS_MAX_MOVE_BYTES_PER_LOOP_ITER \
diff --git a/gcc/testsuite/gcc.target/mips/inline-memcpy-1.c 
b/gcc/testsuite/gcc.target/mips/inline-memcpy-1.c
new file mode 100644
index 000..5a254b1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/mips/inline-memcpy-1.c
@@ -0,0 +1,16 @@
+/* { dg-options "-fno-common isa_rev>=6" } */
+/* { dg-skip-if "code quality test" { *-*-* } { "-O0" "-Os" } { "" } } */
+/* { dg-final { scan-assembler-not "\tmemcpy" } } */
+
+/* Test that memcpy is inline for target hardware
+   without swl, swr.  */
+
+#include 
+
+char c[40] __attribute__ ((aligned(8)));
+
+void
+f1 ()
+{
+  memcpy (c, "1234567890QWERTYUIOPASDFGHJKLZXCVBNM", 32);
+}
diff --git a/gcc/testsuite/gcc.target/mips/inline-memcpy-2.c 
b/gcc/testsuite/gcc.target/mips/inline-memcpy-2.c
new file mode 100644
index 000..c06be15
--- /dev/null
+++ b/gcc/testsuite/gcc.target/mips/inline-memcpy-2.c
@@ -0,0 +1,17 @@
+/* { dg-options "-fno-common isa_rev>=6" } */
+/* { dg-skip-if "code quality test" { *-*-* } { "-O0" } { "" } } */
+/* { dg-final { scan-assembler-not "\tmemcpy" } } */
+/* { dg-final { scan-assembler-times "\tsh\t" 16 } } */
+
+/* Test that inline memcpy is expanded for target hardware without
+   swl, swr when alignment is halfword and sufficent shs are produced.  */
+
+#include 
+
+char c[40] __attribute__ ((aligned(2)));
+
+void
+f1 ()
+{
+  memcpy (c, "1234567890QWERTYUIOPASDFGHJKLZXCVBNM", 32);
+}
diff --git a/gcc/testsuite/gcc.target/mips/inline-memcpy-3.c 
b/gcc/testsuite/gcc.target/mips/inline-memcpy-3.c
new file mode 100644
index 000..96a0387
--- /dev/null
+++ b/gcc/testsuite/gcc.target/mips/inline-memcpy-3.c
@@ -0,0 +1,18 @@
+/* { dg-options "-fno-common isa_rev<=5" } */
+/* { dg-skip-if "code quality test" { *-*-* } { "-O0" "-Os"} { "" } } */
+/* { dg-final { scan-assembler-not "\tmemcpy" } } */
+/* { dg-final { scan-assembler-times "swl" 8 } } */
+/* { dg-final { scan-assembler-times "swr" 8 } } */
+
+/* Test that inline memcpy for hardware with swl, swr handles subword
+   alignment and produces enough swl/swrs for mips32.  */
+
+#include 
+
+char c[40] __attribute__ ((aligned(2)));
+
+void
+f1 ()
+{
+  memcpy (c, "1234567

RE: [PATCH, MIPS, Ping] Inline memcpy for MipsR6

2015-08-20 Thread Simon Dardis
Checked in  as revision 227026.

Thanks,
Simon

-Original Message-
From: Moore, Catherine [mailto:catherine_mo...@mentor.com] 
Sent: 01 August 2015 20:18
To: Simon Dardis; gcc-patches@gcc.gnu.org
Cc: Moore, Catherine
Subject: RE: [PATCH, MIPS, Ping] Inline memcpy for MipsR6



> -Original Message-
> From: Simon Dardis [mailto:simon.dar...@imgtec.com]
> Sent: Wednesday, July 29, 2015 4:29 AM
> To: gcc-patches@gcc.gnu.org
> Cc: Moore, Catherine
> Subject: [PATCH, MIPS, Ping] Inline memcpy for MipsR6
> 
> > This patch enables inline memcpy for R6 which was previously 
> > disabled and
> adds support for expansion when source and destination are at least 
> half- word aligned.
> 
> https://gcc.gnu.org/ml/gcc-patches/2015-07/msg00749.html
> 

Hi Simon,

Two things need to be fixed up with this patch before committing.

1.  The new test inline-memcpy-2.c should not be run with -OS (like the other 
new tests that you submitted).

2.  Your patch is against older source than what is currently in the 
repository, causing this hunk not to apply cleanly:

@@ -8311,8 +8321,8 @@ bool
 mips_expand_block_move (rtx dest, rtx src, rtx length)  {
   if (!ISA_HAS_LWL_LWR
-   && (MEM_ALIGN (src) < BITS_PER_WORD
-  || MEM_ALIGN (dest) < BITS_PER_WORD))
+  && (MEM_ALIGN (src) < MIPS_MIN_MOVE_MEM_ALIGN
+ || MEM_ALIGN (dest) < MIPS_MIN_MOVE_MEM_ALIGN))
 return false;

   if (CONST_INT_P (length))


The correct patch should like this:

@@ -7780,8 +7790,9 @@
 bool
 mips_expand_block_move (rtx dest, rtx src, rtx length)  {
-  /* Disable entirely for R6 initially.  */
-  if (!ISA_HAS_LWL_LWR)
+  if (!ISA_HAS_LWL_LWR
+  && (MEM_ALIGN (src) < MIPS_MIN_MOVE_MEM_ALIGN
+ || MEM_ALIGN (dest) < MIPS_MIN_MOVE_MEM_ALIGN))
 return false;

   if (CONST_INT_P (length))

Okay with those changes.
Thanks,
Catherine


[PATCH] Target hook for disabling the delay slot filler.

2015-09-15 Thread Simon Dardis
Hello all,

This patch adds a target hook for disabling the eager delay slot filler which 
when disabled can give better code. No new regressions. Ok to commit?

Thanks,
Simon

gcc/
* target.def (use_eager_delay_filler_p): New hook for selectively
disabling eager delay slot filler.
* reorg.c (dbr_schedule): Use the new hook.
* config/mips/mips.c (mips_use_eager_delay_filler_p): New static
function.
(TARGET_USE_EAGER_DELAY_FILLER_P): Define.
* doc/tm.texi.in: Add placeholder for new hook.
* doc/tm.texi: Regenerate.

gcc/testsuite/

* gcc.target/mips/ds-schedule-1.c: New file.
* gcc.target/mips/ds-schedule-2.c: Likewise.

Index: gcc/config/mips/mips.c
===
--- gcc/config/mips/mips.c  (revision 227676)
+++ gcc/config/mips/mips.c  (working copy)
@@ -14425,6 +14425,14 @@
   return cached_can_issue_more;
 }
 
+/* Implement USE_EAGER_DELAY_FILLER.  */
+
+static bool
+mips_use_eager_delay_filler_p ()
+{
+  return TARGET_CB_NEVER;
+}
+
 /* Update round-robin counters for ALU1/2 and FALU1/2.  */
 
 static void
@@ -19982,6 +19990,9 @@
 #undef TARGET_IN_SMALL_DATA_P
 #define TARGET_IN_SMALL_DATA_P mips_in_small_data_p
 
+#undef TARGET_USE_EAGER_DELAY_FILLER_P
+#define TARGET_USE_EAGER_DELAY_FILLER_P mips_use_eager_delay_filler_p
+
 #undef TARGET_MACHINE_DEPENDENT_REORG
 #define TARGET_MACHINE_DEPENDENT_REORG mips_reorg
 
Index: gcc/doc/tm.texi
===
--- gcc/doc/tm.texi (revision 227676)
+++ gcc/doc/tm.texi (working copy)
@@ -10949,6 +10949,15 @@
 definition is null.
 @end deftypefn
 
+@deftypefn {Target Hook} bool TARGET_USE_EAGER_DELAY_FILLER_P (void)
+This predicate controls the use of the eager delay slot filler.  Targets
+such as certain MIPS architectures possess both branches with and without
+delay slots.  As the eager delay slot filler can increase code size,
+disabling it is beneficial when ordinary branches are available.  Use of
+delay slot branches filled using the basic filler is often still desirable
+as the delay slot can hide a pipeline bubble.
+@end deftypefn
+
 @deftypefn {Target Hook} void TARGET_INIT_BUILTINS (void)
 Define this hook if you have any machine-specific built-in functions
 that need to be defined.  It should be a function that performs the
Index: gcc/doc/tm.texi.in
===
--- gcc/doc/tm.texi.in  (revision 227676)
+++ gcc/doc/tm.texi.in  (working copy)
@@ -7985,6 +7985,8 @@
 
 @hook TARGET_MACHINE_DEPENDENT_REORG
 
+@hook TARGET_USE_EAGER_DELAY_FILLER_P
+
 @hook TARGET_INIT_BUILTINS
 
 @hook TARGET_BUILTIN_DECL
Index: gcc/reorg.c
===
--- gcc/reorg.c (revision 227676)
+++ gcc/reorg.c (working copy)
@@ -3793,7 +3793,8 @@
 {
   fill_simple_delay_slots (1);
   fill_simple_delay_slots (0);
-  fill_eager_delay_slots ();
+  if (targetm.use_eager_delay_filler_p ())
+   fill_eager_delay_slots ();
   relax_delay_slots (first);
 }
 
Index: gcc/target.def
===
--- gcc/target.def  (revision 227676)
+++ gcc/target.def  (working copy)
@@ -3618,6 +3618,17 @@
 definition is null.",
  void, (void), NULL)
 
+/* Control of eager delay slot filling in delayed-branch scheduling.  */
+DEFHOOK
+(use_eager_delay_filler_p,
+ "This predicate controls the use of the eager delay slot filler.  Targets\n\
+such as certain MIPS architectures possess both branches with and without\n\
+delay slots.  As the eager delay slot filler can increase code size,\n\
+disabling it is beneficial when ordinary branches are available.  Use of\n\
+delay slot branches filled using the basic filler is often still desirable\n\
+as the delay slot can hide a pipeline bubble.", bool, (void),
+  hook_bool_void_true)
+
 /* Create the __builtin_va_list type.  */
 DEFHOOK
 (build_builtin_va_list,
Index: gcc/testsuite/gcc.target/mips/ds-schedule-1.c
===
--- gcc/testsuite/gcc.target/mips/ds-schedule-1.c   (revision 0)
+++ gcc/testsuite/gcc.target/mips/ds-schedule-1.c   (working copy)
@@ -0,0 +1,29 @@
+/* { dg-options "isa_rev>=6 -mcompact-branches=optimal -mno-abicalls -G4" } */
+/* { dg-final { scan-assembler-not "bne\t" } } */
+/* { dg-final { scan-assembler-not "beq\t" } } */
+/* { dg-final { scan-assembler-times "\\(foo\\)" 1 } } */
+
+/* Test that when compact branches are used, that a compact branch is
+   produced in the case where code expansion would have occurred if a
+   delay slot branch would have be used.  'foo' should only be
+   referenced once in the program text.  */
+
+struct list
+{
+  struct list *next;
+  int element;
+};
+
+struct list *gr;
+
+int foo;
+
+extern void t (int, int, int*);
+
+void
+f (struct list **