The Power ISA declares the "bcl 20,31,..." instruction as the preferred
idiom for obtaining the next instruction address (NIA), which we use for
computing the address of the GOT.  This special branch and link is *not*
a subroutine call, meaning it won't be paired with a blr (subroutine return).  
Processors therefore are not supposed to update their internal link stack
when executing one of this instructions, otherwise we'll mispredict the
following blrs.

The 476 processor has an bug where it doesn't ignore these "bcl 20,31,..."
instructions, so we end up getting lots of mispredicts for -fPIC code.
The following patch adds a -mpreserve-link-stack option that is enabled
automatically for -mtune={476,476fp}, that changes the two types of GOT
access code GCC produces.  The new code replaces the "bcl 20,31,..." with
a "bl..., b..., blr" triplet.  I've included some old versus new code
snipits for both types of GOT access code to illustrate how the code
has changed.


1)      Normal Code:                            New 476 Code:
==============================================================================

        bcl 20,31,$+4                           bl $+8
.L3:                                    .L3:
                                                b $+8
                                                blr
        mflr 9                                  mflr 9
        addis 9,9,.LCTOC1-.L3@ha                addis 9,9,.LCTOC1-.L3@ha
        addi 9,9,.LCTOC1-.L3@l                  addi 9,9,.LCTOC1-.L3@l



2)      Normal Code:                            New 476 Code:
==============================================================================

        bcl 20,31,$+8                           bl $+12
        .long _GLOBAL_OFFSET_TABLE_-$           b $+12
                                                .long _GLOBAL_OFFSET_TABLE_-$
                                                blr
        mflr 9                                  mflr 9
                                                addi 9,9,4
        lwz 3,0(9)                              lwz 3,0(9)


I have bootstrapped and regtested the following patch with no regressiosn.
To test the code even more, I modified the patch so that we default to always
using -mpreserve-link-stack and that bootstrapped and regtested with no
regressions too.

Ok for mainline?

Peter


        * config/rs6000/rs6000.opt (mpreserve-link-stack): New option.
        * config/rs6000/rs6000.c (rs6000_option_override_internal): Enable
        TARGET_LINK_STACK for -mtune=476 and -mtune=476fp.
        (rs6000_legitimize_tls_address): Emit the link stack preserving GOT
        code if TARGET_LINK_STACK.
        (rs6000_emit_load_toc_table): Likewise.
        (output_function_profiler): Likewise
        (macho_branch_islands): Likewise
        (machopic_output_stub): Likewise
        * config/rs6000/rs6000.md (load_toc_v4_PIC_1, load_toc_v4_PIC_1b):
        Convert to a define_expand.
        (load_toc_v4_PIC_1_normal): New define_insn.
        (load_toc_v4_PIC_1_476): Likewise.
        (load_toc_v4_PIC_1b_normal): Likewise.
        (load_toc_v4_PIC_1b_476): Likewise.


Index: gcc/config/rs6000/rs6000.opt
===================================================================
--- gcc/config/rs6000/rs6000.opt        (revision 176007)
+++ gcc/config/rs6000/rs6000.opt        (working copy)
@@ -528,3 +528,7 @@ Use/do not use r11 to hold the static li
 msave-toc-indirect
 Target Undocumented Var(TARGET_SAVE_TOC_INDIRECT) Save Init(1)
 ; Control whether we save the TOC in the prologue for indirect calls or 
generate the save inline
+
+mpreserve-link-stack
+Target Report Var(TARGET_LINK_STACK) Init(-1) Save
+Preserve the link stack on some cpus (eg, 476) by matching up a blr with the 
bcl/bl insns used for GOT accesses
Index: gcc/config/rs6000/rs6000.c
===================================================================
--- gcc/config/rs6000/rs6000.c  (revision 176007)
+++ gcc/config/rs6000/rs6000.c  (working copy)
@@ -3246,6 +3246,11 @@ rs6000_option_override_internal (bool gl
     target_option_default_node = target_option_current_node
       = build_target_option_node ();
 
+  /* If not explicitly specified via option, decide whether to generate the
+     extra blr's required to preserve the link stack on some cpus (eg, 476).  
*/
+  if (TARGET_LINK_STACK == -1)
+    TARGET_LINK_STACK = (rs6000_cpu == PROCESSOR_PPC476);
+
   return ret;
 }
 
@@ -5930,6 +5935,8 @@ rs6000_legitimize_tls_address (rtx addr,
                  lab = gen_label_rtx ();
                  emit_insn (gen_load_toc_v4_PIC_1b (gsym, lab));
                  emit_move_insn (tmp1, gen_rtx_REG (Pmode, LR_REGNO));
+                 if (TARGET_LINK_STACK)
+                   emit_insn (gen_addsi3 (tmp1, tmp1, GEN_INT (4)));
                  emit_move_insn (tmp2, mem);
                  last = emit_insn (gen_addsi3 (got, tmp1, tmp2));
                  set_unique_reg_note (last, REG_EQUAL, gsym);
@@ -18927,6 +18934,8 @@ rs6000_emit_load_toc_table (int fromprol
          lab = gen_label_rtx ();
          emit_insn (gen_load_toc_v4_PIC_1b (tocsym, lab));
          emit_move_insn (dest, gen_rtx_REG (Pmode, LR_REGNO));
+         if (TARGET_LINK_STACK)
+           emit_insn (gen_addsi3 (dest, dest, GEN_INT (4)));
          emit_move_insn (temp0, gen_rtx_MEM (Pmode, dest));
        }
       emit_insn (gen_addsi3 (dest, temp0, dest));
@@ -22529,7 +22538,11 @@ output_function_profiler (FILE *file, in
        }
       else if (TARGET_SECURE_PLT && flag_pic)
        {
-         asm_fprintf (file, "\tbcl 20,31,1f\n1:\n\t{st|stw} %s,4(%s)\n",
+         if (TARGET_LINK_STACK)
+           asm_fprintf (file, "\tbl 1f\n\tb 2f\n1:\n\tblr\n2:\n");
+         else
+           asm_fprintf (file, "\tbcl 20,31,1f\n1:\n");
+         asm_fprintf (file, "\t{st|stw} %s,4(%s)\n",
                       reg_names[0], reg_names[1]);
          asm_fprintf (file, "\tmflr %s\n", reg_names[12]);
          asm_fprintf (file, "\t{cau|addis} %s,%s,",
@@ -22554,10 +22567,22 @@ output_function_profiler (FILE *file, in
          asm_fprintf (file, "\t{st|stw} %s,4(%s)\n",
                       reg_names[0], reg_names[1]);
          /* Now, we need to get the address of the label.  */
-         fputs ("\tbcl 20,31,1f\n\t.long ", file);
-         assemble_name (file, buf);
-         fputs ("-.\n1:", file);
-         asm_fprintf (file, "\tmflr %s\n", reg_names[11]);
+         if (TARGET_LINK_STACK)
+           {
+             fputs ("\tbl 1f\n\tb 2f\n\t.long ", file);
+             assemble_name (file, buf);
+             fputs ("-.\n1:\n\tblr\n2:", file);
+             asm_fprintf (file, "\tmflr %s\n", reg_names[11]);
+             asm_fprintf (file, "\taddi %s,%s,4\n",
+                          reg_names[11], reg_names[11]);
+           }
+         else
+           {
+             fputs ("\tbcl 20,31,1f\n\t.long ", file);
+             assemble_name (file, buf);
+             fputs ("-.\n1:", file);
+             asm_fprintf (file, "\tmflr %s\n", reg_names[11]);
+           }
          asm_fprintf (file, "\t{l|lwz} %s,0(%s)\n",
                       reg_names[0], reg_names[11]);
          asm_fprintf (file, "\t{cax|add} %s,%s,%s\n",
@@ -25040,11 +25065,22 @@ macho_branch_islands (void)
 #endif /* DBX_DEBUGGING_INFO || XCOFF_DEBUGGING_INFO */
       if (flag_pic)
        {
-         strcat (tmp_buf, ":\n\tmflr r0\n\tbcl 20,31,");
-         strcat (tmp_buf, label);
-         strcat (tmp_buf, "_pic\n");
-         strcat (tmp_buf, label);
-         strcat (tmp_buf, "_pic:\n\tmflr r11\n");
+         if (TARGET_LINK_STACK)
+           {
+             strcat (tmp_buf, ":\n\tmflr r0\n\tbl $+8\n");
+             strcat (tmp_buf, label);
+             strcat (tmp_buf, "_pic:\n\tb $+8\n");
+             strcat (tmp_buf, "\tblr\n");
+             strcat (tmp_buf, "\tmflr r11\n");
+           }
+         else
+           {
+             strcat (tmp_buf, ":\n\tmflr r0\n\tbcl 20,31,");
+             strcat (tmp_buf, label);
+             strcat (tmp_buf, "_pic\n");
+             strcat (tmp_buf, label);
+             strcat (tmp_buf, "_pic:\n\tmflr r11\n");
+           }
 
          strcat (tmp_buf, "\taddis r11,r11,ha16(");
          strcat (tmp_buf, name_buf);
@@ -25190,8 +25226,16 @@ machopic_output_stub (FILE *file, const
       sprintf (local_label_0, "\"L%011d$spb\"", label);
 
       fprintf (file, "\tmflr r0\n");
-      fprintf (file, "\tbcl 20,31,%s\n", local_label_0);
-      fprintf (file, "%s:\n\tmflr r11\n", local_label_0);
+      if (TARGET_LINK_STACK)
+       {
+         fprintf (file, "\tbl $+8\n");
+         fprintf (file, "%s:\n\tb $+8\n\tblr\n\tmflr r11\n", local_label_0);
+       }
+      else
+       {
+         fprintf (file, "\tbcl 20,31,%s\n", local_label_0);
+         fprintf (file, "%s:\n\tmflr r11\n", local_label_0);
+       }
       fprintf (file, "\taddis r11,r11,ha16(%s-%s)\n",
               lazy_ptr_name, local_label_0);
       fprintf (file, "\tmtlr r0\n");
Index: gcc/config/rs6000/rs6000.md
===================================================================
--- gcc/config/rs6000/rs6000.md (revision 176007)
+++ gcc/config/rs6000/rs6000.md (working copy)
@@ -12081,27 +12081,65 @@ (define_insn "load_toc_v4_pic_si"
   [(set_attr "type" "branch")
    (set_attr "length" "4")])
 
-(define_insn "load_toc_v4_PIC_1"
+(define_expand "load_toc_v4_PIC_1"
+  [(parallel [(set (reg:SI LR_REGNO)
+                  (match_operand:SI 0 "immediate_operand" "s"))
+             (use (unspec [(match_dup 0)] UNSPEC_TOC))])]
+  "TARGET_ELF && DEFAULT_ABI != ABI_AIX
+   && (flag_pic == 2 || (flag_pic && TARGET_SECURE_PLT))"
+  "")
+
+(define_insn "load_toc_v4_PIC_1_normal"
   [(set (reg:SI LR_REGNO)
        (match_operand:SI 0 "immediate_operand" "s"))
    (use (unspec [(match_dup 0)] UNSPEC_TOC))]
-  "TARGET_ELF && DEFAULT_ABI != ABI_AIX
+  "!TARGET_LINK_STACK && TARGET_ELF && DEFAULT_ABI != ABI_AIX
    && (flag_pic == 2 || (flag_pic && TARGET_SECURE_PLT))"
   "bcl 20,31,%0\\n%0:"
   [(set_attr "type" "branch")
    (set_attr "length" "4")])
 
-(define_insn "load_toc_v4_PIC_1b"
+(define_insn "load_toc_v4_PIC_1_476"
+  [(set (reg:SI LR_REGNO)
+       (match_operand:SI 0 "immediate_operand" "s"))
+   (use (unspec [(match_dup 0)] UNSPEC_TOC))]
+  "TARGET_LINK_STACK && TARGET_ELF && DEFAULT_ABI != ABI_AIX
+   && (flag_pic == 2 || (flag_pic && TARGET_SECURE_PLT))"
+  "bl $+8\n%0:\n\tb $+8\n\tblr"
+  [(set_attr "type" "branch")
+   (set_attr "length" "12")])
+
+(define_expand "load_toc_v4_PIC_1b"
+  [(parallel [(set (reg:SI LR_REGNO)
+                  (unspec:SI [(match_operand:SI 0 "immediate_operand" "s")
+                              (label_ref (match_operand 1 "" ""))]
+                          UNSPEC_TOCPTR))
+             (match_dup 1)])]
+  "TARGET_ELF && DEFAULT_ABI != ABI_AIX && flag_pic == 2"
+  "")
+
+(define_insn "load_toc_v4_PIC_1b_normal"
   [(set (reg:SI LR_REGNO)
        (unspec:SI [(match_operand:SI 0 "immediate_operand" "s")
                    (label_ref (match_operand 1 "" ""))]
                UNSPEC_TOCPTR))
    (match_dup 1)]
-  "TARGET_ELF && DEFAULT_ABI != ABI_AIX && flag_pic == 2"
+  "!TARGET_LINK_STACK && TARGET_ELF && DEFAULT_ABI != ABI_AIX && flag_pic == 2"
   "bcl 20,31,$+8\;.long %0-$"
   [(set_attr "type" "branch")
    (set_attr "length" "8")])
 
+(define_insn "load_toc_v4_PIC_1b_476"
+  [(set (reg:SI LR_REGNO)
+       (unspec:SI [(match_operand:SI 0 "immediate_operand" "s")
+                   (label_ref (match_operand 1 "" ""))]
+               UNSPEC_TOCPTR))
+   (match_dup 1)]
+  "TARGET_LINK_STACK && TARGET_ELF && DEFAULT_ABI != ABI_AIX && flag_pic == 2"
+  "bl $+12\n\tb $+12\n\t.long %0-$\n\tblr"
+  [(set_attr "type" "branch")
+   (set_attr "length" "16")])
+
 (define_insn "load_toc_v4_PIC_2"
   [(set (match_operand:SI 0 "gpc_reg_operand" "=r")
        (mem:SI (plus:SI (match_operand:SI 1 "gpc_reg_operand" "b")


Reply via email to