The Power ISA declares the "bcl 20,31,..." instruction as the preferred idiom for obtaining the next instruction address (NIA), which we use for computing the address of the GOT. This special branch and link is *not* a subroutine call, meaning it won't be paired with a blr (subroutine return). Processors therefore are not supposed to update their internal link stack when executing one of this instructions, otherwise we'll mispredict the following blrs.
The 476 processor has an bug where it doesn't ignore these "bcl 20,31,..." instructions, so we end up getting lots of mispredicts for -fPIC code. The following patch adds a -mpreserve-link-stack option that is enabled automatically for -mtune={476,476fp}, that changes the two types of GOT access code GCC produces. The new code replaces the "bcl 20,31,..." with a "bl..., b..., blr" triplet. I've included some old versus new code snipits for both types of GOT access code to illustrate how the code has changed. 1) Normal Code: New 476 Code: ============================================================================== bcl 20,31,$+4 bl $+8 .L3: .L3: b $+8 blr mflr 9 mflr 9 addis 9,9,.LCTOC1-.L3@ha addis 9,9,.LCTOC1-.L3@ha addi 9,9,.LCTOC1-.L3@l addi 9,9,.LCTOC1-.L3@l 2) Normal Code: New 476 Code: ============================================================================== bcl 20,31,$+8 bl $+12 .long _GLOBAL_OFFSET_TABLE_-$ b $+12 .long _GLOBAL_OFFSET_TABLE_-$ blr mflr 9 mflr 9 addi 9,9,4 lwz 3,0(9) lwz 3,0(9) I have bootstrapped and regtested the following patch with no regressiosn. To test the code even more, I modified the patch so that we default to always using -mpreserve-link-stack and that bootstrapped and regtested with no regressions too. Ok for mainline? Peter * config/rs6000/rs6000.opt (mpreserve-link-stack): New option. * config/rs6000/rs6000.c (rs6000_option_override_internal): Enable TARGET_LINK_STACK for -mtune=476 and -mtune=476fp. (rs6000_legitimize_tls_address): Emit the link stack preserving GOT code if TARGET_LINK_STACK. (rs6000_emit_load_toc_table): Likewise. (output_function_profiler): Likewise (macho_branch_islands): Likewise (machopic_output_stub): Likewise * config/rs6000/rs6000.md (load_toc_v4_PIC_1, load_toc_v4_PIC_1b): Convert to a define_expand. (load_toc_v4_PIC_1_normal): New define_insn. (load_toc_v4_PIC_1_476): Likewise. (load_toc_v4_PIC_1b_normal): Likewise. (load_toc_v4_PIC_1b_476): Likewise. Index: gcc/config/rs6000/rs6000.opt =================================================================== --- gcc/config/rs6000/rs6000.opt (revision 176007) +++ gcc/config/rs6000/rs6000.opt (working copy) @@ -528,3 +528,7 @@ Use/do not use r11 to hold the static li msave-toc-indirect Target Undocumented Var(TARGET_SAVE_TOC_INDIRECT) Save Init(1) ; Control whether we save the TOC in the prologue for indirect calls or generate the save inline + +mpreserve-link-stack +Target Report Var(TARGET_LINK_STACK) Init(-1) Save +Preserve the link stack on some cpus (eg, 476) by matching up a blr with the bcl/bl insns used for GOT accesses Index: gcc/config/rs6000/rs6000.c =================================================================== --- gcc/config/rs6000/rs6000.c (revision 176007) +++ gcc/config/rs6000/rs6000.c (working copy) @@ -3246,6 +3246,11 @@ rs6000_option_override_internal (bool gl target_option_default_node = target_option_current_node = build_target_option_node (); + /* If not explicitly specified via option, decide whether to generate the + extra blr's required to preserve the link stack on some cpus (eg, 476). */ + if (TARGET_LINK_STACK == -1) + TARGET_LINK_STACK = (rs6000_cpu == PROCESSOR_PPC476); + return ret; } @@ -5930,6 +5935,8 @@ rs6000_legitimize_tls_address (rtx addr, lab = gen_label_rtx (); emit_insn (gen_load_toc_v4_PIC_1b (gsym, lab)); emit_move_insn (tmp1, gen_rtx_REG (Pmode, LR_REGNO)); + if (TARGET_LINK_STACK) + emit_insn (gen_addsi3 (tmp1, tmp1, GEN_INT (4))); emit_move_insn (tmp2, mem); last = emit_insn (gen_addsi3 (got, tmp1, tmp2)); set_unique_reg_note (last, REG_EQUAL, gsym); @@ -18927,6 +18934,8 @@ rs6000_emit_load_toc_table (int fromprol lab = gen_label_rtx (); emit_insn (gen_load_toc_v4_PIC_1b (tocsym, lab)); emit_move_insn (dest, gen_rtx_REG (Pmode, LR_REGNO)); + if (TARGET_LINK_STACK) + emit_insn (gen_addsi3 (dest, dest, GEN_INT (4))); emit_move_insn (temp0, gen_rtx_MEM (Pmode, dest)); } emit_insn (gen_addsi3 (dest, temp0, dest)); @@ -22529,7 +22538,11 @@ output_function_profiler (FILE *file, in } else if (TARGET_SECURE_PLT && flag_pic) { - asm_fprintf (file, "\tbcl 20,31,1f\n1:\n\t{st|stw} %s,4(%s)\n", + if (TARGET_LINK_STACK) + asm_fprintf (file, "\tbl 1f\n\tb 2f\n1:\n\tblr\n2:\n"); + else + asm_fprintf (file, "\tbcl 20,31,1f\n1:\n"); + asm_fprintf (file, "\t{st|stw} %s,4(%s)\n", reg_names[0], reg_names[1]); asm_fprintf (file, "\tmflr %s\n", reg_names[12]); asm_fprintf (file, "\t{cau|addis} %s,%s,", @@ -22554,10 +22567,22 @@ output_function_profiler (FILE *file, in asm_fprintf (file, "\t{st|stw} %s,4(%s)\n", reg_names[0], reg_names[1]); /* Now, we need to get the address of the label. */ - fputs ("\tbcl 20,31,1f\n\t.long ", file); - assemble_name (file, buf); - fputs ("-.\n1:", file); - asm_fprintf (file, "\tmflr %s\n", reg_names[11]); + if (TARGET_LINK_STACK) + { + fputs ("\tbl 1f\n\tb 2f\n\t.long ", file); + assemble_name (file, buf); + fputs ("-.\n1:\n\tblr\n2:", file); + asm_fprintf (file, "\tmflr %s\n", reg_names[11]); + asm_fprintf (file, "\taddi %s,%s,4\n", + reg_names[11], reg_names[11]); + } + else + { + fputs ("\tbcl 20,31,1f\n\t.long ", file); + assemble_name (file, buf); + fputs ("-.\n1:", file); + asm_fprintf (file, "\tmflr %s\n", reg_names[11]); + } asm_fprintf (file, "\t{l|lwz} %s,0(%s)\n", reg_names[0], reg_names[11]); asm_fprintf (file, "\t{cax|add} %s,%s,%s\n", @@ -25040,11 +25065,22 @@ macho_branch_islands (void) #endif /* DBX_DEBUGGING_INFO || XCOFF_DEBUGGING_INFO */ if (flag_pic) { - strcat (tmp_buf, ":\n\tmflr r0\n\tbcl 20,31,"); - strcat (tmp_buf, label); - strcat (tmp_buf, "_pic\n"); - strcat (tmp_buf, label); - strcat (tmp_buf, "_pic:\n\tmflr r11\n"); + if (TARGET_LINK_STACK) + { + strcat (tmp_buf, ":\n\tmflr r0\n\tbl $+8\n"); + strcat (tmp_buf, label); + strcat (tmp_buf, "_pic:\n\tb $+8\n"); + strcat (tmp_buf, "\tblr\n"); + strcat (tmp_buf, "\tmflr r11\n"); + } + else + { + strcat (tmp_buf, ":\n\tmflr r0\n\tbcl 20,31,"); + strcat (tmp_buf, label); + strcat (tmp_buf, "_pic\n"); + strcat (tmp_buf, label); + strcat (tmp_buf, "_pic:\n\tmflr r11\n"); + } strcat (tmp_buf, "\taddis r11,r11,ha16("); strcat (tmp_buf, name_buf); @@ -25190,8 +25226,16 @@ machopic_output_stub (FILE *file, const sprintf (local_label_0, "\"L%011d$spb\"", label); fprintf (file, "\tmflr r0\n"); - fprintf (file, "\tbcl 20,31,%s\n", local_label_0); - fprintf (file, "%s:\n\tmflr r11\n", local_label_0); + if (TARGET_LINK_STACK) + { + fprintf (file, "\tbl $+8\n"); + fprintf (file, "%s:\n\tb $+8\n\tblr\n\tmflr r11\n", local_label_0); + } + else + { + fprintf (file, "\tbcl 20,31,%s\n", local_label_0); + fprintf (file, "%s:\n\tmflr r11\n", local_label_0); + } fprintf (file, "\taddis r11,r11,ha16(%s-%s)\n", lazy_ptr_name, local_label_0); fprintf (file, "\tmtlr r0\n"); Index: gcc/config/rs6000/rs6000.md =================================================================== --- gcc/config/rs6000/rs6000.md (revision 176007) +++ gcc/config/rs6000/rs6000.md (working copy) @@ -12081,27 +12081,65 @@ (define_insn "load_toc_v4_pic_si" [(set_attr "type" "branch") (set_attr "length" "4")]) -(define_insn "load_toc_v4_PIC_1" +(define_expand "load_toc_v4_PIC_1" + [(parallel [(set (reg:SI LR_REGNO) + (match_operand:SI 0 "immediate_operand" "s")) + (use (unspec [(match_dup 0)] UNSPEC_TOC))])] + "TARGET_ELF && DEFAULT_ABI != ABI_AIX + && (flag_pic == 2 || (flag_pic && TARGET_SECURE_PLT))" + "") + +(define_insn "load_toc_v4_PIC_1_normal" [(set (reg:SI LR_REGNO) (match_operand:SI 0 "immediate_operand" "s")) (use (unspec [(match_dup 0)] UNSPEC_TOC))] - "TARGET_ELF && DEFAULT_ABI != ABI_AIX + "!TARGET_LINK_STACK && TARGET_ELF && DEFAULT_ABI != ABI_AIX && (flag_pic == 2 || (flag_pic && TARGET_SECURE_PLT))" "bcl 20,31,%0\\n%0:" [(set_attr "type" "branch") (set_attr "length" "4")]) -(define_insn "load_toc_v4_PIC_1b" +(define_insn "load_toc_v4_PIC_1_476" + [(set (reg:SI LR_REGNO) + (match_operand:SI 0 "immediate_operand" "s")) + (use (unspec [(match_dup 0)] UNSPEC_TOC))] + "TARGET_LINK_STACK && TARGET_ELF && DEFAULT_ABI != ABI_AIX + && (flag_pic == 2 || (flag_pic && TARGET_SECURE_PLT))" + "bl $+8\n%0:\n\tb $+8\n\tblr" + [(set_attr "type" "branch") + (set_attr "length" "12")]) + +(define_expand "load_toc_v4_PIC_1b" + [(parallel [(set (reg:SI LR_REGNO) + (unspec:SI [(match_operand:SI 0 "immediate_operand" "s") + (label_ref (match_operand 1 "" ""))] + UNSPEC_TOCPTR)) + (match_dup 1)])] + "TARGET_ELF && DEFAULT_ABI != ABI_AIX && flag_pic == 2" + "") + +(define_insn "load_toc_v4_PIC_1b_normal" [(set (reg:SI LR_REGNO) (unspec:SI [(match_operand:SI 0 "immediate_operand" "s") (label_ref (match_operand 1 "" ""))] UNSPEC_TOCPTR)) (match_dup 1)] - "TARGET_ELF && DEFAULT_ABI != ABI_AIX && flag_pic == 2" + "!TARGET_LINK_STACK && TARGET_ELF && DEFAULT_ABI != ABI_AIX && flag_pic == 2" "bcl 20,31,$+8\;.long %0-$" [(set_attr "type" "branch") (set_attr "length" "8")]) +(define_insn "load_toc_v4_PIC_1b_476" + [(set (reg:SI LR_REGNO) + (unspec:SI [(match_operand:SI 0 "immediate_operand" "s") + (label_ref (match_operand 1 "" ""))] + UNSPEC_TOCPTR)) + (match_dup 1)] + "TARGET_LINK_STACK && TARGET_ELF && DEFAULT_ABI != ABI_AIX && flag_pic == 2" + "bl $+12\n\tb $+12\n\t.long %0-$\n\tblr" + [(set_attr "type" "branch") + (set_attr "length" "16")]) + (define_insn "load_toc_v4_PIC_2" [(set (match_operand:SI 0 "gpc_reg_operand" "=r") (mem:SI (plus:SI (match_operand:SI 1 "gpc_reg_operand" "b")