> While it could be possible to output_set_got such that we can
> individually annotate the instructions, it's simpler to simply
> admit that all processors currently being manufactured do want
> deep branch prediction. At which point all of the complication
> simply goes away.
Note that most of modern CPUs special case call to next instruction, so
they will work well with !X86_TUNE_DEEP_BRANCH_PREDICTION code.
Honza
> ---
> gcc/config/i386/i386.c | 105 +++++++----------------------------------------
> gcc/config/i386/i386.h | 3 -
> 2 files changed, 16 insertions(+), 92 deletions(-)
>
> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> index 014401b..332e65b 100644
> --- a/gcc/config/i386/i386.c
> +++ b/gcc/config/i386/i386.c
> @@ -55,7 +55,6 @@ along with GCC; see the file COPYING3. If not see
> #include "params.h"
> #include "cselib.h"
> #include "debug.h"
> -#include "dwarf2out.h"
> #include "sched-int.h"
> #include "sbitmap.h"
> #include "fibheap.h"
> @@ -1847,10 +1846,6 @@ static unsigned int
> initial_ix86_tune_features[X86_TUNE_LAST] = {
> m_486 | m_PENT | m_ATOM | m_PPRO | m_AMD_MULTIPLE | m_K6
> | m_CORE2I7 | m_GENERIC,
>
> - /* X86_TUNE_DEEP_BRANCH_PREDICTION */
> - m_ATOM | m_PPRO | m_K6_GEODE | m_AMD_MULTIPLE | m_PENT4
> - | m_CORE2I7 | m_GENERIC,
> -
> /* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
> on simulation result. But after P4 was made, no performance benefit
> was observed with branch hints. It also increases the code size.
> @@ -8323,31 +8318,11 @@ output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED)
>
> xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
>
> - if (! TARGET_DEEP_BRANCH_PREDICTION || !flag_pic)
> + if (!flag_pic)
> {
> xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
>
> - if (!flag_pic)
> - output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
> - else
> - {
> - output_asm_insn ("call\t%a2", xops);
> -#ifdef DWARF2_UNWIND_INFO
> - /* The call to next label acts as a push. */
> - if (dwarf2out_do_frame ())
> - {
> - rtx insn;
> - start_sequence ();
> - insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
> - gen_rtx_PLUS (Pmode,
> - stack_pointer_rtx,
> - GEN_INT (-4))));
> - RTX_FRAME_RELATED_P (insn) = 1;
> - dwarf2out_frame_debug (insn, true);
> - end_sequence ();
> - }
> -#endif
> - }
> + output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
>
> #if TARGET_MACHO
> /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
> @@ -8358,29 +8333,6 @@ output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED)
>
> targetm.asm_out.internal_label (asm_out_file, "L",
> CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
> -
> - if (flag_pic)
> - {
> - output_asm_insn ("pop%z0\t%0", xops);
> -#ifdef DWARF2_UNWIND_INFO
> - /* The pop is a pop and clobbers dest, but doesn't restore it
> - for unwind info purposes. */
> - if (dwarf2out_do_frame ())
> - {
> - rtx insn;
> - start_sequence ();
> - insn = emit_insn (gen_rtx_SET (VOIDmode, dest, const0_rtx));
> - dwarf2out_frame_debug (insn, true);
> - insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
> - gen_rtx_PLUS (Pmode,
> - stack_pointer_rtx,
> - GEN_INT (4))));
> - RTX_FRAME_RELATED_P (insn) = 1;
> - dwarf2out_frame_debug (insn, true);
> - end_sequence ();
> - }
> -#endif
> - }
> }
> else
> {
> @@ -8388,12 +8340,6 @@ output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED)
> get_pc_thunk_name (name, REGNO (dest));
> pic_labels_used |= 1 << REGNO (dest);
>
> -#ifdef DWARF2_UNWIND_INFO
> - /* Ensure all queued register saves are flushed before the
> - call. */
> - if (dwarf2out_do_frame ())
> - dwarf2out_flush_queued_reg_saves ();
> -#endif
> xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
> xops[2] = gen_rtx_MEM (QImode, xops[2]);
> output_asm_insn ("call\t%X2", xops);
> @@ -8408,13 +8354,8 @@ output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED)
> #endif
> }
>
> - if (TARGET_MACHO)
> - return "";
> -
> - if (!flag_pic || TARGET_DEEP_BRANCH_PREDICTION)
> + if (!TARGET_MACHO)
> output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
> - else
> - output_asm_insn ("add%z0\t{%1+[.-%a2], %0|%0, %1+(.-%a2)}", xops);
>
> return "";
> }
> @@ -10138,7 +10079,11 @@ ix86_expand_prologue (void)
> insn = emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
> }
> else
> - insn = emit_insn (gen_set_got (pic_offset_table_rtx));
> + {
> + insn = emit_insn (gen_set_got (pic_offset_table_rtx));
> + RTX_FRAME_RELATED_P (insn) = 1;
> + add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
> + }
> }
>
> /* In the pic_reg_used case, make sure that the got load isn't deleted
> @@ -28979,12 +28924,7 @@ machopic_output_stub (FILE *file, const char *symb,
> const char *stub)
> if (MACHOPIC_ATT_STUB)
> switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
> else if (MACHOPIC_PURE)
> - {
> - if (TARGET_DEEP_BRANCH_PREDICTION)
> - switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
> - else
> - switch_to_section (darwin_sections[machopic_picsymbol_stub_section]);
> - }
> + switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
> else
> switch_to_section (darwin_sections[machopic_symbol_stub_section]);
>
> @@ -28998,19 +28938,11 @@ machopic_output_stub (FILE *file, const char *symb,
> const char *stub)
> else if (MACHOPIC_PURE)
> {
> /* PIC stub. */
> - if (TARGET_DEEP_BRANCH_PREDICTION)
> - {
> - /* 25-byte PIC stub using "CALL get_pc_thunk". */
> - rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
> - output_set_got (tmp, NULL_RTX); /* "CALL
> ___<cpu>.get_pc_thunk.cx". */
> - fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n", label,
> lazy_ptr_name, label);
> - }
> - else
> - {
> - /* 26-byte PIC stub using inline picbase: "CALL L42 ! L42: pop %eax".
> */
> - fprintf (file, "\tcall LPC$%d\nLPC$%d:\tpopl %%ecx\n", label, label);
> - fprintf (file, "\tmovl %s-LPC$%d(%%ecx),%%ecx\n", lazy_ptr_name,
> label);
> - }
> + /* 25-byte PIC stub using "CALL get_pc_thunk". */
> + rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
> + output_set_got (tmp, NULL_RTX); /* "CALL
> ___<cpu>.get_pc_thunk.cx". */
> + fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
> + label, lazy_ptr_name, label);
> fprintf (file, "\tjmp\t*%%ecx\n");
> }
> else
> @@ -29039,13 +28971,8 @@ machopic_output_stub (FILE *file, const char *symb,
> const char *stub)
> compatibility with existing dylibs. */
> if (MACHOPIC_PURE)
> {
> - /* PIC stubs. */
> - if (TARGET_DEEP_BRANCH_PREDICTION)
> - /* 25-byte PIC stub using "CALL get_pc_thunk". */
> - switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
> - else
> - /* 26-byte PIC stub using inline picbase: "CALL L42 ! L42: pop %ebx".
> */
> - switch_to_section (darwin_sections[machopic_lazy_symbol_ptr_section]);
> + /* 25-byte PIC stub using "CALL get_pc_thunk". */
> + switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
> }
> else
> /* 16-byte -mdynamic-no-pic stub. */
> diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
> index 8badcbb..1452226 100644
> --- a/gcc/config/i386/i386.h
> +++ b/gcc/config/i386/i386.h
> @@ -249,7 +249,6 @@ enum ix86_tune_indices {
> X86_TUNE_PUSH_MEMORY,
> X86_TUNE_ZERO_EXTEND_WITH_AND,
> X86_TUNE_UNROLL_STRLEN,
> - X86_TUNE_DEEP_BRANCH_PREDICTION,
> X86_TUNE_BRANCH_PREDICTION_HINTS,
> X86_TUNE_DOUBLE_WITH_ADD,
> X86_TUNE_USE_SAHF,
> @@ -323,8 +322,6 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
> #define TARGET_ZERO_EXTEND_WITH_AND \
> ix86_tune_features[X86_TUNE_ZERO_EXTEND_WITH_AND]
> #define TARGET_UNROLL_STRLEN ix86_tune_features[X86_TUNE_UNROLL_STRLEN]
> -#define TARGET_DEEP_BRANCH_PREDICTION \
> - ix86_tune_features[X86_TUNE_DEEP_BRANCH_PREDICTION]
> #define TARGET_BRANCH_PREDICTION_HINTS \
> ix86_tune_features[X86_TUNE_BRANCH_PREDICTION_HINTS]
> #define TARGET_DOUBLE_WITH_ADD
> ix86_tune_features[X86_TUNE_DOUBLE_WITH_ADD]
> --
> 1.7.5.4