On Sat, Sep 21, 2013 at 3:51 PM, Xinliang David Li <davi...@google.com> wrote: > On Sat, Sep 21, 2013 at 12:54 PM, Jan Hubicka <hubi...@ucw.cz> wrote: >> Hi, >> this is upated version of patch discussed at >> http://gcc.gnu.org/ml/gcc-patches/2012-12/msg00841.html >> >> It makes CORE tuning to more follow the optimization guidelines. >> In particular it removes some tuning flags for features I implemented years >> back specifically for K7/K8 chips that ended up in Core tunning becuase >> it was based on generic. Incrementally I plan to drop some of these from >> generic, too. >> >> Compared to previous version of patch I left out INC_DEC change, even >> though Core I7+ should resolve dependencies on partial flags correctly. >> Optimization manual still seems to suggest to not use this: >> >> Assembly/Compiler Coding Rule 33. (M impact, H generality) >> INC and DEC instructions should be replaced with ADD or SUB instructions, >> because ADD and SUB overwrite all flags, whereas INC and DEC do not, >> therefore >> creating false dependencies on earlier instructions that set the flags. >> >> Other change dropped is use_vector_fp_converts that seems to improve >> Core perofrmance. > > I did not see this in your patch, but Wei has this tuning in this patch: >
Sorry, I meant to ask why dropping this part? David > http://gcc.gnu.org/ml/gcc-patches/2013-09/msg00884.html > > thanks, > > David > > >> >> I benchmarked the patch on SPEC2k and earlier it was benchmarked on 2k6 >> and the performance difference seems in noise. It causes about 0.3% code >> size reduction. Main motivation for the patch is to drop some codegen >> oddities that do not make sense on modern chips. >> >> Bootstrapped/regtested x86_64-linux, will commit it shortly. >> Honza >> >> * x86-tune.def (partial_reg_stall): Disable for CoreI7 and newer. >> (sse_typeless_stores): Enable for core >> (sse_load0_by_pxor): Likewise. >> (four_jump_limit): Disable for core. >> (pad_returns): Likewise. >> (avoid_vector_decode): Likewise. >> (fuse_cmp_and_branch): Enable for cores. >> * i386.c (x86_accumulate_outgoing_args): Disable for cores. >> Index: x86-tune.def >> =================================================================== >> *** x86-tune.def (revision 202812) >> --- x86-tune.def (working copy) >> *************** DEF_TUNE (X86_TUNE_MOVX, "movx", >> *** 52,58 **** >> and can happen in caller/callee saving sequences. */ >> DEF_TUNE (X86_TUNE_PARTIAL_REG_STALL, "partial_reg_stall", m_PPRO) >> DEF_TUNE (X86_TUNE_PARTIAL_FLAG_REG_STALL, "partial_flag_reg_stall", >> ! m_CORE_ALL | m_GENERIC) >> /* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall >> * on 16-bit immediate moves into memory on Core2 and Corei7. */ >> DEF_TUNE (X86_TUNE_LCP_STALL, "lcp_stall", m_CORE_ALL | m_GENERIC) >> --- 52,58 ---- >> and can happen in caller/callee saving sequences. */ >> DEF_TUNE (X86_TUNE_PARTIAL_REG_STALL, "partial_reg_stall", m_PPRO) >> DEF_TUNE (X86_TUNE_PARTIAL_FLAG_REG_STALL, "partial_flag_reg_stall", >> ! m_CORE2 | m_GENERIC) >> /* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall >> * on 16-bit immediate moves into memory on Core2 and Corei7. */ >> DEF_TUNE (X86_TUNE_LCP_STALL, "lcp_stall", m_CORE_ALL | m_GENERIC) >> *************** DEF_TUNE (X86_TUNE_SSE_PACKED_SINGLE_INS >> *** 125,132 **** >> maintain just lower part of scalar values in proper format leaving the >> upper part undefined. */ >> DEF_TUNE (X86_TUNE_SSE_SPLIT_REGS, "sse_split_regs", m_ATHLON_K8) >> ! DEF_TUNE (X86_TUNE_SSE_TYPELESS_STORES, "sse_typeless_stores", >> m_AMD_MULTIPLE) >> ! DEF_TUNE (X86_TUNE_SSE_LOAD0_BY_PXOR, "sse_load0_by_pxor", m_PPRO | >> m_P4_NOCONA) >> DEF_TUNE (X86_TUNE_MEMORY_MISMATCH_STALL, "memory_mismatch_stall", >> m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_AMD_MULTIPLE | >> m_GENERIC) >> DEF_TUNE (X86_TUNE_PROLOGUE_USING_MOVE, "prologue_using_move", >> --- 125,134 ---- >> maintain just lower part of scalar values in proper format leaving the >> upper part undefined. */ >> DEF_TUNE (X86_TUNE_SSE_SPLIT_REGS, "sse_split_regs", m_ATHLON_K8) >> ! DEF_TUNE (X86_TUNE_SSE_TYPELESS_STORES, "sse_typeless_stores", >> ! m_AMD_MULTIPLE | m_CORE_ALL) >> ! DEF_TUNE (X86_TUNE_SSE_LOAD0_BY_PXOR, "sse_load0_by_pxor", >> ! m_PPRO | m_P4_NOCONA | m_CORE_ALL) >> DEF_TUNE (X86_TUNE_MEMORY_MISMATCH_STALL, "memory_mismatch_stall", >> m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_AMD_MULTIPLE | >> m_GENERIC) >> DEF_TUNE (X86_TUNE_PROLOGUE_USING_MOVE, "prologue_using_move", >> *************** DEF_TUNE (X86_TUNE_INTER_UNIT_CONVERSION >> *** 144,150 **** >> /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more >> than 4 branch instructions in the 16 byte window. */ >> DEF_TUNE (X86_TUNE_FOUR_JUMP_LIMIT, "four_jump_limit", >> ! m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM| >> m_AMD_MULTIPLE >> | m_GENERIC) >> DEF_TUNE (X86_TUNE_SCHEDULE, "schedule", >> m_PENT | m_PPRO | m_CORE_ALL | m_ATOM | m_SLM | m_K6_GEODE >> --- 146,152 ---- >> /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more >> than 4 branch instructions in the 16 byte window. */ >> DEF_TUNE (X86_TUNE_FOUR_JUMP_LIMIT, "four_jump_limit", >> ! m_PPRO | m_P4_NOCONA | m_ATOM | m_SLM | m_AMD_MULTIPLE >> | m_GENERIC) >> DEF_TUNE (X86_TUNE_SCHEDULE, "schedule", >> m_PENT | m_PPRO | m_CORE_ALL | m_ATOM | m_SLM | m_K6_GEODE >> *************** DEF_TUNE (X86_TUNE_USE_BT, "use_bt", >> *** 154,166 **** >> DEF_TUNE (X86_TUNE_USE_INCDEC, "use_incdec", >> ~(m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_GENERIC)) >> DEF_TUNE (X86_TUNE_PAD_RETURNS, "pad_returns", >> ! m_CORE_ALL | m_AMD_MULTIPLE | m_GENERIC) >> DEF_TUNE (X86_TUNE_PAD_SHORT_FUNCTION, "pad_short_function", m_ATOM) >> DEF_TUNE (X86_TUNE_EXT_80387_CONSTANTS, "ext_80387_constants", >> m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_K6_GEODE >> | m_ATHLON_K8 | m_GENERIC) >> DEF_TUNE (X86_TUNE_AVOID_VECTOR_DECODE, "avoid_vector_decode", >> ! m_CORE_ALL | m_K8 | m_GENERIC) >> /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode >> and SImode multiply, but 386 and 486 do HImode multiply faster. */ >> DEF_TUNE (X86_TUNE_PROMOTE_HIMODE_IMUL, "promote_himode_imul", >> --- 156,168 ---- >> DEF_TUNE (X86_TUNE_USE_INCDEC, "use_incdec", >> ~(m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_GENERIC)) >> DEF_TUNE (X86_TUNE_PAD_RETURNS, "pad_returns", >> ! m_AMD_MULTIPLE | m_GENERIC) >> DEF_TUNE (X86_TUNE_PAD_SHORT_FUNCTION, "pad_short_function", m_ATOM) >> DEF_TUNE (X86_TUNE_EXT_80387_CONSTANTS, "ext_80387_constants", >> m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_K6_GEODE >> | m_ATHLON_K8 | m_GENERIC) >> DEF_TUNE (X86_TUNE_AVOID_VECTOR_DECODE, "avoid_vector_decode", >> ! m_K8 | m_GENERIC) >> /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode >> and SImode multiply, but 386 and 486 do HImode multiply faster. */ >> DEF_TUNE (X86_TUNE_PROMOTE_HIMODE_IMUL, "promote_himode_imul", >> *************** DEF_TUNE (X86_TUNE_USE_VECTOR_CONVERTS, >> *** 193,199 **** >> /* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction >> with a subsequent conditional jump instruction into a single >> compare-and-branch uop. */ >> ! DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH, "fuse_cmp_and_branch", m_BDVER) >> /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag >> will impact LEA instruction selection. */ >> DEF_TUNE (X86_TUNE_OPT_AGU, "opt_agu", m_ATOM | m_SLM) >> --- 195,201 ---- >> /* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction >> with a subsequent conditional jump instruction into a single >> compare-and-branch uop. */ >> ! DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH, "fuse_cmp_and_branch", m_BDVER | >> m_CORE_ALL) >> /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag >> will impact LEA instruction selection. */ >> DEF_TUNE (X86_TUNE_OPT_AGU, "opt_agu", m_ATOM | m_SLM) >> Index: i386.c >> =================================================================== >> *** i386.c (revision 202812) >> --- i386.c (working copy) >> *************** static unsigned int initial_ix86_arch_fe >> *** 1899,1905 **** >> }; >> >> static const unsigned int x86_accumulate_outgoing_args >> ! = m_PPRO | m_P4_NOCONA | m_ATOM | m_SLM | m_CORE_ALL | m_AMD_MULTIPLE | >> m_GENERIC; >> >> static const unsigned int x86_arch_always_fancy_math_387 >> = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | >> m_AMD_MULTIPLE | m_GENERIC; >> --- 1899,1905 ---- >> }; >> >> static const unsigned int x86_accumulate_outgoing_args >> ! = m_PPRO | m_P4_NOCONA | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC; >> >> static const unsigned int x86_arch_always_fancy_math_387 >> = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | >> m_AMD_MULTIPLE | m_GENERIC;