On Sat, Sep 21, 2013 at 3:51 PM, Xinliang David Li <davi...@google.com> wrote:
> On Sat, Sep 21, 2013 at 12:54 PM, Jan Hubicka <hubi...@ucw.cz> wrote:
>> Hi,
>> this is upated version of patch discussed at
>> http://gcc.gnu.org/ml/gcc-patches/2012-12/msg00841.html
>>
>> It makes CORE tuning to more follow the optimization guidelines.
>> In particular it removes some tuning flags for features I implemented years
>> back specifically for K7/K8 chips that ended up in Core tunning becuase
>> it was based on generic. Incrementally I plan to drop some of these from
>> generic, too.
>>
>> Compared to previous version of patch I left out INC_DEC change, even
>> though Core I7+ should resolve dependencies on partial flags correctly.
>> Optimization manual still seems to suggest to not use this:
>>
>> Assembly/Compiler Coding Rule 33. (M impact, H generality)
>> INC and DEC instructions should be replaced with ADD or SUB instructions,
>> because ADD and SUB overwrite all flags, whereas INC and DEC do not, 
>> therefore
>> creating false dependencies on earlier instructions that set the flags.
>>
>> Other change dropped is use_vector_fp_converts that seems to improve
>> Core perofrmance.
>
> I did not see this in your patch, but Wei has this tuning in this patch:
>

Sorry, I meant to ask why dropping this part?

David

> http://gcc.gnu.org/ml/gcc-patches/2013-09/msg00884.html
>
> thanks,
>
> David
>
>
>>
>> I benchmarked the patch on SPEC2k and earlier it was benchmarked on 2k6
>> and the performance difference seems in noise.  It causes about 0.3% code
>> size reduction.  Main motivation for the patch is to drop some codegen
>> oddities that do not make sense on modern chips.
>>
>> Bootstrapped/regtested x86_64-linux, will commit it shortly.
>> Honza
>>
>>         * x86-tune.def (partial_reg_stall): Disable for CoreI7 and newer.
>>         (sse_typeless_stores): Enable for core
>>         (sse_load0_by_pxor): Likewise.
>>         (four_jump_limit): Disable for core.
>>         (pad_returns): Likewise.
>>         (avoid_vector_decode): Likewise.
>>         (fuse_cmp_and_branch): Enable for cores.
>>         * i386.c (x86_accumulate_outgoing_args): Disable for cores.
>> Index: x86-tune.def
>> ===================================================================
>> *** x86-tune.def        (revision 202812)
>> --- x86-tune.def        (working copy)
>> *************** DEF_TUNE (X86_TUNE_MOVX, "movx",
>> *** 52,58 ****
>>      and can happen in caller/callee saving sequences.  */
>>   DEF_TUNE (X86_TUNE_PARTIAL_REG_STALL, "partial_reg_stall", m_PPRO)
>>   DEF_TUNE (X86_TUNE_PARTIAL_FLAG_REG_STALL, "partial_flag_reg_stall",
>> !           m_CORE_ALL | m_GENERIC)
>>   /* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall
>>    * on 16-bit immediate moves into memory on Core2 and Corei7.  */
>>   DEF_TUNE (X86_TUNE_LCP_STALL, "lcp_stall", m_CORE_ALL | m_GENERIC)
>> --- 52,58 ----
>>      and can happen in caller/callee saving sequences.  */
>>   DEF_TUNE (X86_TUNE_PARTIAL_REG_STALL, "partial_reg_stall", m_PPRO)
>>   DEF_TUNE (X86_TUNE_PARTIAL_FLAG_REG_STALL, "partial_flag_reg_stall",
>> !           m_CORE2 | m_GENERIC)
>>   /* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall
>>    * on 16-bit immediate moves into memory on Core2 and Corei7.  */
>>   DEF_TUNE (X86_TUNE_LCP_STALL, "lcp_stall", m_CORE_ALL | m_GENERIC)
>> *************** DEF_TUNE (X86_TUNE_SSE_PACKED_SINGLE_INS
>> *** 125,132 ****
>>      maintain just lower part of scalar values in proper format leaving the
>>      upper part undefined.  */
>>   DEF_TUNE (X86_TUNE_SSE_SPLIT_REGS, "sse_split_regs", m_ATHLON_K8)
>> ! DEF_TUNE (X86_TUNE_SSE_TYPELESS_STORES, "sse_typeless_stores", 
>> m_AMD_MULTIPLE)
>> ! DEF_TUNE (X86_TUNE_SSE_LOAD0_BY_PXOR, "sse_load0_by_pxor", m_PPRO | 
>> m_P4_NOCONA)
>>   DEF_TUNE (X86_TUNE_MEMORY_MISMATCH_STALL, "memory_mismatch_stall",
>>             m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_AMD_MULTIPLE | 
>> m_GENERIC)
>>   DEF_TUNE (X86_TUNE_PROLOGUE_USING_MOVE, "prologue_using_move",
>> --- 125,134 ----
>>      maintain just lower part of scalar values in proper format leaving the
>>      upper part undefined.  */
>>   DEF_TUNE (X86_TUNE_SSE_SPLIT_REGS, "sse_split_regs", m_ATHLON_K8)
>> ! DEF_TUNE (X86_TUNE_SSE_TYPELESS_STORES, "sse_typeless_stores",
>> !         m_AMD_MULTIPLE | m_CORE_ALL)
>> ! DEF_TUNE (X86_TUNE_SSE_LOAD0_BY_PXOR, "sse_load0_by_pxor",
>> !         m_PPRO | m_P4_NOCONA | m_CORE_ALL)
>>   DEF_TUNE (X86_TUNE_MEMORY_MISMATCH_STALL, "memory_mismatch_stall",
>>             m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_AMD_MULTIPLE | 
>> m_GENERIC)
>>   DEF_TUNE (X86_TUNE_PROLOGUE_USING_MOVE, "prologue_using_move",
>> *************** DEF_TUNE (X86_TUNE_INTER_UNIT_CONVERSION
>> *** 144,150 ****
>>   /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
>>      than 4 branch instructions in the 16 byte window.  */
>>   DEF_TUNE (X86_TUNE_FOUR_JUMP_LIMIT, "four_jump_limit",
>> !           m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM| 
>> m_AMD_MULTIPLE
>>             | m_GENERIC)
>>   DEF_TUNE (X86_TUNE_SCHEDULE, "schedule",
>>             m_PENT | m_PPRO | m_CORE_ALL | m_ATOM | m_SLM | m_K6_GEODE
>> --- 146,152 ----
>>   /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
>>      than 4 branch instructions in the 16 byte window.  */
>>   DEF_TUNE (X86_TUNE_FOUR_JUMP_LIMIT, "four_jump_limit",
>> !           m_PPRO | m_P4_NOCONA | m_ATOM | m_SLM | m_AMD_MULTIPLE
>>             | m_GENERIC)
>>   DEF_TUNE (X86_TUNE_SCHEDULE, "schedule",
>>             m_PENT | m_PPRO | m_CORE_ALL | m_ATOM | m_SLM | m_K6_GEODE
>> *************** DEF_TUNE (X86_TUNE_USE_BT, "use_bt",
>> *** 154,166 ****
>>   DEF_TUNE (X86_TUNE_USE_INCDEC, "use_incdec",
>>             ~(m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_GENERIC))
>>   DEF_TUNE (X86_TUNE_PAD_RETURNS, "pad_returns",
>> !           m_CORE_ALL | m_AMD_MULTIPLE | m_GENERIC)
>>   DEF_TUNE (X86_TUNE_PAD_SHORT_FUNCTION, "pad_short_function", m_ATOM)
>>   DEF_TUNE (X86_TUNE_EXT_80387_CONSTANTS, "ext_80387_constants",
>>             m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_K6_GEODE
>>             | m_ATHLON_K8 | m_GENERIC)
>>   DEF_TUNE (X86_TUNE_AVOID_VECTOR_DECODE, "avoid_vector_decode",
>> !           m_CORE_ALL | m_K8 | m_GENERIC)
>>   /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
>>      and SImode multiply, but 386 and 486 do HImode multiply faster.  */
>>   DEF_TUNE (X86_TUNE_PROMOTE_HIMODE_IMUL, "promote_himode_imul",
>> --- 156,168 ----
>>   DEF_TUNE (X86_TUNE_USE_INCDEC, "use_incdec",
>>             ~(m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_GENERIC))
>>   DEF_TUNE (X86_TUNE_PAD_RETURNS, "pad_returns",
>> !           m_AMD_MULTIPLE | m_GENERIC)
>>   DEF_TUNE (X86_TUNE_PAD_SHORT_FUNCTION, "pad_short_function", m_ATOM)
>>   DEF_TUNE (X86_TUNE_EXT_80387_CONSTANTS, "ext_80387_constants",
>>             m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_K6_GEODE
>>             | m_ATHLON_K8 | m_GENERIC)
>>   DEF_TUNE (X86_TUNE_AVOID_VECTOR_DECODE, "avoid_vector_decode",
>> !           m_K8 | m_GENERIC)
>>   /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
>>      and SImode multiply, but 386 and 486 do HImode multiply faster.  */
>>   DEF_TUNE (X86_TUNE_PROMOTE_HIMODE_IMUL, "promote_himode_imul",
>> *************** DEF_TUNE (X86_TUNE_USE_VECTOR_CONVERTS,
>> *** 193,199 ****
>>   /* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction
>>      with a subsequent conditional jump instruction into a single
>>      compare-and-branch uop.  */
>> ! DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH, "fuse_cmp_and_branch", m_BDVER)
>>   /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
>>      will impact LEA instruction selection. */
>>   DEF_TUNE (X86_TUNE_OPT_AGU, "opt_agu", m_ATOM | m_SLM)
>> --- 195,201 ----
>>   /* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction
>>      with a subsequent conditional jump instruction into a single
>>      compare-and-branch uop.  */
>> ! DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH, "fuse_cmp_and_branch", m_BDVER | 
>> m_CORE_ALL)
>>   /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
>>      will impact LEA instruction selection. */
>>   DEF_TUNE (X86_TUNE_OPT_AGU, "opt_agu", m_ATOM | m_SLM)
>> Index: i386.c
>> ===================================================================
>> *** i386.c      (revision 202812)
>> --- i386.c      (working copy)
>> *************** static unsigned int initial_ix86_arch_fe
>> *** 1899,1905 ****
>>   };
>>
>>   static const unsigned int x86_accumulate_outgoing_args
>> !   = m_PPRO | m_P4_NOCONA | m_ATOM | m_SLM | m_CORE_ALL | m_AMD_MULTIPLE | 
>> m_GENERIC;
>>
>>   static const unsigned int x86_arch_always_fancy_math_387
>>     = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | 
>> m_AMD_MULTIPLE | m_GENERIC;
>> --- 1899,1905 ----
>>   };
>>
>>   static const unsigned int x86_accumulate_outgoing_args
>> !   = m_PPRO | m_P4_NOCONA | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC;
>>
>>   static const unsigned int x86_arch_always_fancy_math_387
>>     = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | 
>> m_AMD_MULTIPLE | m_GENERIC;

Reply via email to