On Mon, Nov 19, 2012 at 8:55 AM, Uros Bizjak <ubiz...@gmail.com> wrote: >> The following patch fixes >> >> http://gcc.gnu.org/bugzilla/show_bug.cgi?id=19398 >> >> Uros, there is i386.md part for which I need an approval. Without this >> change, GCC will still generate the same code even if LRA uses an >> alternative with 'm' constraint. >> >> 2012-11-18 Vladimir Makarov <vmaka...@redhat.com> >> >> PR target/19398 >> * lra-constraints.c (process_alt_operands): Discourage reloads >> through secodnary memory. >> * config/i386/i386.md (fix_trunc?f?i_sse): Remove peephole2 >> patterns. > > Thanks! > > Please note that i386.md change is not correct, it is peephole2 with > "Shorten x87->SSE reload sequences ..." comment that is not effective > anymore with your patch and should now be removed. The peephole2s that > your patch removes undo LRA transformation for targets that *do not* > benefit from MEM->REG operation for this particular FIX RTX. (Also, > please note that your patch includes movti_internal_rex64 change that > was already reverted due to better fix). > > Please remove mentioned peephole2 instead. The test from the PR will > show effects of LRA change for all targets, other than core2i7_64, k8 > and generic 64bit targets. > > The patch that removes mentioned peephole2 from i386.md is pre-approved.
I have merged my x86 target patch that removes correct peephole2 and associated defines with Vlad's patch and committed everything (including testcase) to mainline SVN. Also, the patch includes macroization of a couple of RTXes in this area. 2012-11-20 Uros Bizjak <ubiz...@gmail.com> * config/i386/i386.md (fix_trunc<MODEF:mode><SWI48:mode>_sse): Macroize insn from fix_trunc<mode>{si,di}_sse using SWI48 mode iterator. (peephole2 to avoid vector decoded forms): Macroize peephole2 using MODEF mode iterator. Use SWI48 mode iterator instead of SWI48x. 2012-11-20 Uros Bizjak <ubiz...@gmail.com> PR target/19398 * config/i386/i386.md (peephole2 to shorten x87->SSE reload sequences): Remove peephole2. * config/i386/i386.h (enum ix86_tune_indices) <IX86_TUNE_SHORTEN_X87_SSE>: Remove. (TARGET_SHORTEN_X87_SSE): Remove. * config/i386/i386.c (initial_ix86_tune_features): Update. 2012-11-20 Vladimir Makarov <vmaka...@redhat.com> PR target/19398 * lra-constraints.c (process_alt_operands): Discourage reloads through secodnary memory. testsuite/ChangeLog: 2012-11-20 Uros Bizjak <ubiz...@gmail.com> PR target/19398 * gcc.target/i386/pr19398.c: New test. Bootstrapped and regression tested on x86_64-pc-linux-gnu {,-m32} and committed to mainline SVN. Uros.
Index: config/i386/i386.c =================================================================== --- config/i386/i386.c (revision 193625) +++ config/i386/i386.c (working copy) @@ -1855,9 +1855,6 @@ static unsigned int initial_ix86_tune_features[X86 /* X86_TUNE_EXT_80387_CONSTANTS */ m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC, - /* X86_TUNE_SHORTEN_X87_SSE */ - ~m_K8, - /* X86_TUNE_AVOID_VECTOR_DECODE */ m_CORE2I7_64 | m_K8 | m_GENERIC64, Index: config/i386/i386.h =================================================================== --- config/i386/i386.h (revision 193625) +++ config/i386/i386.h (working copy) @@ -314,7 +314,6 @@ enum ix86_tune_indices { X86_TUNE_PAD_RETURNS, X86_TUNE_PAD_SHORT_FUNCTION, X86_TUNE_EXT_80387_CONSTANTS, - X86_TUNE_SHORTEN_X87_SSE, X86_TUNE_AVOID_VECTOR_DECODE, X86_TUNE_PROMOTE_HIMODE_IMUL, X86_TUNE_SLOW_IMUL_IMM32_MEM, @@ -408,7 +407,6 @@ extern unsigned char ix86_tune_features[X86_TUNE_L ix86_tune_features[X86_TUNE_PAD_SHORT_FUNCTION] #define TARGET_EXT_80387_CONSTANTS \ ix86_tune_features[X86_TUNE_EXT_80387_CONSTANTS] -#define TARGET_SHORTEN_X87_SSE ix86_tune_features[X86_TUNE_SHORTEN_X87_SSE] #define TARGET_AVOID_VECTOR_DECODE \ ix86_tune_features[X86_TUNE_AVOID_VECTOR_DECODE] #define TARGET_TUNE_PROMOTE_HIMODE_IMUL \ Index: config/i386/i386.md =================================================================== --- config/i386/i386.md (revision 193625) +++ config/i386/i386.md (working copy) @@ -4465,61 +4465,35 @@ "operands[2] = gen_reg_rtx (SImode);") ;; When SSE is available, it is always faster to use it! -(define_insn "fix_trunc<mode>di_sse" - [(set (match_operand:DI 0 "register_operand" "=r,r") - (fix:DI (match_operand:MODEF 1 "nonimmediate_operand" "x,m")))] - "TARGET_64BIT && SSE_FLOAT_MODE_P (<MODE>mode) +(define_insn "fix_trunc<MODEF:mode><SWI48:mode>_sse" + [(set (match_operand:SWI48 0 "register_operand" "=r,r") + (fix:SWI48 (match_operand:MODEF 1 "nonimmediate_operand" "x,m")))] + "SSE_FLOAT_MODE_P (<MODEF:MODE>mode) && (!TARGET_FISTTP || TARGET_SSE_MATH)" - "%vcvtt<ssemodesuffix>2si{q}\t{%1, %0|%0, %1}" + "%vcvtt<MODEF:ssemodesuffix>2si<SWI48:rex64suffix>\t{%1, %0|%0, %1}" [(set_attr "type" "sseicvt") (set_attr "prefix" "maybe_vex") - (set_attr "prefix_rex" "1") - (set_attr "mode" "<MODE>") + (set (attr "prefix_rex") + (if_then_else + (match_test "<SWI48:MODE>mode == DImode") + (const_string "1") + (const_string "*"))) + (set_attr "mode" "<MODEF:MODE>") (set_attr "athlon_decode" "double,vector") (set_attr "amdfam10_decode" "double,double") (set_attr "bdver1_decode" "double,double")]) -(define_insn "fix_trunc<mode>si_sse" - [(set (match_operand:SI 0 "register_operand" "=r,r") - (fix:SI (match_operand:MODEF 1 "nonimmediate_operand" "x,m")))] - "SSE_FLOAT_MODE_P (<MODE>mode) - && (!TARGET_FISTTP || TARGET_SSE_MATH)" - "%vcvtt<ssemodesuffix>2si\t{%1, %0|%0, %1}" - [(set_attr "type" "sseicvt") - (set_attr "prefix" "maybe_vex") - (set_attr "mode" "<MODE>") - (set_attr "athlon_decode" "double,vector") - (set_attr "amdfam10_decode" "double,double") - (set_attr "bdver1_decode" "double,double")]) - -;; Shorten x87->SSE reload sequences of fix_trunc?f?i_sse patterns. -(define_peephole2 - [(set (match_operand:MODEF 0 "register_operand") - (match_operand:MODEF 1 "memory_operand")) - (set (match_operand:SWI48x 2 "register_operand") - (fix:SWI48x (match_dup 0)))] - "TARGET_SHORTEN_X87_SSE - && !(TARGET_AVOID_VECTOR_DECODE && optimize_insn_for_speed_p ()) - && peep2_reg_dead_p (2, operands[0])" - [(set (match_dup 2) (fix:SWI48x (match_dup 1)))]) - ;; Avoid vector decoded forms of the instruction. (define_peephole2 - [(match_scratch:DF 2 "x") - (set (match_operand:SWI48x 0 "register_operand") - (fix:SWI48x (match_operand:DF 1 "memory_operand")))] - "TARGET_SSE2 && TARGET_AVOID_VECTOR_DECODE && optimize_insn_for_speed_p ()" + [(match_scratch:MODEF 2 "x") + (set (match_operand:SWI48 0 "register_operand") + (fix:SWI48 (match_operand:MODEF 1 "memory_operand")))] + "TARGET_AVOID_VECTOR_DECODE + && SSE_FLOAT_MODE_P (<MODEF:MODE>mode) + && optimize_insn_for_speed_p ()" [(set (match_dup 2) (match_dup 1)) - (set (match_dup 0) (fix:SWI48x (match_dup 2)))]) + (set (match_dup 0) (fix:SWI48 (match_dup 2)))]) -(define_peephole2 - [(match_scratch:SF 2 "x") - (set (match_operand:SWI48x 0 "register_operand") - (fix:SWI48x (match_operand:SF 1 "memory_operand")))] - "TARGET_AVOID_VECTOR_DECODE && optimize_insn_for_speed_p ()" - [(set (match_dup 2) (match_dup 1)) - (set (match_dup 0) (fix:SWI48x (match_dup 2)))]) - (define_insn_and_split "fix_trunc<mode>_fisttp_i387_1" [(set (match_operand:SWI248x 0 "nonimmediate_operand") (fix:SWI248x (match_operand 1 "register_operand")))] Index: lra-constraints.c =================================================================== --- lra-constraints.c (revision 193625) +++ lra-constraints.c (working copy) @@ -1942,6 +1942,19 @@ process_alt_operands (int only_alternative) if (no_regs_p && REG_P (op)) reject++; +#ifdef SECONDARY_MEMORY_NEEDED + /* If reload requires moving value through secondary + memory, it will need one more insn at least. */ + if (this_alternative != NO_REGS + && REG_P (op) && (cl = get_reg_class (REGNO (op))) != NO_REGS + && ((curr_static_id->operand[nop].type != OP_OUT + && SECONDARY_MEMORY_NEEDED (cl, this_alternative, + GET_MODE (op))) + || (curr_static_id->operand[nop].type != OP_IN + && SECONDARY_MEMORY_NEEDED (this_alternative, cl, + GET_MODE (op))))) + losers++; +#endif /* Input reloads can be inherited more often than output reloads can be removed, so penalize output reloads. */ Index: testsuite/gcc.target/i386/pr19398.c =================================================================== --- testsuite/gcc.target/i386/pr19398.c (revision 0) +++ testsuite/gcc.target/i386/pr19398.c (working copy) @@ -0,0 +1,9 @@ +/* { dg-do compile } */ +/* { dg-options "-Os -msse -mno-sse3 -mfpmath=387" } */ + +int test (float a) +{ + return (a * a); +} + +/* { dg-final { scan-assembler-not "cvttss2si\[^\\n\]*%xmm" } } */