Hello! There is no point to emit vmovaps instead of vmovapd or vmovdqa, these instructions have same sizes. Attached patch fixes this oversight for TARGET_AVX.
2012-05-11 Uros Bizjak <ubiz...@gmail.com> * config/i386/i386.md (*movti_internal_rex64): Avoid MOVAPS size optimization for TARGET_AVX. (*movti_internal_sse): Ditto. (*movdi_internal_rex64): Handle TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL. (*movdi_internal): Ditto. (*movsi_internal): Ditto. (*movtf_internal): Avoid MOVAPS size optimization for TARGET_AVX. (*movdf_internal_rex64): Ditto. (*movfd_internal): Ditto. (*movsf_internal): Ditto. * config/i386/sse.md (mov<mode>): Handle TARGET_SSE_LOAD0_BY_PXOR. Tested on x86_64-pc-linux-gnu {,-m32}, committed to mainline SVN. Uros.
Index: i386.md =================================================================== --- i386.md (revision 187372) +++ i386.md (working copy) @@ -1890,12 +1890,15 @@ (set (attr "mode") (cond [(eq_attr "alternative" "0,1") (const_string "DI") - (ior (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL") - (match_test "optimize_function_for_size_p (cfun)")) + (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL") (const_string "V4SF") (and (eq_attr "alternative" "4") (match_test "TARGET_SSE_TYPELESS_STORES")) (const_string "V4SF") + (match_test "TARGET_AVX") + (const_string "TI") + (match_test "optimize_function_for_size_p (cfun)") + (const_string "V4SF") ] (const_string "TI")))]) @@ -1943,13 +1946,15 @@ [(set_attr "type" "sselog1,ssemov,ssemov") (set_attr "prefix" "maybe_vex") (set (attr "mode") - (cond [(ior (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL") - (match_test "optimize_function_for_size_p (cfun)")) + (cond [(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL") (const_string "V4SF") (and (eq_attr "alternative" "2") (match_test "TARGET_SSE_TYPELESS_STORES")) (const_string "V4SF") - (not (match_test "TARGET_SSE2")) + (match_test "TARGET_AVX") + (const_string "TI") + (ior (not (match_test "TARGET_SSE2")) + (match_test "optimize_function_for_size_p (cfun)")) (const_string "V4SF") ] (const_string "TI")))]) @@ -1970,8 +1975,11 @@ return "movdq2q\t{%1, %0|%0, %1}"; case TYPE_SSEMOV: - if (get_attr_mode (insn) == MODE_TI) + if (get_attr_mode (insn) == MODE_V4SF) + return "%vmovaps\t{%1, %0|%0, %1}"; + else if (get_attr_mode (insn) == MODE_TI) return "%vmovdqa\t{%1, %0|%0, %1}"; + /* Handle broken assemblers that require movd instead of movq. */ if (GENERAL_REG_P (operands[0]) || GENERAL_REG_P (operands[1])) return "%vmovd\t{%1, %0|%0, %1}"; @@ -2048,7 +2056,20 @@ (if_then_else (eq_attr "alternative" "10,11,12,13,14,15") (const_string "maybe_vex") (const_string "orig"))) - (set_attr "mode" "SI,DI,DI,DI,SI,DI,DI,DI,DI,DI,TI,DI,TI,DI,DI,DI,DI,DI")]) + (set (attr "mode") + (cond [(eq_attr "alternative" "0,4") + (const_string "SI") + (eq_attr "alternative" "10,12") + (cond [(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL") + (const_string "V4SF") + (match_test "TARGET_AVX") + (const_string "TI") + (match_test "optimize_function_for_size_p (cfun)") + (const_string "V4SF") + ] + (const_string "TI")) + ] + (const_string "DI")))]) ;; Reload patterns to support multi-word load/store ;; with non-offsetable address. @@ -2142,7 +2163,7 @@ case MODE_DI: return "%vmovq\t{%1, %0|%0, %1}"; case MODE_V4SF: - return "movaps\t{%1, %0|%0, %1}"; + return "%vmovaps\t{%1, %0|%0, %1}"; case MODE_V2SF: return "movlps\t{%1, %0|%0, %1}"; default: @@ -2189,7 +2210,22 @@ (if_then_else (eq_attr "alternative" "5,6,7,8") (const_string "maybe_vex") (const_string "orig"))) - (set_attr "mode" "DI,DI,DI,DI,DI,TI,DI,TI,DI,V4SF,V2SF,V4SF,V2SF,DI,DI")]) + (set (attr "mode") + (cond [(eq_attr "alternative" "9,11") + (const_string "V4SF") + (eq_attr "alternative" "10,12") + (const_string "V2SF") + (eq_attr "alternative" "5,7") + (cond [(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL") + (const_string "V4SF") + (match_test "TARGET_AVX") + (const_string "TI") + (match_test "optimize_function_for_size_p (cfun)") + (const_string "V4SF") + ] + (const_string "TI")) + ] + (const_string "DI")))]) (define_split [(set (match_operand:DI 0 "nonimmediate_operand") @@ -2271,10 +2307,15 @@ (cond [(eq_attr "alternative" "2,3") (const_string "DI") (eq_attr "alternative" "6,7") - (if_then_else - (not (match_test "TARGET_SSE2")) - (const_string "V4SF") - (const_string "TI")) + (cond [(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL") + (const_string "V4SF") + (match_test "TARGET_AVX") + (const_string "TI") + (ior (not (match_test "TARGET_SSE2")) + (match_test "optimize_function_for_size_p (cfun)")) + (const_string "V4SF") + ] + (const_string "TI")) (and (eq_attr "alternative" "8,9,10,11") (not (match_test "TARGET_SSE2"))) (const_string "SF") @@ -2881,12 +2922,15 @@ (set (attr "mode") (cond [(eq_attr "alternative" "3,4") (const_string "DI") - (ior (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL") - (match_test "optimize_function_for_size_p (cfun)")) + (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL") (const_string "V4SF") (and (eq_attr "alternative" "2") (match_test "TARGET_SSE_TYPELESS_STORES")) (const_string "V4SF") + (match_test "TARGET_AVX") + (const_string "TI") + (match_test "optimize_function_for_size_p (cfun)") + (const_string "V4SF") ] (const_string "TI")))]) @@ -3030,9 +3074,11 @@ (eq_attr "alternative" "3,4,5,6,11,12") (const_string "DI") - /* xorps is one byte shorter. */ + /* xorps is one byte shorter for !TARGET_AVX. */ (eq_attr "alternative" "7") - (cond [(match_test "optimize_function_for_size_p (cfun)") + (cond [(match_test "TARGET_AVX") + (const_string "V2DF") + (match_test "optimize_function_for_size_p (cfun)") (const_string "V4SF") (match_test "TARGET_SSE_LOAD0_BY_PXOR") (const_string "TI") @@ -3043,14 +3089,16 @@ whole SSE registers use APD move to break dependency chains, otherwise use short move to avoid extra work. - movaps encodes one byte shorter. */ + movaps encodes one byte shorter for !TARGET_AVX. */ (eq_attr "alternative" "8") - (cond - [(ior (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL") - (match_test "optimize_function_for_size_p (cfun)")) - (const_string "V4SF") - (match_test "TARGET_SSE_PARTIAL_REG_DEPENDENCY") - (const_string "V2DF") + (cond [(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL") + (const_string "V4SF") + (match_test "TARGET_SSE_PARTIAL_REG_DEPENDENCY") + (const_string "V2DF") + (match_test "TARGET_AVX") + (const_string "DF") + (match_test "optimize_function_for_size_p (cfun)") + (const_string "V4SF") ] (const_string "DF")) /* For architectures resolving dependencies on register @@ -3165,9 +3213,11 @@ (const_string "V4SF") (const_string "V2SF")) - /* xorps is one byte shorter. */ + /* xorps is one byte shorter for !TARGET_AVX. */ (eq_attr "alternative" "5,9") - (cond [(match_test "optimize_function_for_size_p (cfun)") + (cond [(match_test "TARGET_AVX") + (const_string "V2DF") + (match_test "optimize_function_for_size_p (cfun)") (const_string "V4SF") (match_test "TARGET_SSE_LOAD0_BY_PXOR") (const_string "TI") @@ -3178,16 +3228,19 @@ whole SSE registers use APD move to break dependency chains, otherwise use short move to avoid extra work. - movaps encodes one byte shorter. */ + movaps encodes one byte shorter for !TARGET_AVX. */ (eq_attr "alternative" "6,10") - (cond - [(ior (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL") - (match_test "optimize_function_for_size_p (cfun)")) - (const_string "V4SF") - (match_test "TARGET_SSE_PARTIAL_REG_DEPENDENCY") - (const_string "V2DF") + (cond [(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL") + (const_string "V4SF") + (match_test "TARGET_SSE_PARTIAL_REG_DEPENDENCY") + (const_string "V2DF") + (match_test "TARGET_AVX") + (const_string "DF") + (match_test "optimize_function_for_size_p (cfun)") + (const_string "V4SF") ] (const_string "DF")) + /* For architectures resolving dependencies on register parts we may avoid extra work to zero out upper part of register. */ @@ -3277,12 +3330,16 @@ (cond [(eq_attr "alternative" "3,4,9,10") (const_string "SI") (eq_attr "alternative" "5") - (if_then_else - (and (and (match_test "TARGET_SSE_LOAD0_BY_PXOR") - (match_test "TARGET_SSE2")) - (not (match_test "optimize_function_for_size_p (cfun)"))) - (const_string "TI") - (const_string "V4SF")) + (cond [(match_test "TARGET_AVX") + (const_string "V4SF") + (ior (not (match_test "TARGET_SSE2")) + (match_test "optimize_function_for_size_p (cfun)")) + (const_string "V4SF") + (match_test "TARGET_SSE_LOAD0_BY_PXOR") + (const_string "TI") + ] + (const_string "V4SF")) + /* For architectures resolving dependencies on whole SSE registers use APS move to break dependency chains, otherwise use short move to avoid extra work. Index: sse.md =================================================================== --- sse.md (revision 187372) +++ sse.md (working copy) @@ -491,6 +491,9 @@ (ior (not (match_test "TARGET_SSE2")) (match_test "optimize_function_for_size_p (cfun)")) (const_string "V4SF") + (and (eq_attr "alternative" "0") + (match_test "TARGET_SSE_LOAD0_BY_PXOR")) + (const_string "TI") ] (const_string "<sseinsnmode>")))])