Hi! As discussed in the PR, combiner can combine e.g. unaligned integral load (e.g. TImode) together with some SSE instruction that requires aligned load, but doesn't actually check it. For AVX, most of the instructions actually allow unaligned operands, except for a few vmov* instructions where the pattern typically handle the misaligned mems through misaligned_operand checks, and some nontemporal move insns that have UNSPECs that should prevent combination. The following patch attempts to solve this by rejecting combining of unaligned memory loads/stores into SSE insns that don't allow it. I've added ssememalign attribute for that, but actually only later on realized that even for the insns which load/store < 16 byte memory values if strict alignment checking isn't turned on in hw, the arguments don't have to be aligned at all, so perhaps instead of ssememalign in bits all we could have is a boolean attribute whether insn requires for pre-AVX memory operands to be as aligned as their mode, or not (with default that it does require that).
Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? 2013-12-02 Jakub Jelinek <ja...@redhat.com> Uros Bizjak <ubiz...@gmail.com> PR target/59163 * config/i386/i386.c (ix86_legitimate_combined_insn): If for !TARGET_AVX there is misaligned MEM operand with vector mode and get_attr_ssememalign is 0, return false. (ix86_expand_special_args_builtin): Add get_pointer_alignment computed alignment and for non-temporal loads/stores also at least GET_MODE_ALIGNMENT as MEM_ALIGN. * config/i386/sse.md (<sse>_loadu<ssemodesuffix><avxsizesuffix><mask_name>, <sse>_storeu<ssemodesuffix><avxsizesuffix>, <sse2_avx_avx512f>_loaddqu<mode><mask_name>, <sse2_avx_avx512f>_storedqu<mode>, <sse3>_lddqu<avxsizesuffix>, sse_vmrcpv4sf2, sse_vmrsqrtv4sf2, sse2_cvtdq2pd, sse_movhlps, sse_movlhps, sse_storehps, sse_loadhps, *vec_interleave_highv2df, *vec_interleave_lowv2df, *vec_extractv2df_1_sse, sse2_movsd, sse4_1_<code>v8qiv8hi2, sse4_1_<code>v4qiv4si2, sse4_1_<code>v4hiv4si2, sse4_1_<code>v2qiv2di2, sse4_1_<code>v2hiv2di2, sse4_1_<code>v2siv2di2, sse4_2_pcmpestr, *sse4_2_pcmpestr_unaligned, sse4_2_pcmpestri, sse4_2_pcmpestrm, sse4_2_pcmpestr_cconly, sse4_2_pcmpistr, *sse4_2_pcmpistr_unaligned, sse4_2_pcmpistri, sse4_2_pcmpistrm, sse4_2_pcmpistr_cconly): Add ssememalign attribute. * config/i386/i386.md (ssememalign): New define_attr. * g++.dg/torture/pr59163.C: New test. --- gcc/config/i386/i386.c.jj 2013-12-02 14:33:34.813367951 +0100 +++ gcc/config/i386/i386.c 2013-12-02 19:57:39.116438744 +0100 @@ -5685,6 +5685,17 @@ ix86_legitimate_combined_insn (rtx insn) bool win; int j; + /* For pre-AVX disallow unaligned loads/stores where the + instructions don't support it. */ + if (!TARGET_AVX + && VECTOR_MODE_P (GET_MODE (op)) + && misaligned_operand (op, GET_MODE (op))) + { + int min_align = get_attr_ssememalign (insn); + if (min_align == 0) + return false; + } + /* A unary operator may be accepted by the predicate, but it is irrelevant for matching constraints. */ if (UNARY_P (op)) @@ -32426,11 +32437,12 @@ ix86_expand_args_builtin (const struct b static rtx ix86_expand_special_args_builtin (const struct builtin_description *d, - tree exp, rtx target) + tree exp, rtx target) { tree arg; rtx pat, op; unsigned int i, nargs, arg_adjust, memory; + bool aligned_mem = false; struct { rtx op; @@ -32493,6 +32505,26 @@ ix86_expand_special_args_builtin (const klass = store; /* Reserve memory operand for target. */ memory = ARRAY_SIZE (args); + switch (icode) + { + /* These builtins and instructions require the memory + to be properly aligned. */ + case CODE_FOR_avx_movntv4di: + case CODE_FOR_sse2_movntv2di: + case CODE_FOR_avx_movntv8sf: + case CODE_FOR_sse_movntv4sf: + case CODE_FOR_sse4a_vmmovntv4sf: + case CODE_FOR_avx_movntv4df: + case CODE_FOR_sse2_movntv2df: + case CODE_FOR_sse4a_vmmovntv2df: + case CODE_FOR_sse2_movntidi: + case CODE_FOR_sse_movntq: + case CODE_FOR_sse2_movntisi: + aligned_mem = true; + break; + default: + break; + } break; case V4SF_FTYPE_V4SF_PCV2SF: case V2DF_FTYPE_V2DF_PCDOUBLE: @@ -32549,6 +32581,11 @@ ix86_expand_special_args_builtin (const { op = ix86_zero_extend_to_Pmode (op); target = gen_rtx_MEM (tmode, op); + unsigned int align = get_pointer_alignment (arg); + if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode)) + align = GET_MODE_ALIGNMENT (tmode); + if (MEM_ALIGN (target) < align) + set_mem_align (target, align); } else target = force_reg (tmode, op); @@ -32594,8 +32631,11 @@ ix86_expand_special_args_builtin (const /* This must be the memory operand. */ op = ix86_zero_extend_to_Pmode (op); op = gen_rtx_MEM (mode, op); - gcc_assert (GET_MODE (op) == mode - || GET_MODE (op) == VOIDmode); + unsigned int align = get_pointer_alignment (arg); + if (aligned_mem && align < GET_MODE_ALIGNMENT (mode)) + align = GET_MODE_ALIGNMENT (mode); + if (MEM_ALIGN (op) < align) + set_mem_align (op, align); } else { --- gcc/config/i386/sse.md.jj 2013-11-28 16:01:05.000000000 +0100 +++ gcc/config/i386/sse.md 2013-12-02 18:02:00.325523050 +0100 @@ -931,6 +931,7 @@ (define_insn "<sse>_loadu<ssemodesuffix> } [(set_attr "type" "ssemov") (set_attr "movu" "1") + (set_attr "ssememalign" "8") (set_attr "prefix" "maybe_vex") (set (attr "mode") (cond [(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL") @@ -961,6 +962,7 @@ (define_insn "<sse>_storeu<ssemodesuffix } [(set_attr "type" "ssemov") (set_attr "movu" "1") + (set_attr "ssememalign" "8") (set_attr "prefix" "maybe_vex") (set (attr "mode") (cond [(ior (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL") @@ -1020,6 +1022,7 @@ (define_insn "<sse2_avx_avx512f>_loaddqu } [(set_attr "type" "ssemov") (set_attr "movu" "1") + (set_attr "ssememalign" "8") (set (attr "prefix_data16") (if_then_else (match_test "TARGET_AVX") @@ -1059,6 +1062,7 @@ (define_insn "<sse2_avx_avx512f>_storedq } [(set_attr "type" "ssemov") (set_attr "movu" "1") + (set_attr "ssememalign" "8") (set (attr "prefix_data16") (if_then_else (match_test "TARGET_AVX") @@ -1105,6 +1109,7 @@ (define_insn "<sse3>_lddqu<avxsizesuffix "%vlddqu\t{%1, %0|%0, %1}" [(set_attr "type" "ssemov") (set_attr "movu" "1") + (set_attr "ssememalign" "8") (set (attr "prefix_data16") (if_then_else (match_test "TARGET_AVX") @@ -1369,6 +1374,7 @@ (define_insn "sse_vmrcpv4sf2" vrcpss\t{%1, %2, %0|%0, %2, %k1}" [(set_attr "isa" "noavx,avx") (set_attr "type" "sse") + (set_attr "ssememalign" "32") (set_attr "atom_sse_attr" "rcp") (set_attr "btver2_sse_attr" "rcp") (set_attr "prefix" "orig,vex") @@ -1509,6 +1515,7 @@ (define_insn "sse_vmrsqrtv4sf2" vrsqrtss\t{%1, %2, %0|%0, %2, %k1}" [(set_attr "isa" "noavx,avx") (set_attr "type" "sse") + (set_attr "ssememalign" "32") (set_attr "prefix" "orig,vex") (set_attr "mode" "SF")]) @@ -3853,6 +3860,7 @@ (define_insn "sse2_cvtdq2pd" "%vcvtdq2pd\t{%1, %0|%0, %q1}" [(set_attr "type" "ssecvt") (set_attr "prefix" "maybe_vex") + (set_attr "ssememalign" "64") (set_attr "mode" "V2DF")]) (define_insn "<mask_codefor>avx512f_cvtpd2dq512<mask_name>" @@ -4725,6 +4733,7 @@ (define_insn "sse_movhlps" %vmovhps\t{%2, %0|%q0, %2}" [(set_attr "isa" "noavx,avx,noavx,avx,*") (set_attr "type" "ssemov") + (set_attr "ssememalign" "64") (set_attr "prefix" "orig,vex,orig,vex,maybe_vex") (set_attr "mode" "V4SF,V4SF,V2SF,V2SF,V2SF")]) @@ -4770,6 +4779,7 @@ (define_insn "sse_movlhps" %vmovlps\t{%2, %H0|%H0, %2}" [(set_attr "isa" "noavx,avx,noavx,avx,*") (set_attr "type" "ssemov") + (set_attr "ssememalign" "64") (set_attr "prefix" "orig,vex,orig,vex,maybe_vex") (set_attr "mode" "V4SF,V4SF,V2SF,V2SF,V2SF")]) @@ -5174,6 +5184,7 @@ (define_insn "sse_storehps" %vmovhlps\t{%1, %d0|%d0, %1} %vmovlps\t{%H1, %d0|%d0, %H1}" [(set_attr "type" "ssemov") + (set_attr "ssememalign" "64") (set_attr "prefix" "maybe_vex") (set_attr "mode" "V2SF,V4SF,V2SF")]) @@ -5213,6 +5224,7 @@ (define_insn "sse_loadhps" %vmovlps\t{%2, %H0|%H0, %2}" [(set_attr "isa" "noavx,avx,noavx,avx,*") (set_attr "type" "ssemov") + (set_attr "ssememalign" "64") (set_attr "prefix" "orig,vex,orig,vex,maybe_vex") (set_attr "mode" "V2SF,V2SF,V4SF,V4SF,V2SF")]) @@ -6224,7 +6236,8 @@ (define_insn "*vec_interleave_highv2df" vmovlpd\t{%H1, %2, %0|%0, %2, %H1} %vmovhpd\t{%1, %0|%q0, %1}" [(set_attr "isa" "noavx,avx,sse3,noavx,avx,*") - (set_attr "type" "sselog,sselog,sselog,ssemov,ssemov,ssemov") + (set_attr "type" "sselog,sselog,sselog,ssemov,ssemov,ssemov") + (set_attr "ssememalign" "64") (set_attr "prefix_data16" "*,*,*,1,*,1") (set_attr "prefix" "orig,vex,maybe_vex,orig,vex,maybe_vex") (set_attr "mode" "V2DF,V2DF,DF,V1DF,V1DF,V1DF")]) @@ -6368,6 +6381,7 @@ (define_insn "*vec_interleave_lowv2df" %vmovlpd\t{%2, %H0|%H0, %2}" [(set_attr "isa" "noavx,avx,sse3,noavx,avx,*") (set_attr "type" "sselog,sselog,sselog,ssemov,ssemov,ssemov") + (set_attr "ssememalign" "64") (set_attr "prefix_data16" "*,*,*,1,*,1") (set_attr "prefix" "orig,vex,maybe_vex,orig,vex,maybe_vex") (set_attr "mode" "V2DF,V2DF,DF,V1DF,V1DF,V1DF")]) @@ -6959,6 +6973,7 @@ (define_insn "*vec_extractv2df_1_sse" movhlps\t{%1, %0|%0, %1} movlps\t{%H1, %0|%0, %H1}" [(set_attr "type" "ssemov") + (set_attr "ssememalign" "64") (set_attr "mode" "V2SF,V4SF,V2SF")]) ;; Avoid combining registers from different units in a single alternative, @@ -7163,6 +7178,7 @@ (define_insn "sse2_movsd" (const_string "1") (const_string "*"))) (set_attr "length_immediate" "*,*,*,*,*,1,*,*,*") + (set_attr "ssememalign" "64") (set_attr "prefix" "orig,vex,orig,vex,maybe_vex,orig,orig,vex,maybe_vex") (set_attr "mode" "DF,DF,V1DF,V1DF,V1DF,V2DF,V1DF,V1DF,V1DF")]) @@ -11459,6 +11475,7 @@ (define_insn "sse4_1_<code>v8qiv8hi2" "TARGET_SSE4_1" "%vpmov<extsuffix>bw\t{%1, %0|%0, %q1}" [(set_attr "type" "ssemov") + (set_attr "ssememalign" "64") (set_attr "prefix_extra" "1") (set_attr "prefix" "maybe_vex") (set_attr "mode" "TI")]) @@ -11499,6 +11516,7 @@ (define_insn "sse4_1_<code>v4qiv4si2" "TARGET_SSE4_1" "%vpmov<extsuffix>bd\t{%1, %0|%0, %k1}" [(set_attr "type" "ssemov") + (set_attr "ssememalign" "32") (set_attr "prefix_extra" "1") (set_attr "prefix" "maybe_vex") (set_attr "mode" "TI")]) @@ -11534,6 +11552,7 @@ (define_insn "sse4_1_<code>v4hiv4si2" "TARGET_SSE4_1" "%vpmov<extsuffix>wd\t{%1, %0|%0, %q1}" [(set_attr "type" "ssemov") + (set_attr "ssememalign" "64") (set_attr "prefix_extra" "1") (set_attr "prefix" "maybe_vex") (set_attr "mode" "TI")]) @@ -11576,6 +11595,7 @@ (define_insn "sse4_1_<code>v2qiv2di2" "TARGET_SSE4_1" "%vpmov<extsuffix>bq\t{%1, %0|%0, %w1}" [(set_attr "type" "ssemov") + (set_attr "ssememalign" "16") (set_attr "prefix_extra" "1") (set_attr "prefix" "maybe_vex") (set_attr "mode" "TI")]) @@ -11613,6 +11633,7 @@ (define_insn "sse4_1_<code>v2hiv2di2" "TARGET_SSE4_1" "%vpmov<extsuffix>wq\t{%1, %0|%0, %k1}" [(set_attr "type" "ssemov") + (set_attr "ssememalign" "32") (set_attr "prefix_extra" "1") (set_attr "prefix" "maybe_vex") (set_attr "mode" "TI")]) @@ -11646,6 +11667,7 @@ (define_insn "sse4_1_<code>v2siv2di2" "TARGET_SSE4_1" "%vpmov<extsuffix>dq\t{%1, %0|%0, %q1}" [(set_attr "type" "ssemov") + (set_attr "ssememalign" "64") (set_attr "prefix_extra" "1") (set_attr "prefix" "maybe_vex") (set_attr "mode" "TI")]) @@ -11939,6 +11961,7 @@ (define_insn_and_split "sse4_2_pcmpestr" [(set_attr "type" "sselog") (set_attr "prefix_data16" "1") (set_attr "prefix_extra" "1") + (set_attr "ssememalign" "8") (set_attr "length_immediate" "1") (set_attr "memory" "none,load") (set_attr "mode" "TI")]) @@ -12001,6 +12024,7 @@ (define_insn_and_split "*sse4_2_pcmpestr [(set_attr "type" "sselog") (set_attr "prefix_data16" "1") (set_attr "prefix_extra" "1") + (set_attr "ssememalign" "8") (set_attr "length_immediate" "1") (set_attr "memory" "load") (set_attr "mode" "TI")]) @@ -12028,6 +12052,7 @@ (define_insn "sse4_2_pcmpestri" (set_attr "prefix_data16" "1") (set_attr "prefix_extra" "1") (set_attr "prefix" "maybe_vex") + (set_attr "ssememalign" "8") (set_attr "length_immediate" "1") (set_attr "btver2_decode" "vector") (set_attr "memory" "none,load") @@ -12055,6 +12080,7 @@ (define_insn "sse4_2_pcmpestrm" [(set_attr "type" "sselog") (set_attr "prefix_data16" "1") (set_attr "prefix_extra" "1") + (set_attr "ssememalign" "8") (set_attr "length_immediate" "1") (set_attr "prefix" "maybe_vex") (set_attr "btver2_decode" "vector") @@ -12081,6 +12107,7 @@ (define_insn "sse4_2_pcmpestr_cconly" [(set_attr "type" "sselog") (set_attr "prefix_data16" "1") (set_attr "prefix_extra" "1") + (set_attr "ssememalign" "8") (set_attr "length_immediate" "1") (set_attr "memory" "none,load,none,load") (set_attr "btver2_decode" "vector,vector,vector,vector") @@ -12134,6 +12161,7 @@ (define_insn_and_split "sse4_2_pcmpistr" [(set_attr "type" "sselog") (set_attr "prefix_data16" "1") (set_attr "prefix_extra" "1") + (set_attr "ssememalign" "8") (set_attr "length_immediate" "1") (set_attr "memory" "none,load") (set_attr "mode" "TI")]) @@ -12187,6 +12215,7 @@ (define_insn_and_split "*sse4_2_pcmpistr [(set_attr "type" "sselog") (set_attr "prefix_data16" "1") (set_attr "prefix_extra" "1") + (set_attr "ssememalign" "8") (set_attr "length_immediate" "1") (set_attr "memory" "load") (set_attr "mode" "TI")]) @@ -12209,6 +12238,7 @@ (define_insn "sse4_2_pcmpistri" [(set_attr "type" "sselog") (set_attr "prefix_data16" "1") (set_attr "prefix_extra" "1") + (set_attr "ssememalign" "8") (set_attr "length_immediate" "1") (set_attr "prefix" "maybe_vex") (set_attr "memory" "none,load") @@ -12233,6 +12263,7 @@ (define_insn "sse4_2_pcmpistrm" [(set_attr "type" "sselog") (set_attr "prefix_data16" "1") (set_attr "prefix_extra" "1") + (set_attr "ssememalign" "8") (set_attr "length_immediate" "1") (set_attr "prefix" "maybe_vex") (set_attr "memory" "none,load") @@ -12257,6 +12288,7 @@ (define_insn "sse4_2_pcmpistr_cconly" [(set_attr "type" "sselog") (set_attr "prefix_data16" "1") (set_attr "prefix_extra" "1") + (set_attr "ssememalign" "8") (set_attr "length_immediate" "1") (set_attr "memory" "none,load,none,load") (set_attr "prefix" "maybe_vex") --- gcc/config/i386/i386.md.jj 2013-12-02 14:33:34.000000000 +0100 +++ gcc/config/i386/i386.md 2013-12-02 16:31:11.704706128 +0100 @@ -402,6 +402,13 @@ (define_attr "unit" "integer,i387,sse,mm (const_string "unknown")] (const_string "integer"))) +;; The minimum required alignment of vector mode memory operands of the SSE +;; (non-VEX/EVEX) instruction in bits, if it is different from +;; GET_MODE_ALIGNMENT of the operand, otherwise 0. If an instruction has +;; multiple alternatives, this should be conservative maximum of those minimum +;; required alignments. +(define_attr "ssememalign" "" (const_int 0)) + ;; The (bounding maximum) length of an instruction immediate. (define_attr "length_immediate" "" (cond [(eq_attr "type" "incdec,setcc,icmov,str,lea,other,multi,idiv,leave, --- gcc/testsuite/g++.dg/torture/pr59163.C.jj 2013-12-02 16:21:27.124744788 +0100 +++ gcc/testsuite/g++.dg/torture/pr59163.C 2013-12-02 16:21:27.124744788 +0100 @@ -0,0 +1,30 @@ +// PR target/59163 +// { dg-do run } + +struct A { float a[4]; }; +struct B { int b; A a; }; + +__attribute__((noinline, noclone)) void +bar (A &a) +{ + if (a.a[0] != 36.0f || a.a[1] != 42.0f || a.a[2] != 48.0f || a.a[3] != 54.0f) + __builtin_abort (); +} + +__attribute__((noinline, noclone)) void +foo (A &a) +{ + int i; + A c = a; + for (i = 0; i < 4; i++) + c.a[i] *= 6.0f; + a = c; + bar (a); +} + +int +main () +{ + B b = { 5, { 6, 7, 8, 9 } }; + foo (b.a); +} Jakub