Hi! This is an attempt to port my recent http://gcc.gnu.org/viewcvs?rev=204219&root=gcc&view=rev http://gcc.gnu.org/viewcvs?rev=205663&root=gcc&view=rev http://gcc.gnu.org/viewcvs?rev=206090&root=gcc&view=rev changes also to AVX512F. The motivation is to get:
#include <immintrin.h> __m512i foo (void *x, void *y) { __m512i a = _mm512_loadu_si512 (x); __m512i b = _mm512_loadu_si512 (y); return _mm512_add_epi32 (a, b); } use one of the unaligned memories directly as operand to the vpaddd instruction. The first hunk is needed so that we don't regress on say: #include <immintrin.h> __m512i z; __m512i foo (void *x, void *y, int k) { __m512i a = _mm512_mask_loadu_epi32 (z, k, x); __m512i b = _mm512_mask_loadu_epi32 (z, k, y); return _mm512_add_epi32 (a, b); } __m512i bar (void *x, void *y, int k) { __m512i a = _mm512_maskz_loadu_epi32 (k, x); __m512i b = _mm512_maskz_loadu_epi32 (k, y); return _mm512_add_epi32 (a, b); } Does it matter which of vmovdqu32 vs. vmovdqu64 is used if no masking/zeroing is performed (i.e. vmovdqu32 (%rax), %zmm0 vs. vmovdqu64 (%rax), %zmm0) for performance reasons (i.e. isn't there some reinterpretation penalty)? Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? 2014-01-03 Jakub Jelinek <ja...@redhat.com> * config/i386/sse.md (avx512f_load<mode>_mask): Emit vmovup{s,d} or vmovdqu* for misaligned_operand. (<sse>_loadu<ssemodesuffix><avxsizesuffix><mask_name>, <sse2_avx_avx512f>_loaddqu<mode><mask_name>): Handle <mask_applied>. * config/i386/i386.c (ix86_expand_special_args_builtin): Set aligned_mem for AVX512F masked aligned load and store builtins and for non-temporal moves. * gcc.target/i386/avx512f-vmovdqu32-1.c: Allow vmovdqu64 instead of vmovdqu32. --- gcc/config/i386/sse.md.jj 2014-01-02 20:11:49.000000000 +0100 +++ gcc/config/i386/sse.md 2014-01-02 21:59:06.706161064 +0100 @@ -786,8 +786,12 @@ (define_insn "avx512f_load<mode>_mask" { case MODE_V8DF: case MODE_V16SF: + if (misaligned_operand (operands[1], <MODE>mode)) + return "vmovu<ssemodesuffix>\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}"; return "vmova<ssemodesuffix>\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}"; default: + if (misaligned_operand (operands[1], <MODE>mode)) + return "vmovdqu<ssescalarsize>\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}"; return "vmovdqa<ssescalarsize>\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}"; } } @@ -936,11 +940,14 @@ (define_expand "<sse>_loadu<ssemodesuffi false, still emit UNSPEC_LOADU insn to honor user's request for misaligned load. */ if (TARGET_AVX - && misaligned_operand (operands[1], <MODE>mode) - /* FIXME: Revisit after AVX512F merge is completed. */ - && !<mask_applied>) + && misaligned_operand (operands[1], <MODE>mode)) { - emit_insn (gen_rtx_SET (VOIDmode, operands[0], operands[1])); + rtx src = operands[1]; + if (<mask_applied>) + src = gen_rtx_VEC_MERGE (<MODE>mode, operands[1], + operands[2 * <mask_applied>], + operands[3 * <mask_applied>]); + emit_insn (gen_rtx_SET (VOIDmode, operands[0], src)); DONE; } }) @@ -1046,11 +1053,14 @@ (define_expand "<sse2_avx_avx512f>_loadd false, still emit UNSPEC_LOADU insn to honor user's request for misaligned load. */ if (TARGET_AVX - && misaligned_operand (operands[1], <MODE>mode) - /* FIXME: Revisit after AVX512F merge is completed. */ - && !<mask_applied>) + && misaligned_operand (operands[1], <MODE>mode)) { - emit_insn (gen_rtx_SET (VOIDmode, operands[0], operands[1])); + rtx src = operands[1]; + if (<mask_applied>) + src = gen_rtx_VEC_MERGE (<MODE>mode, operands[1], + operands[2 * <mask_applied>], + operands[3 * <mask_applied>]); + emit_insn (gen_rtx_SET (VOIDmode, operands[0], src)); DONE; } }) --- gcc/config/i386/i386.c.jj 2014-01-02 14:44:07.000000000 +0100 +++ gcc/config/i386/i386.c 2014-01-02 21:48:23.204400654 +0100 @@ -34407,6 +34408,9 @@ ix86_expand_special_args_builtin (const case CODE_FOR_sse2_movntidi: case CODE_FOR_sse_movntq: case CODE_FOR_sse2_movntisi: + case CODE_FOR_avx512f_movntv16sf: + case CODE_FOR_avx512f_movntv8df: + case CODE_FOR_avx512f_movntv8di: aligned_mem = true; break; default: @@ -34431,6 +34435,24 @@ ix86_expand_special_args_builtin (const klass = load; memory = 0; break; + case VOID_FTYPE_PV8DF_V8DF_QI: + case VOID_FTYPE_PV16SF_V16SF_HI: + case VOID_FTYPE_PV8DI_V8DI_QI: + case VOID_FTYPE_PV16SI_V16SI_HI: + switch (icode) + { + /* These builtins and instructions require the memory + to be properly aligned. */ + case CODE_FOR_avx512f_storev16sf_mask: + case CODE_FOR_avx512f_storev16si_mask: + case CODE_FOR_avx512f_storev8df_mask: + case CODE_FOR_avx512f_storev8di_mask: + aligned_mem = true; + break; + default: + break; + } + /* FALLTHRU */ case VOID_FTYPE_PV8SF_V8SI_V8SF: case VOID_FTYPE_PV4DF_V4DI_V4DF: case VOID_FTYPE_PV4SF_V4SI_V4SF: @@ -34439,10 +34461,6 @@ ix86_expand_special_args_builtin (const case VOID_FTYPE_PV4DI_V4DI_V4DI: case VOID_FTYPE_PV4SI_V4SI_V4SI: case VOID_FTYPE_PV2DI_V2DI_V2DI: - case VOID_FTYPE_PV8DF_V8DF_QI: - case VOID_FTYPE_PV16SF_V16SF_HI: - case VOID_FTYPE_PV8DI_V8DI_QI: - case VOID_FTYPE_PV16SI_V16SI_HI: case VOID_FTYPE_PDOUBLE_V2DF_QI: case VOID_FTYPE_PFLOAT_V4SF_QI: nargs = 2; @@ -34459,6 +34477,19 @@ ix86_expand_special_args_builtin (const nargs = 3; klass = load; memory = 0; + switch (icode) + { + /* These builtins and instructions require the memory + to be properly aligned. */ + case CODE_FOR_avx512f_loadv16sf_mask: + case CODE_FOR_avx512f_loadv16si_mask: + case CODE_FOR_avx512f_loadv8df_mask: + case CODE_FOR_avx512f_loadv8di_mask: + aligned_mem = true; + break; + default: + break; + } break; case VOID_FTYPE_UINT_UINT_UINT: case VOID_FTYPE_UINT64_UINT_UINT: --- gcc/testsuite/gcc.target/i386/avx512f-vmovdqu32-1.c.jj 2013-12-31 12:51:09.000000000 +0100 +++ gcc/testsuite/gcc.target/i386/avx512f-vmovdqu32-1.c 2014-01-03 01:46:14.714705390 +0100 @@ -1,6 +1,6 @@ /* { dg-do compile } */ /* { dg-options "-mavx512f -O2" } */ -/* { dg-final { scan-assembler-times "vmovdqu32\[ \\t\]+\[^\n\]*\\)\[^\n\]*%zmm\[0-9\]\[^\{\]" 1 } } */ +/* { dg-final { scan-assembler-times "vmovdqu\[36\]\[24\]\[ \\t\]+\[^\n\]*\\)\[^\n\]*%zmm\[0-9\]\[^\{\]" 1 } } */ /* { dg-final { scan-assembler-times "vmovdqu32\[ \\t\]+\[^\n\]*\\)\[^\n\]*%zmm\[0-9\]\{%k\[1-7\]\}\[^\{\]" 1 } } */ /* { dg-final { scan-assembler-times "vmovdqu32\[ \\t\]+\[^\n\]*\\)\[^\n\]*%zmm\[0-9\]\{%k\[1-7\]\}\{z\}" 1 } } */ /* { dg-final { scan-assembler-times "vmovdqu32\[ \\t\]+\[^\n\]*%zmm\[0-9\]\[^\n\]*\\)\[^\{\]" 1 } } */ Jakub