Remove duplicated AVX2/AVX512 vec_dup patterns and replace them with subreg. gcc.target/i386/avx2-vbroadcastss_ps256-1.c is changed by
avx2_test: .cfi_startproc - vmovaps x(%rip), %xmm1 - vbroadcastss %xmm1, %ymm0 + vbroadcastss x(%rip), %ymm0 vmovaps %ymm0, y(%rip) vzeroupper ret .cfi_endproc gcc.target/i386/avx512vl-vbroadcast-3.c is changed by @@ -113,7 +113,7 @@ f10: .cfi_startproc vmovaps %ymm0, %ymm16 vpermilps $85, %ymm16, %ymm16 - vbroadcastss %xmm16, %ymm16 + vshuff32x4 $0x0, %ymm16, %ymm16, %ymm16 vzeroupper ret .cfi_endproc @@ -153,8 +153,7 @@ f12: f13: .LFB12: .cfi_startproc - vmovaps (%rdi), %ymm16 - vbroadcastss %xmm16, %ymm16 + vbroadcastss (%rdi), %ymm16 vzeroupper ret .cfi_endproc OK for trunk? Thanks. H.J. -- gcc/ * config/i386/i386-builtin.def: Replace CODE_FOR_avx2_vec_dupv4sf, CODE_FOR_avx2_vec_dupv8sf and CODE_FOR_avx2_vec_dupv4df with CODE_FOR_vec_dupv4sf, CODE_FOR_vec_dupv8sf and CODE_FOR_vec_dupv4df, respectively. * config/i386/i386.c (expand_vec_perm_1): Use subreg with vec_dup. * config/i386/i386.md (SF to DF splitter): Replace gen_avx512f_vec_dupv16sf_1 with gen_avx512f_vec_dupv16sf. * config/i386/sse.md (VF48_AVX512VL): New. (avx2_vec_dup<mode>): Removed. (avx2_vec_dupv8sf_1): Likewise. (avx512f_vec_dup<mode>_1): Likewise. (avx2_pbroadcast<mode>_1): Likewise. (avx2_vec_dupv4df): Likewise. (<avx512>_vec_dup<mode>_1): Likewise. (*avx_vperm_broadcast_<mode>): Replace gen_avx2_vec_dupv8sf with gen_vec_dupv8sf. gcc/testsuite/ * gcc.target/i386/avx2-vbroadcastss_ps256-1.c: Updated. * gcc.target/i386/avx512vl-vbroadcast-3.c: Likewise. --- gcc/config/i386/i386-builtin.def | 6 +- gcc/config/i386/i386.c | 57 ++++++++++--- gcc/config/i386/i386.md | 2 +- gcc/config/i386/sse.md | 83 +------------------ .../i386/avx2-vbroadcastss_ps256-1.c | 3 +- .../gcc.target/i386/avx512vl-vbroadcast-3.c | 5 +- 6 files changed, 56 insertions(+), 100 deletions(-) diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def index df0f7e975ac..d217add8ee2 100644 --- a/gcc/config/i386/i386-builtin.def +++ b/gcc/config/i386/i386-builtin.def @@ -1194,9 +1194,9 @@ BDESC (OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv16hi, "__builtin_ia32_ BDESC (OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv8si, "__builtin_ia32_punpckldq256", IX86_BUILTIN_PUNPCKLDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI) BDESC (OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv4di, "__builtin_ia32_punpcklqdq256", IX86_BUILTIN_PUNPCKLQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI) BDESC (OPTION_MASK_ISA_AVX2, CODE_FOR_xorv4di3, "__builtin_ia32_pxor256", IX86_BUILTIN_PXOR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI) -BDESC (OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4sf, "__builtin_ia32_vbroadcastss_ps", IX86_BUILTIN_VBROADCASTSS_PS, UNKNOWN, (int) V4SF_FTYPE_V4SF) -BDESC (OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv8sf, "__builtin_ia32_vbroadcastss_ps256", IX86_BUILTIN_VBROADCASTSS_PS256, UNKNOWN, (int) V8SF_FTYPE_V4SF) -BDESC (OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4df, "__builtin_ia32_vbroadcastsd_pd256", IX86_BUILTIN_VBROADCASTSD_PD256, UNKNOWN, (int) V4DF_FTYPE_V2DF) +BDESC (OPTION_MASK_ISA_AVX2, CODE_FOR_vec_dupv4sf, "__builtin_ia32_vbroadcastss_ps", IX86_BUILTIN_VBROADCASTSS_PS, UNKNOWN, (int) V4SF_FTYPE_V4SF) +BDESC (OPTION_MASK_ISA_AVX2, CODE_FOR_vec_dupv8sf, "__builtin_ia32_vbroadcastss_ps256", IX86_BUILTIN_VBROADCASTSS_PS256, UNKNOWN, (int) V8SF_FTYPE_V4SF) +BDESC (OPTION_MASK_ISA_AVX2, CODE_FOR_vec_dupv4df, "__builtin_ia32_vbroadcastsd_pd256", IX86_BUILTIN_VBROADCASTSD_PD256, UNKNOWN, (int) V4DF_FTYPE_V2DF) BDESC (OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vbroadcasti128_v4di, "__builtin_ia32_vbroadcastsi256", IX86_BUILTIN_VBROADCASTSI256, UNKNOWN, (int) V4DI_FTYPE_V2DI) BDESC (OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv4si, "__builtin_ia32_pblendd128", IX86_BUILTIN_PBLENDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_INT) BDESC (OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv8si, "__builtin_ia32_pblendd256", IX86_BUILTIN_PBLENDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT) diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 963c7fcbb34..6b95d774ad1 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -45963,28 +45963,41 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d) { /* Use vpbroadcast{b,w,d}. */ rtx (*gen) (rtx, rtx) = NULL; + machine_mode smode = VOIDmode; switch (d->vmode) { case E_V64QImode: if (TARGET_AVX512BW) - gen = gen_avx512bw_vec_dupv64qi_1; + { + smode = V16QImode; + gen = gen_avx512bw_vec_dupv64qi; + } break; case E_V32QImode: - gen = gen_avx2_pbroadcastv32qi_1; + smode = V16QImode; + gen = gen_avx2_pbroadcastv32qi; break; case E_V32HImode: if (TARGET_AVX512BW) - gen = gen_avx512bw_vec_dupv32hi_1; + { + smode = V8HImode; + gen = gen_avx512bw_vec_dupv32hi; + } break; case E_V16HImode: - gen = gen_avx2_pbroadcastv16hi_1; + smode = V8HImode; + gen = gen_avx2_pbroadcastv16hi; break; case E_V16SImode: if (TARGET_AVX512F) - gen = gen_avx512f_vec_dupv16si_1; + { + smode = V4SImode; + gen = gen_avx512f_vec_dupv16si; + } break; case E_V8SImode: - gen = gen_avx2_pbroadcastv8si_1; + smode = V4SImode; + gen = gen_avx2_pbroadcastv8si; break; case E_V16QImode: gen = gen_avx2_pbroadcastv16qi; @@ -45993,19 +46006,25 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d) gen = gen_avx2_pbroadcastv8hi; break; case E_V16SFmode: + smode = SFmode; if (TARGET_AVX512F) - gen = gen_avx512f_vec_dupv16sf_1; + gen = gen_avx512f_vec_dupv16sf; break; case E_V8SFmode: - gen = gen_avx2_vec_dupv8sf_1; + smode = SFmode; + gen = gen_vec_dupv8sf; break; case E_V8DFmode: + smode = DFmode; if (TARGET_AVX512F) - gen = gen_avx512f_vec_dupv8df_1; + gen = gen_avx512f_vec_dupv8df; break; case E_V8DImode: if (TARGET_AVX512F) - gen = gen_avx512f_vec_dupv8di_1; + { + smode = V2DImode; + gen = gen_avx512f_vec_dupv8di; + } break; /* For other modes prefer other shuffles this function creates. */ default: break; @@ -46013,7 +46032,23 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d) if (gen != NULL) { if (!d->testing_p) - emit_insn (gen (d->target, d->op0)); + { + if (smode == VOIDmode) + emit_insn (gen (d->target, d->op0)); + else + { + rtx op = d->op0; + unsigned int oppos = 0; + if (SUBREG_P (op)) + { + op = SUBREG_REG (op); + oppos = SUBREG_BYTE (op); + } + emit_insn (gen (d->target, + gen_rtx_SUBREG (smode, op, + oppos))); + } + } return true; } } diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 7fb2b144f47..4a6fa077db5 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -4399,7 +4399,7 @@ else { rtx tmp = lowpart_subreg (V16SFmode, operands[3], V4SFmode); - emit_insn (gen_avx512f_vec_dupv16sf_1 (tmp, tmp)); + emit_insn (gen_avx512f_vec_dupv16sf (tmp, tmp)); } } else diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index ee73e1fdf80..90a700c154a 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -7117,42 +7117,6 @@ (set_attr "prefix" "orig,maybe_evex") (set_attr "mode" "SF")]) -(define_insn "avx2_vec_dup<mode>" - [(set (match_operand:VF1_128_256 0 "register_operand" "=v") - (vec_duplicate:VF1_128_256 - (vec_select:SF - (match_operand:V4SF 1 "register_operand" "v") - (parallel [(const_int 0)]))))] - "TARGET_AVX2" - "vbroadcastss\t{%1, %0|%0, %1}" - [(set_attr "type" "sselog1") - (set_attr "prefix" "maybe_evex") - (set_attr "mode" "<MODE>")]) - -(define_insn "avx2_vec_dupv8sf_1" - [(set (match_operand:V8SF 0 "register_operand" "=v") - (vec_duplicate:V8SF - (vec_select:SF - (match_operand:V8SF 1 "register_operand" "v") - (parallel [(const_int 0)]))))] - "TARGET_AVX2" - "vbroadcastss\t{%x1, %0|%0, %x1}" - [(set_attr "type" "sselog1") - (set_attr "prefix" "maybe_evex") - (set_attr "mode" "V8SF")]) - -(define_insn "avx512f_vec_dup<mode>_1" - [(set (match_operand:VF_512 0 "register_operand" "=v") - (vec_duplicate:VF_512 - (vec_select:<ssescalarmode> - (match_operand:VF_512 1 "register_operand" "v") - (parallel [(const_int 0)]))))] - "TARGET_AVX512F" - "vbroadcast<bcstscalarsuff>\t{%x1, %0|%0, %x1}" - [(set_attr "type" "sselog1") - (set_attr "prefix" "evex") - (set_attr "mode" "<MODE>")]) - ;; Although insertps takes register source, we prefer ;; unpcklps with register source since it is shorter. (define_insn "*vec_concatv2sf_sse4_1" @@ -17918,24 +17882,6 @@ (set_attr "prefix" "vex,evex") (set_attr "mode" "<sseinsnmode>")]) -(define_insn "avx2_pbroadcast<mode>_1" - [(set (match_operand:VI_256 0 "register_operand" "=x,x,v,v") - (vec_duplicate:VI_256 - (vec_select:<ssescalarmode> - (match_operand:VI_256 1 "nonimmediate_operand" "m,x,m,v") - (parallel [(const_int 0)]))))] - "TARGET_AVX2" - "@ - vpbroadcast<ssemodesuffix>\t{%1, %0|%0, %<iptr>1} - vpbroadcast<ssemodesuffix>\t{%x1, %0|%0, %x1} - vpbroadcast<ssemodesuffix>\t{%1, %0|%0, %<iptr>1} - vpbroadcast<ssemodesuffix>\t{%x1, %0|%0, %x1}" - [(set_attr "isa" "*,*,<pbroadcast_evex_isa>,<pbroadcast_evex_isa>") - (set_attr "type" "ssemov") - (set_attr "prefix_extra" "1") - (set_attr "prefix" "vex") - (set_attr "mode" "<sseinsnmode>")]) - (define_insn "<avx2_avx512>_permvar<mode><mask_name>" [(set (match_operand:VI48F_256_512 0 "register_operand" "=v") (unspec:VI48F_256_512 @@ -18111,32 +18057,6 @@ (set_attr "prefix" "vex") (set_attr "mode" "OI")]) -(define_insn "avx2_vec_dupv4df" - [(set (match_operand:V4DF 0 "register_operand" "=v") - (vec_duplicate:V4DF - (vec_select:DF - (match_operand:V2DF 1 "register_operand" "v") - (parallel [(const_int 0)]))))] - "TARGET_AVX2" - "vbroadcastsd\t{%1, %0|%0, %1}" - [(set_attr "type" "sselog1") - (set_attr "prefix" "maybe_evex") - (set_attr "mode" "V4DF")]) - -(define_insn "<avx512>_vec_dup<mode>_1" - [(set (match_operand:VI_AVX512BW 0 "register_operand" "=v,v") - (vec_duplicate:VI_AVX512BW - (vec_select:<ssescalarmode> - (match_operand:VI_AVX512BW 1 "nonimmediate_operand" "v,m") - (parallel [(const_int 0)]))))] - "TARGET_AVX512F" - "@ - vpbroadcast<ssemodesuffix>\t{%x1, %0|%0, %x1} - vpbroadcast<ssemodesuffix>\t{%x1, %0|%0, %<iptr>1}" - [(set_attr "type" "ssemov") - (set_attr "prefix" "evex") - (set_attr "mode" "<sseinsnmode>")]) - (define_insn "<avx512>_vec_dup<mode><mask_name>" [(set (match_operand:V48_AVX512VL 0 "register_operand" "=v") (vec_duplicate:V48_AVX512VL @@ -18545,8 +18465,7 @@ or VSHUFF128. */ gcc_assert (<MODE>mode == V8SFmode); if ((mask & 1) == 0) - emit_insn (gen_avx2_vec_dupv8sf (op0, - gen_lowpart (V4SFmode, op0))); + emit_insn (gen_vec_dupv8sf (op0, gen_lowpart (V4SFmode, op0))); else emit_insn (gen_avx512vl_shuf_f32x4_1 (op0, op0, op0, GEN_INT (4), GEN_INT (5), diff --git a/gcc/testsuite/gcc.target/i386/avx2-vbroadcastss_ps256-1.c b/gcc/testsuite/gcc.target/i386/avx2-vbroadcastss_ps256-1.c index dfac3916b08..3ff7497aa21 100644 --- a/gcc/testsuite/gcc.target/i386/avx2-vbroadcastss_ps256-1.c +++ b/gcc/testsuite/gcc.target/i386/avx2-vbroadcastss_ps256-1.c @@ -1,6 +1,7 @@ /* { dg-do compile } */ /* { dg-options "-mavx2 -O2" } */ -/* { dg-final { scan-assembler "vbroadcastss\[ \\t\]+\[^\n\]*%xmm\[0-9\]" } } */ +/* { dg-final { scan-assembler "vbroadcastss\[ \\t\]+\[^\n\]*%ymm\[0-9\]" } } */ +/* { dg-final { scan-assembler-not "vmovaps\[\t \]*\[^,\]*,%xmm\[0-9\]" } } */ #include <immintrin.h> diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-vbroadcast-3.c b/gcc/testsuite/gcc.target/i386/avx512vl-vbroadcast-3.c index 7233398cd64..1c62364dac4 100644 --- a/gcc/testsuite/gcc.target/i386/avx512vl-vbroadcast-3.c +++ b/gcc/testsuite/gcc.target/i386/avx512vl-vbroadcast-3.c @@ -151,8 +151,8 @@ f16 (V2 *x) } /* { dg-final { scan-assembler-times "vbroadcastss\[^\n\r]*%\[re\]di\[^\n\r]*%xmm16" 4 } } */ -/* { dg-final { scan-assembler-times "vbroadcastss\[^\n\r]*%xmm16\[^\n\r]*%ymm16" 3 } } */ -/* { dg-final { scan-assembler-times "vbroadcastss\[^\n\r]*%\[re\]di\[^\n\r]*%ymm16" 3 } } */ +/* { dg-final { scan-assembler-times "vbroadcastss\[^\n\r]*%xmm16\[^\n\r]*%ymm16" 1 } } */ +/* { dg-final { scan-assembler-times "vbroadcastss\[^\n\r]*%\[re\]di\[^\n\r]*%ymm16" 4 } } */ /* { dg-final { scan-assembler-times "vpermilps\[^\n\r]*\\\$0\[^\n\r]*%xmm16\[^\n\r]*%xmm16" 1 } } */ /* { dg-final { scan-assembler-times "vpermilps\[^\n\r]*\\\$85\[^\n\r]*%xmm16\[^\n\r]*%xmm16" 1 } } */ /* { dg-final { scan-assembler-times "vpermilps\[^\n\r]*\\\$170\[^\n\r]*%xmm16\[^\n\r]*%xmm16" 1 } } */ @@ -160,3 +160,4 @@ f16 (V2 *x) /* { dg-final { scan-assembler-times "vpermilps\[^\n\r]*\\\$0\[^\n\r]*%ymm16\[^\n\r]*%ymm16" 1 } } */ /* { dg-final { scan-assembler-times "vpermilps\[^\n\r]*\\\$85\[^\n\r]*%ymm16\[^\n\r]*%ymm16" 2 } } */ /* { dg-final { scan-assembler-times "vshuff32x4\[^\n\r]*\\\$3\[^\n\r]*%ymm16\[^\n\r]*%ymm16\[^\n\r]*%ymm16" 2 } } */ +/* { dg-final { scan-assembler-times "vshuff32x4\[^\n\r]*\\\$0\[^\n\r]*%ymm16\[^\n\r]*%ymm16\[^\n\r]*%ymm16" 1 } } */ -- 2.17.2