On Thu, Oct 28, 2021 at 10:26 AM Hongtao Liu <crazy...@gmail.com> wrote: > > On Mon, Oct 25, 2021 at 4:24 PM liuhongt <hongtao....@intel.com> wrote: > > > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}. > > Ok for trunk? > > > I'm going to check in this patch if there's no objection. Committed. > > gcc/ChangeLog: > > > > PR target/102464 > > * config/i386/i386-builtin-types.def (V8HF_FTYPE_V8HF): New > > function type. > > (V16HF_FTYPE_V16HF): Ditto. > > (V32HF_FTYPE_V32HF): Ditto. > > (V8HF_FTYPE_V8HF_ROUND): Ditto. > > (V16HF_FTYPE_V16HF_ROUND): Ditto. > > (V32HF_FTYPE_V32HF_ROUND): Ditto. > > * config/i386/i386-builtin.def ( IX86_BUILTIN_FLOORPH, > > IX86_BUILTIN_CEILPH, IX86_BUILTIN_TRUNCPH, > > IX86_BUILTIN_FLOORPH256, IX86_BUILTIN_CEILPH256, > > IX86_BUILTIN_TRUNCPH256, IX86_BUILTIN_FLOORPH512, > > IX86_BUILTIN_CEILPH512, IX86_BUILTIN_TRUNCPH512): New builtin. > > * config/i386/i386-builtins.c > > (ix86_builtin_vectorized_function): Enable vectorization for > > HFmode FLOOR/CEIL/TRUNC operation. > > * config/i386/i386-expand.c (ix86_expand_args_builtin): Handle > > new builtins. > > * config/i386/sse.md (rint<mode>2, nearbyint<mode>2): Extend > > to vector HFmodes. > > > > gcc/testsuite/ChangeLog: > > > > * gcc.target/i386/pr102464-vrndscaleph.c: New test. > > --- > > gcc/config/i386/i386-builtin-types.def | 7 ++ > > gcc/config/i386/i386-builtin.def | 11 ++ > > gcc/config/i386/i386-builtins.c | 42 +++++++ > > gcc/config/i386/i386-expand.c | 3 + > > gcc/config/i386/sse.md | 12 +- > > .../gcc.target/i386/pr102464-vrndscaleph.c | 115 ++++++++++++++++++ > > 6 files changed, 184 insertions(+), 6 deletions(-) > > create mode 100644 gcc/testsuite/gcc.target/i386/pr102464-vrndscaleph.c > > > > diff --git a/gcc/config/i386/i386-builtin-types.def > > b/gcc/config/i386/i386-builtin-types.def > > index 4c355c587b5..e33f06ab30b 100644 > > --- a/gcc/config/i386/i386-builtin-types.def > > +++ b/gcc/config/i386/i386-builtin-types.def > > @@ -1380,3 +1380,10 @@ DEF_FUNCTION_TYPE (USI, V32HF, V32HF, INT, USI, INT) > > DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, V32HF, UHI, INT) > > DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, V32HF, USI, INT) > > DEF_FUNCTION_TYPE (V32HF, V32HF, INT, V32HF, USI, INT) > > + > > +DEF_FUNCTION_TYPE (V8HF, V8HF) > > +DEF_FUNCTION_TYPE (V16HF, V16HF) > > +DEF_FUNCTION_TYPE (V32HF, V32HF) > > +DEF_FUNCTION_TYPE_ALIAS (V8HF_FTYPE_V8HF, ROUND) > > +DEF_FUNCTION_TYPE_ALIAS (V16HF_FTYPE_V16HF, ROUND) > > +DEF_FUNCTION_TYPE_ALIAS (V32HF_FTYPE_V32HF, ROUND) > > diff --git a/gcc/config/i386/i386-builtin.def > > b/gcc/config/i386/i386-builtin.def > > index 99217d08d37..d9eee3f373c 100644 > > --- a/gcc/config/i386/i386-builtin.def > > +++ b/gcc/config/i386/i386-builtin.def > > @@ -958,6 +958,10 @@ BDESC (OPTION_MASK_ISA_SSE4_1, 0, > > CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__buil > > BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_roundv2df2, > > "__builtin_ia32_roundpd_az", IX86_BUILTIN_ROUNDPD_AZ, UNKNOWN, (int) > > V2DF_FTYPE_V2DF) > > BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_roundv2df2_vec_pack_sfix, > > "__builtin_ia32_roundpd_az_vec_pack_sfix", > > IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF) > > > > +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, > > CODE_FOR_avx512fp16_rndscalev8hf, "__builtin_ia32_floorph", > > IX86_BUILTIN_FLOORPH, (enum rtx_code) ROUND_FLOOR, (int) > > V8HF_FTYPE_V8HF_ROUND) > > +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, > > CODE_FOR_avx512fp16_rndscalev8hf, "__builtin_ia32_ceilph", > > IX86_BUILTIN_CEILPH, (enum rtx_code) ROUND_CEIL, (int) > > V8HF_FTYPE_V8HF_ROUND) > > +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, > > CODE_FOR_avx512fp16_rndscalev8hf, "__builtin_ia32_truncph", > > IX86_BUILTIN_TRUNCPH, (enum rtx_code) ROUND_TRUNC, (int) > > V8HF_FTYPE_V8HF_ROUND) > > + > > BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_roundps, > > "__builtin_ia32_floorps", IX86_BUILTIN_FLOORPS, (enum rtx_code) > > ROUND_FLOOR, (int) V4SF_FTYPE_V4SF_ROUND) > > BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_roundps, > > "__builtin_ia32_ceilps", IX86_BUILTIN_CEILPS, (enum rtx_code) ROUND_CEIL, > > (int) V4SF_FTYPE_V4SF_ROUND) > > BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_roundps, > > "__builtin_ia32_truncps", IX86_BUILTIN_TRUNCPS, (enum rtx_code) > > ROUND_TRUNC, (int) V4SF_FTYPE_V4SF_ROUND) > > @@ -1090,6 +1094,10 @@ BDESC (OPTION_MASK_ISA_AVX, 0, > > CODE_FOR_roundv4df2_vec_pack_sfix, "__builtin_ia3 > > BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_roundpd_vec_pack_sfix256, > > "__builtin_ia32_floorpd_vec_pack_sfix256", > > IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) > > V8SI_FTYPE_V4DF_V4DF_ROUND) > > BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_roundpd_vec_pack_sfix256, > > "__builtin_ia32_ceilpd_vec_pack_sfix256", > > IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_CEIL, (int) > > V8SI_FTYPE_V4DF_V4DF_ROUND) > > > > +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, > > CODE_FOR_avx512vl_rndscalev16hf, "__builtin_ia32_floorph256", > > IX86_BUILTIN_FLOORPH256, (enum rtx_code) ROUND_FLOOR, (int) > > V16HF_FTYPE_V16HF_ROUND) > > +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, > > CODE_FOR_avx512vl_rndscalev16hf, "__builtin_ia32_ceilph256", > > IX86_BUILTIN_CEILPH256, (enum rtx_code) ROUND_CEIL, (int) > > V16HF_FTYPE_V16HF_ROUND) > > +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, > > CODE_FOR_avx512vl_rndscalev16hf, "__builtin_ia32_truncph256", > > IX86_BUILTIN_TRUNCPH256, (enum rtx_code) ROUND_TRUNC, (int) > > V16HF_FTYPE_V16HF_ROUND) > > + > > BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_roundps256, > > "__builtin_ia32_floorps256", IX86_BUILTIN_FLOORPS256, (enum rtx_code) > > ROUND_FLOOR, (int) V8SF_FTYPE_V8SF_ROUND) > > BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_roundps256, > > "__builtin_ia32_ceilps256", IX86_BUILTIN_CEILPS256, (enum rtx_code) > > ROUND_CEIL, (int) V8SF_FTYPE_V8SF_ROUND) > > BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_roundps256, > > "__builtin_ia32_truncps256", IX86_BUILTIN_TRUNCPS256, (enum rtx_code) > > ROUND_TRUNC, (int) V8SF_FTYPE_V8SF_ROUND) > > @@ -1528,6 +1536,9 @@ BDESC (OPTION_MASK_ISA_AVX512F, 0, > > CODE_FOR_copysignv8df3, "__builtin_ia32_copy > > BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_sqrtv8df2, > > "__builtin_ia32_sqrtpd512", IX86_BUILTIN_SQRTPD512, UNKNOWN, (int) > > V8DF_FTYPE_V8DF) > > BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_sqrtv16sf2, > > "__builtin_ia32_sqrtps512", IX86_BUILTIN_SQRTPS_NR512, UNKNOWN, (int) > > V16SF_FTYPE_V16SF) > > BDESC (OPTION_MASK_ISA_AVX512ER, 0, CODE_FOR_avx512er_exp2v16sf, > > "__builtin_ia32_exp2ps", IX86_BUILTIN_EXP2PS, UNKNOWN, (int) > > V16SF_FTYPE_V16SF) > > +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_rndscalev32hf, > > "__builtin_ia32_floorph512", IX86_BUILTIN_FLOORPH512, (enum rtx_code) > > ROUND_FLOOR, (int) V32HF_FTYPE_V32HF_ROUND) > > +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_rndscalev32hf, > > "__builtin_ia32_ceilph512", IX86_BUILTIN_CEILPH512, (enum rtx_code) > > ROUND_CEIL, (int) V32HF_FTYPE_V32HF_ROUND) > > +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_rndscalev32hf, > > "__builtin_ia32_truncph512", IX86_BUILTIN_TRUNCPH512, (enum rtx_code) > > ROUND_TRUNC, (int) V32HF_FTYPE_V32HF_ROUND) > > BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_roundps512, > > "__builtin_ia32_floorps512", IX86_BUILTIN_FLOORPS512, (enum rtx_code) > > ROUND_FLOOR, (int) V16SF_FTYPE_V16SF_ROUND) > > BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_roundps512, > > "__builtin_ia32_ceilps512", IX86_BUILTIN_CEILPS512, (enum rtx_code) > > ROUND_CEIL, (int) V16SF_FTYPE_V16SF_ROUND) > > BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_roundps512, > > "__builtin_ia32_truncps512", IX86_BUILTIN_TRUNCPS512, (enum rtx_code) > > ROUND_TRUNC, (int) V16SF_FTYPE_V16SF_ROUND) > > diff --git a/gcc/config/i386/i386-builtins.c > > b/gcc/config/i386/i386-builtins.c > > index 11ce58b2574..0fb14b55712 100644 > > --- a/gcc/config/i386/i386-builtins.c > > +++ b/gcc/config/i386/i386-builtins.c > > @@ -1652,6 +1652,20 @@ ix86_builtin_vectorized_function (unsigned int fn, > > tree type_out, > > else if (out_n == 16 && in_n == 16) > > return ix86_get_builtin (IX86_BUILTIN_FLOORPS512); > > } > > + if (out_mode == HFmode && in_mode == HFmode) > > + { > > + /* V8HF/V16HF is supported in ix86_vector_mode_supported_p > > + under TARGET_AVX512FP16, TARGET_AVX512VL is needed here. */ > > + if (out_n < 32 && !TARGET_AVX512VL) > > + break; > > + > > + if (out_n == 8 && in_n == 8) > > + return ix86_get_builtin (IX86_BUILTIN_FLOORPH); > > + else if (out_n == 16 && in_n == 16) > > + return ix86_get_builtin (IX86_BUILTIN_FLOORPH256); > > + else if (out_n == 32 && in_n == 32) > > + return ix86_get_builtin (IX86_BUILTIN_FLOORPH512); > > + } > > break; > > > > CASE_CFN_CEIL: > > @@ -1677,6 +1691,20 @@ ix86_builtin_vectorized_function (unsigned int fn, > > tree type_out, > > else if (out_n == 16 && in_n == 16) > > return ix86_get_builtin (IX86_BUILTIN_CEILPS512); > > } > > + if (out_mode == HFmode && in_mode == HFmode) > > + { > > + /* V8HF/V16HF is supported in ix86_vector_mode_supported_p > > + under TARGET_AVX512FP16, TARGET_AVX512VL is needed here. */ > > + if (out_n < 32 && !TARGET_AVX512VL) > > + break; > > + > > + if (out_n == 8 && in_n == 8) > > + return ix86_get_builtin (IX86_BUILTIN_CEILPH); > > + else if (out_n == 16 && in_n == 16) > > + return ix86_get_builtin (IX86_BUILTIN_CEILPH256); > > + else if (out_n == 32 && in_n == 32) > > + return ix86_get_builtin (IX86_BUILTIN_CEILPH512); > > + } > > break; > > > > CASE_CFN_TRUNC: > > @@ -1702,6 +1730,20 @@ ix86_builtin_vectorized_function (unsigned int fn, > > tree type_out, > > else if (out_n == 16 && in_n == 16) > > return ix86_get_builtin (IX86_BUILTIN_TRUNCPS512); > > } > > + if (out_mode == HFmode && in_mode == HFmode) > > + { > > + /* V8HF/V16HF is supported in ix86_vector_mode_supported_p > > + under TARGET_AVX512FP16, TARGET_AVX512VL is needed here. */ > > + if (out_n < 32 && !TARGET_AVX512VL) > > + break; > > + > > + if (out_n == 8 && in_n == 8) > > + return ix86_get_builtin (IX86_BUILTIN_TRUNCPH); > > + else if (out_n == 16 && in_n == 16) > > + return ix86_get_builtin (IX86_BUILTIN_TRUNCPH256); > > + else if (out_n == 32 && in_n == 32) > > + return ix86_get_builtin (IX86_BUILTIN_TRUNCPH512); > > + } > > break; > > > > CASE_CFN_FMA: > > diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c > > index 56dd99b5511..15c4bc375d5 100644 > > --- a/gcc/config/i386/i386-expand.c > > +++ b/gcc/config/i386/i386-expand.c > > @@ -9423,6 +9423,9 @@ ix86_expand_args_builtin (const struct > > builtin_description *d, > > case V4SF_FTYPE_V4SF_ROUND: > > case V8SF_FTYPE_V8SF_ROUND: > > case V16SF_FTYPE_V16SF_ROUND: > > + case V8HF_FTYPE_V8HF_ROUND: > > + case V16HF_FTYPE_V16HF_ROUND: > > + case V32HF_FTYPE_V32HF_ROUND: > > case V4SI_FTYPE_V4SF_ROUND: > > case V8SI_FTYPE_V8SF_ROUND: > > case V16SI_FTYPE_V16SF_ROUND: > > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md > > index fbf056bf9e6..00ee7b58ef3 100644 > > --- a/gcc/config/i386/sse.md > > +++ b/gcc/config/i386/sse.md > > @@ -21758,18 +21758,18 @@ (define_insn "ptesttf2" > > (set_attr "mode" "TI")]) > > > > (define_expand "nearbyint<mode>2" > > - [(set (match_operand:VF 0 "register_operand") > > - (unspec:VF > > - [(match_operand:VF 1 "vector_operand") > > + [(set (match_operand:VFH 0 "register_operand") > > + (unspec:VFH > > + [(match_operand:VFH 1 "vector_operand") > > (match_dup 2)] > > UNSPEC_ROUND))] > > "TARGET_SSE4_1" > > "operands[2] = GEN_INT (ROUND_MXCSR | ROUND_NO_EXC);") > > > > (define_expand "rint<mode>2" > > - [(set (match_operand:VF 0 "register_operand") > > - (unspec:VF > > - [(match_operand:VF 1 "vector_operand") > > + [(set (match_operand:VFH 0 "register_operand") > > + (unspec:VFH > > + [(match_operand:VFH 1 "vector_operand") > > (match_dup 2)] > > UNSPEC_ROUND))] > > "TARGET_SSE4_1" > > diff --git a/gcc/testsuite/gcc.target/i386/pr102464-vrndscaleph.c > > b/gcc/testsuite/gcc.target/i386/pr102464-vrndscaleph.c > > new file mode 100644 > > index 00000000000..a76d9e7e376 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr102464-vrndscaleph.c > > @@ -0,0 +1,115 @@ > > +/* PR target/102464. */ > > +/* { dg-do compile } */ > > +/* { dg-options "-Ofast -mavx512fp16 -mavx512vl -mprefer-vector-width=512" > > } */ > > +#include<math.h> > > +void > > +foo (_Float16* __restrict a, _Float16* b) > > +{ > > + for (int i = 0; i != 8; i++) > > + a[i] = floor (b[i]); > > +} > > + > > +void > > +foo1 (_Float16* __restrict a, _Float16* b) > > +{ > > + for (int i = 0; i != 8; i++) > > + a[i] = ceil (b[i]); > > +} > > + > > +void > > +foo2 (_Float16* __restrict a, _Float16* b) > > +{ > > + for (int i = 0; i != 8; i++) > > + a[i] = trunc (b[i]); > > +} > > + > > +void > > +foo3 (_Float16* __restrict a, _Float16* b) > > +{ > > + for (int i = 0; i != 8; i++) > > + a[i] = nearbyint (b[i]); > > +} > > + > > +void > > +foo4 (_Float16* __restrict a, _Float16* b) > > +{ > > + for (int i = 0; i != 8; i++) > > + a[i] = rint (b[i]); > > +} > > + > > +void > > +foo5 (_Float16* __restrict a, _Float16* b) > > +{ > > + for (int i = 0; i != 16; i++) > > + a[i] = floor (b[i]); > > +} > > + > > +void > > +foo6 (_Float16* __restrict a, _Float16* b) > > +{ > > + for (int i = 0; i != 16; i++) > > + a[i] = ceil (b[i]); > > +} > > + > > +void > > +foo7 (_Float16* __restrict a, _Float16* b) > > +{ > > + for (int i = 0; i != 16; i++) > > + a[i] = trunc (b[i]); > > +} > > + > > +void > > +foo8 (_Float16* __restrict a, _Float16* b) > > +{ > > + for (int i = 0; i != 16; i++) > > + a[i] = nearbyint (b[i]); > > +} > > + > > +void > > +foo9 (_Float16* __restrict a, _Float16* b) > > +{ > > + for (int i = 0; i != 16; i++) > > + a[i] = rint (b[i]); > > +} > > + > > +void > > +foo10 (_Float16* __restrict a, _Float16* b) > > +{ > > + for (int i = 0; i != 32; i++) > > + a[i] = floor (b[i]); > > +} > > + > > +void > > +foo11 (_Float16* __restrict a, _Float16* b) > > +{ > > + for (int i = 0; i != 32; i++) > > + a[i] = ceil (b[i]); > > +} > > + > > +void > > +foo12 (_Float16* __restrict a, _Float16* b) > > +{ > > + for (int i = 0; i != 32; i++) > > + a[i] = trunc (b[i]); > > +} > > + > > +void > > +foo13 (_Float16* __restrict a, _Float16* b) > > +{ > > + for (int i = 0; i != 32; i++) > > + a[i] = nearbyint (b[i]); > > +} > > + > > +void > > +foo14 (_Float16* __restrict a, _Float16* b) > > +{ > > + for (int i = 0; i != 32; i++) > > + a[i] = rint (b[i]); > > +} > > + > > +/* { dg-final { scan-assembler-not "vcvtsh2s\[sd\]" } } */ > > +/* { dg-final { scan-assembler-not "vcvtph2p\[sd\]" } } */ > > +/* { dg-final { scan-assembler-not "extendhfxf" } } */ > > +/* { dg-final { scan-assembler-times "vrndscaleph\[^\n\r\]*xmm\[0-9\]" 5 } > > } */ > > +/* { dg-final { scan-assembler-times "vrndscaleph\[^\n\r\]*ymm\[0-9\]" 5 } > > } */ > > +/* { dg-final { scan-assembler-times "vrndscaleph\[^\n\r\]*zmm\[0-9\]" 5 } > > } */ > > -- > > 2.18.1 > > > > > -- > BR, > Hongtao
-- BR, Hongtao