On Mon, May 25, 2020 at 8:27 AM Richard Biener <rguent...@suse.de> wrote: > > On May 25, 2020 8:12:12 AM GMT+02:00, Uros Bizjak <ubiz...@gmail.com> wrote: > >On Mon, May 25, 2020 at 7:53 AM Hongtao Liu <crazy...@gmail.com> wrote: > > > >> > We have to introduce a new expander, that will have conforming mode > >of > >> > output operand (V2SF) and will produce RTX that will match > >> > *float<floatunssuffix>v2div2sf2. A paradoxical output subreg from > >> > V2SFmode V4SFmode is needed, generated by simplify_gen_subreg as is > >> > the case with paradoxical input subreg. > >> > >> Problem`is simplify_gen_subreg (V4SFmode, operands[0], V2SFmode, 0) > >> will return NULL since > >> ---- > >> 948 /* Subregs involving floating point modes are not allowed to > >> 949 change size. Therefore (subreg:DI (reg:DF) 0) is fine, but > >> 950 (subreg:SI (reg:DF) 0) isn't. */ > > > >But, we are not changing size, we are still operating with SFmode. It > >looks to me that this limitation is too strict, the intention is to > >not expand scalar SFmode to DFmode. > > I guess so. The test probably wants to tes the component mode.
There is already some fishy x86 specific workaround in place, which I took the liberty to extend to vector modes. The attached patch that exercises this code works as expected, and produces expected code. Liu, can you please test the attached patch? Richi, is the middle end part OK? Uros.
diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def index fa123788a8e..b873498f3ab 100644 --- a/gcc/config/i386/i386-builtin.def +++ b/gcc/config/i386/i386-builtin.def @@ -1649,9 +1649,9 @@ BDESC (OPTION_MASK_ISA_AVX512DQ | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_ufix_not BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_ufix_notruncv4dfv4si2_mask, "__builtin_ia32_cvtpd2udq256_mask", IX86_BUILTIN_CVTPD2UDQ256_MASK, UNKNOWN, (int) V4SI_FTYPE_V4DF_V4SI_UQI) BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_ufix_notruncv2dfv2si2_mask, "__builtin_ia32_cvtpd2udq128_mask", IX86_BUILTIN_CVTPD2UDQ128_MASK, UNKNOWN, (int) V4SI_FTYPE_V2DF_V4SI_UQI) BDESC (OPTION_MASK_ISA_AVX512DQ | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_fix_truncv4sfv4di2_mask, "__builtin_ia32_cvttps2qq256_mask", IX86_BUILTIN_CVTTPS2QQ256, UNKNOWN, (int) V4DI_FTYPE_V4SF_V4DI_UQI) -BDESC (OPTION_MASK_ISA_AVX512DQ | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_fix_truncv2sfv2di2_mask, "__builtin_ia32_cvttps2qq128_mask", IX86_BUILTIN_CVTTPS2QQ128, UNKNOWN, (int) V2DI_FTYPE_V4SF_V2DI_UQI) +BDESC (OPTION_MASK_ISA_AVX512DQ | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_avx512dq_fix_truncv2sfv2di2_mask, "__builtin_ia32_cvttps2qq128_mask", IX86_BUILTIN_CVTTPS2QQ128, UNKNOWN, (int) V2DI_FTYPE_V4SF_V2DI_UQI) BDESC (OPTION_MASK_ISA_AVX512DQ | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_fixuns_truncv4sfv4di2_mask, "__builtin_ia32_cvttps2uqq256_mask", IX86_BUILTIN_CVTTPS2UQQ256, UNKNOWN, (int) V4DI_FTYPE_V4SF_V4DI_UQI) -BDESC (OPTION_MASK_ISA_AVX512DQ | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_fixuns_truncv2sfv2di2_mask, "__builtin_ia32_cvttps2uqq128_mask", IX86_BUILTIN_CVTTPS2UQQ128, UNKNOWN, (int) V2DI_FTYPE_V4SF_V2DI_UQI) +BDESC (OPTION_MASK_ISA_AVX512DQ | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_avx512dq_fixuns_truncv2sfv2di2_mask, "__builtin_ia32_cvttps2uqq128_mask", IX86_BUILTIN_CVTTPS2UQQ128, UNKNOWN, (int) V2DI_FTYPE_V4SF_V2DI_UQI) BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_fix_truncv8sfv8si2_mask, "__builtin_ia32_cvttps2dq256_mask", IX86_BUILTIN_CVTTPS2DQ256_MASK, UNKNOWN, (int) V8SI_FTYPE_V8SF_V8SI_UQI) BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_fix_truncv4sfv4si2_mask, "__builtin_ia32_cvttps2dq128_mask", IX86_BUILTIN_CVTTPS2DQ128_MASK, UNKNOWN, (int) V4SI_FTYPE_V4SF_V4SI_UQI) BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_ufix_truncv8sfv8si2_mask, "__builtin_ia32_cvttps2udq256_mask", IX86_BUILTIN_CVTTPS2UDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF_V8SI_UQI) diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index cee2b2890a5..fde65391d7d 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -5795,7 +5795,7 @@ (set_attr "prefix" "evex") (set_attr "mode" "<MODE>")]) -(define_expand "float<floatunssuffix>v2div2sf2" +(define_expand "avx512dq_float<floatunssuffix>v2div2sf2" [(set (match_operand:V4SF 0 "register_operand" "=v") (vec_concat:V4SF (any_float:V2SF (match_operand:V2DI 1 "nonimmediate_operand" "vm")) @@ -5803,7 +5803,7 @@ "TARGET_AVX512DQ && TARGET_AVX512VL" "operands[2] = CONST0_RTX (V2SFmode);") -(define_insn "*float<floatunssuffix>v2div2sf2" +(define_insn "*avx512dq_float<floatunssuffix>v2div2sf2" [(set (match_operand:V4SF 0 "register_operand" "=v") (vec_concat:V4SF (any_float:V2SF (match_operand:V2DI 1 "nonimmediate_operand" "vm")) @@ -5814,6 +5814,17 @@ (set_attr "prefix" "evex") (set_attr "mode" "V4SF")]) +(define_expand "float<floatunssuffix>v2div2sf2" + [(set (match_operand:V2SF 0 "register_operand") + (any_float:V2SF (match_operand:V2DI 1 "nonimmediate_operand")))] + "TARGET_AVX512DQ && TARGET_AVX512VL" +{ + operands[0] = simplify_gen_subreg (V4SFmode, operands[0], V2SFmode, 0); + emit_insn (gen_avx512dq_float<floatunssuffix>v2div2sf2 + (operands[0], operands[1])); + DONE; +}) + (define_mode_attr vpckfloat_concat_mode [(V8DI "v16sf") (V4DI "v8sf") (V2DI "v8sf")]) (define_mode_attr vpckfloat_temp_mode @@ -5830,7 +5841,12 @@ { rtx r1 = gen_reg_rtx (<vpckfloat_temp_mode>mode); rtx r2 = gen_reg_rtx (<vpckfloat_temp_mode>mode); - rtx (*gen) (rtx, rtx) = gen_float<floatunssuffix><mode><vpckfloat_op_mode>2; + rtx (*gen) (rtx, rtx); + + if (<MODE>mode == V2DImode) + gen = gen_avx512dq_float<floatunssuffix>v2div2sf2; + else + gen = gen_float<floatunssuffix><mode><vpckfloat_op_mode>2; emit_insn (gen (r1, operands[1])); emit_insn (gen (r2, operands[2])); if (<MODE>mode == V2DImode) @@ -6217,7 +6233,7 @@ (set_attr "prefix" "evex") (set_attr "mode" "<sseintvecmode3>")]) -(define_insn "fix<fixunssuffix>_truncv2sfv2di2<mask_name>" +(define_insn "avx512dq_fix<fixunssuffix>_truncv2sfv2di2<mask_name>" [(set (match_operand:V2DI 0 "register_operand" "=v") (any_fix:V2DI (vec_select:V2SF @@ -6229,6 +6245,18 @@ (set_attr "prefix" "evex") (set_attr "mode" "TI")]) +(define_expand "fix<fixunssuffix>_truncv2sfv2di2" + [(set (match_operand:V2DI 0 "register_operand") + (any_fix:V2DI + (match_operand:V2SF 1 "register_operand")))] + "TARGET_AVX512DQ && TARGET_AVX512VL" +{ + operands[1] = simplify_gen_subreg (V4SFmode, operands[1], V2SFmode, 0); + emit_insn (gen_avx512dq_fix<fixunssuffix>_truncv2sfv2di2 + (operands[0], operands[1])); + DONE; +}) + (define_mode_attr vunpckfixt_mode [(V16SF "V8DI") (V8SF "V4DI") (V4SF "V2DI")]) (define_mode_attr vunpckfixt_model @@ -6243,14 +6271,18 @@ "TARGET_AVX512DQ" { rtx tem = operands[1]; + rtx (*gen) (rtx, rtx); + if (<MODE>mode != V4SFmode) { tem = gen_reg_rtx (<ssehalfvecmode>mode); emit_insn (gen_vec_extract_lo_<vunpckfixt_extract_mode> (tem, operands[1])); + gen = gen_fix<fixunssuffix>_trunc<ssehalfvecmodelower><vunpckfixt_model>2; } - rtx (*gen) (rtx, rtx) - = gen_fix<fixunssuffix>_trunc<ssehalfvecmodelower><vunpckfixt_model>2; + else + gen = gen_avx512dq_fix<fixunssuffix>_truncv2sfv2di2; + emit_insn (gen (operands[0], tem)); DONE; }) @@ -6262,19 +6294,22 @@ "TARGET_AVX512DQ" { rtx tem; + rtx (*gen) (rtx, rtx); + if (<MODE>mode != V4SFmode) { tem = gen_reg_rtx (<ssehalfvecmode>mode); emit_insn (gen_vec_extract_hi_<vunpckfixt_extract_mode> (tem, operands[1])); + gen = gen_fix<fixunssuffix>_trunc<ssehalfvecmodelower><vunpckfixt_model>2; } else { tem = gen_reg_rtx (V4SFmode); emit_insn (gen_avx_vpermilv4sf (tem, operands[1], GEN_INT (0x4e))); + gen = gen_avx512dq_fix<fixunssuffix>_truncv2sfv2di2; } - rtx (*gen) (rtx, rtx) - = gen_fix<fixunssuffix>_trunc<ssehalfvecmodelower><vunpckfixt_model>2; + emit_insn (gen (operands[0], tem)); DONE; }) diff --git a/gcc/emit-rtl.c b/gcc/emit-rtl.c index c49628040f8..cde312e1bae 100644 --- a/gcc/emit-rtl.c +++ b/gcc/emit-rtl.c @@ -939,11 +939,15 @@ validate_subreg (machine_mode omode, machine_mode imode, && GET_MODE_INNER (imode) == omode) ; /* ??? x86 sse code makes heavy use of *paradoxical* vector subregs, - i.e. (subreg:V4SF (reg:SF) 0). This surely isn't the cleanest way to - represent this. It's questionable if this ought to be represented at - all -- why can't this all be hidden in post-reload splitters that make - arbitrarily mode changes to the registers themselves. */ - else if (VECTOR_MODE_P (omode) && GET_MODE_INNER (omode) == imode) + i.e. (subreg:V4SF (reg:SF) 0) or (subreg:V4SF (reg:V2SF) 0). This + surely isn't the cleanest way to represent this. It's questionable + if this ought to be represented at all -- why can't this all be hidden + in post-reload splitters that make arbitrarily mode changes to the + registers themselves. */ + else if (VECTOR_MODE_P (omode) + && (GET_MODE_INNER (omode) == imode + || (VECTOR_MODE_P (imode) + && (GET_MODE_INNER (omode) == GET_MODE_INNER (imode))))) ; /* Subregs involving floating point modes are not allowed to change size. Therefore (subreg:DI (reg:DF) 0) is fine, but