Re: [PATCH] i386: Fix some -mavx512vl -mno-avx512bw bugs [PR99321]

Uros Bizjak via Gcc-patches Sat, 06 Mar 2021 02:19:32 -0800

On Fri, Mar 5, 2021 at 9:51 PM Jakub Jelinek <[email protected]> wrote:
>
> Hi!
>
> As I wrote in the mail with the previous PR99321 fix, we have various
> bugs where we emit instructions that need avx512bw and avx512vl
> ISAs when compiling with -mavx512vl -mno-avx512bw.
>
> Without the following patch,
> /* PR target/99321 */
> /* Would need some effective target for GNU as that supports 
> -march=+noavx512bw etc. */
> /* { dg-do assemble } */
> /* { dg-options "-O2 -mavx512vl -mno-avx512bw -Wa,-march=+noavx512bw" } */
>
> #include <x86intrin.h>
>
> typedef unsigned char V1 __attribute__((vector_size (16)));
> typedef unsigned char V2 __attribute__((vector_size (32)));
> typedef unsigned short V3 __attribute__((vector_size (16)));
> typedef unsigned short V4 __attribute__((vector_size (32)));
>
> void f1 (void) { register V1 a __asm ("%xmm16"), b __asm ("%xmm17"); __asm 
> ("" : "=v" (a), "=v" (b)); a += b; __asm ("" : : "v" (a)); }
> void f2 (void) { register V2 a __asm ("%xmm16"), b __asm ("%xmm17"); __asm 
> ("" : "=v" (a), "=v" (b)); a += b; __asm ("" : : "v" (a)); }
> void f3 (void) { register V3 a __asm ("%xmm16"), b __asm ("%xmm17"); __asm 
> ("" : "=v" (a), "=v" (b)); a += b; __asm ("" : : "v" (a)); }
> void f4 (void) { register V4 a __asm ("%xmm16"), b __asm ("%xmm17"); __asm 
> ("" : "=v" (a), "=v" (b)); a += b; __asm ("" : : "v" (a)); }
> void f5 (void) { register V1 a __asm ("%xmm16"), b __asm ("%xmm17"); __asm 
> ("" : "=v" (a), "=v" (b)); a -= b; __asm ("" : : "v" (a)); }
> void f6 (void) { register V2 a __asm ("%xmm16"), b __asm ("%xmm17"); __asm 
> ("" : "=v" (a), "=v" (b)); a -= b; __asm ("" : : "v" (a)); }
> void f7 (void) { register V3 a __asm ("%xmm16"), b __asm ("%xmm17"); __asm 
> ("" : "=v" (a), "=v" (b)); a -= b; __asm ("" : : "v" (a)); }
> void f8 (void) { register V4 a __asm ("%xmm16"), b __asm ("%xmm17"); __asm 
> ("" : "=v" (a), "=v" (b)); a -= b; __asm ("" : : "v" (a)); }
> void f9 (void) { register V3 a __asm ("%xmm16"), b __asm ("%xmm17"); __asm 
> ("" : "=v" (a), "=v" (b)); a *= b; __asm ("" : : "v" (a)); }
> void f10 (void) { register V4 a __asm ("%xmm16"), b __asm ("%xmm17"); __asm 
> ("" : "=v" (a), "=v" (b)); a *= b; __asm ("" : : "v" (a)); }
> void f11 (void) { register V1 a __asm ("%xmm16"), b __asm ("%xmm17"); __asm 
> ("" : "=v" (a), "=v" (b)); a = (V1) _mm_min_epu8 ((__m128i) a, (__m128i) b); 
> __asm ("" : : "v" (a)); }
> void f12 (void) { register V2 a __asm ("%xmm16"), b __asm ("%xmm17"); __asm 
> ("" : "=v" (a), "=v" (b)); a = (V2) _mm256_min_epu8 ((__m256i) a, (__m256i) 
> b); __asm ("" : : "v" (a)); }
> void f13 (void) { register V3 a __asm ("%xmm16"), b __asm ("%xmm17"); __asm 
> ("" : "=v" (a), "=v" (b)); a = (V3) _mm_min_epu16 ((__m128i) a, (__m128i) b); 
> __asm ("" : : "v" (a)); }
> void f14 (void) { register V4 a __asm ("%xmm16"), b __asm ("%xmm17"); __asm 
> ("" : "=v" (a), "=v" (b)); a = (V4) _mm256_min_epu16 ((__m256i) a, (__m256i) 
> b); __asm ("" : : "v" (a)); }
> void f15 (void) { register V1 a __asm ("%xmm16"), b __asm ("%xmm17"); __asm 
> ("" : "=v" (a), "=v" (b)); a = (V1) _mm_min_epi8 ((__m128i) a, (__m128i) b); 
> __asm ("" : : "v" (a)); }
> void f16 (void) { register V2 a __asm ("%xmm16"), b __asm ("%xmm17"); __asm 
> ("" : "=v" (a), "=v" (b)); a = (V2) _mm256_min_epi8 ((__m256i) a, (__m256i) 
> b); __asm ("" : : "v" (a)); }
> void f17 (void) { register V3 a __asm ("%xmm16"), b __asm ("%xmm17"); __asm 
> ("" : "=v" (a), "=v" (b)); a = (V3) _mm_min_epi16 ((__m128i) a, (__m128i) b); 
> __asm ("" : : "v" (a)); }
> void f18 (void) { register V4 a __asm ("%xmm16"), b __asm ("%xmm17"); __asm 
> ("" : "=v" (a), "=v" (b)); a = (V4) _mm256_min_epi16 ((__m256i) a, (__m256i) 
> b); __asm ("" : : "v" (a)); }
> void f19 (void) { register V1 a __asm ("%xmm16"), b __asm ("%xmm17"); __asm 
> ("" : "=v" (a), "=v" (b)); a = (V1) _mm_max_epu8 ((__m128i) a, (__m128i) b); 
> __asm ("" : : "v" (a)); }
> void f20 (void) { register V2 a __asm ("%xmm16"), b __asm ("%xmm17"); __asm 
> ("" : "=v" (a), "=v" (b)); a = (V2) _mm256_max_epu8 ((__m256i) a, (__m256i) 
> b); __asm ("" : : "v" (a)); }
> void f21 (void) { register V3 a __asm ("%xmm16"), b __asm ("%xmm17"); __asm 
> ("" : "=v" (a), "=v" (b)); a = (V3) _mm_max_epu16 ((__m128i) a, (__m128i) b); 
> __asm ("" : : "v" (a)); }
> void f22 (void) { register V4 a __asm ("%xmm16"), b __asm ("%xmm17"); __asm 
> ("" : "=v" (a), "=v" (b)); a = (V4) _mm256_max_epu16 ((__m256i) a, (__m256i) 
> b); __asm ("" : : "v" (a)); }
> void f23 (void) { register V1 a __asm ("%xmm16"), b __asm ("%xmm17"); __asm 
> ("" : "=v" (a), "=v" (b)); a = (V1) _mm_max_epi8 ((__m128i) a, (__m128i) b); 
> __asm ("" : : "v" (a)); }
> void f24 (void) { register V2 a __asm ("%xmm16"), b __asm ("%xmm17"); __asm 
> ("" : "=v" (a), "=v" (b)); a = (V2) _mm256_max_epi8 ((__m256i) a, (__m256i) 
> b); __asm ("" : : "v" (a)); }
> void f25 (void) { register V3 a __asm ("%xmm16"), b __asm ("%xmm17"); __asm 
> ("" : "=v" (a), "=v" (b)); a = (V3) _mm_max_epi16 ((__m128i) a, (__m128i) b); 
> __asm ("" : : "v" (a)); }
> void f26 (void) { register V4 a __asm ("%xmm16"), b __asm ("%xmm17"); __asm 
> ("" : "=v" (a), "=v" (b)); a = (V4) _mm256_max_epi16 ((__m256i) a, (__m256i) 
> b); __asm ("" : : "v" (a)); }
> test fails with:
> /tmp/ccW4PsfG.s: Assembler messages:
> /tmp/ccW4PsfG.s:9: Error: unsupported instruction `vpaddb'
> /tmp/ccW4PsfG.s:20: Error: unsupported instruction `vpaddb'
> /tmp/ccW4PsfG.s:31: Error: unsupported instruction `vpaddw'
> /tmp/ccW4PsfG.s:42: Error: unsupported instruction `vpaddw'
> /tmp/ccW4PsfG.s:53: Error: unsupported instruction `vpsubb'
> /tmp/ccW4PsfG.s:64: Error: unsupported instruction `vpsubb'
> /tmp/ccW4PsfG.s:75: Error: unsupported instruction `vpsubw'
> /tmp/ccW4PsfG.s:86: Error: unsupported instruction `vpsubw'
> /tmp/ccW4PsfG.s:97: Error: unsupported instruction `vpmullw'
> /tmp/ccW4PsfG.s:108: Error: unsupported instruction `vpmullw'
> /tmp/ccW4PsfG.s:133: Error: unsupported instruction `vpminub'
> /tmp/ccW4PsfG.s:144: Error: unsupported instruction `vpminuw'
> /tmp/ccW4PsfG.s:155: Error: unsupported instruction `vpminuw'
> /tmp/ccW4PsfG.s:166: Error: unsupported instruction `vpminsb'
> /tmp/ccW4PsfG.s:177: Error: unsupported instruction `vpminsb'
> /tmp/ccW4PsfG.s:202: Error: unsupported instruction `vpminsw'
> /tmp/ccW4PsfG.s:227: Error: unsupported instruction `vpmaxub'
> /tmp/ccW4PsfG.s:238: Error: unsupported instruction `vpmaxuw'
> /tmp/ccW4PsfG.s:249: Error: unsupported instruction `vpmaxuw'
> /tmp/ccW4PsfG.s:260: Error: unsupported instruction `vpmaxsb'
> /tmp/ccW4PsfG.s:271: Error: unsupported instruction `vpmaxsb'
> /tmp/ccW4PsfG.s:296: Error: unsupported instruction `vpmaxsw'
>
> We already have Yw constraint which is equivalent to v for
> -mavx512bw -mavx512vl and to nothing otherwise, so for
> the instructions that need both we need to use xYw and
> v for modes that don't need that.


Perhaps we should introduce another Y... constraint to return correct
SSE regset based on TARGET_... flags, instead of using compound xYw? I
think that introducing new constraint is the established approach we
should follow. The new mode_attr looks OK to me.

Uros.

> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
>
> Do we want such a testcase in the testsuite?  I guess we'd need to
> add an effective target whether -Wa,-march=+noavx512bw can be used
> and also add effective target avx512vl.
> And I'll need to fix a lot of other instructions that have the same problem.
>
> 2021-03-05  Jakub Jelinek  <[email protected]>
>
>         PR target/99321
>         * config/i386/sse.md (v_xYw): New define_mode_attr.
>         (*<insn><mode>3, *mul<mode>3<mask_name>, *avx2_<code><mode>3,
>         *sse4_1_<code><mode>3<mask_name>): Use <v_xYw> instead of v
>         in constraints.
>
> --- gcc/config/i386/sse.md.jj   2021-02-10 19:50:04.466086143 +0100
> +++ gcc/config/i386/sse.md      2021-03-05 19:25:57.540752452 +0100
> @@ -560,6 +560,14 @@ (define_mode_attr avx512
>     (V4SF "avx512vl") (V8SF "avx512vl") (V16SF "avx512f")
>     (V2DF "avx512vl") (V4DF "avx512vl") (V8DF "avx512f")])
>
> +(define_mode_attr v_xYw
> +  [(V16QI "xYw") (V32QI "xYw") (V64QI "v")
> +   (V8HI "xYw") (V16HI "xYw") (V32HI "v")
> +   (V4SI "v") (V8SI "v") (V16SI "v")
> +   (V2DI "v") (V4DI "v") (V8DI "v")
> +   (V4SF "v") (V8SF "v") (V16SF "v")
> +   (V2DF "v") (V4DF "v") (V8DF "v")])
> +
>  (define_mode_attr sse2_avx_avx512f
>    [(V16QI "sse2") (V32QI "avx") (V64QI "avx512f")
>     (V8HI  "avx512vl") (V16HI  "avx512vl") (V32HI "avx512bw")
> @@ -11677,10 +11685,10 @@ (define_expand "<insn><mode>3_mask"
>    "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);")
>
>  (define_insn "*<insn><mode>3"
> -  [(set (match_operand:VI_AVX2 0 "register_operand" "=x,v")
> +  [(set (match_operand:VI_AVX2 0 "register_operand" "=x,<v_xYw>")
>         (plusminus:VI_AVX2
> -         (match_operand:VI_AVX2 1 "bcst_vector_operand" "<comm>0,v")
> -         (match_operand:VI_AVX2 2 "bcst_vector_operand" "xBm,vmBr")))]
> +         (match_operand:VI_AVX2 1 "bcst_vector_operand" "<comm>0,<v_xYw>")
> +         (match_operand:VI_AVX2 2 "bcst_vector_operand" "xBm,<v_xYw>mBr")))]
>    "TARGET_SSE2 && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
>    "@
>     p<plusminus_mnemonic><ssemodesuffix>\t{%2, %0|%0, %2}
> @@ -11790,9 +11798,9 @@ (define_expand "mul<mode>3<mask_name>"
>    "ix86_fixup_binary_operands_no_copy (MULT, <MODE>mode, operands);")
>
>  (define_insn "*mul<mode>3<mask_name>"
> -  [(set (match_operand:VI2_AVX2 0 "register_operand" "=x,v")
> -       (mult:VI2_AVX2 (match_operand:VI2_AVX2 1 "vector_operand" "%0,v")
> -                      (match_operand:VI2_AVX2 2 "vector_operand" "xBm,vm")))]
> +  [(set (match_operand:VI2_AVX2 0 "register_operand" "=x,<v_xYw>")
> +       (mult:VI2_AVX2 (match_operand:VI2_AVX2 1 "vector_operand" 
> "%0,<v_xYw>")
> +                      (match_operand:VI2_AVX2 2 "vector_operand" 
> "xBm,<v_xYw>m")))]
>    "TARGET_SSE2 && !(MEM_P (operands[1]) && MEM_P (operands[2]))
>     && <mask_mode512bit_condition> && <mask_avx512bw_condition>"
>    "@
> @@ -12618,10 +12626,10 @@ (define_expand "<code><mode>3"
>    "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);")
>
>  (define_insn "*avx2_<code><mode>3"
> -  [(set (match_operand:VI124_256 0 "register_operand" "=v")
> +  [(set (match_operand:VI124_256 0 "register_operand" "=<v_xYw>")
>         (maxmin:VI124_256
> -         (match_operand:VI124_256 1 "nonimmediate_operand" "%v")
> -         (match_operand:VI124_256 2 "nonimmediate_operand" "vm")))]
> +         (match_operand:VI124_256 1 "nonimmediate_operand" "%<v_xYw>")
> +         (match_operand:VI124_256 2 "nonimmediate_operand" "<v_xYw>m")))]
>    "TARGET_AVX2 && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
>    "vp<maxmin_int><ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}"
>    [(set_attr "type" "sseiadd")
> @@ -12745,10 +12753,10 @@ (define_expand "<code><mode>3"
>  })
>
>  (define_insn "*sse4_1_<code><mode>3<mask_name>"
> -  [(set (match_operand:VI14_128 0 "register_operand" "=Yr,*x,v")
> +  [(set (match_operand:VI14_128 0 "register_operand" "=Yr,*x,<v_xYw>")
>         (smaxmin:VI14_128
> -         (match_operand:VI14_128 1 "vector_operand" "%0,0,v")
> -         (match_operand:VI14_128 2 "vector_operand" "YrBm,*xBm,vm")))]
> +         (match_operand:VI14_128 1 "vector_operand" "%0,0,<v_xYw>")
> +         (match_operand:VI14_128 2 "vector_operand" "YrBm,*xBm,<v_xYw>m")))]
>    "TARGET_SSE4_1
>     && <mask_mode512bit_condition>
>     && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
> @@ -12830,10 +12838,10 @@ (define_expand "<code><mode>3"
>  })
>
>  (define_insn "*sse4_1_<code><mode>3<mask_name>"
> -  [(set (match_operand:VI24_128 0 "register_operand" "=Yr,*x,v")
> +  [(set (match_operand:VI24_128 0 "register_operand" "=Yr,*x,<v_xYw>")
>         (umaxmin:VI24_128
> -         (match_operand:VI24_128 1 "vector_operand" "%0,0,v")
> -         (match_operand:VI24_128 2 "vector_operand" "YrBm,*xBm,vm")))]
> +         (match_operand:VI24_128 1 "vector_operand" "%0,0,<v_xYw>")
> +         (match_operand:VI24_128 2 "vector_operand" "YrBm,*xBm,<v_xYw>m")))]
>    "TARGET_SSE4_1
>     && <mask_mode512bit_condition>
>     && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
>
>         Jakub
>

Re: [PATCH] i386: Fix some -mavx512vl -mno-avx512bw bugs [PR99321]

Reply via email to