https://gcc.gnu.org/g:de867e8da30bf5e0cb51c3946ec43c3c4778d4a0

commit r15-5071-gde867e8da30bf5e0cb51c3946ec43c3c4778d4a0
Author: liuhongt <hongtao....@intel.com>
Date:   Wed Nov 6 18:15:42 2024 -0800

    Guard truncate from vector float to vector __bf16 with !flag_rounding_math 
&& HONOR_NANS (BFmode).
    
    hw instruction doesn't raise exceptions, turns sNAN into qNAN quietly,
    and always round to nearest (even). Output denormals are always
    flushed to zero and input denormals are always treated as zero. MXCSR
    is not consulted nor updated.
    W/o native instructions, flag_unsafe_math_optimizations is needed for
    the permutation instructions.
    Similar guard extend from vector __bf16 to vector float with
    !HONOR_NANS (BFmode).
    
    gcc/ChangeLog:
    
            * config/i386/i386.md (truncsf2bf2): Add !flag_rounding_math
            to the condition, require flag_unsafe_math_optimizations when
            native instruction is not available.
            * config/i386/mmx.md: (truncv2sfv2bf2): Ditto.
            (extendv2bfv2sf2): Add !HONOR_NANS (BFmode) to the condition.
            * config/i386/sse.md: (truncv4sfv4sf2): Add
            !flag_rounding_math to the condition, require
            flag_unsafe_math_optimizations when native instruction is not
            available.
            (truncv8sfv8bf2): Ditto.
            (truncv16sfv16bf2): Ditto.
            (extendv4bfv4sf2): Add !HONOR_NANS (BFmode) to the condition.
            (extendv8bfv8sf2): Ditto.
            (extendv16bfv16sf2): Ditto.
    
    gcc/testsuite/ChangeLog:
    
            * gcc.target/i386/avx512bf16-truncsfbf.c: Add -ffast-math.
            * gcc.target/i386/avx512bw-extendbf2sf.c: Ditto.
            * gcc.target/i386/avx512bw-truncsfbf.c: Ditto.
            * gcc.target/i386/sse2-extendbf2sf.c: Ditto.
            * gcc.target/i386/ssse3-truncsfbf.c: Ditto.

Diff:
---
 gcc/config/i386/i386.md                              | 11 ++++++++++-
 gcc/config/i386/mmx.md                               |  8 ++++++--
 gcc/config/i386/sse.md                               | 16 ++++++++++++----
 gcc/testsuite/gcc.target/i386/avx512bf16-truncsfbf.c |  2 +-
 gcc/testsuite/gcc.target/i386/avx512bw-extendbf2sf.c |  2 +-
 gcc/testsuite/gcc.target/i386/avx512bw-truncsfbf.c   |  2 +-
 gcc/testsuite/gcc.target/i386/sse2-extendbf2sf.c     |  2 +-
 gcc/testsuite/gcc.target/i386/ssse3-truncsfbf.c      |  2 +-
 8 files changed, 33 insertions(+), 12 deletions(-)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 34bc04622b18..f4aae80b7a95 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -5698,11 +5698,20 @@
    (set_attr "prefix" "evex")
    (set_attr "mode" "HF")])
 
+/* vcvtneps2bf16 doesn't honor SNAN, and turn sNAN into qNAN quietly,
+   and it always round to even.
+   flag_unsafte_math_optimization is needed for psrld.
+   If we don't expect qNaNs nor sNaNs and can assume rounding
+   to nearest, we can expand the conversion inline as
+   (fromi + 0x7fff + ((fromi >> 16) & 1)) >> 16.  */
 (define_insn "truncsfbf2"
   [(set (match_operand:BF 0 "register_operand" "=x,x,v,Yv")
        (float_truncate:BF
          (match_operand:SF 1 "register_operand" "0,x,v,Yv")))]
-  "TARGET_SSE2 && flag_unsafe_math_optimizations && !HONOR_NANS (BFmode)"
+  "TARGET_SSE2 && !HONOR_NANS (BFmode) && !flag_rounding_math
+   && (flag_unsafe_math_optimizations
+       || TARGET_AVXNECONVERT
+       || (TARGET_AVX512BF16 && TARGET_AVX512VL))"
   "@
   psrld\t{$16, %0|%0, 16}
   %{vex%} vcvtneps2bf16\t{%1, %0|%0, %1}
diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 021ac90ae2a0..61a4f4d21ea3 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -2998,7 +2998,11 @@
   [(set (match_operand:V2BF 0 "register_operand")
        (float_truncate:V2BF
          (match_operand:V2SF 1 "nonimmediate_operand")))]
-  "TARGET_SSSE3 && TARGET_MMX_WITH_SSE"
+  "TARGET_SSSE3 && TARGET_MMX_WITH_SSE
+  && !HONOR_NANS (BFmode) && !flag_rounding_math
+  && (flag_unsafe_math_optimizations
+      || TARGET_AVXNECONVERT
+      || (TARGET_AVX512BF16 && TARGET_AVX512VL))"
 {
   rtx op1 = gen_reg_rtx (V4SFmode);
   rtx op0 = gen_reg_rtx (V4BFmode);
@@ -3016,7 +3020,7 @@
   [(set (match_operand:V2SF 0 "register_operand")
        (float_extend:V2SF
          (match_operand:V2BF 1 "nonimmediate_operand")))]
-  "TARGET_SSE2 && TARGET_MMX_WITH_SSE"
+  "TARGET_SSE2 && TARGET_MMX_WITH_SSE && !HONOR_NANS (BFmode)"
 {
   rtx op0 = gen_reg_rtx (V4SFmode);
   rtx op1 = gen_reg_rtx (V4BFmode);
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 5eeb3ab221a1..efe32e5149fc 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -30995,7 +30995,10 @@
   [(set (match_operand:V4BF 0 "register_operand")
          (float_truncate:V4BF
            (match_operand:V4SF 1 "nonimmediate_operand")))]
-  "TARGET_SSSE3"
+  "TARGET_SSSE3 && !HONOR_NANS (BFmode) && !flag_rounding_math
+   && (flag_unsafe_math_optimizations
+       || TARGET_AVXNECONVERT
+       || (TARGET_AVX512BF16 && TARGET_AVX512VL))"
 {
   if (!TARGET_AVXNECONVERT
       && !(TARGET_AVX512BF16 && TARGET_AVX512VL))
@@ -31088,7 +31091,10 @@
   [(set (match_operand:V8BF 0 "register_operand")
        (float_truncate:V8BF
          (match_operand:V8SF 1 "nonimmediate_operand")))]
-  "TARGET_AVX2"
+  "TARGET_AVX2 && !HONOR_NANS (BFmode) && !flag_rounding_math
+   && (flag_unsafe_math_optimizations
+       || TARGET_AVXNECONVERT
+       || (TARGET_AVX512BF16 && TARGET_AVX512VL))"
 {
   if (!TARGET_AVXNECONVERT
       && !(TARGET_AVX512BF16 && TARGET_AVX512VL))
@@ -31114,7 +31120,9 @@
   [(set (match_operand:V16BF 0 "register_operand")
        (float_truncate:V16BF
          (match_operand:V16SF 1 "nonimmediate_operand")))]
-  "TARGET_AVX512BW && TARGET_EVEX512"
+  "TARGET_AVX512BW && TARGET_EVEX512
+   && !HONOR_NANS (BFmode) && !flag_rounding_math
+   && (flag_unsafe_math_optimizations || TARGET_AVX512BF16)"
 {
   if (!TARGET_AVX512BF16)
     {
@@ -31127,7 +31135,7 @@
   [(set (match_operand:VF1_AVX512BW 0 "register_operand")
        (float_extend:VF1_AVX512BW
          (match_operand:<sf_cvt_bf16> 1 "nonimmediate_operand")))]
-  "TARGET_SSE2"
+  "TARGET_SSE2 && !HONOR_NANS (BFmode)"
 {
   ix86_expand_vector_bf2sf_with_vec_perm (operands[0], operands[1]);
   DONE;
diff --git a/gcc/testsuite/gcc.target/i386/avx512bf16-truncsfbf.c 
b/gcc/testsuite/gcc.target/i386/avx512bf16-truncsfbf.c
index da31bdba21b0..1b4b62f10601 100644
--- a/gcc/testsuite/gcc.target/i386/avx512bf16-truncsfbf.c
+++ b/gcc/testsuite/gcc.target/i386/avx512bf16-truncsfbf.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-mavx512vl -mavx512bf16 -O2" } */
+/* { dg-options "-mavx512vl -mavx512bf16 -O2 -ffast-math" } */
 /* { dg-final { scan-assembler-times {(?n)vcvtneps2bf16} 6 } } */
 
 #include "avx512bw-truncsfbf.c"
diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-extendbf2sf.c 
b/gcc/testsuite/gcc.target/i386/avx512bw-extendbf2sf.c
index 5b59958151f7..e7c65b7ee014 100644
--- a/gcc/testsuite/gcc.target/i386/avx512bw-extendbf2sf.c
+++ b/gcc/testsuite/gcc.target/i386/avx512bw-extendbf2sf.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-mavx512bw -mavx512vl -O2" } */
+/* { dg-options "-mavx512bw -mavx512vl -O2 -ffast-math" } */
 /* { dg-final { scan-assembler-times {(?n)(?:vpermi2w|vpunpcklwd)} 6 } } */
 
 typedef float v4sf __attribute__((vector_size(16)));
diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-truncsfbf.c 
b/gcc/testsuite/gcc.target/i386/avx512bw-truncsfbf.c
index 071db21cfb37..40802d865df6 100644
--- a/gcc/testsuite/gcc.target/i386/avx512bw-truncsfbf.c
+++ b/gcc/testsuite/gcc.target/i386/avx512bw-truncsfbf.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-mavx512bw -mavx512vl -mno-avx512bf16 -mno-avxneconvert -O2" 
} */
+/* { dg-options "-mavx512bw -mavx512vl -mno-avx512bf16 -mno-avxneconvert -O2 
-ffast-math" } */
 /* { dg-final { scan-assembler-times {(?n)(?:vpermw|vpshufb)} 6 } } */
 
 typedef float v4sf __attribute__((vector_size(16)));
diff --git a/gcc/testsuite/gcc.target/i386/sse2-extendbf2sf.c 
b/gcc/testsuite/gcc.target/i386/sse2-extendbf2sf.c
index 0f007df68f6b..d7f77acd6035 100644
--- a/gcc/testsuite/gcc.target/i386/sse2-extendbf2sf.c
+++ b/gcc/testsuite/gcc.target/i386/sse2-extendbf2sf.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-msse2 -O2" } */
+/* { dg-options "-msse2 -O2 -ffast-math" } */
 /* { dg-final { scan-assembler-times {(?n)(?:vpermi2w|punpcklwd)} 2 { target { 
! ia32 } } } } */
 
 typedef float v2sf __attribute__((vector_size(8)));
diff --git a/gcc/testsuite/gcc.target/i386/ssse3-truncsfbf.c 
b/gcc/testsuite/gcc.target/i386/ssse3-truncsfbf.c
index 70840c537f19..af92f4d0befe 100644
--- a/gcc/testsuite/gcc.target/i386/ssse3-truncsfbf.c
+++ b/gcc/testsuite/gcc.target/i386/ssse3-truncsfbf.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-mssse3 -mno-avx512bf16 -mno-avxneconvert -O2" } */
+/* { dg-options "-mssse3 -mno-avx512bf16 -mno-avxneconvert -O2 -ffast-math" } 
*/
 /* { dg-final { scan-assembler-times {(?n)pshufb} 2 { target { ! ia32 } } } } 
*/
 
 typedef float v2sf __attribute__((vector_size(8)));

Reply via email to