[PATCH] D96231: [X86] Always assign reassoc flag for intrinsics *reduce_add/mul_ps/pd.

Pengfei Wang via Phabricator via cfe-commits Sun, 07 Feb 2021 18:58:43 -0800

pengfei created this revision.
pengfei added reviewers: RKSimon, craig.topper, spatel.
pengfei requested review of this revision.
Herald added a project: clang.
Herald added a subscriber: cfe-commits.


Intrinsics *reduce_add/mul_ps/pd have assumption that the elements in
the vector are reassociable. So we need to always assign the reassoc
flag when we call _mm_reduce_* intrinsics.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D96231

Files:
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/lib/Headers/avx512fintrin.h
  clang/test/CodeGen/X86/avx512-reduceIntrin.c

Index: clang/test/CodeGen/X86/avx512-reduceIntrin.c
===================================================================
--- clang/test/CodeGen/X86/avx512-reduceIntrin.c
+++ clang/test/CodeGen/X86/avx512-reduceIntrin.c
@@ -115,25 +115,25 @@
 
 double test_mm512_reduce_add_pd(__m512d __W){
 // CHECK-LABEL: @test_mm512_reduce_add_pd(
-// CHECK:    call double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> %{{.*}})
+// CHECK:    call reassoc double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> %{{.*}})
   return _mm512_reduce_add_pd(__W); 
 }
 
 double test_mm512_reduce_mul_pd(__m512d __W){
 // CHECK-LABEL: @test_mm512_reduce_mul_pd(
-// CHECK:    call double @llvm.vector.reduce.fmul.v8f64(double 1.000000e+00, <8 x double> %{{.*}})
+// CHECK:    call reassoc double @llvm.vector.reduce.fmul.v8f64(double 1.000000e+00, <8 x double> %{{.*}})
   return _mm512_reduce_mul_pd(__W); 
 }
 
 float test_mm512_reduce_add_ps(__m512 __W){
 // CHECK-LABEL: @test_mm512_reduce_add_ps(
-// CHECK:    call float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> %{{.*}})
+// CHECK:    call reassoc float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> %{{.*}})
   return _mm512_reduce_add_ps(__W); 
 }
 
 float test_mm512_reduce_mul_ps(__m512 __W){
 // CHECK-LABEL: @test_mm512_reduce_mul_ps(
-// CHECK:    call float @llvm.vector.reduce.fmul.v16f32(float 1.000000e+00, <16 x float> %{{.*}})
+// CHECK:    call reassoc float @llvm.vector.reduce.fmul.v16f32(float 1.000000e+00, <16 x float> %{{.*}})
   return _mm512_reduce_mul_ps(__W); 
 }
 
@@ -141,7 +141,7 @@
 // CHECK-LABEL: @test_mm512_mask_reduce_add_pd(
 // CHECK:    bitcast i8 %{{.*}} to <8 x i1>
 // CHECK:    select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
-// CHECK:    call double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> %{{.*}})
+// CHECK:    call reassoc double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> %{{.*}})
   return _mm512_mask_reduce_add_pd(__M, __W); 
 }
 
@@ -149,7 +149,7 @@
 // CHECK-LABEL: @test_mm512_mask_reduce_mul_pd(
 // CHECK:    bitcast i8 %{{.*}} to <8 x i1>
 // CHECK:    select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
-// CHECK:    call double @llvm.vector.reduce.fmul.v8f64(double 1.000000e+00, <8 x double> %{{.*}})
+// CHECK:    call reassoc double @llvm.vector.reduce.fmul.v8f64(double 1.000000e+00, <8 x double> %{{.*}})
   return _mm512_mask_reduce_mul_pd(__M, __W); 
 }
 
@@ -157,7 +157,7 @@
 // CHECK-LABEL: @test_mm512_mask_reduce_add_ps(
 // CHECK:    bitcast i16 %{{.*}} to <16 x i1>
 // CHECK:    select <16 x i1> %{{.*}}, <16 x float> {{.*}}, <16 x float> {{.*}}
-// CHECK:    call float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> %{{.*}})
+// CHECK:    call reassoc float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> %{{.*}})
   return _mm512_mask_reduce_add_ps(__M, __W); 
 }
 
@@ -165,6 +165,6 @@
 // CHECK-LABEL: @test_mm512_mask_reduce_mul_ps(
 // CHECK:    bitcast i16 %{{.*}} to <16 x i1>
 // CHECK:    select <16 x i1> %{{.*}}, <16 x float> {{.*}}, <16 x float> %{{.*}}
-// CHECK:    call float @llvm.vector.reduce.fmul.v16f32(float 1.000000e+00, <16 x float> %{{.*}})
+// CHECK:    call reassoc float @llvm.vector.reduce.fmul.v16f32(float 1.000000e+00, <16 x float> %{{.*}})
   return _mm512_mask_reduce_mul_ps(__M, __W); 
 }
Index: clang/lib/Headers/avx512fintrin.h
===================================================================
--- clang/lib/Headers/avx512fintrin.h
+++ clang/lib/Headers/avx512fintrin.h
@@ -9300,6 +9300,9 @@
  * computations. In vector-reduction arithmetic, the evaluation off is
  * independent of the order of the input elements of V.
 
+ * For floating points type, we always assume the elements are reassociable even
+ * if -fast-math is off.
+
  * Used bisection method. At each step, we partition the vector with previous
  * step in half, and the operation is performed on its two halves.
  * This takes log2(n) steps where n is the number of elements in the vector.
Index: clang/lib/CodeGen/CGBuiltin.cpp
===================================================================
--- clang/lib/CodeGen/CGBuiltin.cpp
+++ clang/lib/CodeGen/CGBuiltin.cpp
@@ -13826,12 +13826,14 @@
   case X86::BI__builtin_ia32_reduce_fadd_ps512: {
     Function *F =
         CGM.getIntrinsic(Intrinsic::vector_reduce_fadd, Ops[1]->getType());
+    Builder.getFastMathFlags().setAllowReassoc(true);
     return Builder.CreateCall(F, {Ops[0], Ops[1]});
   }
   case X86::BI__builtin_ia32_reduce_fmul_pd512:
   case X86::BI__builtin_ia32_reduce_fmul_ps512: {
     Function *F =
         CGM.getIntrinsic(Intrinsic::vector_reduce_fmul, Ops[1]->getType());
+    Builder.getFastMathFlags().setAllowReassoc(true);
     return Builder.CreateCall(F, {Ops[0], Ops[1]});
   }
   case X86::BI__builtin_ia32_reduce_mul_d512:

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D96231: [X86] Always assign reassoc flag for intrinsics *reduce_add/mul_ps/pd.

Reply via email to