https://github.com/easyonaadit updated 
https://github.com/llvm/llvm-project/pull/161816

>From f6ed1d8bbae8f05ca464cb78029297d8bc3e6d52 Mon Sep 17 00:00:00 2001
From: Aaditya <[email protected]>
Date: Tue, 30 Sep 2025 11:37:42 +0530
Subject: [PATCH 1/2] [AMDGPU] Add builtins for wave reduction intrinsics

---
 clang/include/clang/Basic/BuiltinsAMDGPU.def |  4 +
 clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp  |  8 ++
 clang/test/CodeGenOpenCL/builtins-amdgcn.cl  | 84 ++++++++++++++++++++
 3 files changed, 96 insertions(+)

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 81e684a04a03d..0019c14052d3c 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -402,6 +402,10 @@ BUILTIN(__builtin_amdgcn_wave_reduce_max_u64, "WUiWUiZi", 
"nc")
 BUILTIN(__builtin_amdgcn_wave_reduce_and_b64, "WiWiZi", "nc")
 BUILTIN(__builtin_amdgcn_wave_reduce_or_b64, "WiWiZi", "nc")
 BUILTIN(__builtin_amdgcn_wave_reduce_xor_b64, "WiWiZi", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_add_f32, "ffZi", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_sub_f32, "ffZi", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_min_f32, "ffZi", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_max_f32, "ffZi", "nc")
 
 
//===----------------------------------------------------------------------===//
 // R600-NI only builtins.
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp 
b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index 9eab70955b6b9..573bcd79611e2 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -389,18 +389,22 @@ static Intrinsic::ID 
getIntrinsicIDforWaveReduction(unsigned BuiltinID) {
     llvm_unreachable("Unknown BuiltinID for wave reduction");
   case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u32:
   case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u64:
+  case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_add_f32:
     return Intrinsic::amdgcn_wave_reduce_add;
   case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u32:
   case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u64:
+  case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_f32:
     return Intrinsic::amdgcn_wave_reduce_sub;
   case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i32:
   case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i64:
+  case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_f32:
     return Intrinsic::amdgcn_wave_reduce_min;
   case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u32:
   case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u64:
     return Intrinsic::amdgcn_wave_reduce_umin;
   case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i32:
   case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i64:
+  case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_f32:
     return Intrinsic::amdgcn_wave_reduce_max;
   case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u32:
   case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u64:
@@ -423,11 +427,15 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   llvm::SyncScope::ID SSID;
   switch (BuiltinID) {
   case AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u32:
+  case AMDGPU::BI__builtin_amdgcn_wave_reduce_add_f32:
   case AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u32:
+  case AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_f32:
   case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i32:
   case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u32:
+  case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_f32:
   case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i32:
   case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u32:
+  case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_f32:
   case AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b32:
   case AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b32:
   case AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b32:
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
index b92454de60c78..2e03fc8d8cd84 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
@@ -405,6 +405,13 @@ void test_wave_reduce_add_u64_default(global int* out, 
long in)
   *out = __builtin_amdgcn_wave_reduce_add_u64(in, 0);
 }
 
+// CHECK-LABEL: @test_wave_reduce_add_f32_default
+// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.add.f32(
+void test_wave_reduce_add_f32_default(global float* out, float in)
+{
+  *out = __builtin_amdgcn_wave_reduce_add_f32(in, 0);
+}
+
 // CHECK-LABEL: @test_wave_reduce_add_u32_iterative
 // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.add.i32(
 void test_wave_reduce_add_u32_iterative(global int* out, int in)
@@ -419,6 +426,13 @@ void test_wave_reduce_add_u64_iterative(global int* out, 
long in)
   *out = __builtin_amdgcn_wave_reduce_add_u64(in, 1);
 }
 
+// CHECK-LABEL: @test_wave_reduce_add_f32_iterative
+// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.add.f32(
+void test_wave_reduce_add_f32_iterative(global float* out, float in)
+{
+  *out = __builtin_amdgcn_wave_reduce_add_f32(in, 0);
+}
+
 // CHECK-LABEL: @test_wave_reduce_add_u32_dpp
 // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.add.i32(
 void test_wave_reduce_add_u32_dpp(global int* out, int in)
@@ -433,6 +447,13 @@ void test_wave_reduce_add_u64_dpp(global int* out, long in)
   *out = __builtin_amdgcn_wave_reduce_add_u64(in, 2);
 }
 
+// CHECK-LABEL: @test_wave_reduce_add_f32_dpp
+// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.add.f32(
+void test_wave_reduce_add_f32_dpp(global float* out, float in)
+{
+  *out = __builtin_amdgcn_wave_reduce_add_f32(in, 0);
+}
+
 // CHECK-LABEL: @test_wave_reduce_sub_u32_default
 // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.sub.i32(
 void test_wave_reduce_sub_u32_default(global int* out, int in)
@@ -447,6 +468,13 @@ void test_wave_reduce_sub_u64_default(global int* out, 
long in)
   *out = __builtin_amdgcn_wave_reduce_sub_u64(in, 0);
 }
 
+// CHECK-LABEL: @test_wave_reduce_sub_f32_default
+// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.sub.f32(
+void test_wave_reduce_sub_f32_default(global float* out, float in)
+{
+  *out = __builtin_amdgcn_wave_reduce_sub_f32(in, 0);
+}
+
 // CHECK-LABEL: @test_wave_reduce_sub_u32_iterative
 // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.sub.i32(
 void test_wave_reduce_sub_u32_iterative(global int* out, int in)
@@ -461,6 +489,13 @@ void test_wave_reduce_sub_u64_iterative(global int* out, 
long in)
   *out = __builtin_amdgcn_wave_reduce_sub_u64(in, 1);
 }
 
+// CHECK-LABEL: @test_wave_reduce_sub_f32_iterative
+// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.sub.f32(
+void test_wave_reduce_sub_f32_iterative(global float* out, float in)
+{
+  *out = __builtin_amdgcn_wave_reduce_sub_f32(in, 0);
+}
+
 // CHECK-LABEL: @test_wave_reduce_sub_u32_dpp
 // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.sub.i32(
 void test_wave_reduce_sub_u32_dpp(global int* out, int in)
@@ -475,6 +510,13 @@ void test_wave_reduce_sub_u64_dpp(global int* out, long in)
   *out = __builtin_amdgcn_wave_reduce_sub_u64(in, 2);
 }
 
+// CHECK-LABEL: @test_wave_reduce_sub_f32_dpp
+// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.sub.f32(
+void test_wave_reduce_sub_f32_dpp(global float* out, float in)
+{
+  *out = __builtin_amdgcn_wave_reduce_sub_f32(in, 0);
+}
+
 // CHECK-LABEL: @test_wave_reduce_and_b32_default
 // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.and.i32(
 void test_wave_reduce_and_b32_default(global int* out, int in)
@@ -615,6 +657,13 @@ void test_wave_reduce_min_i64_default(global int* out, 
long in)
   *out = __builtin_amdgcn_wave_reduce_min_i64(in, 0);
 }
 
+// CHECK-LABEL: @test_wave_reduce_min_f32_default
+// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.min.f32(
+void test_wave_reduce_min_f32_default(global float* out, float in)
+{
+  *out = __builtin_amdgcn_wave_reduce_min_f32(in, 0);
+}
+
 // CHECK-LABEL: @test_wave_reduce_min_i32_iterative
 // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.min.i32(
 void test_wave_reduce_min_i32_iterative(global int* out, int in)
@@ -629,6 +678,13 @@ void test_wave_reduce_min_i64_iterative(global int* out, 
long in)
   *out = __builtin_amdgcn_wave_reduce_min_i64(in, 1);
 }
 
+// CHECK-LABEL: @test_wave_reduce_min_f32_iterative
+// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.min.f32(
+void test_wave_reduce_min_f32_iterative(global float* out, float in)
+{
+  *out = __builtin_amdgcn_wave_reduce_min_f32(in, 0);
+}
+
 // CHECK-LABEL: @test_wave_reduce_min_i32_dpp
 // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.min.i32(
 void test_wave_reduce_min_i32_dpp(global int* out, int in)
@@ -643,6 +699,13 @@ void test_wave_reduce_min_i64_dpp(global int* out, long in)
   *out = __builtin_amdgcn_wave_reduce_min_i64(in, 2);
 }
 
+// CHECK-LABEL: @test_wave_reduce_min_f32_dpp
+// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.min.f32(
+void test_wave_reduce_min_f32_dpp(global float* out, float in)
+{
+  *out = __builtin_amdgcn_wave_reduce_min_f32(in, 0);
+}
+
 // CHECK-LABEL: @test_wave_reduce_min_u32_default
 // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.umin.i32(
 void test_wave_reduce_min_u32_default(global int* out, int in)
@@ -699,6 +762,13 @@ void test_wave_reduce_max_i64_default(global int* out, 
long in)
   *out = __builtin_amdgcn_wave_reduce_max_i64(in, 0);
 }
 
+// CHECK-LABEL: @test_wave_reduce_max_f32_default
+// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.max.f32(
+void test_wave_reduce_max_f32_default(global float* out, float in)
+{
+  *out = __builtin_amdgcn_wave_reduce_max_f32(in, 0);
+}
+
 // CHECK-LABEL: @test_wave_reduce_max_i32_iterative
 // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.max.i32(
 void test_wave_reduce_max_i32_iterative(global int* out, int in)
@@ -713,6 +783,13 @@ void test_wave_reduce_max_i64_iterative(global int* out, 
long in)
   *out = __builtin_amdgcn_wave_reduce_max_i64(in, 1);
 }
 
+// CHECK-LABEL: @test_wave_reduce_max_f32_iterative
+// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.max.f32(
+void test_wave_reduce_max_f32_iterative(global float* out, float in)
+{
+  *out = __builtin_amdgcn_wave_reduce_max_f32(in, 0);
+}
+
 // CHECK-LABEL: @test_wave_reduce_max_i32_dpp
 // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.max.i32(
 void test_wave_reduce_max_i32_dpp(global int* out, int in)
@@ -727,6 +804,13 @@ void test_wave_reduce_max_i64_dpp(global int* out, long in)
   *out = __builtin_amdgcn_wave_reduce_max_i64(in, 2);
 }
 
+// CHECK-LABEL: @test_wave_reduce_max_f32_dpp
+// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.max.f32(
+void test_wave_reduce_max_f32_dpp(global float* out, float in)
+{
+  *out = __builtin_amdgcn_wave_reduce_max_f32(in, 0);
+}
+
 // CHECK-LABEL: @test_wave_reduce_max_u32_default
 // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.umax.i32(
 void test_wave_reduce_max_u32_default(global int* out, int in)

>From 2ae7caad39371d578a8936b73e22f8942522df1b Mon Sep 17 00:00:00 2001
From: Aaditya <[email protected]>
Date: Wed, 12 Nov 2025 10:08:07 +0530
Subject: [PATCH 2/2] Review comments: remove the float overload.

---
 clang/include/clang/Basic/BuiltinsAMDGPU.def |  8 +-
 clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp  | 20 ++--
 clang/test/CodeGenOpenCL/builtins-amdgcn.cl  | 96 ++++++++++----------
 3 files changed, 64 insertions(+), 60 deletions(-)

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 0019c14052d3c..a3ded0f6a9983 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -402,10 +402,10 @@ BUILTIN(__builtin_amdgcn_wave_reduce_max_u64, "WUiWUiZi", 
"nc")
 BUILTIN(__builtin_amdgcn_wave_reduce_and_b64, "WiWiZi", "nc")
 BUILTIN(__builtin_amdgcn_wave_reduce_or_b64, "WiWiZi", "nc")
 BUILTIN(__builtin_amdgcn_wave_reduce_xor_b64, "WiWiZi", "nc")
-BUILTIN(__builtin_amdgcn_wave_reduce_add_f32, "ffZi", "nc")
-BUILTIN(__builtin_amdgcn_wave_reduce_sub_f32, "ffZi", "nc")
-BUILTIN(__builtin_amdgcn_wave_reduce_min_f32, "ffZi", "nc")
-BUILTIN(__builtin_amdgcn_wave_reduce_max_f32, "ffZi", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_fadd_f32, "ffZi", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_fsub_f32, "ffZi", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_fmin_f32, "ffZi", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_fmax_f32, "ffZi", "nc")
 
 
//===----------------------------------------------------------------------===//
 // R600-NI only builtins.
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp 
b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index 573bcd79611e2..bb974e3b2adc8 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -389,23 +389,27 @@ static Intrinsic::ID 
getIntrinsicIDforWaveReduction(unsigned BuiltinID) {
     llvm_unreachable("Unknown BuiltinID for wave reduction");
   case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u32:
   case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u64:
-  case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_add_f32:
     return Intrinsic::amdgcn_wave_reduce_add;
+  case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_fadd_f32:
+    return Intrinsic::amdgcn_wave_reduce_fadd;
   case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u32:
   case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u64:
-  case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_f32:
     return Intrinsic::amdgcn_wave_reduce_sub;
+  case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_fsub_f32:
+    return Intrinsic::amdgcn_wave_reduce_fsub;
   case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i32:
   case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i64:
-  case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_f32:
     return Intrinsic::amdgcn_wave_reduce_min;
+  case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_fmin_f32:
+    return Intrinsic::amdgcn_wave_reduce_fmin;
   case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u32:
   case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u64:
     return Intrinsic::amdgcn_wave_reduce_umin;
   case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i32:
   case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i64:
-  case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_f32:
     return Intrinsic::amdgcn_wave_reduce_max;
+  case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_fmax_f32:
+    return Intrinsic::amdgcn_wave_reduce_fmax;
   case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u32:
   case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u64:
     return Intrinsic::amdgcn_wave_reduce_umax;
@@ -427,15 +431,15 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
   llvm::SyncScope::ID SSID;
   switch (BuiltinID) {
   case AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u32:
-  case AMDGPU::BI__builtin_amdgcn_wave_reduce_add_f32:
+  case AMDGPU::BI__builtin_amdgcn_wave_reduce_fadd_f32:
   case AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u32:
-  case AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_f32:
+  case AMDGPU::BI__builtin_amdgcn_wave_reduce_fsub_f32:
   case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i32:
   case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u32:
-  case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_f32:
+  case AMDGPU::BI__builtin_amdgcn_wave_reduce_fmin_f32:
   case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i32:
   case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u32:
-  case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_f32:
+  case AMDGPU::BI__builtin_amdgcn_wave_reduce_fmax_f32:
   case AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b32:
   case AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b32:
   case AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b32:
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
index 2e03fc8d8cd84..a5132c9114673 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
@@ -405,11 +405,11 @@ void test_wave_reduce_add_u64_default(global int* out, 
long in)
   *out = __builtin_amdgcn_wave_reduce_add_u64(in, 0);
 }
 
-// CHECK-LABEL: @test_wave_reduce_add_f32_default
-// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.add.f32(
-void test_wave_reduce_add_f32_default(global float* out, float in)
+// CHECK-LABEL: @test_wave_reduce_fadd_f32_default
+// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fadd.f32(
+void test_wave_reduce_fadd_f32_default(global float* out, float in)
 {
-  *out = __builtin_amdgcn_wave_reduce_add_f32(in, 0);
+  *out = __builtin_amdgcn_wave_reduce_fadd_f32(in, 0);
 }
 
 // CHECK-LABEL: @test_wave_reduce_add_u32_iterative
@@ -426,11 +426,11 @@ void test_wave_reduce_add_u64_iterative(global int* out, 
long in)
   *out = __builtin_amdgcn_wave_reduce_add_u64(in, 1);
 }
 
-// CHECK-LABEL: @test_wave_reduce_add_f32_iterative
-// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.add.f32(
-void test_wave_reduce_add_f32_iterative(global float* out, float in)
+// CHECK-LABEL: @test_wave_reduce_fadd_f32_iterative
+// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fadd.f32(
+void test_wave_reduce_fadd_f32_iterative(global float* out, float in)
 {
-  *out = __builtin_amdgcn_wave_reduce_add_f32(in, 0);
+  *out = __builtin_amdgcn_wave_reduce_fadd_f32(in, 0);
 }
 
 // CHECK-LABEL: @test_wave_reduce_add_u32_dpp
@@ -447,11 +447,11 @@ void test_wave_reduce_add_u64_dpp(global int* out, long 
in)
   *out = __builtin_amdgcn_wave_reduce_add_u64(in, 2);
 }
 
-// CHECK-LABEL: @test_wave_reduce_add_f32_dpp
-// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.add.f32(
-void test_wave_reduce_add_f32_dpp(global float* out, float in)
+// CHECK-LABEL: @test_wave_reduce_fadd_f32_dpp
+// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fadd.f32(
+void test_wave_reduce_fadd_f32_dpp(global float* out, float in)
 {
-  *out = __builtin_amdgcn_wave_reduce_add_f32(in, 0);
+  *out = __builtin_amdgcn_wave_reduce_fadd_f32(in, 0);
 }
 
 // CHECK-LABEL: @test_wave_reduce_sub_u32_default
@@ -468,11 +468,11 @@ void test_wave_reduce_sub_u64_default(global int* out, 
long in)
   *out = __builtin_amdgcn_wave_reduce_sub_u64(in, 0);
 }
 
-// CHECK-LABEL: @test_wave_reduce_sub_f32_default
-// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.sub.f32(
-void test_wave_reduce_sub_f32_default(global float* out, float in)
+// CHECK-LABEL: @test_wave_reduce_fsub_f32_default
+// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fsub.f32(
+void test_wave_reduce_fsub_f32_default(global float* out, float in)
 {
-  *out = __builtin_amdgcn_wave_reduce_sub_f32(in, 0);
+  *out = __builtin_amdgcn_wave_reduce_fsub_f32(in, 0);
 }
 
 // CHECK-LABEL: @test_wave_reduce_sub_u32_iterative
@@ -489,11 +489,11 @@ void test_wave_reduce_sub_u64_iterative(global int* out, 
long in)
   *out = __builtin_amdgcn_wave_reduce_sub_u64(in, 1);
 }
 
-// CHECK-LABEL: @test_wave_reduce_sub_f32_iterative
-// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.sub.f32(
-void test_wave_reduce_sub_f32_iterative(global float* out, float in)
+// CHECK-LABEL: @test_wave_reduce_fsub_f32_iterative
+// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fsub.f32(
+void test_wave_reduce_fsub_f32_iterative(global float* out, float in)
 {
-  *out = __builtin_amdgcn_wave_reduce_sub_f32(in, 0);
+  *out = __builtin_amdgcn_wave_reduce_fsub_f32(in, 0);
 }
 
 // CHECK-LABEL: @test_wave_reduce_sub_u32_dpp
@@ -510,11 +510,11 @@ void test_wave_reduce_sub_u64_dpp(global int* out, long 
in)
   *out = __builtin_amdgcn_wave_reduce_sub_u64(in, 2);
 }
 
-// CHECK-LABEL: @test_wave_reduce_sub_f32_dpp
-// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.sub.f32(
-void test_wave_reduce_sub_f32_dpp(global float* out, float in)
+// CHECK-LABEL: @test_wave_reduce_fsub_f32_dpp
+// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fsub.f32(
+void test_wave_reduce_fsub_f32_dpp(global float* out, float in)
 {
-  *out = __builtin_amdgcn_wave_reduce_sub_f32(in, 0);
+  *out = __builtin_amdgcn_wave_reduce_fsub_f32(in, 0);
 }
 
 // CHECK-LABEL: @test_wave_reduce_and_b32_default
@@ -657,11 +657,11 @@ void test_wave_reduce_min_i64_default(global int* out, 
long in)
   *out = __builtin_amdgcn_wave_reduce_min_i64(in, 0);
 }
 
-// CHECK-LABEL: @test_wave_reduce_min_f32_default
-// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.min.f32(
-void test_wave_reduce_min_f32_default(global float* out, float in)
+// CHECK-LABEL: @test_wave_reduce_fmin_f32_default
+// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fmin.f32(
+void test_wave_reduce_fmin_f32_default(global float* out, float in)
 {
-  *out = __builtin_amdgcn_wave_reduce_min_f32(in, 0);
+  *out = __builtin_amdgcn_wave_reduce_fmin_f32(in, 0);
 }
 
 // CHECK-LABEL: @test_wave_reduce_min_i32_iterative
@@ -678,11 +678,11 @@ void test_wave_reduce_min_i64_iterative(global int* out, 
long in)
   *out = __builtin_amdgcn_wave_reduce_min_i64(in, 1);
 }
 
-// CHECK-LABEL: @test_wave_reduce_min_f32_iterative
-// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.min.f32(
-void test_wave_reduce_min_f32_iterative(global float* out, float in)
+// CHECK-LABEL: @test_wave_reduce_fmin_f32_iterative
+// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fmin.f32(
+void test_wave_reduce_fmin_f32_iterative(global float* out, float in)
 {
-  *out = __builtin_amdgcn_wave_reduce_min_f32(in, 0);
+  *out = __builtin_amdgcn_wave_reduce_fmin_f32(in, 0);
 }
 
 // CHECK-LABEL: @test_wave_reduce_min_i32_dpp
@@ -699,11 +699,11 @@ void test_wave_reduce_min_i64_dpp(global int* out, long 
in)
   *out = __builtin_amdgcn_wave_reduce_min_i64(in, 2);
 }
 
-// CHECK-LABEL: @test_wave_reduce_min_f32_dpp
-// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.min.f32(
-void test_wave_reduce_min_f32_dpp(global float* out, float in)
+// CHECK-LABEL: @test_wave_reduce_fmin_f32_dpp
+// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fmin.f32(
+void test_wave_reduce_fmin_f32_dpp(global float* out, float in)
 {
-  *out = __builtin_amdgcn_wave_reduce_min_f32(in, 0);
+  *out = __builtin_amdgcn_wave_reduce_fmin_f32(in, 0);
 }
 
 // CHECK-LABEL: @test_wave_reduce_min_u32_default
@@ -762,11 +762,11 @@ void test_wave_reduce_max_i64_default(global int* out, 
long in)
   *out = __builtin_amdgcn_wave_reduce_max_i64(in, 0);
 }
 
-// CHECK-LABEL: @test_wave_reduce_max_f32_default
-// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.max.f32(
-void test_wave_reduce_max_f32_default(global float* out, float in)
+// CHECK-LABEL: @test_wave_reduce_fmax_f32_default
+// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fmax.f32(
+void test_wave_reduce_fmax_f32_default(global float* out, float in)
 {
-  *out = __builtin_amdgcn_wave_reduce_max_f32(in, 0);
+  *out = __builtin_amdgcn_wave_reduce_fmax_f32(in, 0);
 }
 
 // CHECK-LABEL: @test_wave_reduce_max_i32_iterative
@@ -783,11 +783,11 @@ void test_wave_reduce_max_i64_iterative(global int* out, 
long in)
   *out = __builtin_amdgcn_wave_reduce_max_i64(in, 1);
 }
 
-// CHECK-LABEL: @test_wave_reduce_max_f32_iterative
-// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.max.f32(
-void test_wave_reduce_max_f32_iterative(global float* out, float in)
+// CHECK-LABEL: @test_wave_reduce_fmax_f32_iterative
+// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fmax.f32(
+void test_wave_reduce_fmax_f32_iterative(global float* out, float in)
 {
-  *out = __builtin_amdgcn_wave_reduce_max_f32(in, 0);
+  *out = __builtin_amdgcn_wave_reduce_fmax_f32(in, 0);
 }
 
 // CHECK-LABEL: @test_wave_reduce_max_i32_dpp
@@ -804,11 +804,11 @@ void test_wave_reduce_max_i64_dpp(global int* out, long 
in)
   *out = __builtin_amdgcn_wave_reduce_max_i64(in, 2);
 }
 
-// CHECK-LABEL: @test_wave_reduce_max_f32_dpp
-// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.max.f32(
-void test_wave_reduce_max_f32_dpp(global float* out, float in)
+// CHECK-LABEL: @test_wave_reduce_fmax_f32_dpp
+// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fmax.f32(
+void test_wave_reduce_fmax_f32_dpp(global float* out, float in)
 {
-  *out = __builtin_amdgcn_wave_reduce_max_f32(in, 0);
+  *out = __builtin_amdgcn_wave_reduce_fmax_f32(in, 0);
 }
 
 // CHECK-LABEL: @test_wave_reduce_max_u32_default

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to