https://github.com/easyonaadit updated https://github.com/llvm/llvm-project/pull/161816
>From 1a0a6b8636c589a8a30c217d66a09697b88717e4 Mon Sep 17 00:00:00 2001 From: Aaditya <[email protected]> Date: Tue, 30 Sep 2025 11:37:42 +0530 Subject: [PATCH 1/2] [AMDGPU] Add builtins for wave reduction intrinsics --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 4 + clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp | 8 ++ clang/test/CodeGenOpenCL/builtins-amdgcn.cl | 84 ++++++++++++++++++++ 3 files changed, 96 insertions(+) diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 81e684a04a03d..0019c14052d3c 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -402,6 +402,10 @@ BUILTIN(__builtin_amdgcn_wave_reduce_max_u64, "WUiWUiZi", "nc") BUILTIN(__builtin_amdgcn_wave_reduce_and_b64, "WiWiZi", "nc") BUILTIN(__builtin_amdgcn_wave_reduce_or_b64, "WiWiZi", "nc") BUILTIN(__builtin_amdgcn_wave_reduce_xor_b64, "WiWiZi", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_add_f32, "ffZi", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_sub_f32, "ffZi", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_min_f32, "ffZi", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_max_f32, "ffZi", "nc") //===----------------------------------------------------------------------===// // R600-NI only builtins. diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp index 9eab70955b6b9..573bcd79611e2 100644 --- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp @@ -389,18 +389,22 @@ static Intrinsic::ID getIntrinsicIDforWaveReduction(unsigned BuiltinID) { llvm_unreachable("Unknown BuiltinID for wave reduction"); case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u32: case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u64: + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_add_f32: return Intrinsic::amdgcn_wave_reduce_add; case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u32: case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u64: + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_f32: return Intrinsic::amdgcn_wave_reduce_sub; case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i32: case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i64: + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_f32: return Intrinsic::amdgcn_wave_reduce_min; case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u32: case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u64: return Intrinsic::amdgcn_wave_reduce_umin; case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i32: case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i64: + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_f32: return Intrinsic::amdgcn_wave_reduce_max; case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u32: case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u64: @@ -423,11 +427,15 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, llvm::SyncScope::ID SSID; switch (BuiltinID) { case AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u32: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_add_f32: case AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u32: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_f32: case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i32: case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u32: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_f32: case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i32: case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u32: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_f32: case AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b32: case AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b32: case AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b32: diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl index b92454de60c78..2e03fc8d8cd84 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl @@ -405,6 +405,13 @@ void test_wave_reduce_add_u64_default(global int* out, long in) *out = __builtin_amdgcn_wave_reduce_add_u64(in, 0); } +// CHECK-LABEL: @test_wave_reduce_add_f32_default +// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.add.f32( +void test_wave_reduce_add_f32_default(global float* out, float in) +{ + *out = __builtin_amdgcn_wave_reduce_add_f32(in, 0); +} + // CHECK-LABEL: @test_wave_reduce_add_u32_iterative // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.add.i32( void test_wave_reduce_add_u32_iterative(global int* out, int in) @@ -419,6 +426,13 @@ void test_wave_reduce_add_u64_iterative(global int* out, long in) *out = __builtin_amdgcn_wave_reduce_add_u64(in, 1); } +// CHECK-LABEL: @test_wave_reduce_add_f32_iterative +// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.add.f32( +void test_wave_reduce_add_f32_iterative(global float* out, float in) +{ + *out = __builtin_amdgcn_wave_reduce_add_f32(in, 0); +} + // CHECK-LABEL: @test_wave_reduce_add_u32_dpp // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.add.i32( void test_wave_reduce_add_u32_dpp(global int* out, int in) @@ -433,6 +447,13 @@ void test_wave_reduce_add_u64_dpp(global int* out, long in) *out = __builtin_amdgcn_wave_reduce_add_u64(in, 2); } +// CHECK-LABEL: @test_wave_reduce_add_f32_dpp +// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.add.f32( +void test_wave_reduce_add_f32_dpp(global float* out, float in) +{ + *out = __builtin_amdgcn_wave_reduce_add_f32(in, 0); +} + // CHECK-LABEL: @test_wave_reduce_sub_u32_default // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.sub.i32( void test_wave_reduce_sub_u32_default(global int* out, int in) @@ -447,6 +468,13 @@ void test_wave_reduce_sub_u64_default(global int* out, long in) *out = __builtin_amdgcn_wave_reduce_sub_u64(in, 0); } +// CHECK-LABEL: @test_wave_reduce_sub_f32_default +// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.sub.f32( +void test_wave_reduce_sub_f32_default(global float* out, float in) +{ + *out = __builtin_amdgcn_wave_reduce_sub_f32(in, 0); +} + // CHECK-LABEL: @test_wave_reduce_sub_u32_iterative // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.sub.i32( void test_wave_reduce_sub_u32_iterative(global int* out, int in) @@ -461,6 +489,13 @@ void test_wave_reduce_sub_u64_iterative(global int* out, long in) *out = __builtin_amdgcn_wave_reduce_sub_u64(in, 1); } +// CHECK-LABEL: @test_wave_reduce_sub_f32_iterative +// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.sub.f32( +void test_wave_reduce_sub_f32_iterative(global float* out, float in) +{ + *out = __builtin_amdgcn_wave_reduce_sub_f32(in, 0); +} + // CHECK-LABEL: @test_wave_reduce_sub_u32_dpp // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.sub.i32( void test_wave_reduce_sub_u32_dpp(global int* out, int in) @@ -475,6 +510,13 @@ void test_wave_reduce_sub_u64_dpp(global int* out, long in) *out = __builtin_amdgcn_wave_reduce_sub_u64(in, 2); } +// CHECK-LABEL: @test_wave_reduce_sub_f32_dpp +// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.sub.f32( +void test_wave_reduce_sub_f32_dpp(global float* out, float in) +{ + *out = __builtin_amdgcn_wave_reduce_sub_f32(in, 0); +} + // CHECK-LABEL: @test_wave_reduce_and_b32_default // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.and.i32( void test_wave_reduce_and_b32_default(global int* out, int in) @@ -615,6 +657,13 @@ void test_wave_reduce_min_i64_default(global int* out, long in) *out = __builtin_amdgcn_wave_reduce_min_i64(in, 0); } +// CHECK-LABEL: @test_wave_reduce_min_f32_default +// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.min.f32( +void test_wave_reduce_min_f32_default(global float* out, float in) +{ + *out = __builtin_amdgcn_wave_reduce_min_f32(in, 0); +} + // CHECK-LABEL: @test_wave_reduce_min_i32_iterative // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.min.i32( void test_wave_reduce_min_i32_iterative(global int* out, int in) @@ -629,6 +678,13 @@ void test_wave_reduce_min_i64_iterative(global int* out, long in) *out = __builtin_amdgcn_wave_reduce_min_i64(in, 1); } +// CHECK-LABEL: @test_wave_reduce_min_f32_iterative +// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.min.f32( +void test_wave_reduce_min_f32_iterative(global float* out, float in) +{ + *out = __builtin_amdgcn_wave_reduce_min_f32(in, 0); +} + // CHECK-LABEL: @test_wave_reduce_min_i32_dpp // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.min.i32( void test_wave_reduce_min_i32_dpp(global int* out, int in) @@ -643,6 +699,13 @@ void test_wave_reduce_min_i64_dpp(global int* out, long in) *out = __builtin_amdgcn_wave_reduce_min_i64(in, 2); } +// CHECK-LABEL: @test_wave_reduce_min_f32_dpp +// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.min.f32( +void test_wave_reduce_min_f32_dpp(global float* out, float in) +{ + *out = __builtin_amdgcn_wave_reduce_min_f32(in, 0); +} + // CHECK-LABEL: @test_wave_reduce_min_u32_default // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.umin.i32( void test_wave_reduce_min_u32_default(global int* out, int in) @@ -699,6 +762,13 @@ void test_wave_reduce_max_i64_default(global int* out, long in) *out = __builtin_amdgcn_wave_reduce_max_i64(in, 0); } +// CHECK-LABEL: @test_wave_reduce_max_f32_default +// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.max.f32( +void test_wave_reduce_max_f32_default(global float* out, float in) +{ + *out = __builtin_amdgcn_wave_reduce_max_f32(in, 0); +} + // CHECK-LABEL: @test_wave_reduce_max_i32_iterative // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.max.i32( void test_wave_reduce_max_i32_iterative(global int* out, int in) @@ -713,6 +783,13 @@ void test_wave_reduce_max_i64_iterative(global int* out, long in) *out = __builtin_amdgcn_wave_reduce_max_i64(in, 1); } +// CHECK-LABEL: @test_wave_reduce_max_f32_iterative +// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.max.f32( +void test_wave_reduce_max_f32_iterative(global float* out, float in) +{ + *out = __builtin_amdgcn_wave_reduce_max_f32(in, 0); +} + // CHECK-LABEL: @test_wave_reduce_max_i32_dpp // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.max.i32( void test_wave_reduce_max_i32_dpp(global int* out, int in) @@ -727,6 +804,13 @@ void test_wave_reduce_max_i64_dpp(global int* out, long in) *out = __builtin_amdgcn_wave_reduce_max_i64(in, 2); } +// CHECK-LABEL: @test_wave_reduce_max_f32_dpp +// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.max.f32( +void test_wave_reduce_max_f32_dpp(global float* out, float in) +{ + *out = __builtin_amdgcn_wave_reduce_max_f32(in, 0); +} + // CHECK-LABEL: @test_wave_reduce_max_u32_default // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.umax.i32( void test_wave_reduce_max_u32_default(global int* out, int in) >From c5cf1b2ee54e59e3b43e2d31d9dd4bb13b6958e4 Mon Sep 17 00:00:00 2001 From: Aaditya <[email protected]> Date: Wed, 12 Nov 2025 10:08:07 +0530 Subject: [PATCH 2/2] Review comments: remove the float overload. --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 8 +- clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp | 20 ++-- clang/test/CodeGenOpenCL/builtins-amdgcn.cl | 96 ++++++++++---------- 3 files changed, 64 insertions(+), 60 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 0019c14052d3c..a3ded0f6a9983 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -402,10 +402,10 @@ BUILTIN(__builtin_amdgcn_wave_reduce_max_u64, "WUiWUiZi", "nc") BUILTIN(__builtin_amdgcn_wave_reduce_and_b64, "WiWiZi", "nc") BUILTIN(__builtin_amdgcn_wave_reduce_or_b64, "WiWiZi", "nc") BUILTIN(__builtin_amdgcn_wave_reduce_xor_b64, "WiWiZi", "nc") -BUILTIN(__builtin_amdgcn_wave_reduce_add_f32, "ffZi", "nc") -BUILTIN(__builtin_amdgcn_wave_reduce_sub_f32, "ffZi", "nc") -BUILTIN(__builtin_amdgcn_wave_reduce_min_f32, "ffZi", "nc") -BUILTIN(__builtin_amdgcn_wave_reduce_max_f32, "ffZi", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_fadd_f32, "ffZi", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_fsub_f32, "ffZi", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_fmin_f32, "ffZi", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_fmax_f32, "ffZi", "nc") //===----------------------------------------------------------------------===// // R600-NI only builtins. diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp index 573bcd79611e2..bb974e3b2adc8 100644 --- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp @@ -389,23 +389,27 @@ static Intrinsic::ID getIntrinsicIDforWaveReduction(unsigned BuiltinID) { llvm_unreachable("Unknown BuiltinID for wave reduction"); case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u32: case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u64: - case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_add_f32: return Intrinsic::amdgcn_wave_reduce_add; + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_fadd_f32: + return Intrinsic::amdgcn_wave_reduce_fadd; case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u32: case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u64: - case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_f32: return Intrinsic::amdgcn_wave_reduce_sub; + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_fsub_f32: + return Intrinsic::amdgcn_wave_reduce_fsub; case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i32: case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i64: - case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_f32: return Intrinsic::amdgcn_wave_reduce_min; + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_fmin_f32: + return Intrinsic::amdgcn_wave_reduce_fmin; case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u32: case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u64: return Intrinsic::amdgcn_wave_reduce_umin; case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i32: case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i64: - case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_f32: return Intrinsic::amdgcn_wave_reduce_max; + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_fmax_f32: + return Intrinsic::amdgcn_wave_reduce_fmax; case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u32: case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u64: return Intrinsic::amdgcn_wave_reduce_umax; @@ -427,15 +431,15 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, llvm::SyncScope::ID SSID; switch (BuiltinID) { case AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u32: - case AMDGPU::BI__builtin_amdgcn_wave_reduce_add_f32: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_fadd_f32: case AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u32: - case AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_f32: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_fsub_f32: case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i32: case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u32: - case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_f32: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_fmin_f32: case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i32: case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u32: - case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_f32: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_fmax_f32: case AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b32: case AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b32: case AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b32: diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl index 2e03fc8d8cd84..a5132c9114673 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl @@ -405,11 +405,11 @@ void test_wave_reduce_add_u64_default(global int* out, long in) *out = __builtin_amdgcn_wave_reduce_add_u64(in, 0); } -// CHECK-LABEL: @test_wave_reduce_add_f32_default -// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.add.f32( -void test_wave_reduce_add_f32_default(global float* out, float in) +// CHECK-LABEL: @test_wave_reduce_fadd_f32_default +// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fadd.f32( +void test_wave_reduce_fadd_f32_default(global float* out, float in) { - *out = __builtin_amdgcn_wave_reduce_add_f32(in, 0); + *out = __builtin_amdgcn_wave_reduce_fadd_f32(in, 0); } // CHECK-LABEL: @test_wave_reduce_add_u32_iterative @@ -426,11 +426,11 @@ void test_wave_reduce_add_u64_iterative(global int* out, long in) *out = __builtin_amdgcn_wave_reduce_add_u64(in, 1); } -// CHECK-LABEL: @test_wave_reduce_add_f32_iterative -// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.add.f32( -void test_wave_reduce_add_f32_iterative(global float* out, float in) +// CHECK-LABEL: @test_wave_reduce_fadd_f32_iterative +// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fadd.f32( +void test_wave_reduce_fadd_f32_iterative(global float* out, float in) { - *out = __builtin_amdgcn_wave_reduce_add_f32(in, 0); + *out = __builtin_amdgcn_wave_reduce_fadd_f32(in, 0); } // CHECK-LABEL: @test_wave_reduce_add_u32_dpp @@ -447,11 +447,11 @@ void test_wave_reduce_add_u64_dpp(global int* out, long in) *out = __builtin_amdgcn_wave_reduce_add_u64(in, 2); } -// CHECK-LABEL: @test_wave_reduce_add_f32_dpp -// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.add.f32( -void test_wave_reduce_add_f32_dpp(global float* out, float in) +// CHECK-LABEL: @test_wave_reduce_fadd_f32_dpp +// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fadd.f32( +void test_wave_reduce_fadd_f32_dpp(global float* out, float in) { - *out = __builtin_amdgcn_wave_reduce_add_f32(in, 0); + *out = __builtin_amdgcn_wave_reduce_fadd_f32(in, 0); } // CHECK-LABEL: @test_wave_reduce_sub_u32_default @@ -468,11 +468,11 @@ void test_wave_reduce_sub_u64_default(global int* out, long in) *out = __builtin_amdgcn_wave_reduce_sub_u64(in, 0); } -// CHECK-LABEL: @test_wave_reduce_sub_f32_default -// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.sub.f32( -void test_wave_reduce_sub_f32_default(global float* out, float in) +// CHECK-LABEL: @test_wave_reduce_fsub_f32_default +// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fsub.f32( +void test_wave_reduce_fsub_f32_default(global float* out, float in) { - *out = __builtin_amdgcn_wave_reduce_sub_f32(in, 0); + *out = __builtin_amdgcn_wave_reduce_fsub_f32(in, 0); } // CHECK-LABEL: @test_wave_reduce_sub_u32_iterative @@ -489,11 +489,11 @@ void test_wave_reduce_sub_u64_iterative(global int* out, long in) *out = __builtin_amdgcn_wave_reduce_sub_u64(in, 1); } -// CHECK-LABEL: @test_wave_reduce_sub_f32_iterative -// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.sub.f32( -void test_wave_reduce_sub_f32_iterative(global float* out, float in) +// CHECK-LABEL: @test_wave_reduce_fsub_f32_iterative +// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fsub.f32( +void test_wave_reduce_fsub_f32_iterative(global float* out, float in) { - *out = __builtin_amdgcn_wave_reduce_sub_f32(in, 0); + *out = __builtin_amdgcn_wave_reduce_fsub_f32(in, 0); } // CHECK-LABEL: @test_wave_reduce_sub_u32_dpp @@ -510,11 +510,11 @@ void test_wave_reduce_sub_u64_dpp(global int* out, long in) *out = __builtin_amdgcn_wave_reduce_sub_u64(in, 2); } -// CHECK-LABEL: @test_wave_reduce_sub_f32_dpp -// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.sub.f32( -void test_wave_reduce_sub_f32_dpp(global float* out, float in) +// CHECK-LABEL: @test_wave_reduce_fsub_f32_dpp +// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fsub.f32( +void test_wave_reduce_fsub_f32_dpp(global float* out, float in) { - *out = __builtin_amdgcn_wave_reduce_sub_f32(in, 0); + *out = __builtin_amdgcn_wave_reduce_fsub_f32(in, 0); } // CHECK-LABEL: @test_wave_reduce_and_b32_default @@ -657,11 +657,11 @@ void test_wave_reduce_min_i64_default(global int* out, long in) *out = __builtin_amdgcn_wave_reduce_min_i64(in, 0); } -// CHECK-LABEL: @test_wave_reduce_min_f32_default -// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.min.f32( -void test_wave_reduce_min_f32_default(global float* out, float in) +// CHECK-LABEL: @test_wave_reduce_fmin_f32_default +// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fmin.f32( +void test_wave_reduce_fmin_f32_default(global float* out, float in) { - *out = __builtin_amdgcn_wave_reduce_min_f32(in, 0); + *out = __builtin_amdgcn_wave_reduce_fmin_f32(in, 0); } // CHECK-LABEL: @test_wave_reduce_min_i32_iterative @@ -678,11 +678,11 @@ void test_wave_reduce_min_i64_iterative(global int* out, long in) *out = __builtin_amdgcn_wave_reduce_min_i64(in, 1); } -// CHECK-LABEL: @test_wave_reduce_min_f32_iterative -// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.min.f32( -void test_wave_reduce_min_f32_iterative(global float* out, float in) +// CHECK-LABEL: @test_wave_reduce_fmin_f32_iterative +// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fmin.f32( +void test_wave_reduce_fmin_f32_iterative(global float* out, float in) { - *out = __builtin_amdgcn_wave_reduce_min_f32(in, 0); + *out = __builtin_amdgcn_wave_reduce_fmin_f32(in, 0); } // CHECK-LABEL: @test_wave_reduce_min_i32_dpp @@ -699,11 +699,11 @@ void test_wave_reduce_min_i64_dpp(global int* out, long in) *out = __builtin_amdgcn_wave_reduce_min_i64(in, 2); } -// CHECK-LABEL: @test_wave_reduce_min_f32_dpp -// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.min.f32( -void test_wave_reduce_min_f32_dpp(global float* out, float in) +// CHECK-LABEL: @test_wave_reduce_fmin_f32_dpp +// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fmin.f32( +void test_wave_reduce_fmin_f32_dpp(global float* out, float in) { - *out = __builtin_amdgcn_wave_reduce_min_f32(in, 0); + *out = __builtin_amdgcn_wave_reduce_fmin_f32(in, 0); } // CHECK-LABEL: @test_wave_reduce_min_u32_default @@ -762,11 +762,11 @@ void test_wave_reduce_max_i64_default(global int* out, long in) *out = __builtin_amdgcn_wave_reduce_max_i64(in, 0); } -// CHECK-LABEL: @test_wave_reduce_max_f32_default -// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.max.f32( -void test_wave_reduce_max_f32_default(global float* out, float in) +// CHECK-LABEL: @test_wave_reduce_fmax_f32_default +// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fmax.f32( +void test_wave_reduce_fmax_f32_default(global float* out, float in) { - *out = __builtin_amdgcn_wave_reduce_max_f32(in, 0); + *out = __builtin_amdgcn_wave_reduce_fmax_f32(in, 0); } // CHECK-LABEL: @test_wave_reduce_max_i32_iterative @@ -783,11 +783,11 @@ void test_wave_reduce_max_i64_iterative(global int* out, long in) *out = __builtin_amdgcn_wave_reduce_max_i64(in, 1); } -// CHECK-LABEL: @test_wave_reduce_max_f32_iterative -// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.max.f32( -void test_wave_reduce_max_f32_iterative(global float* out, float in) +// CHECK-LABEL: @test_wave_reduce_fmax_f32_iterative +// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fmax.f32( +void test_wave_reduce_fmax_f32_iterative(global float* out, float in) { - *out = __builtin_amdgcn_wave_reduce_max_f32(in, 0); + *out = __builtin_amdgcn_wave_reduce_fmax_f32(in, 0); } // CHECK-LABEL: @test_wave_reduce_max_i32_dpp @@ -804,11 +804,11 @@ void test_wave_reduce_max_i64_dpp(global int* out, long in) *out = __builtin_amdgcn_wave_reduce_max_i64(in, 2); } -// CHECK-LABEL: @test_wave_reduce_max_f32_dpp -// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.max.f32( -void test_wave_reduce_max_f32_dpp(global float* out, float in) +// CHECK-LABEL: @test_wave_reduce_fmax_f32_dpp +// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fmax.f32( +void test_wave_reduce_fmax_f32_dpp(global float* out, float in) { - *out = __builtin_amdgcn_wave_reduce_max_f32(in, 0); + *out = __builtin_amdgcn_wave_reduce_fmax_f32(in, 0); } // CHECK-LABEL: @test_wave_reduce_max_u32_default _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
