https://github.com/shiltian created https://github.com/llvm/llvm-project/pull/135224
__builtin_amdgcn_image_bvh8_intersect_ray __builtin_amdgcn_image_bvh_dual_intersect_ray For the above two builtins, the second and third return values of the intrinsics are returned through pointer-type function arguments. __builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn __builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn __builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn For the last builtin, the intrinsic returns `{i64, i32}`, the builtin returns `<2 x i64>`. The second return value of the intrinsic is zext'ed. >From d6d0f7ff400d1540db414731c0a368e309f24818 Mon Sep 17 00:00:00 2001 From: Shilei Tian <i...@tianshilei.me> Date: Thu, 10 Apr 2025 13:46:23 -0400 Subject: [PATCH] [AMDGPU][Clang] Add builtins for gfx12 ray tracing intrinsics __builtin_amdgcn_image_bvh8_intersect_ray __builtin_amdgcn_image_bvh_dual_intersect_ray For the above two builtins, the second and third return values of the intrinsics are returned through pointer-type function arguments. __builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn __builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn __builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn For the last builtin, the intrinsic returns `{i64, i32}`, the builtin returns `<2 x i64>`. The second return value of the intrinsic is zext'ed. --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 12 +++ clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp | 66 +++++++++++++++- .../builtins-amdgcn-raytracing.cl | 78 +++++++++++++++++++ 3 files changed, 154 insertions(+), 2 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index cbef637be213a..39fef9e4601f8 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -506,6 +506,18 @@ TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4bf16, "V4yV4y*1", "nc", "g TARGET_BUILTIN(__builtin_amdgcn_ds_bpermute_fi_b32, "iii", "nc", "gfx12-insts") +// For the following two builtins, the second and third return values of the +// intrinsics are returned through the last two pointer-type function arguments. +TARGET_BUILTIN(__builtin_amdgcn_image_bvh8_intersect_ray, "V10UiWUifUcV3fV3fUiV4UiV3f*V3f*", "nc", "gfx12-insts") +TARGET_BUILTIN(__builtin_amdgcn_image_bvh_dual_intersect_ray, "V10UiWUifUcV3fV3fV2UiV4UiV3f*V3f*", "nc", "gfx12-insts") + +TARGET_BUILTIN(__builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn, "V2UiUiUiV4UiIi", "n", "gfx11-insts") +TARGET_BUILTIN(__builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn, "V2UiUiUiV8UiIi", "n", "gfx12-insts") + +// The intrinsic returns {i64, i32}, the builtin returns <2 x i64>. +// The second return value of the intrinsic is zext'ed. +TARGET_BUILTIN(__builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn, "V2WUiUiUiV8UiIi", "n", "gfx12-insts") + //===----------------------------------------------------------------------===// // WMMA builtins. // Postfix w32 indicates the builtin requires wavefront size of 32. diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp index b56b739094ff3..35c9f8ae48c80 100644 --- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp @@ -616,19 +616,81 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, return Builder.CreateCall(F, {NodePtr, RayExtent, RayOrigin, RayDir, RayInverseDir, TextureDescr}); } + case AMDGPU::BI__builtin_amdgcn_image_bvh8_intersect_ray: + case AMDGPU::BI__builtin_amdgcn_image_bvh_dual_intersect_ray: { + Intrinsic::ID IID; + switch (BuiltinID) { + case AMDGPU::BI__builtin_amdgcn_image_bvh8_intersect_ray: + IID = Intrinsic::amdgcn_image_bvh8_intersect_ray; + break; + case AMDGPU::BI__builtin_amdgcn_image_bvh_dual_intersect_ray: + IID = Intrinsic::amdgcn_image_bvh_dual_intersect_ray; + break; + } + llvm::Value *NodePtr = EmitScalarExpr(E->getArg(0)); + llvm::Value *RayExtent = EmitScalarExpr(E->getArg(1)); + llvm::Value *InstanceMask = EmitScalarExpr(E->getArg(2)); + llvm::Value *RayOrigin = EmitScalarExpr(E->getArg(3)); + llvm::Value *RayDir = EmitScalarExpr(E->getArg(4)); + llvm::Value *Offset = EmitScalarExpr(E->getArg(5)); + llvm::Value *TextureDescr = EmitScalarExpr(E->getArg(6)); + + Address RetRayOriginPtr = EmitPointerWithAlignment(E->getArg(7)); + Address RetRayDirPtr = EmitPointerWithAlignment(E->getArg(8)); + + llvm::Function *IntrinsicFunc = CGM.getIntrinsic(IID); + + llvm::CallInst *CI = Builder.CreateCall( + IntrinsicFunc, {NodePtr, RayExtent, InstanceMask, RayOrigin, RayDir, + Offset, TextureDescr}); + + llvm::Value *RetVData = Builder.CreateExtractValue(CI, 0); + llvm::Value *RetRayOrigin = Builder.CreateExtractValue(CI, 1); + llvm::Value *RetRayDir = Builder.CreateExtractValue(CI, 2); + + Builder.CreateStore(RetRayOrigin, RetRayOriginPtr); + Builder.CreateStore(RetRayDir, RetRayDirPtr); + + return RetVData; + } + + case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_rtn: + case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn: + case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn: + case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn: { + Intrinsic::ID IID; + switch (BuiltinID) { + case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_rtn: + IID = Intrinsic::amdgcn_ds_bvh_stack_rtn; + break; + case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn: + IID = Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn; + break; + case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn: + IID = Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn; + break; + case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn: + IID = Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn; + break; + } - case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_rtn: { SmallVector<Value *, 4> Args; for (int i = 0, e = E->getNumArgs(); i != e; ++i) Args.push_back(EmitScalarExpr(E->getArg(i))); - Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_ds_bvh_stack_rtn); + Function *F = CGM.getIntrinsic(IID); Value *Call = Builder.CreateCall(F, Args); Value *Rtn = Builder.CreateExtractValue(Call, 0); Value *A = Builder.CreateExtractValue(Call, 1); llvm::Type *RetTy = ConvertType(E->getType()); Value *I0 = Builder.CreateInsertElement(PoisonValue::get(RetTy), Rtn, (uint64_t)0); + // ds_bvh_stack_push8_pop2_rtn returns {i64, i32} but the builtin returns + // <2 x i64>, zext the second value. + if (A->getType()->getPrimitiveSizeInBits() < + RetTy->getScalarType()->getPrimitiveSizeInBits()) + A = Builder.CreateZExt(A, RetTy->getScalarType()); + return Builder.CreateInsertElement(I0, A, 1); } case AMDGPU::BI__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4: diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-raytracing.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-raytracing.cl index 7f73cdd61c80d..2cf7f3dc6f80e 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-raytracing.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-raytracing.cl @@ -3,6 +3,10 @@ // RUN: -emit-llvm -cl-std=CL2.0 -o - %s | FileCheck %s // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1030 -S \ // RUN: -cl-std=CL2.0 -o - %s | FileCheck -check-prefix=ISA %s +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -emit-llvm \ +// RUN: -cl-std=CL2.0 -o - %s | FileCheck -check-prefix=GFX12 %s +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -S \ +// RUN: -cl-std=CL2.0 -o - %s | FileCheck -check-prefix=GFX12ISA %s // Test llvm.amdgcn.image.bvh.intersect.ray intrinsic. @@ -12,12 +16,18 @@ // Postfix l indicates the 1st argument is i64 and postfix h indicates // the 4/5-th arguments are half4. +typedef unsigned char uchar; typedef unsigned int uint; typedef unsigned long ulong; +typedef float float3 __attribute__((ext_vector_type(3))); typedef float float4 __attribute__((ext_vector_type(4))); typedef double double4 __attribute__((ext_vector_type(4))); typedef half half4 __attribute__((ext_vector_type(4))); +typedef uint uint2 __attribute__((ext_vector_type(2))); typedef uint uint4 __attribute__((ext_vector_type(4))); +typedef uint uint8 __attribute__((ext_vector_type(8))); +typedef uint uint10 __attribute__((ext_vector_type(10))); +typedef ulong ulong2 __attribute__((ext_vector_type(2))); // CHECK: call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v3f32 // ISA: image_bvh_intersect_ray @@ -59,3 +69,71 @@ void test_image_bvh_intersect_ray_lh(global uint4* out, ulong node_ptr, ray_origin, ray_dir, ray_inv_dir, texture_descr); } +#if __has_builtin(__builtin_amdgcn_image_bvh8_intersect_ray) +// GFX12: call { <10 x i32>, <3 x float>, <3 x float> } @llvm.amdgcn.image.bvh8.intersect.ray( +// GFX12: i64 %node_ptr, float %ray_extent, i8 %instance_mask, <3 x float> %ray_origin, +// GFX12: <3 x float> %ray_dir, i32 %offset, <4 x i32> %texture_descr) +// GFX12ISA: image_bvh8_intersect_ray +void test_image_bvh8_intersect_ray(global uint10* ret_vdata, float3* ret_ray_origin, + float3* ret_ray_dir, ulong node_ptr, float ray_extent, uchar instance_mask, + float3 ray_origin, float3 ray_dir, uint offset, uint4 texture_descr) +{ + *ret_vdata = __builtin_amdgcn_image_bvh8_intersect_ray(node_ptr, ray_extent, + instance_mask, ray_origin, ray_dir, offset, texture_descr, + ret_ray_origin, ret_ray_dir); +} +#endif + +#if __has_builtin(__builtin_amdgcn_image_bvh_dual_intersect_ray) +// GFX12: call { <10 x i32>, <3 x float>, <3 x float> } @llvm.amdgcn.image.bvh.dual.intersect.ray( +// GFX12: i64 %node_ptr, float %ray_extent, i8 %instance_mask, <3 x float> %ray_origin, +// GFX12: <3 x float> %ray_dir, <2 x i32> %offset, <4 x i32> %texture_descr) +// GFX12ISA: image_bvh_dual_intersect_ray +void test_builtin_amdgcn_image_bvh_dual_intersect_ray(global uint10* ret_vdata, float3* ret_ray_origin, + float3* ret_ray_dir, ulong node_ptr, float ray_extent, uchar instance_mask, + float3 ray_origin, float3 ray_dir, uint2 offset, uint4 texture_descr) +{ + *ret_vdata = __builtin_amdgcn_image_bvh_dual_intersect_ray(node_ptr, ray_extent, + instance_mask, ray_origin, ray_dir, offset, texture_descr, + ret_ray_origin, ret_ray_dir); +} +#endif + +#if __has_builtin(__builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn) +// GFX12: call { i32, i32 } @llvm.amdgcn.ds.bvh.stack.push4.pop1.rtn( +// GFX12: i32 %addr, i32 %data0, <4 x i32> %data1, i32 0) +// GFX12ISA: ds_bvh_stack_push4_pop1_rtn +void test_builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn(uint* ret_vdst, uint* ret_addr, + uint addr, uint data0, uint4 data1) +{ + uint2 ret = __builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn(addr, data0, data1, /*constant offset=*/0); + *ret_vdst = ret.x; + *ret_addr = ret.y; +} +#endif + +#if __has_builtin(__builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn) +// GFX12: call { i32, i32 } @llvm.amdgcn.ds.bvh.stack.push8.pop1.rtn( +// GFX12: i32 %addr, i32 %data0, <8 x i32> %data1, i32 0) +// GFX12ISA: ds_bvh_stack_push8_pop1_rtn +void test_builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn(uint* ret_vdst, uint* ret_addr, + uint addr, uint data0, uint8 data1) +{ + uint2 ret = __builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn(addr, data0, data1, /*constant offset=*/0); + *ret_vdst = ret.x; + *ret_addr = ret.y; +} +#endif + +#if __has_builtin(__builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn) +// GFX12: call { i64, i32 } @llvm.amdgcn.ds.bvh.stack.push8.pop2.rtn( +// GFX12: i32 %addr, i32 %data0, <8 x i32> %data1, i32 0) +// GFX12ISA: ds_bvh_stack_push8_pop2_rtn +void test_builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn(ulong* ret_vdst, uint* ret_addr, + uint addr, uint data0, uint8 data1) +{ + ulong2 ret = __builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn(addr, data0, data1, /*constant offset=*/0); + *ret_vdst = ret.x; + *ret_addr = ret.y; +} +#endif _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits