llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-mlir-llvm Author: Krzysztof Drewniak (krzysz00) <details> <summary>Changes</summary> This PR adds a amdgns_load_to_lds intrinsic that abstracts over loads to LDS from global (address space 1) pointers and buffer fat pointers (address space 7), since they use the saem API and "gather from a pointer to LDS" is something of an abstract operation. This commet adds the intrinsic and its lowerings for addrspaces 1 and 7, and updates the MLIR wrappers to use it (loosening up the restrictions on loads to LDS along the way to match the ground truth from target features). It also plumbs the intrinsic through to clang. (Any clang folks know why things are broken?) --- Patch is 50.24 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/137425.diff 21 Files Affected: - (modified) clang/include/clang/Basic/BuiltinsAMDGPU.def (+1) - (modified) clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp (+4) - (modified) clang/lib/Sema/SemaAMDGPU.cpp (+1) - (modified) clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl (+30) - (added) clang/test/CodeGenOpenCL/builtins-amdgcn-load-to-lds.cl (+60) - (modified) llvm/docs/ReleaseNotes.md (+8) - (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+21) - (modified) llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp (+5) - (modified) llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp (+20) - (modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (+2) - (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+7-1) - (added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.gfx950.ll (+75) - (added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.ll (+220) - (modified) llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll (+18) - (modified) mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td (+7-5) - (modified) mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td (+29-6) - (modified) mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp (+7-8) - (modified) mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp (+15-6) - (modified) mlir/test/Conversion/AMDGPUToROCDL/load_lds.mlir (+51-16) - (modified) mlir/test/Dialect/LLVMIR/rocdl.mlir (+10-7) - (modified) mlir/test/Target/LLVMIR/rocdl.mlir (+7-4) ``````````diff diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 39fef9e4601f8..730fd15913c11 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -257,6 +257,7 @@ TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fadd_v2bf16, "V2sV2s*0V2s", "t", "at TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_v2bf16, "V2sV2s*1V2s", "t", "atomic-global-pk-add-bf16-inst") TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2bf16, "V2sV2s*3V2s", "t", "atomic-ds-pk-add-16-insts") TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2f16, "V2hV2h*3V2h", "t", "atomic-ds-pk-add-16-insts") +TARGET_BUILTIN(__builtin_amdgcn_load_to_lds, "vv*v*3IUiIiIUi", "t", "vmem-to-lds-load-insts") TARGET_BUILTIN(__builtin_amdgcn_global_load_lds, "vv*1v*3IUiIiIUi", "t", "vmem-to-lds-load-insts") //===----------------------------------------------------------------------===// diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp index ad012d98635ff..a32ef1c2a5a12 100644 --- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp @@ -564,6 +564,10 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, llvm::Function *F = CGM.getIntrinsic(IID, {LoadTy}); return Builder.CreateCall(F, {Addr}); } + case AMDGPU::BI__builtin_amdgcn_load_to_lds: { + return emitBuiltinWithOneOverloadedType<5>(*this, E, + Intrinsic::amdgcn_load_to_lds); + } case AMDGPU::BI__builtin_amdgcn_get_fpenv: { Function *F = CGM.getIntrinsic(Intrinsic::get_fpenv, {llvm::Type::getInt64Ty(getLLVMContext())}); diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp index a6366aceec2a6..e6414a623b929 100644 --- a/clang/lib/Sema/SemaAMDGPU.cpp +++ b/clang/lib/Sema/SemaAMDGPU.cpp @@ -36,6 +36,7 @@ bool SemaAMDGPU::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID, switch (BuiltinID) { case AMDGPU::BI__builtin_amdgcn_raw_ptr_buffer_load_lds: + case AMDGPU::BI__builtin_amdgcn_load_to_lds: case AMDGPU::BI__builtin_amdgcn_global_load_lds: { constexpr const int SizeIdx = 2; llvm::APSInt Size; diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl index 8251d6c213e3d..4b73347ac8155 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl @@ -1766,6 +1766,36 @@ void test_cvt_sr_f16_f32(global half2 *out, float src, uint seed) *out = __builtin_amdgcn_cvt_sr_f16_f32(*out, src, seed, 1); } +// CHECK-LABEL: @test_load_to_lds_96( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5) +// CHECK-NEXT: store ptr addrspace(1) [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 8 +// CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr addrspace(5) [[DST_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[SRC_ADDR]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[DST_ADDR]], align 4 +// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 12, i32 0, i32 0) +// CHECK-NEXT: ret void +// +void test_load_to_lds_96(global void* src, local void *dst) { + __builtin_amdgcn_global_load_lds(src, dst, /*size=*/12, /*offset=*/0, /*aux=*/0); +} + +// CHECK-LABEL: @test_load_to_lds_128( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5) +// CHECK-NEXT: store ptr addrspace(1) [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 8 +// CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr addrspace(5) [[DST_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[SRC_ADDR]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[DST_ADDR]], align 4 +// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 16, i32 0, i32 0) +// CHECK-NEXT: ret void +// +void test_load_to_lds_128(global void* src, local void *dst) { + __builtin_amdgcn_load_to_lds(src, dst, /*size=*/16, /*offset=*/0, /*aux=*/0); +} + // CHECK-LABEL: @test_global_load_lds_96( // CHECK-NEXT: entry: // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-load-to-lds.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-load-to-lds.cl new file mode 100644 index 0000000000000..6cdedb33bdd80 --- /dev/null +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-load-to-lds.cl @@ -0,0 +1,60 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx900 -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx942 -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -emit-llvm -o - %s | FileCheck %s +// REQUIRES: amdgpu-registered-target + +typedef unsigned int u32; +typedef unsigned short u16; +typedef unsigned char u8; + +// CHECK-LABEL: @test_load_to_lds_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5) +// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr +// CHECK-NEXT: [[DST_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DST_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr [[DST_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[SRC_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr [[DST_ADDR_ASCAST]], align 4 +// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 4, i32 0, i32 0) +// CHECK-NEXT: ret void +// +void test_load_to_lds_u32(global u32* src, local u32 *dst) { + __builtin_amdgcn_load_to_lds(src, dst, /*size=*/4, /*offset=*/0, /*aux=*/0); +} + +// CHECK-LABEL: @test_load_to_lds_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5) +// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr +// CHECK-NEXT: [[DST_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DST_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr [[DST_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[SRC_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr [[DST_ADDR_ASCAST]], align 4 +// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 2, i32 0, i32 0) +// CHECK-NEXT: ret void +// +void test_load_to_lds_u16(global u16* src, local u16 *dst) { + __builtin_amdgcn_load_to_lds(src, dst, /*size=*/2, /*offset=*/0, /*aux=*/0); +} + +// CHECK-LABEL: @test_load_to_lds_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5) +// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr +// CHECK-NEXT: [[DST_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DST_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr [[DST_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[SRC_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr [[DST_ADDR_ASCAST]], align 4 +// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 1, i32 0, i32 0) +// CHECK-NEXT: ret void +// +void test_load_to_lds_u8(global u8* src, local u8 *dst) { + __builtin_amdgcn_load_to_lds(src, dst, /*size=*/1, /*offset=*/0, /*aux=*/0); +} diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md index 6fb206e4df188..d86fc74fe2889 100644 --- a/llvm/docs/ReleaseNotes.md +++ b/llvm/docs/ReleaseNotes.md @@ -102,6 +102,14 @@ Changes to the AMDGPU Backend * Bump the default `.amdhsa_code_object_version` to 6. ROCm 6.3 is required to run any program compiled with COV6. +* Add a new `amdgcn.load.to.lds` intrinsic that wraps the existing global.load.lds +intrinsic and has the same semantics. This intrinsic allows using buffer fat pointers +(`ptr addrspace(7)`) as arguments, allowing loads to LDS from these pointers to be +represented in the IR without needing to use buffer resource intrinsics directly. +This intrinsic is exposed to Clang as `__builtin_amdgcn_load_to_lds`, though +buffer fat pointers are not yet enabled in Clang. Migration to this intrinsic is +optional, and there are no plans to deprecate `amdgcn.global.load.lds`. + Changes to the ARM Backend -------------------------- diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index a57eb4a6dba49..3c9886a01d757 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2641,6 +2641,27 @@ def int_amdgcn_perm : // GFX9 Intrinsics //===----------------------------------------------------------------------===// +/// This is a general-purpose intrinsic for all operations that take a pointer +/// a base location in LDS, and a data size and use it to perform a gather to LDS. +/// This allows abstracting over both global pointers (address space 1) and +/// the buffer-resource-wrapper pointers (address space 7 and 9). +/// TODO: add support for address space 5 and scratch_load_lds. +class AMDGPULoadToLDS : + ClangBuiltin<"__builtin_amdgcn_load_to_lds">, + Intrinsic < + [], + [llvm_anyptr_ty, // Base pointer to load from. Varies per lane. + LLVMQualPointerType<3>, // LDS base pointer to store to. Must be wave-uniform. + llvm_i32_ty, // Data byte size: 1/2/4 (/12/16 for gfx950) + llvm_i32_ty, // imm offset (applied to both input and LDS address) + llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = sc0, + // bit 1 = sc1, + // bit 4 = scc)) + [IntrWillReturn, NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>, + ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree], + "", [SDNPMemOperand]>; +def int_amdgcn_load_to_lds : AMDGPULoadToLDS; + class AMDGPUGlobalLoadLDS : ClangBuiltin<"__builtin_amdgcn_global_load_lds">, Intrinsic < diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 2fa03e3964207..907b5b7e705d7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -2335,6 +2335,11 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( case Intrinsic::amdgcn_struct_buffer_load_lds: case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: return selectBufferLoadLds(I); + // Until we can store both the address space of the global and the LDS + // arguments by having tto MachineMemOperands on an intrinsic, we just trust + // that the argument is a global pointer (buffer pointers have been handled by + // a LLVM IR-level lowering). + case Intrinsic::amdgcn_load_to_lds: case Intrinsic::amdgcn_global_load_lds: return selectGlobalLoadLds(I); case Intrinsic::amdgcn_exp_compr: diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp index 7163ad2aa7dca..f86aafdf08f9a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp @@ -2167,6 +2167,7 @@ static bool isRemovablePointerIntrinsic(Intrinsic::ID IID) { case Intrinsic::memset: case Intrinsic::memset_inline: case Intrinsic::experimental_memset_pattern: + case Intrinsic::amdgcn_load_to_lds: return true; } } @@ -2255,6 +2256,25 @@ PtrParts SplitPtrStructs::visitIntrinsicInst(IntrinsicInst &I) { SplitUsers.insert(&I); return {NewRsrc, Off}; } + case Intrinsic::amdgcn_load_to_lds: { + Value *Ptr = I.getArgOperand(0); + if (!isSplitFatPtr(Ptr->getType())) + return {nullptr, nullptr}; + IRB.SetInsertPoint(&I); + auto [Rsrc, Off] = getPtrParts(Ptr); + Value *LDSPtr = I.getArgOperand(1); + Value *LoadSize = I.getArgOperand(2); + Value *ImmOff = I.getArgOperand(3); + Value *Aux = I.getArgOperand(4); + Value *SOffset = IRB.getInt32(0); + Instruction *NewLoad = IRB.CreateIntrinsic( + Intrinsic::amdgcn_raw_ptr_buffer_load_lds, {}, + {Rsrc, LDSPtr, LoadSize, Off, SOffset, ImmOff, Aux}); + copyMetadata(NewLoad, &I); + SplitUsers.insert(&I); + I.replaceAllUsesWith(NewLoad); + return {nullptr, nullptr}; + } } return {nullptr, nullptr}; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 1d0e81db5a5db..6085c8d584af2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -3312,6 +3312,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl( constrainOpWithReadfirstlane(B, MI, 6); // soffset return; } + case Intrinsic::amdgcn_load_to_lds: case Intrinsic::amdgcn_global_load_lds: { applyDefaultMapping(OpdMapper); constrainOpWithReadfirstlane(B, MI, 2); @@ -5273,6 +5274,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32); break; } + case Intrinsic::amdgcn_load_to_lds: case Intrinsic::amdgcn_global_load_lds: { OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index c05ba42d999e9..c686bb00bc286 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1450,6 +1450,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.flags |= MachineMemOperand::MOStore; return true; } + case Intrinsic::amdgcn_load_to_lds: case Intrinsic::amdgcn_global_load_lds: { Info.opc = ISD::INTRINSIC_VOID; unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue(); @@ -1531,6 +1532,7 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II, case Intrinsic::amdgcn_global_load_tr_b128: Ptr = II->getArgOperand(0); break; + case Intrinsic::amdgcn_load_to_lds: case Intrinsic::amdgcn_global_load_lds: Ptr = II->getArgOperand(1); break; @@ -10219,6 +10221,10 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, return SDValue(Load, 0); } + // Buffers are handled by LowerBufferFatPointers, and we're going to go + // for "trust me" that the remaining cases are global pointers until + // such time as we can put two mem operands on an intrinsic. + case Intrinsic::amdgcn_load_to_lds: case Intrinsic::amdgcn_global_load_lds: { if (!Subtarget->hasVMemToLDSLoad()) return SDValue(); @@ -10249,7 +10255,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, break; } - auto *M = cast<MemSDNode>(Op); SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3)); SmallVector<SDValue, 6> Ops; @@ -10289,6 +10294,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, Ops.push_back(M0Val.getValue(0)); // Chain Ops.push_back(M0Val.getValue(1)); // Glue + auto *M = cast<MemSDNode>(Op); MachineMemOperand *LoadMMO = M->getMemOperand(); MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo(); LoadPtrI.Offset = Op->getConstantOperandVal(5); diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.gfx950.ll new file mode 100644 index 0000000000000..72ef6963c9976 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.gfx950.ll @@ -0,0 +1,75 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950,GFX950-SDAG %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950,GFX950-GISEL %s + +; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 -filetype=null < %s 2>&1 | FileCheck -check-prefix=ERR-SDAG %s +; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 -filetype=null < %s 2>&1 | FileCheck -check-prefix=ERR-GISEL %s + +; ERR-SDAG: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.load.to.lds + +; ERR-GISEL: LLVM ERROR: cannot select: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.load.to.lds), + +;; Note: this is a bare-bones test to make sure that amdgcn.load.to.lds lowers to +;; the correct intrinsic. + +declare void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) nocapture %lptr, i32 %size, i32 %offset, i32 %aux) +declare void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) nocapture %gptr, ptr addrspace(3) nocapture %lptr, i32 %size, i32 %offset, i32 %aux) + +;---------------------------------------------------------------------y +; dwordx3 +;--------------------------------------------------------------------- + +define amdgpu_ps void @global_load_lds_dwordx3_vaddr_saddr(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) nocapture inreg %lptr) { +; GFX950-LABEL: global_load_lds_dwordx3_vaddr_saddr: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_mov_b32 m0, s0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: global_load_lds_dwordx3 v[0:1], off offset:16 sc0 +; GFX950-NEXT: s_endpgm + call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 12, i32 16, i32 1) + ret void +} + +define amdgpu_ps void @buffer_load_lds_dwordx3_vaddr_saddr(ptr addrspace(7) nocapture inreg %gptr, i32 %off, ptr addrspace(3) nocapture inreg %lptr) { +; GFX950-LABEL: buffer_load_lds_dwordx3_vaddr_saddr: +; GFX950: ; %bb.0: +; GFX950-NEXT: v_add_u32_e32 v0, s4, v0 +; GFX950-NEXT: s_mov_b32 m0, s5 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: buffer_load_dwordx3 v0, s[0:3], 0 offen offset:16 sc0 lds +; GFX950-NEXT: s_endpgm + %gptr.off = getelementptr i8, ptr addrspace(7) %gptr, i32 %off + call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 12, i32 16, i32 1) + ret void +} + +;--------------------------------------------------------------------- +; dwordx4 +;--------------------------------------------------------------------- + +define amdgpu_ps void @global_load_lds_dwordx4_vaddr_saddr(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) nocapture inreg %lptr) { +; GFX950-LABEL: global_load_lds_dwordx4_vaddr_saddr: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_mov_b32 m0, s0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: global_load_lds_dwordx4 v[0:1], off offset:16 sc0 +; GFX950-NEXT: s_endpgm + call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/137425 _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits