llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-mlir-gpu Author: Joe Nash (Sisyph) <details> <summary>Changes</summary> Fix the logic in rewriteBuiltinFunctionDecl to work when the builtin has a pointer parameter with an address space and one without a fixed address space. A builtin fitting these criteria was recently added. Change the attribute string to perform type checking on it, so without the sema change compilation would fail with a wrong number of arguments error. --- Patch is 58.15 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/138141.diff 24 Files Affected: - (modified) clang/include/clang/Basic/BuiltinsAMDGPU.def (+1) - (modified) clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp (+5) - (modified) clang/lib/Sema/SemaAMDGPU.cpp (+1) - (modified) clang/lib/Sema/SemaExpr.cpp (+2-4) - (modified) clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl (+30) - (added) clang/test/CodeGenOpenCL/builtins-amdgcn-load-to-lds.cl (+60) - (added) clang/test/SemaOpenCL/builtins-amdgcn-load-to-lds-err.cl (+25) - (modified) llvm/docs/AMDGPUUsage.rst (+9-1) - (modified) llvm/docs/ReleaseNotes.md (+8) - (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+20) - (modified) llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp (+5) - (modified) llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp (+20) - (modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (+2) - (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+7-1) - (added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.gfx950.ll (+128) - (added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.ll (+220) - (modified) llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll (+18) - (modified) mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td (+7-5) - (modified) mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td (+29-6) - (modified) mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp (+7-8) - (modified) mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp (+15-6) - (modified) mlir/test/Conversion/AMDGPUToROCDL/load_lds.mlir (+51-16) - (modified) mlir/test/Dialect/LLVMIR/rocdl.mlir (+10-7) - (modified) mlir/test/Target/LLVMIR/rocdl.mlir (+7-4) ``````````diff diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 39fef9e4601f8..802b4be42419d 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -257,6 +257,7 @@ TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fadd_v2bf16, "V2sV2s*0V2s", "t", "at TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_v2bf16, "V2sV2s*1V2s", "t", "atomic-global-pk-add-bf16-inst") TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2bf16, "V2sV2s*3V2s", "t", "atomic-ds-pk-add-16-insts") TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2f16, "V2hV2h*3V2h", "t", "atomic-ds-pk-add-16-insts") +TARGET_BUILTIN(__builtin_amdgcn_load_to_lds, "vv*v*3IUiIiIUi", "", "vmem-to-lds-load-insts") TARGET_BUILTIN(__builtin_amdgcn_global_load_lds, "vv*1v*3IUiIiIUi", "t", "vmem-to-lds-load-insts") //===----------------------------------------------------------------------===// diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp index ad012d98635ff..d1c722c9dc610 100644 --- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp @@ -564,6 +564,11 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, llvm::Function *F = CGM.getIntrinsic(IID, {LoadTy}); return Builder.CreateCall(F, {Addr}); } + case AMDGPU::BI__builtin_amdgcn_load_to_lds: { + // Should this have asan instrumentation? + return emitBuiltinWithOneOverloadedType<5>(*this, E, + Intrinsic::amdgcn_load_to_lds); + } case AMDGPU::BI__builtin_amdgcn_get_fpenv: { Function *F = CGM.getIntrinsic(Intrinsic::get_fpenv, {llvm::Type::getInt64Ty(getLLVMContext())}); diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp index a6366aceec2a6..e6414a623b929 100644 --- a/clang/lib/Sema/SemaAMDGPU.cpp +++ b/clang/lib/Sema/SemaAMDGPU.cpp @@ -36,6 +36,7 @@ bool SemaAMDGPU::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID, switch (BuiltinID) { case AMDGPU::BI__builtin_amdgcn_raw_ptr_buffer_load_lds: + case AMDGPU::BI__builtin_amdgcn_load_to_lds: case AMDGPU::BI__builtin_amdgcn_global_load_lds: { constexpr const int SizeIdx = 2; llvm::APSInt Size; diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index 283d910a09d54..1be03327ae915 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -6311,7 +6311,8 @@ static FunctionDecl *rewriteBuiltinFunctionDecl(Sema *Sema, ASTContext &Context, return nullptr; Expr *Arg = ArgRes.get(); QualType ArgType = Arg->getType(); - if (!ParamType->isPointerType() || ParamType.hasAddressSpace() || + if (!ParamType->isPointerType() || + ParamType->getPointeeType().hasAddressSpace() || !ArgType->isPointerType() || !ArgType->getPointeeType().hasAddressSpace() || isPtrSizeAddressSpace(ArgType->getPointeeType().getAddressSpace())) { @@ -6320,9 +6321,6 @@ static FunctionDecl *rewriteBuiltinFunctionDecl(Sema *Sema, ASTContext &Context, } QualType PointeeType = ParamType->getPointeeType(); - if (PointeeType.hasAddressSpace()) - continue; - NeedsNewDecl = true; LangAS AS = ArgType->getPointeeType().getAddressSpace(); diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl index 8251d6c213e3d..3d81893553c65 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl @@ -1766,6 +1766,36 @@ void test_cvt_sr_f16_f32(global half2 *out, float src, uint seed) *out = __builtin_amdgcn_cvt_sr_f16_f32(*out, src, seed, 1); } +// CHECK-LABEL: @test_load_to_lds_96( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5) +// CHECK-NEXT: store ptr addrspace(1) [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 8 +// CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr addrspace(5) [[DST_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[SRC_ADDR]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[DST_ADDR]], align 4 +// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 12, i32 0, i32 0) +// CHECK-NEXT: ret void +// +void test_load_to_lds_96(global void* src, local void *dst) { + __builtin_amdgcn_load_to_lds(src, dst, /*size=*/12, /*offset=*/0, /*aux=*/0); +} + +// CHECK-LABEL: @test_load_to_lds_128( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5) +// CHECK-NEXT: store ptr addrspace(1) [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 8 +// CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr addrspace(5) [[DST_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[SRC_ADDR]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[DST_ADDR]], align 4 +// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 16, i32 0, i32 0) +// CHECK-NEXT: ret void +// +void test_load_to_lds_128(global void* src, local void *dst) { + __builtin_amdgcn_load_to_lds(src, dst, /*size=*/16, /*offset=*/0, /*aux=*/0); +} + // CHECK-LABEL: @test_global_load_lds_96( // CHECK-NEXT: entry: // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-load-to-lds.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-load-to-lds.cl new file mode 100644 index 0000000000000..6cdedb33bdd80 --- /dev/null +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-load-to-lds.cl @@ -0,0 +1,60 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx900 -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx942 -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -emit-llvm -o - %s | FileCheck %s +// REQUIRES: amdgpu-registered-target + +typedef unsigned int u32; +typedef unsigned short u16; +typedef unsigned char u8; + +// CHECK-LABEL: @test_load_to_lds_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5) +// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr +// CHECK-NEXT: [[DST_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DST_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr [[DST_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[SRC_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr [[DST_ADDR_ASCAST]], align 4 +// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 4, i32 0, i32 0) +// CHECK-NEXT: ret void +// +void test_load_to_lds_u32(global u32* src, local u32 *dst) { + __builtin_amdgcn_load_to_lds(src, dst, /*size=*/4, /*offset=*/0, /*aux=*/0); +} + +// CHECK-LABEL: @test_load_to_lds_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5) +// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr +// CHECK-NEXT: [[DST_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DST_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr [[DST_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[SRC_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr [[DST_ADDR_ASCAST]], align 4 +// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 2, i32 0, i32 0) +// CHECK-NEXT: ret void +// +void test_load_to_lds_u16(global u16* src, local u16 *dst) { + __builtin_amdgcn_load_to_lds(src, dst, /*size=*/2, /*offset=*/0, /*aux=*/0); +} + +// CHECK-LABEL: @test_load_to_lds_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5) +// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr +// CHECK-NEXT: [[DST_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DST_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr [[DST_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[SRC_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr [[DST_ADDR_ASCAST]], align 4 +// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 1, i32 0, i32 0) +// CHECK-NEXT: ret void +// +void test_load_to_lds_u8(global u8* src, local u8 *dst) { + __builtin_amdgcn_load_to_lds(src, dst, /*size=*/1, /*offset=*/0, /*aux=*/0); +} diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-load-to-lds-err.cl b/clang/test/SemaOpenCL/builtins-amdgcn-load-to-lds-err.cl new file mode 100644 index 0000000000000..d93d724212077 --- /dev/null +++ b/clang/test/SemaOpenCL/builtins-amdgcn-load-to-lds-err.cl @@ -0,0 +1,25 @@ +// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx900 -S -verify=gfx,expected -o - %s +// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx942 -S -verify=gfx,expected -o - %s +// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -S -verify=gfx,expected -o - %s +// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx950 -S -verify=gfx950,expected -o - %s +// REQUIRES: amdgpu-registered-target + +typedef unsigned int u32; + +void test_load_to_lds_unsupported_size(global u32* src, local u32 *dst, u32 size, u32 offset, u32 aux) { + __builtin_amdgcn_load_to_lds(src, dst, size, /*offset=*/0, /*aux=*/0); // expected-error{{argument to '__builtin_amdgcn_load_to_lds' must be a constant integer}} + __builtin_amdgcn_load_to_lds(src, dst, /*size=*/4, offset, /*aux=*/0); // expected-error{{argument to '__builtin_amdgcn_load_to_lds' must be a constant integer}} + __builtin_amdgcn_load_to_lds(src, dst, /*size=*/4, /*offset=*/0, aux); // expected-error{{argument to '__builtin_amdgcn_load_to_lds' must be a constant integer}} + __builtin_amdgcn_load_to_lds(src, dst, /*size=*/5, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} gfx-note {{size must be 1, 2, or 4}} gfx950-note {{size must be 1, 2, 4, 12 or 16}} + __builtin_amdgcn_load_to_lds(src, dst, /*size=*/0, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} gfx-note {{size must be 1, 2, or 4}} gfx950-note {{size must be 1, 2, 4, 12 or 16}} + __builtin_amdgcn_load_to_lds(src, dst, /*size=*/3, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} gfx-note {{size must be 1, 2, or 4}} gfx950-note {{size must be 1, 2, 4, 12 or 16}} + __builtin_amdgcn_load_to_lds(src, dst, /*size=*/12, /*offset=*/0, /*aux=*/0); // gfx-error{{invalid size value}} gfx-note {{size must be 1, 2, or 4}} + __builtin_amdgcn_load_to_lds(src, dst, /*size=*/16, /*offset=*/0, /*aux=*/0); // gfx-error{{invalid size value}} gfx-note {{size must be 1, 2, or 4}} + __builtin_amdgcn_load_to_lds(src, dst, /*size=*/-1, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} gfx-note {{size must be 1, 2, or 4}} gfx950-note {{size must be 1, 2, 4, 12 or 16}} +} + +__attribute__((target("gfx950-insts"))) +void test_load_to_lds_via_target_feature(global u32* src, local u32 *dst, u32 size, u32 offset, u32 aux) { + __builtin_amdgcn_load_to_lds(src, dst, /*size=*/12, /*offset=*/0, /*aux=*/0); + __builtin_amdgcn_load_to_lds(src, dst, /*size=*/16, /*offset=*/0, /*aux=*/0); +} diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index d1535960a0257..3ee0f8cae3fc2 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -1216,7 +1216,15 @@ The AMDGPU backend implements the following LLVM IR intrinsics. The format is a 64-bit concatenation of the MODE and TRAPSTS registers. :ref:`llvm.set.fpenv<int_set_fpenv>` Sets the floating point environment to the specifies state. - + llvm.amdgcn.load.to.lds.p<1/7> Loads values from global memory (either in the form of a global + a raw fat buffer pointer) to LDS. The size of the data copied can be 1, 2, + or 4 bytes (and gfx950 also allows 12 or 16 bytes). The LDS pointer + argument should be wavefront-uniform; the global pointer need not be. + The LDS pointer is implicitly offset by 4 * lane_id bytes for sies <= 4 bytes + and 16 * lane_id bytes for larger sizes. This lowers to `global_load_lds`, + `buffer_load_* ... lds`, or `global_load__* ... lds` depnedening on address + space and architecture. `amdgcn.global.load.lds` has the same semantics as + `amdgcn.load.to.lds.p1`. llvm.amdgcn.readfirstlane Provides direct access to v_readfirstlane_b32. Returns the value in the lowest active lane of the input operand. Currently implemented for i16, i32, float, half, bfloat, <2 x i16>, <2 x half>, <2 x bfloat>, diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md index 6fb206e4df188..d86fc74fe2889 100644 --- a/llvm/docs/ReleaseNotes.md +++ b/llvm/docs/ReleaseNotes.md @@ -102,6 +102,14 @@ Changes to the AMDGPU Backend * Bump the default `.amdhsa_code_object_version` to 6. ROCm 6.3 is required to run any program compiled with COV6. +* Add a new `amdgcn.load.to.lds` intrinsic that wraps the existing global.load.lds +intrinsic and has the same semantics. This intrinsic allows using buffer fat pointers +(`ptr addrspace(7)`) as arguments, allowing loads to LDS from these pointers to be +represented in the IR without needing to use buffer resource intrinsics directly. +This intrinsic is exposed to Clang as `__builtin_amdgcn_load_to_lds`, though +buffer fat pointers are not yet enabled in Clang. Migration to this intrinsic is +optional, and there are no plans to deprecate `amdgcn.global.load.lds`. + Changes to the ARM Backend -------------------------- diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index a57eb4a6dba49..7939ef0cf8620 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2641,6 +2641,26 @@ def int_amdgcn_perm : // GFX9 Intrinsics //===----------------------------------------------------------------------===// +/// This is a general-purpose intrinsic for all operations that take a pointer +/// a base location in LDS, and a data size and use it to perform a gather to LDS. +/// This allows abstracting over both global pointers (address space 1) and +/// the buffer-resource-wrapper pointers (address space 7 and 9). +/// TODO: add support for address space 5 and scratch_load_lds. +class AMDGPULoadToLDS : + Intrinsic < + [], + [llvm_anyptr_ty, // Base pointer to load from. Varies per lane. + LLVMQualPointerType<3>, // LDS base pointer to store to. Must be wave-uniform. + llvm_i32_ty, // Data byte size: 1/2/4 (/12/16 for gfx950) + llvm_i32_ty, // imm offset (applied to both input and LDS address) + llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = sc0, + // bit 1 = sc1, + // bit 4 = scc)) + [IntrWillReturn, NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>, + ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree], + "", [SDNPMemOperand]>; +def int_amdgcn_load_to_lds : AMDGPULoadToLDS; + class AMDGPUGlobalLoadLDS : ClangBuiltin<"__builtin_amdgcn_global_load_lds">, Intrinsic < diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 2fa03e3964207..907b5b7e705d7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -2335,6 +2335,11 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( case Intrinsic::amdgcn_struct_buffer_load_lds: case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: return selectBufferLoadLds(I); + // Until we can store both the address space of the global and the LDS + // arguments by having tto MachineMemOperands on an intrinsic, we just trust + // that the argument is a global pointer (buffer pointers have been handled by + // a LLVM IR-level lowering). + case Intrinsic::amdgcn_load_to_lds: case Intrinsic::amdgcn_global_load_lds: return selectGlobalLoadLds(I); case Intrinsic::amdgcn_exp_compr: diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp index 7163ad2aa7dca..f86aafdf08f9a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp @@ -2167,6 +2167,7 @@ static bool isRemovablePointerIntrinsic(Intrinsic::ID IID) { case Intrinsic::memset: case Intrinsic::memset_inline: case Intrinsic::experimental_memset_pattern: + case Intrinsic::amdgcn_load_to_lds: return true; } } @@ -2255,6 +2256,25 @@ PtrParts SplitPtrStructs::visitIntrinsicInst(IntrinsicInst &I) { SplitUsers.insert(&I); return {NewRsrc, Off}; } + case Intrinsic::amdgcn_load_to_lds: { + Value *Ptr = I.getArgOperand(0); + if (!isSplitFatPtr(Ptr->getType())) + return {nullptr, nullptr}; + IRB.SetInsertPoint(&I); + auto [Rsrc, Off] = getPtrParts(Ptr); + Value *LDSPtr = I.getArgOperand(1); + Value *LoadSize = I.getArgOperand(2); + Value *ImmOff = I.getArgOperand(3); + Value *Aux = I.getArgOperand(4); + Value *SOffset = IRB.getInt32(0); + Instruction *NewLoad = IRB.CreateIntrinsic( + Intrinsic::amdgcn_raw_ptr_buffer_load_lds, {}, + {Rsrc, LDSPtr, LoadSize, Off, SOffset, ImmOff, Aux}); + copyMetadata(NewLoad, &I); + SplitUsers.insert(&I); + I.replaceAllUsesWith(NewLoad); + return {nullptr, nullptr}; + } } return {nullptr, nullptr}; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 1d0e81db5a5db..6085c8d584af2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -3312,6 +3312,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl( constrainOpWithReadfirstlane(B, MI, 6); // soffset return; } + case Intrinsic::amdgcn_load_to_lds: case Intrinsic::amdgcn_global_load_lds: { applyDefaultMapping(OpdMapper); constrainOpWithR... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/138141 _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits