llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-clang Author: Stanislav Mekhanoshin (rampitec) <details> <summary>Changes</summary> --- Patch is 20.39 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/150466.diff 12 Files Affected: - (modified) clang/include/clang/Basic/BuiltinsAMDGPU.def (+3) - (modified) clang/test/CodeGenOpenCL/amdgpu-features.cl (+1-1) - (modified) clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl (+19) - (modified) clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl (+6-1) - (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+18) - (modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (+3) - (modified) llvm/lib/Target/AMDGPU/FLATInstructions.td (+19) - (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+3-1) - (modified) llvm/lib/TargetParser/TargetParser.cpp (+1) - (added) llvm/test/CodeGen/AMDGPU/hard-clauses-gfx1250.mir (+33) - (added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.flat.prefetch.ll (+100) - (added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.prefetch.ll (+100) ``````````diff diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 878543566f0e3..0b16e1264ce6b 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -642,6 +642,9 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_f16_f32, "V2hV2hfUiIb", "nc", "f32-to-f16 // GFX1250+ only builtins. //===----------------------------------------------------------------------===// +TARGET_BUILTIN(__builtin_amdgcn_flat_prefetch, "vvC*0Ii", "nc", "vmem-pref-insts") +TARGET_BUILTIN(__builtin_amdgcn_global_prefetch, "vvC*1Ii", "nc", "vmem-pref-insts") + TARGET_BUILTIN(__builtin_amdgcn_tensor_load_to_lds, "vV4iV8iV4iV4iIi", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_tensor_load_to_lds_d2, "vV4iV8iIi", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_tensor_store_from_lds, "vV4iV8iV4iV4iIi", "nc", "gfx1250-insts") diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl b/clang/test/CodeGenOpenCL/amdgpu-features.cl index 75e9710f96705..e96dd669788d2 100644 --- a/clang/test/CodeGenOpenCL/amdgpu-features.cl +++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl @@ -108,7 +108,7 @@ // GFX1153: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" // GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" // GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" -// GFX1250: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf16-trans-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+transpose-load-f4f6-insts,+wavefrontsize32 +// GFX1250: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf16-trans-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32 // GFX1103-W64: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize64" diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl index a21862c4a9395..81f39f987e3b6 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl @@ -440,6 +440,25 @@ void test_permlane16_swap(global uint2* out, uint old, uint src) { *out = __builtin_amdgcn_permlane16_swap(old, src, false, true); } +// CHECK-LABEL: @test_prefetch( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[FPTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[GPTR_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[FPTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[FPTR_ADDR]] to ptr +// CHECK-NEXT: [[GPTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[GPTR_ADDR]] to ptr +// CHECK-NEXT: store ptr [[FPTR:%.*]], ptr [[FPTR_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr addrspace(1) [[GPTR:%.*]], ptr [[GPTR_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[FPTR_ADDR_ASCAST]], align 8 +// CHECK-NEXT: call void @llvm.amdgcn.flat.prefetch(ptr [[TMP0]], i32 0) +// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[GPTR_ADDR_ASCAST]], align 8 +// CHECK-NEXT: call void @llvm.amdgcn.global.prefetch(ptr addrspace(1) [[TMP1]], i32 8) +// CHECK-NEXT: ret void +// +void test_prefetch(generic void *fptr, global void *gptr) { + __builtin_amdgcn_flat_prefetch(fptr, 0); + __builtin_amdgcn_global_prefetch(gptr, 8); +} + // CHECK-LABEL: @test_cvt_f32_fp8_e5m3( // CHECK-NEXT: entry: // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl index 9711b3bdded6b..12a0f3c27fca2 100644 --- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl +++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl @@ -1,5 +1,5 @@ // REQUIRES: amdgpu-registered-target -// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx1250 -verify -S -o - %s +// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-- -target-cpu gfx1250 -verify -S -o - %s typedef int v4i __attribute__((ext_vector_type(4))); typedef int v8i __attribute__((ext_vector_type(8))); @@ -36,6 +36,11 @@ void test_amdgcn_tensor_load_store(v4i sg0, v8i sg1, v4i sg2, v4i sg3, int cpol) __builtin_amdgcn_tensor_store_from_lds_d2(sg0, sg1, cpol); // expected-error {{'__builtin_amdgcn_tensor_store_from_lds_d2' must be a constant integer}} } +void test_prefetch(generic void *fptr, global void *gptr, int cpol) { + __builtin_amdgcn_flat_prefetch(fptr, cpol); // expected-error {{'__builtin_amdgcn_flat_prefetch' must be a constant integer}} + __builtin_amdgcn_global_prefetch(gptr, cpol); // expected-error {{'__builtin_amdgcn_global_prefetch' must be a constant integer}} +} + void test_cvt_f32_fp8_e5m3(global int* out, int a) { *out = __builtin_amdgcn_cvt_f32_fp8_e5m3(a, a); // expected-error {{'__builtin_amdgcn_cvt_f32_fp8_e5m3' must be a constant integer}} diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 8bfa34584c3a4..f313c6b73ce41 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -3045,6 +3045,24 @@ def int_amdgcn_ds_bpermute_fi_b32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; +def int_amdgcn_flat_prefetch : ClangBuiltin<"__builtin_amdgcn_flat_prefetch">, + Intrinsic<[], + [llvm_ptr_ty, // Pointer + llvm_i32_ty], // cachepolicy(imm), bits [0-2] = th, bits [3-4] = scope + [IntrInaccessibleMemOrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>, + IntrNoCallback, IntrNoFree, ImmArg<ArgIndex<1>>], + "", [SDNPMemOperand] + >; + +def int_amdgcn_global_prefetch : ClangBuiltin<"__builtin_amdgcn_global_prefetch">, + Intrinsic<[], + [LLVMQualPointerType<1>, // Pointer + llvm_i32_ty], // cachepolicy(imm), bits [0-2] = th, bits [3-4] = scope + [IntrInaccessibleMemOrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>, + IntrNoCallback, IntrNoFree, ImmArg<ArgIndex<1>>], + "", [SDNPMemOperand] + >; + //===----------------------------------------------------------------------===// // Deep learning intrinsics. //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 9b05f7c339738..a10dca2f3986a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -5437,6 +5437,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); break; } + case Intrinsic::amdgcn_flat_prefetch: + case Intrinsic::amdgcn_global_prefetch: + return getDefaultMappingVOP(MI); default: return getInvalidInstructionMapping(); } diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index db827f4fd7c46..8054ea489ac51 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -2174,6 +2174,25 @@ defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_SHORT_D16, load_d16_lo_private, v2f } // End OtherPredicates = [HasFlatScratchInsts,EnableFlatScratch] +multiclass FlatIntrPrefetchPats<string inst, SDPatternOperator intr> { + def : GCNPat < + (intr (FlatOffset i64:$vaddr, i32:$offset), timm:$cpol), + (!cast<FLAT_Pseudo>(inst) $vaddr, $offset, $cpol) + >; + + def : GCNPat < + (intr (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), timm:$cpol), + (!cast<FLAT_Pseudo>(inst#"_SADDR") $saddr, $voffset, $offset, $cpol)> { + let AddedComplexity = 11; + } +} + +let SubtargetPredicate = HasVmemPrefInsts in { + // Patterns for target intrinsics + defm : FlatIntrPrefetchPats<"FLAT_PREFETCH_B8", int_amdgcn_flat_prefetch>; + defm : FlatIntrPrefetchPats<"GLOBAL_PREFETCH_B8", int_amdgcn_global_prefetch>; +} // End SubtargetPredicate = HasVmemPrefInsts + //===----------------------------------------------------------------------===// // Target //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index f1a8ee118356e..1d7612cc2077f 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1548,7 +1548,9 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; return true; } - case Intrinsic::amdgcn_s_prefetch_data: { + case Intrinsic::amdgcn_s_prefetch_data: + case Intrinsic::amdgcn_flat_prefetch: + case Intrinsic::amdgcn_global_prefetch: { Info.opc = ISD::INTRINSIC_VOID; Info.memVT = EVT::getIntegerVT(CI.getContext(), 8); Info.ptrVal = CI.getArgOperand(0); diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp index 4ca7444a73b35..e5c896feb953c 100644 --- a/llvm/lib/TargetParser/TargetParser.cpp +++ b/llvm/lib/TargetParser/TargetParser.cpp @@ -451,6 +451,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T, Features["permlane16-swap"] = true; Features["ashr-pk-insts"] = true; Features["atomic-buffer-pk-add-bf16-inst"] = true; + Features["vmem-pref-insts"] = true; Features["atomic-fadd-rtn-insts"] = true; Features["atomic-buffer-global-pk-add-f16-insts"] = true; Features["atomic-flat-pk-add-16-insts"] = true; diff --git a/llvm/test/CodeGen/AMDGPU/hard-clauses-gfx1250.mir b/llvm/test/CodeGen/AMDGPU/hard-clauses-gfx1250.mir new file mode 100644 index 0000000000000..8007597a32fbe --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/hard-clauses-gfx1250.mir @@ -0,0 +1,33 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass si-insert-hard-clauses %s -o - | FileCheck %s -check-prefixes=GFX12 +# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -run-pass si-insert-hard-clauses %s -o - | FileCheck %s -check-prefixes=GFX12 + +--- +name: flat_prefetch_flat_load +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; GFX12-LABEL: name: flat_prefetch_flat_load + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: FLAT_PREFETCH_B8 $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr + ; GFX12-NEXT: $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr + FLAT_PREFETCH_B8 $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr + $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr +... + +--- +name: global_prefetch_flat_load +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; GFX12-LABEL: name: global_prefetch_flat_load + ; GFX12: liveins: $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: GLOBAL_PREFETCH_B8 $vgpr0_vgpr1, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr + GLOBAL_PREFETCH_B8 $vgpr0_vgpr1, 0, 0, implicit $exec + $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr +... diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.flat.prefetch.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.flat.prefetch.ll new file mode 100644 index 0000000000000..89555d3060883 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.flat.prefetch.ll @@ -0,0 +1,100 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GCN %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GCN %s + +declare void @llvm.amdgcn.flat.prefetch(ptr %ptr, i32 %col) + +define amdgpu_ps void @flat_prefetch(ptr %ptr) { +; GCN-LABEL: flat_prefetch: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: flat_prefetch_b8 v[0:1] +; GCN-NEXT: s_endpgm +entry: + tail call void @llvm.amdgcn.flat.prefetch(ptr %ptr, i32 0) + ret void +} + +define amdgpu_ps void @flat_prefetch_sgpr(ptr inreg %ptr) { +; GCN-LABEL: flat_prefetch_sgpr: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: flat_prefetch_b8 v0, s[0:1] +; GCN-NEXT: s_endpgm +entry: + tail call void @llvm.amdgcn.flat.prefetch(ptr %ptr, i32 0) + ret void +} + +define amdgpu_ps void @flat_prefetch_offset(ptr %ptr) { +; GCN-LABEL: flat_prefetch_offset: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: flat_prefetch_b8 v[0:1] offset:512 +; GCN-NEXT: s_endpgm +entry: + %gep = getelementptr i32, ptr %ptr, i32 128 + tail call void @llvm.amdgcn.flat.prefetch(ptr %gep, i32 0) + ret void +} + +define amdgpu_ps void @flat_prefetch_sgpr_voffset(ptr inreg %ptr, i32 %offset) { +; GCN-LABEL: flat_prefetch_sgpr_voffset: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: flat_prefetch_b8 v0, s[0:1] +; GCN-NEXT: s_endpgm +entry: + %gep = getelementptr i8, ptr %ptr, i32 %offset + tail call void @llvm.amdgcn.flat.prefetch(ptr %gep, i32 0) + ret void +} + +define amdgpu_ps void @flat_prefetch_sgpr_voffset_offset(ptr inreg %ptr, i32 %offset) { +; GCN-LABEL: flat_prefetch_sgpr_voffset_offset: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: flat_prefetch_b8 v0, s[0:1] offset:128 +; GCN-NEXT: s_endpgm +entry: + %gep1 = getelementptr i8, ptr %ptr, i32 %offset + %gep2 = getelementptr i8, ptr %gep1, i32 128 + tail call void @llvm.amdgcn.flat.prefetch(ptr %gep2, i32 0) + ret void +} + +define amdgpu_ps void @flat_prefetch_se(ptr %ptr) { +; GCN-LABEL: flat_prefetch_se: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: flat_prefetch_b8 v[0:1] scope:SCOPE_SE +; GCN-NEXT: s_endpgm +entry: + tail call void @llvm.amdgcn.flat.prefetch(ptr %ptr, i32 8) + ret void +} + +define amdgpu_ps void @flat_prefetch_se_nt(ptr %ptr) { +; GCN-LABEL: flat_prefetch_se_nt: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: flat_prefetch_b8 v[0:1] th:TH_LOAD_NT scope:SCOPE_SE +; GCN-NEXT: s_endpgm +entry: + tail call void @llvm.amdgcn.flat.prefetch(ptr %ptr, i32 9) + ret void +} + +define amdgpu_ps void @flat_prefetch_dev_ht(ptr %ptr) { +; GCN-LABEL: flat_prefetch_dev_ht: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: flat_prefetch_b8 v[0:1] th:TH_LOAD_HT scope:SCOPE_DEV +; GCN-NEXT: s_endpgm +entry: + tail call void @llvm.amdgcn.flat.prefetch(ptr %ptr, i32 18) + ret void +} + +define amdgpu_ps void @flat_prefetch_sys_lu(ptr %ptr) { +; GCN-LABEL: flat_prefetch_sys_lu: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: flat_prefetch_b8 v[0:1] th:TH_LOAD_BYPASS scope:SCOPE_SYS +; GCN-NEXT: s_endpgm +entry: + tail call void @llvm.amdgcn.flat.prefetch(ptr %ptr, i32 27) + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.prefetch.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.prefetch.ll new file mode 100644 index 0000000000000..047a6ccf10d91 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.prefetch.ll @@ -0,0 +1,100 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GCN %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GCN %s + +declare void @llvm.amdgcn.global.prefetch(ptr addrspace(1) %ptr, i32 %col) + +define amdgpu_ps void @global_prefetch(ptr addrspace(1) %ptr) { +; GCN-LABEL: global_prefetch: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: global_prefetch_b8 v[0:1], off +; GCN-NEXT: s_endpgm +entry: + tail call void @llvm.amdgcn.global.prefetch(ptr addrspace(1) %ptr, i32 0) + ret void +} + +define amdgpu_ps void @global_prefetch_sgpr(ptr addrspace(1) inreg %ptr) { +; GCN-LABEL: global_prefetch_sgpr: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: global_prefetch_b8 v0, s[0:1] +; GCN-NEXT: s_endpgm +entry: + tail call void @llvm.amdgcn.global.prefetch(ptr addrspace(1) %ptr, i32 0) + ret void +} + +define amdgpu_ps void @global_prefetch_offset(ptr addrspace(1) %ptr) { +; GCN-LABEL: global_prefetch_offset: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: global_prefetch_b8 v[0:1], off offset:512 +; GCN-NEXT: s_endpgm +entry: + %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 128 + tail call void @llvm.amdgcn.global.prefetch(ptr addrspace(1) %gep, i32 0) + ret void +} + +define amdgpu_ps void @global_prefetch_sgpr_voffset(ptr addrspace(1) inreg %ptr, i32 %offset) { +; GCN-LABEL: global_prefetch_sgpr_voffset: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: global_prefetch_b8 v0, s[0:1] +; GCN-NEXT: s_endpgm +entry: + %gep = getelementptr i8, ptr addrspace(1) %ptr, i32 %offset + tail call void @llvm.amdgcn.global.prefetch(ptr addrspace(1) %gep, i32 0) + ret void +} + +define amdgpu_ps void @global_prefetch_sgpr_voffset_offset(ptr addrspace(1) inreg %ptr, i32 %offset) { +; GCN-LABEL: global_prefetch_sgpr_voffset_offset: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: global_prefetch_b8 v0, s[0:1] offset:128 +; GCN-NEXT: s_endpgm +entry: + %gep1 = getelementptr i8, ptr addrspace(1) %ptr, i32 %offset + %gep2 = getelementptr i8, ptr addrspace(1) %gep1, i32 128 + tail call void @llvm.amdgcn.global.prefetch(ptr addrspace(1) %gep2, i32 0) + ret void +} + +define amdgpu_ps void @global_prefetch_se(ptr addrspace(1) %ptr) { +; GCN-LABEL: global_prefetch_se: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: global_prefetch_b8 v[0:1], off scope:SCOPE_SE +; GCN-NEXT: s_endpgm +entry: + tail call void @llvm.amdgcn.global.prefetch(ptr addrspace(1) %ptr, i32 8) + ret void +} + +define amdgpu_ps void @global_prefetch_se_nt(ptr addrspace(1) %ptr) { +; GCN-LABEL: global_prefetch_se_nt: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: global_prefetch_b8 v[0:1], off th:TH_LOAD_NT scope:SCOPE_SE +; GCN-NEXT: s_endpgm +entry: + tail call void @llvm.amdgcn.global.prefetch(ptr addrspace(1) %ptr, i32 9) + ret void +} + +define amdgpu_ps void @global_prefetch_dev_ht(ptr addrspace(1) %ptr) { +; GCN-LABEL: global_prefetch_dev_ht: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: global_prefetch_b8 v[0:1], off th:TH_LOAD_HT scope:SCOPE_DEV +; GCN-NEXT: s_endpgm +entry: + tail call void @llvm.amdgcn.global.prefetch(... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/150466 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits