================ @@ -0,0 +1,136 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GCN,SDAG %s +; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GCN,GISEL %s + +define amdgpu_ps void @prefetch_data_sgpr_base_sgpr_len(ptr addrspace(4) inreg %ptr, i32 inreg %len) { +; GCN-LABEL: prefetch_data_sgpr_base_sgpr_len: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_prefetch_data s[0:1], 0x0, s2, 0 +; GCN-NEXT: s_endpgm +entry: + tail call void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) %ptr, i32 %len) + ret void +} + +define amdgpu_ps void @prefetch_data_sgpr_imm_base_sgpr_len(ptr addrspace(4) inreg %ptr, i32 inreg %len) { +; GCN-LABEL: prefetch_data_sgpr_imm_base_sgpr_len: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_prefetch_data s[0:1], 0x200, s2, 0 +; GCN-NEXT: s_endpgm +entry: + %gep = getelementptr i32, ptr addrspace(4) %ptr, i32 128 + tail call void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) %gep, i32 %len) + ret void +} + +define amdgpu_ps void @prefetch_data_sgpr_base_imm_len(ptr addrspace(4) inreg %ptr) { +; GCN-LABEL: prefetch_data_sgpr_base_imm_len: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_prefetch_data s[0:1], 0x0, null, 31 +; GCN-NEXT: s_endpgm +entry: + tail call void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) %ptr, i32 31) + ret void +} + +define amdgpu_ps void @prefetch_data_sgpr_imm_base_imm_len(ptr addrspace(4) inreg %ptr) { +; GCN-LABEL: prefetch_data_sgpr_imm_base_imm_len: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_prefetch_data s[0:1], 0x200, null, 31 +; GCN-NEXT: s_endpgm +entry: + %gep = getelementptr i32, ptr addrspace(4) %ptr, i32 128 + tail call void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) %gep, i32 31) + ret void +} + +define amdgpu_ps void @prefetch_data_vgpr_base_sgpr_len(ptr addrspace(4) %ptr, i32 inreg %len) { +; GCN-LABEL: prefetch_data_vgpr_base_sgpr_len: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_readfirstlane_b32 s2, v0 +; GCN-NEXT: v_readfirstlane_b32 s3, v1 +; GCN-NEXT: s_prefetch_data s[2:3], 0x0, s0, 0 +; GCN-NEXT: s_endpgm +entry: + tail call void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) %ptr, i32 %len) + ret void +} + +define amdgpu_ps void @prefetch_data_vgpr_imm_base_sgpr_len(ptr addrspace(4) %ptr, i32 inreg %len) { +; SDAG-LABEL: prefetch_data_vgpr_imm_base_sgpr_len: +; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; SDAG-NEXT: v_readfirstlane_b32 s3, v1 +; SDAG-NEXT: s_prefetch_data s[2:3], 0x200, s0, 0 +; SDAG-NEXT: s_endpgm +; +; GISEL-LABEL: prefetch_data_vgpr_imm_base_sgpr_len: +; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x200, v0 +; GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GISEL-NEXT: v_readfirstlane_b32 s2, v0 +; GISEL-NEXT: v_readfirstlane_b32 s3, v1 +; GISEL-NEXT: s_prefetch_data s[2:3], 0x0, s0, 0 +; GISEL-NEXT: s_endpgm +entry: + %gep = getelementptr i32, ptr addrspace(4) %ptr, i32 128 + tail call void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) %gep, i32 %len) + ret void +} + +define amdgpu_ps void @prefetch_data_sgpr_base_vgpr_len(ptr addrspace(4) inreg %ptr, i32 %len) { +; GCN-LABEL: prefetch_data_sgpr_base_vgpr_len: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_readfirstlane_b32 s2, v0 +; GCN-NEXT: s_prefetch_data s[0:1], 0x0, s2, 0 +; GCN-NEXT: s_endpgm +entry: + tail call void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) %ptr, i32 %len) + ret void +} + +define amdgpu_ps void @prefetch_data_sgpr_base_imm_len_global(ptr addrspace(1) inreg %ptr) { +; GCN-LABEL: prefetch_data_sgpr_base_imm_len_global: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_prefetch_data s[0:1], 0x0, null, 31 ---------------- rampitec wrote:
It is scalar prefetch, it takes SGPR address and fills scalar cache. Yes, AS1 is also isFlatGlobal and a scalar load may be issued for AS1 as well should the address be uniform and cache r/o. It is really up to the intrinsic user to reason if prefetching into the scalar cache would be useful or not. Then the pointer could be VGPR, but this VGPR can contain a uniform value. That is perfectly legal and happens. To legalize this you could either issue a readfirstlane or do a waterfall loop. I cannot imagine anyone wanting a prefetching waterfall loop. Using readfirstlane on the other hand is reasonable assuming the value may be dynamically uniform. In the 'worst' case it will prefetch less than a waterfall loop would, but prefetch is generally discardable. https://github.com/llvm/llvm-project/pull/107133 _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits