================ @@ -183,10 +183,10 @@ define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half> define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, <2 x i16> %data) { ; GFX940-LABEL: local_atomic_fadd_v2bf16_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ---------------- cdevadas wrote:
Unfortunately, that's not happening. The IR load-store-vectorizer doesn't combine the two loads. I still see the two loads after the IR vectorizer and they become two loads in the selected code. Can this happen because the alignment for the two loads differ and the IR vectorizer safely ignores them? *** IR Dump before Selection *** define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, <2 x i16> %data) #0 { %local_atomic_fadd_v2bf16_noret.kernarg.segment = call nonnull align 16 dereferenceable(44) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() %ptr.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %local_atomic_fadd_v2bf16_noret.kernarg.segment, i64 36, !amdgpu.uniform !0 **%ptr.load = load ptr addrspace(3), ptr addrspace(4) %ptr.kernarg.offset**, align 4, !invariant.load !0 %data.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %local_atomic_fadd_v2bf16_noret.kernarg.segment, i64 40, !amdgpu.uniform !0 **%data.load = load <2 x i16>, ptr addrspace(4) %data.kernarg.offset**, align 8, !invariant.load !0 %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr.load, <2 x i16> %data.load) ret void } # *** IR Dump After selection ***: # Machine code for function local_atomic_fadd_v2bf16_noret: IsSSA, TracksLiveness Function Live Ins: $sgpr0_sgpr1 in %1 bb.0 (%ir-block.0): liveins: $sgpr0_sgpr1 %1:sgpr_64(p4) = COPY $sgpr0_sgpr1 %3:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1:sgpr_64(p4), 36, 0 :: (dereferenceable invariant load (s32) from %ir.ptr.kernarg.offset, addrspace 4) %4:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1:sgpr_64(p4), 40, 0 :: (dereferenceable invariant load (s32) from %ir.data.kernarg.offset, align 8, addrspace 4) %5:vgpr_32 = COPY %3:sreg_32_xm0_xexec %6:vgpr_32 = COPY %4:sreg_32_xm0_xexec DS_PK_ADD_BF16 killed %5:vgpr_32, killed %6:vgpr_32, 0, 0, implicit $m0, implicit $exec S_ENDPGM 0 https://github.com/llvm/llvm-project/pull/96162 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits