https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/96739
>From 864e3bbfc5f40bfb1e87f7689ede0d5f33aa42da Mon Sep 17 00:00:00 2001 From: Matt Arsenault <matthew.arsena...@amd.com> Date: Tue, 11 Jun 2024 11:46:15 +0200 Subject: [PATCH] AMDGPU: Remove ds_fmin/ds_fmax intrinsics These have been replaced with atomicrmw. --- llvm/docs/ReleaseNotes.rst | 5 + llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 14 - llvm/lib/IR/AutoUpgrade.cpp | 8 +- .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 32 - llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h | 3 - .../Target/AMDGPU/AMDGPUSearchableTables.td | 2 - .../AMDGPU/AMDGPUTargetTransformInfo.cpp | 20 +- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 15 +- llvm/test/Bitcode/amdgcn-atomic.ll | 52 + .../AMDGPU/GlobalISel/llvm.amdgcn.ds.fmax.ll | 371 ----- .../AMDGPU/GlobalISel/llvm.amdgcn.ds.fmin.ll | 279 ---- .../CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll | 1418 ----------------- 12 files changed, 65 insertions(+), 2154 deletions(-) delete mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmax.ll delete mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmin.ll delete mode 100644 llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst index 416b3952f1ac4..ed7d252668850 100644 --- a/llvm/docs/ReleaseNotes.rst +++ b/llvm/docs/ReleaseNotes.rst @@ -132,6 +132,11 @@ Changes to the AMDGPU Backend * Implemented :ref:`llvm.get.rounding <int_get_rounding>` and :ref:`llvm.set.rounding <int_set_rounding>` +* Removed ``llvm.amdgcn.ds.fadd``, ``llvm.amdgcn.ds.fmin`` and + ``llvm.amdgcn.ds.fmax`` intrinsics. Users should use the + :ref:`atomicrmw <i_atomicrmw>` instruction with `fadd`, `fmin` and + `fmax` with addrspace(3) instead. + Changes to the ARM Backend -------------------------- diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index d040aa8f38278..71b1e832bde3c 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -523,17 +523,6 @@ def int_amdgcn_fmad_ftz : [IntrNoMem, IntrSpeculatable] >; -class AMDGPULDSIntrin : - Intrinsic<[llvm_any_ty], - [LLVMQualPointerType<3>, - LLVMMatchType<0>, - llvm_i32_ty, // ordering - llvm_i32_ty, // scope - llvm_i1_ty], // isVolatile - [IntrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>, - ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree] ->; - // FIXME: The m0 argument should be moved after the normal arguments class AMDGPUDSOrderedIntrinsic : Intrinsic< [llvm_i32_ty], @@ -571,9 +560,6 @@ def int_amdgcn_ds_ordered_swap : AMDGPUDSOrderedIntrinsic; def int_amdgcn_ds_append : AMDGPUDSAppendConsumedIntrinsic; def int_amdgcn_ds_consume : AMDGPUDSAppendConsumedIntrinsic; -def int_amdgcn_ds_fmin : AMDGPULDSIntrin; -def int_amdgcn_ds_fmax : AMDGPULDSIntrin; - } // TargetPrefix = "amdgcn" // New-style image intrinsics diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index d7825d9b3e3e5..32076a07d30e7 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -1033,8 +1033,10 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, break; // No other 'amdgcn.atomic.*' } - if (Name.starts_with("ds.fadd")) { - // Replaced with atomicrmw fadd, so there's no new declaration. + if (Name.starts_with("ds.fadd") || Name.starts_with("ds.fmin") || + Name.starts_with("ds.fmax")) { + // Replaced with atomicrmw fadd/fmin/fmax, so there's no new + // declaration. NewFn = nullptr; return true; } @@ -2347,6 +2349,8 @@ static Value *upgradeAMDGCNIntrinsicCall(StringRef Name, CallBase *CI, AtomicRMWInst::BinOp RMWOp = StringSwitch<AtomicRMWInst::BinOp>(Name) .StartsWith("ds.fadd", AtomicRMWInst::FAdd) + .StartsWith("ds.fmin", AtomicRMWInst::FMin) + .StartsWith("ds.fmax", AtomicRMWInst::FMax) .StartsWith("atomic.inc.", AtomicRMWInst::UIncWrap) .StartsWith("atomic.dec.", AtomicRMWInst::UDecWrap); diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index f1254b2e9e1d2..dc165d65fa6ff 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -5401,35 +5401,6 @@ bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, return true; } -static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { - switch (IID) { - case Intrinsic::amdgcn_ds_fmin: - return AMDGPU::G_ATOMICRMW_FMIN; - case Intrinsic::amdgcn_ds_fmax: - return AMDGPU::G_ATOMICRMW_FMAX; - default: - llvm_unreachable("not a DS FP intrinsic"); - } -} - -bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, - MachineInstr &MI, - Intrinsic::ID IID) const { - GISelChangeObserver &Observer = Helper.Observer; - Observer.changingInstr(MI); - - MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); - - // The remaining operands were used to set fields in the MemOperand on - // construction. - for (int I = 6; I > 3; --I) - MI.removeOperand(I); - - MI.removeOperand(1); // Remove the intrinsic ID. - Observer.changedInstr(MI); - return true; -} - // TODO: Fix pointer type handling bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper, MachineInstr &MI, @@ -7451,9 +7422,6 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, return legalizeBufferAtomic(MI, B, IntrID); case Intrinsic::amdgcn_rsq_clamp: return legalizeRsqClampIntrinsic(MI, MRI, B); - case Intrinsic::amdgcn_ds_fmin: - case Intrinsic::amdgcn_ds_fmax: - return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); case Intrinsic::amdgcn_image_bvh_intersect_ray: return legalizeBVHIntrinsic(MI, B); case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16: diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h index ae01bb29c1108..db1c5874093a7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h @@ -175,9 +175,6 @@ class AMDGPULegalizerInfo final : public LegalizerInfo { bool legalizeRsqClampIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const; - bool legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, - MachineInstr &MI, Intrinsic::ID IID) const; - bool getImplicitArgPtr(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td index ed5bae3e4ff61..a323f63767737 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -252,8 +252,6 @@ def : SourceOfDivergence<int_amdgcn_flat_atomic_fmin_num>; def : SourceOfDivergence<int_amdgcn_flat_atomic_fmax_num>; def : SourceOfDivergence<int_amdgcn_global_atomic_fadd_v2bf16>; def : SourceOfDivergence<int_amdgcn_flat_atomic_fadd_v2bf16>; -def : SourceOfDivergence<int_amdgcn_ds_fmin>; -def : SourceOfDivergence<int_amdgcn_ds_fmax>; def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_swap>; def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_add>; def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_sub>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 1192b49fd1f08..8882839ed8de3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -501,9 +501,7 @@ bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const { switch (Inst->getIntrinsicID()) { case Intrinsic::amdgcn_ds_ordered_add: - case Intrinsic::amdgcn_ds_ordered_swap: - case Intrinsic::amdgcn_ds_fmin: - case Intrinsic::amdgcn_ds_fmax: { + case Intrinsic::amdgcn_ds_ordered_swap: { auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2)); auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4)); if (!Ordering || !Volatile) @@ -1018,8 +1016,6 @@ bool GCNTTIImpl::isAlwaysUniform(const Value *V) const { bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes, Intrinsic::ID IID) const { switch (IID) { - case Intrinsic::amdgcn_ds_fmin: - case Intrinsic::amdgcn_ds_fmax: case Intrinsic::amdgcn_is_shared: case Intrinsic::amdgcn_is_private: case Intrinsic::amdgcn_flat_atomic_fadd: @@ -1039,20 +1035,6 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *NewV) const { auto IntrID = II->getIntrinsicID(); switch (IntrID) { - case Intrinsic::amdgcn_ds_fmin: - case Intrinsic::amdgcn_ds_fmax: { - const ConstantInt *IsVolatile = cast<ConstantInt>(II->getArgOperand(4)); - if (!IsVolatile->isZero()) - return nullptr; - Module *M = II->getParent()->getParent()->getParent(); - Type *DestTy = II->getType(); - Type *SrcTy = NewV->getType(); - Function *NewDecl = - Intrinsic::getDeclaration(M, II->getIntrinsicID(), {DestTy, SrcTy}); - II->setArgOperand(0, NewV); - II->setCalledFunction(NewDecl); - return II; - } case Intrinsic::amdgcn_is_shared: case Intrinsic::amdgcn_is_private: { unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ? diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 83bfb622ee525..16fa7266a4b7d 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1279,9 +1279,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, switch (IntrID) { case Intrinsic::amdgcn_ds_ordered_add: - case Intrinsic::amdgcn_ds_ordered_swap: - case Intrinsic::amdgcn_ds_fmin: - case Intrinsic::amdgcn_ds_fmax: { + case Intrinsic::amdgcn_ds_ordered_swap: { Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::getVT(CI.getType()); Info.ptrVal = CI.getOperand(0); @@ -1450,8 +1448,6 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II, case Intrinsic::amdgcn_atomic_cond_sub_u32: case Intrinsic::amdgcn_ds_append: case Intrinsic::amdgcn_ds_consume: - case Intrinsic::amdgcn_ds_fmax: - case Intrinsic::amdgcn_ds_fmin: case Intrinsic::amdgcn_ds_ordered_add: case Intrinsic::amdgcn_ds_ordered_swap: case Intrinsic::amdgcn_flat_atomic_fadd: @@ -8899,15 +8895,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, M->getVTList(), Ops, M->getMemoryVT(), M->getMemOperand()); } - case Intrinsic::amdgcn_ds_fmin: - case Intrinsic::amdgcn_ds_fmax: { - MemSDNode *M = cast<MemSDNode>(Op); - unsigned Opc = IntrID == Intrinsic::amdgcn_ds_fmin ? ISD::ATOMIC_LOAD_FMIN - : ISD::ATOMIC_LOAD_FMAX; - return DAG.getAtomic(Opc, SDLoc(Op), M->getMemoryVT(), M->getOperand(0), - M->getOperand(2), M->getOperand(3), - M->getMemOperand()); - } case Intrinsic::amdgcn_raw_buffer_load: case Intrinsic::amdgcn_raw_ptr_buffer_load: case Intrinsic::amdgcn_raw_buffer_load_format: diff --git a/llvm/test/Bitcode/amdgcn-atomic.ll b/llvm/test/Bitcode/amdgcn-atomic.ll index 311bd8863859b..ed7b04a2f3146 100644 --- a/llvm/test/Bitcode/amdgcn-atomic.ll +++ b/llvm/test/Bitcode/amdgcn-atomic.ll @@ -248,4 +248,56 @@ define <2 x i16> @upgrade_amdgcn_ds_fadd_v2bf16__missing_args_as_i16(ptr addrspa ret <2 x i16> %result0 } +declare float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) nocapture, float, i32 immarg, i32 immarg, i1 immarg) +declare double @llvm.amdgcn.ds.fmin.f64(ptr addrspace(3) nocapture, double, i32 immarg, i32 immarg, i1 immarg) + +define float @upgrade_amdgcn_ds_fmin_f32(ptr addrspace(3) %ptr, float %val) { + ; CHECK: atomicrmw fmin ptr addrspace(3) %ptr, float %val syncscope("agent") seq_cst, align 4 + %result0 = call float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 false) + + ; CHECK: = atomicrmw volatile fmin ptr addrspace(3) %ptr, float %val syncscope("agent") seq_cst, align 4 + %result1 = call float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 true) + + ; CHECK: = atomicrmw fmin ptr addrspace(3) %ptr, float %val syncscope("agent") seq_cst, align 4 + %result2 = call float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) %ptr, float %val, i32 43, i32 3, i1 false) + + ; CHECK: = atomicrmw fmin ptr addrspace(3) %ptr, float %val syncscope("agent") acquire, align 4 + %result3 = call float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) %ptr, float %val, i32 4, i32 2, i1 false) + + ret float %result3 +} + +define double @upgrade_amdgcn_ds_fmin_f64(ptr addrspace(3) %ptr, double %val) { + ; CHECK: atomicrmw fmin ptr addrspace(3) %ptr, double %val syncscope("agent") seq_cst, align 8 + %result0 = call double @llvm.amdgcn.ds.fmin.f64(ptr addrspace(3) %ptr, double %val, i32 0, i32 0, i1 false) + + ; CHECK: = atomicrmw volatile fmin ptr addrspace(3) %ptr, double %val syncscope("agent") seq_cst, align 8 + %result1 = call double @llvm.amdgcn.ds.fmin.f64(ptr addrspace(3) %ptr, double %val, i32 0, i32 0, i1 true) + + ; CHECK: = atomicrmw fmin ptr addrspace(3) %ptr, double %val syncscope("agent") seq_cst, align 8 + %result2 = call double @llvm.amdgcn.ds.fmin.f64(ptr addrspace(3) %ptr, double %val, i32 43, i32 3, i1 false) + + ; CHECK: = atomicrmw fmin ptr addrspace(3) %ptr, double %val syncscope("agent") acquire, align 8 + %result3 = call double @llvm.amdgcn.ds.fmin.f64(ptr addrspace(3) %ptr, double %val, i32 4, i32 2, i1 false) + + ret double %result3 +} + +declare float @llvm.amdgcn.ds.fmin(ptr addrspace(3) nocapture, float, i32 immarg, i32 immarg, i1 immarg) + +define float @upgrade_amdgcn_ds_fmin_f32_no_suffix(ptr addrspace(3) %ptr, float %val) { + ; CHECK: = atomicrmw fmin ptr addrspace(3) %ptr, float %val syncscope("agent") seq_cst, align 4 + + %result0 = call float @llvm.amdgcn.ds.fmin(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 false) + ret float %result0 +} + +declare float @llvm.amdgcn.ds.fmax(ptr addrspace(3) nocapture, float, i32 immarg, i32 immarg, i1 immarg) + +define float @upgrade_amdgcn_ds_fmax_f32_no_suffix(ptr addrspace(3) %ptr, float %val) { + ; CHECK: = atomicrmw fmax ptr addrspace(3) %ptr, float %val syncscope("agent") seq_cst, align 4 + %result0 = call float @llvm.amdgcn.ds.fmax(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 false) + ret float %result0 +} + attributes #0 = { argmemonly nounwind willreturn } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmax.ll deleted file mode 100644 index e4c4f42b137ef..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmax.ll +++ /dev/null @@ -1,371 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX8 %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s - -; Make sure the memory operand information is preserved. -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -stop-after=instruction-select -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX8-MIR %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -stop-after=instruction-select -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX9-MIR %s - - -define amdgpu_ps float @ds_fmax_f32_ss(ptr addrspace(3) inreg %ptr, float inreg %val) { -; GFX8-LABEL: ds_fmax_f32_ss: -; GFX8: ; %bb.0: -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_max_rtn_f32 v0, v0, v1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ; return to shader part epilog -; -; GFX9-LABEL: ds_fmax_f32_ss: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: ds_max_rtn_f32 v0, v0, v1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ; return to shader part epilog - ; GFX8-MIR-LABEL: name: ds_fmax_f32_ss - ; GFX8-MIR: bb.1 (%ir-block.0): - ; GFX8-MIR-NEXT: liveins: $sgpr2, $sgpr3 - ; GFX8-MIR-NEXT: {{ $}} - ; GFX8-MIR-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX8-MIR-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX8-MIR-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]] - ; GFX8-MIR-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] - ; GFX8-MIR-NEXT: $m0 = S_MOV_B32 -1 - ; GFX8-MIR-NEXT: [[DS_MAX_RTN_F32_:%[0-9]+]]:vgpr_32 = DS_MAX_RTN_F32 [[COPY2]], [[COPY3]], 0, 0, implicit $m0, implicit $exec :: (load store (s32) on %ir.ptr, addrspace 3) - ; GFX8-MIR-NEXT: $vgpr0 = COPY [[DS_MAX_RTN_F32_]] - ; GFX8-MIR-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - ; GFX9-MIR-LABEL: name: ds_fmax_f32_ss - ; GFX9-MIR: bb.1 (%ir-block.0): - ; GFX9-MIR-NEXT: liveins: $sgpr2, $sgpr3 - ; GFX9-MIR-NEXT: {{ $}} - ; GFX9-MIR-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX9-MIR-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX9-MIR-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]] - ; GFX9-MIR-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] - ; GFX9-MIR-NEXT: [[DS_MAX_RTN_F32_gfx9_:%[0-9]+]]:vgpr_32 = DS_MAX_RTN_F32_gfx9 [[COPY2]], [[COPY3]], 0, 0, implicit $exec :: (load store (s32) on %ir.ptr, addrspace 3) - ; GFX9-MIR-NEXT: $vgpr0 = COPY [[DS_MAX_RTN_F32_gfx9_]] - ; GFX9-MIR-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %ret = call float @llvm.amdgcn.ds.fmax(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 false) - ret float %ret -} - -define amdgpu_ps float @ds_fmax_f32_ss_offset(ptr addrspace(3) inreg %ptr, float inreg %val) { -; GFX8-LABEL: ds_fmax_f32_ss_offset: -; GFX8: ; %bb.0: -; GFX8-NEXT: v_mov_b32_e32 v0, s3 -; GFX8-NEXT: v_mov_b32_e32 v1, s2 -; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_max_rtn_f32 v0, v1, v0 offset:512 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ; return to shader part epilog -; -; GFX9-LABEL: ds_fmax_f32_ss_offset: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: ds_max_rtn_f32 v0, v1, v0 offset:512 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ; return to shader part epilog - ; GFX8-MIR-LABEL: name: ds_fmax_f32_ss_offset - ; GFX8-MIR: bb.1 (%ir-block.0): - ; GFX8-MIR-NEXT: liveins: $sgpr2, $sgpr3 - ; GFX8-MIR-NEXT: {{ $}} - ; GFX8-MIR-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX8-MIR-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX8-MIR-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] - ; GFX8-MIR-NEXT: $m0 = S_MOV_B32 -1 - ; GFX8-MIR-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]] - ; GFX8-MIR-NEXT: [[DS_MAX_RTN_F32_:%[0-9]+]]:vgpr_32 = DS_MAX_RTN_F32 [[COPY3]], [[COPY2]], 512, 0, implicit $m0, implicit $exec :: (load store (s32) on %ir.gep, addrspace 3) - ; GFX8-MIR-NEXT: $vgpr0 = COPY [[DS_MAX_RTN_F32_]] - ; GFX8-MIR-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - ; GFX9-MIR-LABEL: name: ds_fmax_f32_ss_offset - ; GFX9-MIR: bb.1 (%ir-block.0): - ; GFX9-MIR-NEXT: liveins: $sgpr2, $sgpr3 - ; GFX9-MIR-NEXT: {{ $}} - ; GFX9-MIR-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX9-MIR-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX9-MIR-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] - ; GFX9-MIR-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]] - ; GFX9-MIR-NEXT: [[DS_MAX_RTN_F32_gfx9_:%[0-9]+]]:vgpr_32 = DS_MAX_RTN_F32_gfx9 [[COPY3]], [[COPY2]], 512, 0, implicit $exec :: (load store (s32) on %ir.gep, addrspace 3) - ; GFX9-MIR-NEXT: $vgpr0 = COPY [[DS_MAX_RTN_F32_gfx9_]] - ; GFX9-MIR-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %gep = getelementptr float, ptr addrspace(3) %ptr, i32 128 - %ret = call float @llvm.amdgcn.ds.fmax(ptr addrspace(3) %gep, float %val, i32 0, i32 0, i1 false) - ret float %ret -} - -define amdgpu_ps void @ds_fmax_f32_ss_nortn(ptr addrspace(3) inreg %ptr, float inreg %val) { -; GFX8-LABEL: ds_fmax_f32_ss_nortn: -; GFX8: ; %bb.0: -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_max_f32 v0, v1 -; GFX8-NEXT: s_endpgm -; -; GFX9-LABEL: ds_fmax_f32_ss_nortn: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: ds_max_f32 v0, v1 -; GFX9-NEXT: s_endpgm - ; GFX8-MIR-LABEL: name: ds_fmax_f32_ss_nortn - ; GFX8-MIR: bb.1 (%ir-block.0): - ; GFX8-MIR-NEXT: liveins: $sgpr2, $sgpr3 - ; GFX8-MIR-NEXT: {{ $}} - ; GFX8-MIR-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX8-MIR-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX8-MIR-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]] - ; GFX8-MIR-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] - ; GFX8-MIR-NEXT: $m0 = S_MOV_B32 -1 - ; GFX8-MIR-NEXT: DS_MAX_F32 [[COPY2]], [[COPY3]], 0, 0, implicit $m0, implicit $exec :: (load store (s32) on %ir.ptr, addrspace 3) - ; GFX8-MIR-NEXT: S_ENDPGM 0 - ; GFX9-MIR-LABEL: name: ds_fmax_f32_ss_nortn - ; GFX9-MIR: bb.1 (%ir-block.0): - ; GFX9-MIR-NEXT: liveins: $sgpr2, $sgpr3 - ; GFX9-MIR-NEXT: {{ $}} - ; GFX9-MIR-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX9-MIR-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX9-MIR-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]] - ; GFX9-MIR-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] - ; GFX9-MIR-NEXT: DS_MAX_F32_gfx9 [[COPY2]], [[COPY3]], 0, 0, implicit $exec :: (load store (s32) on %ir.ptr, addrspace 3) - ; GFX9-MIR-NEXT: S_ENDPGM 0 - %unused = call float @llvm.amdgcn.ds.fmax(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 false) - ret void -} - -define amdgpu_ps void @ds_fmax_f32_ss_offset_nortn(ptr addrspace(3) inreg %ptr, float inreg %val) { -; GFX8-LABEL: ds_fmax_f32_ss_offset_nortn: -; GFX8: ; %bb.0: -; GFX8-NEXT: v_mov_b32_e32 v0, s3 -; GFX8-NEXT: v_mov_b32_e32 v1, s2 -; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_max_f32 v1, v0 offset:512 -; GFX8-NEXT: s_endpgm -; -; GFX9-LABEL: ds_fmax_f32_ss_offset_nortn: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: ds_max_f32 v1, v0 offset:512 -; GFX9-NEXT: s_endpgm - ; GFX8-MIR-LABEL: name: ds_fmax_f32_ss_offset_nortn - ; GFX8-MIR: bb.1 (%ir-block.0): - ; GFX8-MIR-NEXT: liveins: $sgpr2, $sgpr3 - ; GFX8-MIR-NEXT: {{ $}} - ; GFX8-MIR-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX8-MIR-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX8-MIR-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] - ; GFX8-MIR-NEXT: $m0 = S_MOV_B32 -1 - ; GFX8-MIR-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]] - ; GFX8-MIR-NEXT: DS_MAX_F32 [[COPY3]], [[COPY2]], 512, 0, implicit $m0, implicit $exec :: (load store (s32) on %ir.gep, addrspace 3) - ; GFX8-MIR-NEXT: S_ENDPGM 0 - ; GFX9-MIR-LABEL: name: ds_fmax_f32_ss_offset_nortn - ; GFX9-MIR: bb.1 (%ir-block.0): - ; GFX9-MIR-NEXT: liveins: $sgpr2, $sgpr3 - ; GFX9-MIR-NEXT: {{ $}} - ; GFX9-MIR-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; GFX9-MIR-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX9-MIR-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] - ; GFX9-MIR-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]] - ; GFX9-MIR-NEXT: DS_MAX_F32_gfx9 [[COPY3]], [[COPY2]], 512, 0, implicit $exec :: (load store (s32) on %ir.gep, addrspace 3) - ; GFX9-MIR-NEXT: S_ENDPGM 0 - %gep = getelementptr float, ptr addrspace(3) %ptr, i32 128 - %unused = call float @llvm.amdgcn.ds.fmax(ptr addrspace(3) %gep, float %val, i32 0, i32 0, i1 false) - ret void -} - -define float @ds_fmax_f32_vv(ptr addrspace(3) %ptr, float %val) { -; GFX8-LABEL: ds_fmax_f32_vv: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_max_rtn_f32 v0, v0, v1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: ds_fmax_f32_vv: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ds_max_rtn_f32 v0, v0, v1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] - ; GFX8-MIR-LABEL: name: ds_fmax_f32_vv - ; GFX8-MIR: bb.1 (%ir-block.0): - ; GFX8-MIR-NEXT: liveins: $vgpr0, $vgpr1 - ; GFX8-MIR-NEXT: {{ $}} - ; GFX8-MIR-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX8-MIR-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX8-MIR-NEXT: $m0 = S_MOV_B32 -1 - ; GFX8-MIR-NEXT: [[DS_MAX_RTN_F32_:%[0-9]+]]:vgpr_32 = DS_MAX_RTN_F32 [[COPY]], [[COPY1]], 0, 0, implicit $m0, implicit $exec :: (load store (s32) on %ir.ptr, addrspace 3) - ; GFX8-MIR-NEXT: $vgpr0 = COPY [[DS_MAX_RTN_F32_]] - ; GFX8-MIR-NEXT: SI_RETURN implicit $vgpr0 - ; GFX9-MIR-LABEL: name: ds_fmax_f32_vv - ; GFX9-MIR: bb.1 (%ir-block.0): - ; GFX9-MIR-NEXT: liveins: $vgpr0, $vgpr1 - ; GFX9-MIR-NEXT: {{ $}} - ; GFX9-MIR-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9-MIR-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX9-MIR-NEXT: [[DS_MAX_RTN_F32_gfx9_:%[0-9]+]]:vgpr_32 = DS_MAX_RTN_F32_gfx9 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (load store (s32) on %ir.ptr, addrspace 3) - ; GFX9-MIR-NEXT: $vgpr0 = COPY [[DS_MAX_RTN_F32_gfx9_]] - ; GFX9-MIR-NEXT: SI_RETURN implicit $vgpr0 - %ret = call float @llvm.amdgcn.ds.fmax(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 false) - ret float %ret -} - -define float @ds_fmax_f32_vv_offset(ptr addrspace(3) %ptr, float %val) { -; GFX8-LABEL: ds_fmax_f32_vv_offset: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_max_rtn_f32 v0, v0, v1 offset:512 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: ds_fmax_f32_vv_offset: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ds_max_rtn_f32 v0, v0, v1 offset:512 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] - ; GFX8-MIR-LABEL: name: ds_fmax_f32_vv_offset - ; GFX8-MIR: bb.1 (%ir-block.0): - ; GFX8-MIR-NEXT: liveins: $vgpr0, $vgpr1 - ; GFX8-MIR-NEXT: {{ $}} - ; GFX8-MIR-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX8-MIR-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX8-MIR-NEXT: $m0 = S_MOV_B32 -1 - ; GFX8-MIR-NEXT: [[DS_MAX_RTN_F32_:%[0-9]+]]:vgpr_32 = DS_MAX_RTN_F32 [[COPY]], [[COPY1]], 512, 0, implicit $m0, implicit $exec :: (load store (s32) on %ir.gep, addrspace 3) - ; GFX8-MIR-NEXT: $vgpr0 = COPY [[DS_MAX_RTN_F32_]] - ; GFX8-MIR-NEXT: SI_RETURN implicit $vgpr0 - ; GFX9-MIR-LABEL: name: ds_fmax_f32_vv_offset - ; GFX9-MIR: bb.1 (%ir-block.0): - ; GFX9-MIR-NEXT: liveins: $vgpr0, $vgpr1 - ; GFX9-MIR-NEXT: {{ $}} - ; GFX9-MIR-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9-MIR-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX9-MIR-NEXT: [[DS_MAX_RTN_F32_gfx9_:%[0-9]+]]:vgpr_32 = DS_MAX_RTN_F32_gfx9 [[COPY]], [[COPY1]], 512, 0, implicit $exec :: (load store (s32) on %ir.gep, addrspace 3) - ; GFX9-MIR-NEXT: $vgpr0 = COPY [[DS_MAX_RTN_F32_gfx9_]] - ; GFX9-MIR-NEXT: SI_RETURN implicit $vgpr0 - %gep = getelementptr float, ptr addrspace(3) %ptr, i32 128 - %ret = call float @llvm.amdgcn.ds.fmax(ptr addrspace(3) %gep, float %val, i32 0, i32 0, i1 false) - ret float %ret -} - -define void @ds_fmax_f32_vv_nortn(ptr addrspace(3) %ptr, float %val) { -; GFX8-LABEL: ds_fmax_f32_vv_nortn: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_max_f32 v0, v1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: ds_fmax_f32_vv_nortn: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ds_max_f32 v0, v1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] - ; GFX8-MIR-LABEL: name: ds_fmax_f32_vv_nortn - ; GFX8-MIR: bb.1 (%ir-block.0): - ; GFX8-MIR-NEXT: liveins: $vgpr0, $vgpr1 - ; GFX8-MIR-NEXT: {{ $}} - ; GFX8-MIR-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX8-MIR-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX8-MIR-NEXT: $m0 = S_MOV_B32 -1 - ; GFX8-MIR-NEXT: DS_MAX_F32 [[COPY]], [[COPY1]], 0, 0, implicit $m0, implicit $exec :: (load store (s32) on %ir.ptr, addrspace 3) - ; GFX8-MIR-NEXT: SI_RETURN - ; GFX9-MIR-LABEL: name: ds_fmax_f32_vv_nortn - ; GFX9-MIR: bb.1 (%ir-block.0): - ; GFX9-MIR-NEXT: liveins: $vgpr0, $vgpr1 - ; GFX9-MIR-NEXT: {{ $}} - ; GFX9-MIR-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9-MIR-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX9-MIR-NEXT: DS_MAX_F32_gfx9 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (load store (s32) on %ir.ptr, addrspace 3) - ; GFX9-MIR-NEXT: SI_RETURN - %ret = call float @llvm.amdgcn.ds.fmax(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 false) - ret void -} - -define void @ds_fmax_f32_vv_offset_nortn(ptr addrspace(3) %ptr, float %val) { -; GFX8-LABEL: ds_fmax_f32_vv_offset_nortn: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_max_f32 v0, v1 offset:512 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: ds_fmax_f32_vv_offset_nortn: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ds_max_f32 v0, v1 offset:512 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] - ; GFX8-MIR-LABEL: name: ds_fmax_f32_vv_offset_nortn - ; GFX8-MIR: bb.1 (%ir-block.0): - ; GFX8-MIR-NEXT: liveins: $vgpr0, $vgpr1 - ; GFX8-MIR-NEXT: {{ $}} - ; GFX8-MIR-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX8-MIR-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX8-MIR-NEXT: $m0 = S_MOV_B32 -1 - ; GFX8-MIR-NEXT: DS_MAX_F32 [[COPY]], [[COPY1]], 512, 0, implicit $m0, implicit $exec :: (load store (s32) on %ir.gep, addrspace 3) - ; GFX8-MIR-NEXT: SI_RETURN - ; GFX9-MIR-LABEL: name: ds_fmax_f32_vv_offset_nortn - ; GFX9-MIR: bb.1 (%ir-block.0): - ; GFX9-MIR-NEXT: liveins: $vgpr0, $vgpr1 - ; GFX9-MIR-NEXT: {{ $}} - ; GFX9-MIR-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9-MIR-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX9-MIR-NEXT: DS_MAX_F32_gfx9 [[COPY]], [[COPY1]], 512, 0, implicit $exec :: (load store (s32) on %ir.gep, addrspace 3) - ; GFX9-MIR-NEXT: SI_RETURN - %gep = getelementptr float, ptr addrspace(3) %ptr, i32 128 - %ret = call float @llvm.amdgcn.ds.fmax(ptr addrspace(3) %gep, float %val, i32 0, i32 0, i1 false) - ret void -} - -define float @ds_fmax_f32_vv_volatile(ptr addrspace(3) %ptr, float %val) { -; GFX8-LABEL: ds_fmax_f32_vv_volatile: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_max_rtn_f32 v0, v0, v1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: ds_fmax_f32_vv_volatile: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ds_max_rtn_f32 v0, v0, v1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] - ; GFX8-MIR-LABEL: name: ds_fmax_f32_vv_volatile - ; GFX8-MIR: bb.1 (%ir-block.0): - ; GFX8-MIR-NEXT: liveins: $vgpr0, $vgpr1 - ; GFX8-MIR-NEXT: {{ $}} - ; GFX8-MIR-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX8-MIR-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX8-MIR-NEXT: $m0 = S_MOV_B32 -1 - ; GFX8-MIR-NEXT: [[DS_MAX_RTN_F32_:%[0-9]+]]:vgpr_32 = DS_MAX_RTN_F32 [[COPY]], [[COPY1]], 0, 0, implicit $m0, implicit $exec :: (volatile load store (s32) on %ir.ptr, addrspace 3) - ; GFX8-MIR-NEXT: $vgpr0 = COPY [[DS_MAX_RTN_F32_]] - ; GFX8-MIR-NEXT: SI_RETURN implicit $vgpr0 - ; GFX9-MIR-LABEL: name: ds_fmax_f32_vv_volatile - ; GFX9-MIR: bb.1 (%ir-block.0): - ; GFX9-MIR-NEXT: liveins: $vgpr0, $vgpr1 - ; GFX9-MIR-NEXT: {{ $}} - ; GFX9-MIR-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9-MIR-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX9-MIR-NEXT: [[DS_MAX_RTN_F32_gfx9_:%[0-9]+]]:vgpr_32 = DS_MAX_RTN_F32_gfx9 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (volatile load store (s32) on %ir.ptr, addrspace 3) - ; GFX9-MIR-NEXT: $vgpr0 = COPY [[DS_MAX_RTN_F32_gfx9_]] - ; GFX9-MIR-NEXT: SI_RETURN implicit $vgpr0 - %ret = call float @llvm.amdgcn.ds.fmax(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 true) - ret float %ret -} - -declare float @llvm.amdgcn.ds.fmax(ptr addrspace(3) nocapture, float, i32 immarg, i32 immarg, i1 immarg) #0 - -attributes #0 = { argmemonly nounwind willreturn } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmin.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmin.ll deleted file mode 100644 index 0f6fb5acd56ad..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmin.ll +++ /dev/null @@ -1,279 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX8 %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10PLUS,GFX10 %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10PLUS,GFX11 %s - -define amdgpu_ps float @ds_fmin_f32_ss(ptr addrspace(3) inreg %ptr, float inreg %val) { -; GFX8-LABEL: ds_fmin_f32_ss: -; GFX8: ; %bb.0: -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_min_rtn_f32 v0, v0, v1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ; return to shader part epilog -; -; GFX9-LABEL: ds_fmin_f32_ss: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: ds_min_rtn_f32 v0, v0, v1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: ds_fmin_f32_ss: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: ds_min_rtn_f32 v0, v0, v1 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: ; return to shader part epilog -; -; GFX11-LABEL: ds_fmin_f32_ss: -; GFX11: ; %bb.0: -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-NEXT: ds_min_rtn_f32 v0, v0, v1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: ; return to shader part epilog - %ret = call float @llvm.amdgcn.ds.fmin(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 false) - ret float %ret -} - -define amdgpu_ps float @ds_fmin_f32_ss_offset(ptr addrspace(3) inreg %ptr, float inreg %val) { -; GFX8-LABEL: ds_fmin_f32_ss_offset: -; GFX8: ; %bb.0: -; GFX8-NEXT: v_mov_b32_e32 v0, s3 -; GFX8-NEXT: v_mov_b32_e32 v1, s2 -; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_min_rtn_f32 v0, v1, v0 offset:512 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ; return to shader part epilog -; -; GFX9-LABEL: ds_fmin_f32_ss_offset: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: ds_min_rtn_f32 v0, v1, v0 offset:512 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: ds_fmin_f32_ss_offset: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_mov_b32_e32 v0, s3 -; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: ds_min_rtn_f32 v0, v1, v0 offset:512 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: ; return to shader part epilog -; -; GFX11-LABEL: ds_fmin_f32_ss_offset: -; GFX11: ; %bb.0: -; GFX11-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2 -; GFX11-NEXT: ds_min_rtn_f32 v0, v1, v0 offset:512 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: ; return to shader part epilog - %gep = getelementptr float, ptr addrspace(3) %ptr, i32 128 - %ret = call float @llvm.amdgcn.ds.fmin(ptr addrspace(3) %gep, float %val, i32 0, i32 0, i1 false) - ret float %ret -} - -define amdgpu_ps void @ds_fmin_f32_ss_nortn(ptr addrspace(3) inreg %ptr, float inreg %val) { -; GFX8-LABEL: ds_fmin_f32_ss_nortn: -; GFX8: ; %bb.0: -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_min_f32 v0, v1 -; GFX8-NEXT: s_endpgm -; -; GFX9-LABEL: ds_fmin_f32_ss_nortn: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: ds_min_f32 v0, v1 -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: ds_fmin_f32_ss_nortn: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: ds_min_f32 v0, v1 -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: ds_fmin_f32_ss_nortn: -; GFX11: ; %bb.0: -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-NEXT: ds_min_f32 v0, v1 -; GFX11-NEXT: s_endpgm - %unused = call float @llvm.amdgcn.ds.fmin(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 false) - ret void -} - -define amdgpu_ps void @ds_fmin_f32_ss_offset_nortn(ptr addrspace(3) inreg %ptr, float inreg %val) { -; GFX8-LABEL: ds_fmin_f32_ss_offset_nortn: -; GFX8: ; %bb.0: -; GFX8-NEXT: v_mov_b32_e32 v0, s3 -; GFX8-NEXT: v_mov_b32_e32 v1, s2 -; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_min_f32 v1, v0 offset:512 -; GFX8-NEXT: s_endpgm -; -; GFX9-LABEL: ds_fmin_f32_ss_offset_nortn: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: ds_min_f32 v1, v0 offset:512 -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: ds_fmin_f32_ss_offset_nortn: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_mov_b32_e32 v0, s3 -; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: ds_min_f32 v1, v0 offset:512 -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: ds_fmin_f32_ss_offset_nortn: -; GFX11: ; %bb.0: -; GFX11-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2 -; GFX11-NEXT: ds_min_f32 v1, v0 offset:512 -; GFX11-NEXT: s_endpgm - %gep = getelementptr float, ptr addrspace(3) %ptr, i32 128 - %unused = call float @llvm.amdgcn.ds.fmin(ptr addrspace(3) %gep, float %val, i32 0, i32 0, i1 false) - ret void -} - -define float @ds_fmin_f32_vv(ptr addrspace(3) %ptr, float %val) { -; GFX8-LABEL: ds_fmin_f32_vv: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_min_rtn_f32 v0, v0, v1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: ds_fmin_f32_vv: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ds_min_rtn_f32 v0, v0, v1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX10PLUS-LABEL: ds_fmin_f32_vv: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: ds_min_rtn_f32 v0, v0, v1 -; GFX10PLUS-NEXT: s_waitcnt lgkmcnt(0) -; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] - %ret = call float @llvm.amdgcn.ds.fmin(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 false) - ret float %ret -} - -define float @ds_fmin_f32_vv_offset(ptr addrspace(3) %ptr, float %val) { -; GFX8-LABEL: ds_fmin_f32_vv_offset: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_min_rtn_f32 v0, v0, v1 offset:512 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: ds_fmin_f32_vv_offset: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ds_min_rtn_f32 v0, v0, v1 offset:512 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX10PLUS-LABEL: ds_fmin_f32_vv_offset: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: ds_min_rtn_f32 v0, v0, v1 offset:512 -; GFX10PLUS-NEXT: s_waitcnt lgkmcnt(0) -; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(3) %ptr, i32 128 - %ret = call float @llvm.amdgcn.ds.fmin(ptr addrspace(3) %gep, float %val, i32 0, i32 0, i1 false) - ret float %ret -} - -define void @ds_fmin_f32_vv_nortn(ptr addrspace(3) %ptr, float %val) { -; GFX8-LABEL: ds_fmin_f32_vv_nortn: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_min_f32 v0, v1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: ds_fmin_f32_vv_nortn: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ds_min_f32 v0, v1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX10PLUS-LABEL: ds_fmin_f32_vv_nortn: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: ds_min_f32 v0, v1 -; GFX10PLUS-NEXT: s_waitcnt lgkmcnt(0) -; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] - %ret = call float @llvm.amdgcn.ds.fmin(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 false) - ret void -} - -define void @ds_fmin_f32_vv_offset_nortn(ptr addrspace(3) %ptr, float %val) { -; GFX8-LABEL: ds_fmin_f32_vv_offset_nortn: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_min_f32 v0, v1 offset:512 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: ds_fmin_f32_vv_offset_nortn: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ds_min_f32 v0, v1 offset:512 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX10PLUS-LABEL: ds_fmin_f32_vv_offset_nortn: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: ds_min_f32 v0, v1 offset:512 -; GFX10PLUS-NEXT: s_waitcnt lgkmcnt(0) -; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(3) %ptr, i32 128 - %ret = call float @llvm.amdgcn.ds.fmin(ptr addrspace(3) %gep, float %val, i32 0, i32 0, i1 false) - ret void -} - -define float @ds_fmin_f32_vv_volatile(ptr addrspace(3) %ptr, float %val) { -; GFX8-LABEL: ds_fmin_f32_vv_volatile: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_min_rtn_f32 v0, v0, v1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: ds_fmin_f32_vv_volatile: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ds_min_rtn_f32 v0, v0, v1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX10PLUS-LABEL: ds_fmin_f32_vv_volatile: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: ds_min_rtn_f32 v0, v0, v1 -; GFX10PLUS-NEXT: s_waitcnt lgkmcnt(0) -; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] - %ret = call float @llvm.amdgcn.ds.fmin(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 true) - ret float %ret -} - -declare float @llvm.amdgcn.ds.fmin(ptr addrspace(3) nocapture, float, i32 immarg, i32 immarg, i1 immarg) #0 - -attributes #0 = { argmemonly nounwind willreturn } diff --git a/llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll b/llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll deleted file mode 100644 index 142a6ed19daf8..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll +++ /dev/null @@ -1,1418 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI %s -; RUN: llc -mtriple=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7 %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX10 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX11 %s - -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=G_SI %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=G_GFX7 %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=G_VI %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=G_GFX9 %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=G_GFX10 %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=G_GFX11 %s - -declare float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) nocapture, float, i32, i32, i1) -declare float @llvm.amdgcn.ds.fmax.f32(ptr addrspace(3) nocapture, float, i32, i32, i1) -declare double @llvm.amdgcn.ds.fmin.f64(ptr addrspace(3) nocapture, double, i32, i32, i1) -declare double @llvm.amdgcn.ds.fmax.f64(ptr addrspace(3) nocapture, double, i32, i32, i1) - - -define amdgpu_kernel void @lds_ds_fmin(ptr addrspace(5) %out, ptr addrspace(3) %ptrf, i32 %idx) { -; SI-LABEL: lds_ds_fmin: -; SI: ; %bb.0: -; SI-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 -; SI-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s7, 0xe8f000 -; SI-NEXT: s_add_u32 s4, s4, s3 -; SI-NEXT: s_addc_u32 s5, s5, 0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s3, s2, 4 -; SI-NEXT: s_lshl_b32 s2, s2, 3 -; SI-NEXT: s_add_i32 s2, s2, 32 -; SI-NEXT: v_mov_b32_e32 v0, 0x42280000 -; SI-NEXT: v_mov_b32_e32 v1, s2 -; SI-NEXT: s_mov_b32 m0, -1 -; SI-NEXT: ds_min_rtn_f32 v1, v1, v0 -; SI-NEXT: s_add_i32 s2, s3, 64 -; SI-NEXT: v_mov_b32_e32 v2, s2 -; SI-NEXT: ds_min_f32 v2, v0 -; SI-NEXT: v_mov_b32_e32 v0, s1 -; SI-NEXT: s_waitcnt lgkmcnt(1) -; SI-NEXT: ds_min_rtn_f32 v0, v0, v1 -; SI-NEXT: v_mov_b32_e32 v1, s0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen -; SI-NEXT: s_endpgm -; -; GFX7-LABEL: lds_ds_fmin: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 -; GFX7-NEXT: s_load_dword s2, s[0:1], 0xb -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX7-NEXT: s_mov_b32 s6, -1 -; GFX7-NEXT: s_mov_b32 s7, 0xe8f000 -; GFX7-NEXT: s_add_u32 s4, s4, s3 -; GFX7-NEXT: s_addc_u32 s5, s5, 0 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_lshl_b32 s3, s2, 3 -; GFX7-NEXT: v_mov_b32_e32 v0, 0x42280000 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_min_rtn_f32 v1, v1, v0 offset:32 -; GFX7-NEXT: s_lshl_b32 s2, s2, 4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: ds_min_f32 v2, v0 offset:64 -; GFX7-NEXT: v_mov_b32_e32 v0, s1 -; GFX7-NEXT: s_waitcnt lgkmcnt(1) -; GFX7-NEXT: ds_min_rtn_f32 v0, v0, v1 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen -; GFX7-NEXT: s_endpgm -; -; VI-LABEL: lds_ds_fmin: -; VI: ; %bb.0: -; VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 -; VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s90, -1 -; VI-NEXT: s_mov_b32 s91, 0xe80000 -; VI-NEXT: s_add_u32 s88, s88, s3 -; VI-NEXT: s_addc_u32 s89, s89, 0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s3, s2, 3 -; VI-NEXT: v_mov_b32_e32 v0, 0x42280000 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: s_mov_b32 m0, -1 -; VI-NEXT: ds_min_rtn_f32 v1, v1, v0 offset:32 -; VI-NEXT: s_lshl_b32 s2, s2, 4 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: ds_min_f32 v2, v0 offset:64 -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: s_waitcnt lgkmcnt(1) -; VI-NEXT: ds_min_rtn_f32 v0, v0, v1 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: lds_ds_fmin: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-NEXT: s_add_u32 s8, s8, s3 -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_addc_u32 s9, s9, 0 -; GFX9-NEXT: v_mov_b32_e32 v0, 0x42280000 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s0, s4, 3 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: ds_min_rtn_f32 v1, v1, v0 offset:32 -; GFX9-NEXT: s_lshl_b32 s0, s4, 4 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: ds_min_f32 v2, v0 offset:64 -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-NEXT: ds_min_rtn_f32 v0, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: lds_ds_fmin: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX10-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX10-NEXT: s_mov_b32 s10, -1 -; GFX10-NEXT: s_mov_b32 s11, 0x31c16000 -; GFX10-NEXT: s_add_u32 s8, s8, s3 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10-NEXT: s_addc_u32 s9, s9, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0x42280000 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_lshl_b32 s0, s4, 3 -; GFX10-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-NEXT: s_lshl_b32 s0, s4, 4 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: ds_min_rtn_f32 v1, v1, v0 offset:32 -; GFX10-NEXT: ds_min_f32 v2, v0 offset:64 -; GFX10-NEXT: s_waitcnt lgkmcnt(1) -; GFX10-NEXT: ds_min_rtn_f32 v0, v3, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: lds_ds_fmin: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshl_b32 s3, s2, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v0, 0x42280000 :: v_dual_mov_b32 v1, s3 -; GFX11-NEXT: s_lshl_b32 s2, s2, 4 -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s2 -; GFX11-NEXT: ds_min_rtn_f32 v1, v1, v0 offset:32 -; GFX11-NEXT: ds_min_f32 v2, v0 offset:64 -; GFX11-NEXT: s_waitcnt lgkmcnt(1) -; GFX11-NEXT: ds_min_rtn_f32 v0, v3, v1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: scratch_store_b32 off, v0, s0 -; GFX11-NEXT: s_endpgm -; -; G_SI-LABEL: lds_ds_fmin: -; G_SI: ; %bb.0: -; G_SI-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 -; G_SI-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 -; G_SI-NEXT: s_load_dword s2, s[0:1], 0xb -; G_SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; G_SI-NEXT: s_mov_b32 s6, -1 -; G_SI-NEXT: s_mov_b32 s7, 0xe8f000 -; G_SI-NEXT: s_add_u32 s4, s4, s3 -; G_SI-NEXT: s_addc_u32 s5, s5, 0 -; G_SI-NEXT: s_waitcnt lgkmcnt(0) -; G_SI-NEXT: s_add_i32 s2, s2, 4 -; G_SI-NEXT: s_lshl_b32 s3, s2, 3 -; G_SI-NEXT: v_mov_b32_e32 v0, 0x42280000 -; G_SI-NEXT: v_mov_b32_e32 v1, s3 -; G_SI-NEXT: s_mov_b32 m0, -1 -; G_SI-NEXT: ds_min_rtn_f32 v1, v1, v0 -; G_SI-NEXT: s_lshl_b32 s2, s2, 4 -; G_SI-NEXT: v_mov_b32_e32 v2, s2 -; G_SI-NEXT: ds_min_f32 v2, v0 -; G_SI-NEXT: v_mov_b32_e32 v0, s1 -; G_SI-NEXT: s_waitcnt lgkmcnt(1) -; G_SI-NEXT: ds_min_rtn_f32 v0, v0, v1 -; G_SI-NEXT: v_mov_b32_e32 v1, s0 -; G_SI-NEXT: s_waitcnt lgkmcnt(0) -; G_SI-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen -; G_SI-NEXT: s_endpgm -; -; G_GFX7-LABEL: lds_ds_fmin: -; G_GFX7: ; %bb.0: -; G_GFX7-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 -; G_GFX7-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 -; G_GFX7-NEXT: s_load_dword s2, s[0:1], 0xb -; G_GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; G_GFX7-NEXT: s_mov_b32 s6, -1 -; G_GFX7-NEXT: s_mov_b32 s7, 0xe8f000 -; G_GFX7-NEXT: s_add_u32 s4, s4, s3 -; G_GFX7-NEXT: s_addc_u32 s5, s5, 0 -; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX7-NEXT: s_add_i32 s2, s2, 4 -; G_GFX7-NEXT: s_lshl_b32 s3, s2, 3 -; G_GFX7-NEXT: v_mov_b32_e32 v0, 0x42280000 -; G_GFX7-NEXT: v_mov_b32_e32 v1, s3 -; G_GFX7-NEXT: s_mov_b32 m0, -1 -; G_GFX7-NEXT: ds_min_rtn_f32 v1, v1, v0 -; G_GFX7-NEXT: s_lshl_b32 s2, s2, 4 -; G_GFX7-NEXT: v_mov_b32_e32 v2, s2 -; G_GFX7-NEXT: ds_min_f32 v2, v0 -; G_GFX7-NEXT: v_mov_b32_e32 v0, s1 -; G_GFX7-NEXT: s_waitcnt lgkmcnt(1) -; G_GFX7-NEXT: ds_min_rtn_f32 v0, v0, v1 -; G_GFX7-NEXT: v_mov_b32_e32 v1, s0 -; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX7-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen -; G_GFX7-NEXT: s_endpgm -; -; G_VI-LABEL: lds_ds_fmin: -; G_VI: ; %bb.0: -; G_VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 -; G_VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 -; G_VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; G_VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; G_VI-NEXT: s_mov_b32 s90, -1 -; G_VI-NEXT: s_mov_b32 s91, 0xe80000 -; G_VI-NEXT: s_add_u32 s88, s88, s3 -; G_VI-NEXT: s_addc_u32 s89, s89, 0 -; G_VI-NEXT: s_waitcnt lgkmcnt(0) -; G_VI-NEXT: s_add_i32 s2, s2, 4 -; G_VI-NEXT: s_lshl_b32 s3, s2, 3 -; G_VI-NEXT: v_mov_b32_e32 v0, 0x42280000 -; G_VI-NEXT: v_mov_b32_e32 v1, s3 -; G_VI-NEXT: s_mov_b32 m0, -1 -; G_VI-NEXT: ds_min_rtn_f32 v1, v1, v0 -; G_VI-NEXT: s_lshl_b32 s2, s2, 4 -; G_VI-NEXT: v_mov_b32_e32 v2, s2 -; G_VI-NEXT: ds_min_f32 v2, v0 -; G_VI-NEXT: v_mov_b32_e32 v0, s1 -; G_VI-NEXT: s_waitcnt lgkmcnt(1) -; G_VI-NEXT: ds_min_rtn_f32 v0, v0, v1 -; G_VI-NEXT: v_mov_b32_e32 v1, s0 -; G_VI-NEXT: s_waitcnt lgkmcnt(0) -; G_VI-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen -; G_VI-NEXT: s_endpgm -; -; G_GFX9-LABEL: lds_ds_fmin: -; G_GFX9: ; %bb.0: -; G_GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; G_GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; G_GFX9-NEXT: s_mov_b32 s10, -1 -; G_GFX9-NEXT: s_mov_b32 s11, 0xe00000 -; G_GFX9-NEXT: s_add_u32 s8, s8, s3 -; G_GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; G_GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; G_GFX9-NEXT: s_addc_u32 s9, s9, 0 -; G_GFX9-NEXT: v_mov_b32_e32 v1, 0x42280000 -; G_GFX9-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX9-NEXT: s_add_i32 s4, s4, 4 -; G_GFX9-NEXT: s_lshl_b32 s0, s4, 3 -; G_GFX9-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX9-NEXT: ds_min_rtn_f32 v0, v0, v1 -; G_GFX9-NEXT: s_lshl_b32 s0, s4, 4 -; G_GFX9-NEXT: v_mov_b32_e32 v2, s0 -; G_GFX9-NEXT: ds_min_f32 v2, v1 -; G_GFX9-NEXT: v_mov_b32_e32 v1, s3 -; G_GFX9-NEXT: s_waitcnt lgkmcnt(1) -; G_GFX9-NEXT: ds_min_rtn_f32 v0, v1, v0 -; G_GFX9-NEXT: v_mov_b32_e32 v1, s2 -; G_GFX9-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX9-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen -; G_GFX9-NEXT: s_endpgm -; -; G_GFX10-LABEL: lds_ds_fmin: -; G_GFX10: ; %bb.0: -; G_GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c -; G_GFX10-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 -; G_GFX10-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 -; G_GFX10-NEXT: s_mov_b32 s6, -1 -; G_GFX10-NEXT: s_mov_b32 s7, 0x31c16000 -; G_GFX10-NEXT: s_add_u32 s4, s4, s3 -; G_GFX10-NEXT: s_addc_u32 s5, s5, 0 -; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; G_GFX10-NEXT: v_mov_b32_e32 v1, 0x42280000 -; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: s_add_i32 s2, s2, 4 -; G_GFX10-NEXT: s_lshl_b32 s3, s2, 3 -; G_GFX10-NEXT: s_lshl_b32 s2, s2, 4 -; G_GFX10-NEXT: v_mov_b32_e32 v0, s3 -; G_GFX10-NEXT: v_mov_b32_e32 v2, s2 -; G_GFX10-NEXT: v_mov_b32_e32 v3, s1 -; G_GFX10-NEXT: ds_min_rtn_f32 v0, v0, v1 -; G_GFX10-NEXT: ds_min_f32 v2, v1 -; G_GFX10-NEXT: s_waitcnt lgkmcnt(1) -; G_GFX10-NEXT: ds_min_rtn_f32 v0, v3, v0 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s0 -; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen -; G_GFX10-NEXT: s_endpgm -; -; G_GFX11-LABEL: lds_ds_fmin: -; G_GFX11: ; %bb.0: -; G_GFX11-NEXT: s_clause 0x1 -; G_GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; G_GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; G_GFX11-NEXT: v_mov_b32_e32 v1, 0x42280000 -; G_GFX11-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX11-NEXT: s_add_i32 s2, s2, 4 -; G_GFX11-NEXT: v_mov_b32_e32 v3, s1 -; G_GFX11-NEXT: s_lshl_b32 s3, s2, 3 -; G_GFX11-NEXT: s_lshl_b32 s2, s2, 4 -; G_GFX11-NEXT: v_mov_b32_e32 v0, s3 -; G_GFX11-NEXT: v_mov_b32_e32 v2, s2 -; G_GFX11-NEXT: ds_min_rtn_f32 v0, v0, v1 -; G_GFX11-NEXT: ds_min_f32 v2, v1 -; G_GFX11-NEXT: s_waitcnt lgkmcnt(1) -; G_GFX11-NEXT: ds_min_rtn_f32 v0, v3, v0 -; G_GFX11-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX11-NEXT: scratch_store_b32 off, v0, s0 -; G_GFX11-NEXT: s_endpgm - %idx.add = add nuw i32 %idx, 4 - %shl0 = shl i32 %idx.add, 3 - %shl1 = shl i32 %idx.add, 4 - %ptr0 = inttoptr i32 %shl0 to ptr addrspace(3) - %ptr1 = inttoptr i32 %shl1 to ptr addrspace(3) - %a1 = call float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) %ptr0, float 4.2e+1, i32 0, i32 0, i1 false) - %a2 = call float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) %ptr1, float 4.2e+1, i32 0, i32 0, i1 false) - %a3 = call float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) %ptrf, float %a1, i32 0, i32 0, i1 false) - store float %a3, ptr addrspace(5) %out - ret void -} - -define amdgpu_kernel void @lds_ds_fmax(ptr addrspace(5) %out, ptr addrspace(3) %ptrf, i32 %idx) { -; SI-LABEL: lds_ds_fmax: -; SI: ; %bb.0: -; SI-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 -; SI-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s7, 0xe8f000 -; SI-NEXT: s_add_u32 s4, s4, s3 -; SI-NEXT: s_addc_u32 s5, s5, 0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s3, s2, 4 -; SI-NEXT: s_lshl_b32 s2, s2, 3 -; SI-NEXT: s_add_i32 s2, s2, 32 -; SI-NEXT: v_mov_b32_e32 v0, 0x42280000 -; SI-NEXT: v_mov_b32_e32 v1, s2 -; SI-NEXT: s_mov_b32 m0, -1 -; SI-NEXT: ds_max_rtn_f32 v1, v1, v0 -; SI-NEXT: s_add_i32 s2, s3, 64 -; SI-NEXT: v_mov_b32_e32 v2, s2 -; SI-NEXT: ds_max_f32 v2, v0 -; SI-NEXT: v_mov_b32_e32 v0, s1 -; SI-NEXT: s_waitcnt lgkmcnt(1) -; SI-NEXT: ds_max_rtn_f32 v0, v0, v1 -; SI-NEXT: v_mov_b32_e32 v1, s0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen -; SI-NEXT: s_endpgm -; -; GFX7-LABEL: lds_ds_fmax: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 -; GFX7-NEXT: s_load_dword s2, s[0:1], 0xb -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX7-NEXT: s_mov_b32 s6, -1 -; GFX7-NEXT: s_mov_b32 s7, 0xe8f000 -; GFX7-NEXT: s_add_u32 s4, s4, s3 -; GFX7-NEXT: s_addc_u32 s5, s5, 0 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_lshl_b32 s3, s2, 3 -; GFX7-NEXT: v_mov_b32_e32 v0, 0x42280000 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_max_rtn_f32 v1, v1, v0 offset:32 -; GFX7-NEXT: s_lshl_b32 s2, s2, 4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: ds_max_f32 v2, v0 offset:64 -; GFX7-NEXT: v_mov_b32_e32 v0, s1 -; GFX7-NEXT: s_waitcnt lgkmcnt(1) -; GFX7-NEXT: ds_max_rtn_f32 v0, v0, v1 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen -; GFX7-NEXT: s_endpgm -; -; VI-LABEL: lds_ds_fmax: -; VI: ; %bb.0: -; VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 -; VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s90, -1 -; VI-NEXT: s_mov_b32 s91, 0xe80000 -; VI-NEXT: s_add_u32 s88, s88, s3 -; VI-NEXT: s_addc_u32 s89, s89, 0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s3, s2, 3 -; VI-NEXT: v_mov_b32_e32 v0, 0x42280000 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: s_mov_b32 m0, -1 -; VI-NEXT: ds_max_rtn_f32 v1, v1, v0 offset:32 -; VI-NEXT: s_lshl_b32 s2, s2, 4 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: ds_max_f32 v2, v0 offset:64 -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: s_waitcnt lgkmcnt(1) -; VI-NEXT: ds_max_rtn_f32 v0, v0, v1 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: lds_ds_fmax: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-NEXT: s_add_u32 s8, s8, s3 -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_addc_u32 s9, s9, 0 -; GFX9-NEXT: v_mov_b32_e32 v0, 0x42280000 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s0, s4, 3 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: ds_max_rtn_f32 v1, v1, v0 offset:32 -; GFX9-NEXT: s_lshl_b32 s0, s4, 4 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: ds_max_f32 v2, v0 offset:64 -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-NEXT: ds_max_rtn_f32 v0, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: lds_ds_fmax: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX10-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX10-NEXT: s_mov_b32 s10, -1 -; GFX10-NEXT: s_mov_b32 s11, 0x31c16000 -; GFX10-NEXT: s_add_u32 s8, s8, s3 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10-NEXT: s_addc_u32 s9, s9, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0x42280000 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_lshl_b32 s0, s4, 3 -; GFX10-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-NEXT: s_lshl_b32 s0, s4, 4 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: ds_max_rtn_f32 v1, v1, v0 offset:32 -; GFX10-NEXT: ds_max_f32 v2, v0 offset:64 -; GFX10-NEXT: s_waitcnt lgkmcnt(1) -; GFX10-NEXT: ds_max_rtn_f32 v0, v3, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: lds_ds_fmax: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshl_b32 s3, s2, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v0, 0x42280000 :: v_dual_mov_b32 v1, s3 -; GFX11-NEXT: s_lshl_b32 s2, s2, 4 -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s2 -; GFX11-NEXT: ds_max_rtn_f32 v1, v1, v0 offset:32 -; GFX11-NEXT: ds_max_f32 v2, v0 offset:64 -; GFX11-NEXT: s_waitcnt lgkmcnt(1) -; GFX11-NEXT: ds_max_rtn_f32 v0, v3, v1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: scratch_store_b32 off, v0, s0 -; GFX11-NEXT: s_endpgm -; -; G_SI-LABEL: lds_ds_fmax: -; G_SI: ; %bb.0: -; G_SI-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 -; G_SI-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 -; G_SI-NEXT: s_load_dword s2, s[0:1], 0xb -; G_SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; G_SI-NEXT: s_mov_b32 s6, -1 -; G_SI-NEXT: s_mov_b32 s7, 0xe8f000 -; G_SI-NEXT: s_add_u32 s4, s4, s3 -; G_SI-NEXT: s_addc_u32 s5, s5, 0 -; G_SI-NEXT: s_waitcnt lgkmcnt(0) -; G_SI-NEXT: s_add_i32 s2, s2, 4 -; G_SI-NEXT: s_lshl_b32 s3, s2, 3 -; G_SI-NEXT: v_mov_b32_e32 v0, 0x42280000 -; G_SI-NEXT: v_mov_b32_e32 v1, s3 -; G_SI-NEXT: s_mov_b32 m0, -1 -; G_SI-NEXT: ds_max_rtn_f32 v1, v1, v0 -; G_SI-NEXT: s_lshl_b32 s2, s2, 4 -; G_SI-NEXT: v_mov_b32_e32 v2, s2 -; G_SI-NEXT: ds_max_f32 v2, v0 -; G_SI-NEXT: v_mov_b32_e32 v0, s1 -; G_SI-NEXT: s_waitcnt lgkmcnt(1) -; G_SI-NEXT: ds_max_rtn_f32 v0, v0, v1 -; G_SI-NEXT: v_mov_b32_e32 v1, s0 -; G_SI-NEXT: s_waitcnt lgkmcnt(0) -; G_SI-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen -; G_SI-NEXT: s_endpgm -; -; G_GFX7-LABEL: lds_ds_fmax: -; G_GFX7: ; %bb.0: -; G_GFX7-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 -; G_GFX7-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 -; G_GFX7-NEXT: s_load_dword s2, s[0:1], 0xb -; G_GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; G_GFX7-NEXT: s_mov_b32 s6, -1 -; G_GFX7-NEXT: s_mov_b32 s7, 0xe8f000 -; G_GFX7-NEXT: s_add_u32 s4, s4, s3 -; G_GFX7-NEXT: s_addc_u32 s5, s5, 0 -; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX7-NEXT: s_add_i32 s2, s2, 4 -; G_GFX7-NEXT: s_lshl_b32 s3, s2, 3 -; G_GFX7-NEXT: v_mov_b32_e32 v0, 0x42280000 -; G_GFX7-NEXT: v_mov_b32_e32 v1, s3 -; G_GFX7-NEXT: s_mov_b32 m0, -1 -; G_GFX7-NEXT: ds_max_rtn_f32 v1, v1, v0 -; G_GFX7-NEXT: s_lshl_b32 s2, s2, 4 -; G_GFX7-NEXT: v_mov_b32_e32 v2, s2 -; G_GFX7-NEXT: ds_max_f32 v2, v0 -; G_GFX7-NEXT: v_mov_b32_e32 v0, s1 -; G_GFX7-NEXT: s_waitcnt lgkmcnt(1) -; G_GFX7-NEXT: ds_max_rtn_f32 v0, v0, v1 -; G_GFX7-NEXT: v_mov_b32_e32 v1, s0 -; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX7-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen -; G_GFX7-NEXT: s_endpgm -; -; G_VI-LABEL: lds_ds_fmax: -; G_VI: ; %bb.0: -; G_VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 -; G_VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 -; G_VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; G_VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; G_VI-NEXT: s_mov_b32 s90, -1 -; G_VI-NEXT: s_mov_b32 s91, 0xe80000 -; G_VI-NEXT: s_add_u32 s88, s88, s3 -; G_VI-NEXT: s_addc_u32 s89, s89, 0 -; G_VI-NEXT: s_waitcnt lgkmcnt(0) -; G_VI-NEXT: s_add_i32 s2, s2, 4 -; G_VI-NEXT: s_lshl_b32 s3, s2, 3 -; G_VI-NEXT: v_mov_b32_e32 v0, 0x42280000 -; G_VI-NEXT: v_mov_b32_e32 v1, s3 -; G_VI-NEXT: s_mov_b32 m0, -1 -; G_VI-NEXT: ds_max_rtn_f32 v1, v1, v0 -; G_VI-NEXT: s_lshl_b32 s2, s2, 4 -; G_VI-NEXT: v_mov_b32_e32 v2, s2 -; G_VI-NEXT: ds_max_f32 v2, v0 -; G_VI-NEXT: v_mov_b32_e32 v0, s1 -; G_VI-NEXT: s_waitcnt lgkmcnt(1) -; G_VI-NEXT: ds_max_rtn_f32 v0, v0, v1 -; G_VI-NEXT: v_mov_b32_e32 v1, s0 -; G_VI-NEXT: s_waitcnt lgkmcnt(0) -; G_VI-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen -; G_VI-NEXT: s_endpgm -; -; G_GFX9-LABEL: lds_ds_fmax: -; G_GFX9: ; %bb.0: -; G_GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; G_GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; G_GFX9-NEXT: s_mov_b32 s10, -1 -; G_GFX9-NEXT: s_mov_b32 s11, 0xe00000 -; G_GFX9-NEXT: s_add_u32 s8, s8, s3 -; G_GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; G_GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; G_GFX9-NEXT: s_addc_u32 s9, s9, 0 -; G_GFX9-NEXT: v_mov_b32_e32 v1, 0x42280000 -; G_GFX9-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX9-NEXT: s_add_i32 s4, s4, 4 -; G_GFX9-NEXT: s_lshl_b32 s0, s4, 3 -; G_GFX9-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX9-NEXT: ds_max_rtn_f32 v0, v0, v1 -; G_GFX9-NEXT: s_lshl_b32 s0, s4, 4 -; G_GFX9-NEXT: v_mov_b32_e32 v2, s0 -; G_GFX9-NEXT: ds_max_f32 v2, v1 -; G_GFX9-NEXT: v_mov_b32_e32 v1, s3 -; G_GFX9-NEXT: s_waitcnt lgkmcnt(1) -; G_GFX9-NEXT: ds_max_rtn_f32 v0, v1, v0 -; G_GFX9-NEXT: v_mov_b32_e32 v1, s2 -; G_GFX9-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX9-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen -; G_GFX9-NEXT: s_endpgm -; -; G_GFX10-LABEL: lds_ds_fmax: -; G_GFX10: ; %bb.0: -; G_GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c -; G_GFX10-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 -; G_GFX10-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 -; G_GFX10-NEXT: s_mov_b32 s6, -1 -; G_GFX10-NEXT: s_mov_b32 s7, 0x31c16000 -; G_GFX10-NEXT: s_add_u32 s4, s4, s3 -; G_GFX10-NEXT: s_addc_u32 s5, s5, 0 -; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; G_GFX10-NEXT: v_mov_b32_e32 v1, 0x42280000 -; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: s_add_i32 s2, s2, 4 -; G_GFX10-NEXT: s_lshl_b32 s3, s2, 3 -; G_GFX10-NEXT: s_lshl_b32 s2, s2, 4 -; G_GFX10-NEXT: v_mov_b32_e32 v0, s3 -; G_GFX10-NEXT: v_mov_b32_e32 v2, s2 -; G_GFX10-NEXT: v_mov_b32_e32 v3, s1 -; G_GFX10-NEXT: ds_max_rtn_f32 v0, v0, v1 -; G_GFX10-NEXT: ds_max_f32 v2, v1 -; G_GFX10-NEXT: s_waitcnt lgkmcnt(1) -; G_GFX10-NEXT: ds_max_rtn_f32 v0, v3, v0 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s0 -; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen -; G_GFX10-NEXT: s_endpgm -; -; G_GFX11-LABEL: lds_ds_fmax: -; G_GFX11: ; %bb.0: -; G_GFX11-NEXT: s_clause 0x1 -; G_GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; G_GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; G_GFX11-NEXT: v_mov_b32_e32 v1, 0x42280000 -; G_GFX11-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX11-NEXT: s_add_i32 s2, s2, 4 -; G_GFX11-NEXT: v_mov_b32_e32 v3, s1 -; G_GFX11-NEXT: s_lshl_b32 s3, s2, 3 -; G_GFX11-NEXT: s_lshl_b32 s2, s2, 4 -; G_GFX11-NEXT: v_mov_b32_e32 v0, s3 -; G_GFX11-NEXT: v_mov_b32_e32 v2, s2 -; G_GFX11-NEXT: ds_max_rtn_f32 v0, v0, v1 -; G_GFX11-NEXT: ds_max_f32 v2, v1 -; G_GFX11-NEXT: s_waitcnt lgkmcnt(1) -; G_GFX11-NEXT: ds_max_rtn_f32 v0, v3, v0 -; G_GFX11-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX11-NEXT: scratch_store_b32 off, v0, s0 -; G_GFX11-NEXT: s_endpgm - %idx.add = add nuw i32 %idx, 4 - %shl0 = shl i32 %idx.add, 3 - %shl1 = shl i32 %idx.add, 4 - %ptr0 = inttoptr i32 %shl0 to ptr addrspace(3) - %ptr1 = inttoptr i32 %shl1 to ptr addrspace(3) - %a1 = call float @llvm.amdgcn.ds.fmax.f32(ptr addrspace(3) %ptr0, float 4.2e+1, i32 0, i32 0, i1 false) - %a2 = call float @llvm.amdgcn.ds.fmax.f32(ptr addrspace(3) %ptr1, float 4.2e+1, i32 0, i32 0, i1 false) - %a3 = call float @llvm.amdgcn.ds.fmax.f32(ptr addrspace(3) %ptrf, float %a1, i32 0, i32 0, i1 false) - store float %a3, ptr addrspace(5) %out - ret void -} - -define amdgpu_kernel void @lds_ds_fmin_f64(ptr addrspace(5) %out, ptr addrspace(3) %ptrf, i32 %idx) { -; SI-LABEL: lds_ds_fmin_f64: -; SI: ; %bb.0: -; SI-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 -; SI-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s7, 0xe8f000 -; SI-NEXT: s_add_u32 s4, s4, s3 -; SI-NEXT: s_addc_u32 s5, s5, 0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s3, s2, 4 -; SI-NEXT: s_lshl_b32 s2, s2, 3 -; SI-NEXT: v_mov_b32_e32 v0, 0 -; SI-NEXT: s_add_i32 s2, s2, 32 -; SI-NEXT: v_mov_b32_e32 v1, 0x40450000 -; SI-NEXT: v_mov_b32_e32 v2, s2 -; SI-NEXT: s_mov_b32 m0, -1 -; SI-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1] -; SI-NEXT: v_mov_b32_e32 v4, s1 -; SI-NEXT: s_add_i32 s1, s3, 64 -; SI-NEXT: v_mov_b32_e32 v5, s1 -; SI-NEXT: ds_min_f64 v5, v[0:1] -; SI-NEXT: s_waitcnt lgkmcnt(1) -; SI-NEXT: ds_min_rtn_f64 v[0:1], v4, v[2:3] -; SI-NEXT: s_add_i32 s1, s0, 4 -; SI-NEXT: v_mov_b32_e32 v3, s1 -; SI-NEXT: v_mov_b32_e32 v2, s0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dword v1, v3, s[4:7], 0 offen -; SI-NEXT: buffer_store_dword v0, v2, s[4:7], 0 offen -; SI-NEXT: s_endpgm -; -; GFX7-LABEL: lds_ds_fmin_f64: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 -; GFX7-NEXT: s_load_dword s2, s[0:1], 0xb -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX7-NEXT: s_mov_b32 s6, -1 -; GFX7-NEXT: s_mov_b32 s7, 0xe8f000 -; GFX7-NEXT: s_add_u32 s4, s4, s3 -; GFX7-NEXT: s_addc_u32 s5, s5, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, 0 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_lshl_b32 s3, s2, 3 -; GFX7-NEXT: v_mov_b32_e32 v1, 0x40450000 -; GFX7-NEXT: v_mov_b32_e32 v2, s3 -; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1] offset:32 -; GFX7-NEXT: s_lshl_b32 s2, s2, 4 -; GFX7-NEXT: v_mov_b32_e32 v5, s2 -; GFX7-NEXT: v_mov_b32_e32 v4, s1 -; GFX7-NEXT: ds_min_f64 v5, v[0:1] offset:64 -; GFX7-NEXT: s_waitcnt lgkmcnt(1) -; GFX7-NEXT: ds_min_rtn_f64 v[0:1], v4, v[2:3] -; GFX7-NEXT: s_add_i32 s1, s0, 4 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: buffer_store_dword v1, v3, s[4:7], 0 offen -; GFX7-NEXT: buffer_store_dword v0, v2, s[4:7], 0 offen -; GFX7-NEXT: s_endpgm -; -; VI-LABEL: lds_ds_fmin_f64: -; VI: ; %bb.0: -; VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 -; VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s90, -1 -; VI-NEXT: s_mov_b32 s91, 0xe80000 -; VI-NEXT: s_add_u32 s88, s88, s3 -; VI-NEXT: s_addc_u32 s89, s89, 0 -; VI-NEXT: v_mov_b32_e32 v0, 0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s3, s2, 3 -; VI-NEXT: v_mov_b32_e32 v1, 0x40450000 -; VI-NEXT: v_mov_b32_e32 v2, s3 -; VI-NEXT: s_mov_b32 m0, -1 -; VI-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1] offset:32 -; VI-NEXT: s_lshl_b32 s2, s2, 4 -; VI-NEXT: v_mov_b32_e32 v5, s2 -; VI-NEXT: v_mov_b32_e32 v4, s1 -; VI-NEXT: ds_min_f64 v5, v[0:1] offset:64 -; VI-NEXT: s_waitcnt lgkmcnt(1) -; VI-NEXT: ds_min_rtn_f64 v[0:1], v4, v[2:3] -; VI-NEXT: s_add_i32 s1, s0, 4 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen -; VI-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: lds_ds_fmin_f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-NEXT: s_add_u32 s8, s8, s3 -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_addc_u32 s9, s9, 0 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0x40450000 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s0, s4, 3 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1] offset:32 -; GFX9-NEXT: s_lshl_b32 s0, s4, 4 -; GFX9-NEXT: v_mov_b32_e32 v5, s0 -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: ds_min_f64 v5, v[0:1] offset:64 -; GFX9-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-NEXT: ds_min_rtn_f64 v[0:1], v4, v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v1, v2, s[8:11], 0 offen offset:4 -; GFX9-NEXT: buffer_store_dword v0, v2, s[8:11], 0 offen -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: lds_ds_fmin_f64: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX10-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX10-NEXT: s_mov_b32 s10, -1 -; GFX10-NEXT: s_mov_b32 s11, 0x31c16000 -; GFX10-NEXT: s_add_u32 s8, s8, s3 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10-NEXT: s_addc_u32 s9, s9, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, 0x40450000 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_lshl_b32 s0, s4, 3 -; GFX10-NEXT: v_mov_b32_e32 v5, s3 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: s_lshl_b32 s0, s4, 4 -; GFX10-NEXT: v_mov_b32_e32 v4, s0 -; GFX10-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1] offset:32 -; GFX10-NEXT: ds_min_f64 v4, v[0:1] offset:64 -; GFX10-NEXT: s_waitcnt lgkmcnt(1) -; GFX10-NEXT: ds_min_rtn_f64 v[0:1], v5, v[2:3] -; GFX10-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: buffer_store_dword v1, v2, s[8:11], 0 offen offset:4 -; GFX10-NEXT: buffer_store_dword v0, v2, s[8:11], 0 offen -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: lds_ds_fmin_f64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshl_b32 s3, s2, 3 -; GFX11-NEXT: v_mov_b32_e32 v5, s1 -; GFX11-NEXT: v_dual_mov_b32 v1, 0x40450000 :: v_dual_mov_b32 v2, s3 -; GFX11-NEXT: s_lshl_b32 s2, s2, 4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v4, s2 -; GFX11-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1] offset:32 -; GFX11-NEXT: ds_min_f64 v4, v[0:1] offset:64 -; GFX11-NEXT: s_waitcnt lgkmcnt(1) -; GFX11-NEXT: ds_min_rtn_f64 v[0:1], v5, v[2:3] -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: scratch_store_b64 off, v[0:1], s0 -; GFX11-NEXT: s_endpgm -; -; G_SI-LABEL: lds_ds_fmin_f64: -; G_SI: ; %bb.0: -; G_SI-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; G_SI-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; G_SI-NEXT: s_load_dword s4, s[0:1], 0xb -; G_SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; G_SI-NEXT: s_mov_b32 s10, -1 -; G_SI-NEXT: s_mov_b32 s11, 0xe8f000 -; G_SI-NEXT: s_add_u32 s8, s8, s3 -; G_SI-NEXT: s_mov_b32 s2, 0 -; G_SI-NEXT: s_addc_u32 s9, s9, 0 -; G_SI-NEXT: s_mov_b32 s3, 0x40450000 -; G_SI-NEXT: s_waitcnt lgkmcnt(0) -; G_SI-NEXT: s_add_i32 s4, s4, 4 -; G_SI-NEXT: v_mov_b32_e32 v0, s2 -; G_SI-NEXT: s_lshl_b32 s2, s4, 3 -; G_SI-NEXT: v_mov_b32_e32 v1, s3 -; G_SI-NEXT: v_mov_b32_e32 v2, s2 -; G_SI-NEXT: s_mov_b32 m0, -1 -; G_SI-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1] -; G_SI-NEXT: s_lshl_b32 s2, s4, 4 -; G_SI-NEXT: v_mov_b32_e32 v4, s2 -; G_SI-NEXT: ds_min_f64 v4, v[0:1] -; G_SI-NEXT: v_mov_b32_e32 v0, s1 -; G_SI-NEXT: s_waitcnt lgkmcnt(1) -; G_SI-NEXT: ds_min_rtn_f64 v[0:1], v0, v[2:3] -; G_SI-NEXT: v_mov_b32_e32 v2, s0 -; G_SI-NEXT: s_add_u32 s0, s0, 4 -; G_SI-NEXT: v_mov_b32_e32 v3, s0 -; G_SI-NEXT: s_waitcnt lgkmcnt(0) -; G_SI-NEXT: buffer_store_dword v0, v2, s[8:11], 0 offen -; G_SI-NEXT: buffer_store_dword v1, v3, s[8:11], 0 offen -; G_SI-NEXT: s_endpgm -; -; G_GFX7-LABEL: lds_ds_fmin_f64: -; G_GFX7: ; %bb.0: -; G_GFX7-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; G_GFX7-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; G_GFX7-NEXT: s_load_dword s4, s[0:1], 0xb -; G_GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; G_GFX7-NEXT: s_mov_b32 s10, -1 -; G_GFX7-NEXT: s_mov_b32 s11, 0xe8f000 -; G_GFX7-NEXT: s_add_u32 s8, s8, s3 -; G_GFX7-NEXT: s_mov_b32 s2, 0 -; G_GFX7-NEXT: s_addc_u32 s9, s9, 0 -; G_GFX7-NEXT: s_mov_b32 s3, 0x40450000 -; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX7-NEXT: s_add_i32 s4, s4, 4 -; G_GFX7-NEXT: v_mov_b32_e32 v0, s2 -; G_GFX7-NEXT: s_lshl_b32 s2, s4, 3 -; G_GFX7-NEXT: v_mov_b32_e32 v1, s3 -; G_GFX7-NEXT: v_mov_b32_e32 v2, s2 -; G_GFX7-NEXT: s_mov_b32 m0, -1 -; G_GFX7-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1] -; G_GFX7-NEXT: s_lshl_b32 s2, s4, 4 -; G_GFX7-NEXT: v_mov_b32_e32 v4, s2 -; G_GFX7-NEXT: ds_min_f64 v4, v[0:1] -; G_GFX7-NEXT: v_mov_b32_e32 v0, s1 -; G_GFX7-NEXT: s_waitcnt lgkmcnt(1) -; G_GFX7-NEXT: ds_min_rtn_f64 v[0:1], v0, v[2:3] -; G_GFX7-NEXT: v_mov_b32_e32 v2, s0 -; G_GFX7-NEXT: s_add_u32 s0, s0, 4 -; G_GFX7-NEXT: v_mov_b32_e32 v3, s0 -; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX7-NEXT: buffer_store_dword v0, v2, s[8:11], 0 offen -; G_GFX7-NEXT: buffer_store_dword v1, v3, s[8:11], 0 offen -; G_GFX7-NEXT: s_endpgm -; -; G_VI-LABEL: lds_ds_fmin_f64: -; G_VI: ; %bb.0: -; G_VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 -; G_VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 -; G_VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; G_VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; G_VI-NEXT: s_mov_b32 s90, -1 -; G_VI-NEXT: s_mov_b32 s91, 0xe80000 -; G_VI-NEXT: s_add_u32 s88, s88, s3 -; G_VI-NEXT: s_mov_b32 s2, 0 -; G_VI-NEXT: s_addc_u32 s89, s89, 0 -; G_VI-NEXT: s_mov_b32 s3, 0x40450000 -; G_VI-NEXT: s_waitcnt lgkmcnt(0) -; G_VI-NEXT: s_add_i32 s4, s4, 4 -; G_VI-NEXT: v_mov_b32_e32 v0, s2 -; G_VI-NEXT: s_lshl_b32 s2, s4, 3 -; G_VI-NEXT: v_mov_b32_e32 v1, s3 -; G_VI-NEXT: v_mov_b32_e32 v2, s2 -; G_VI-NEXT: s_mov_b32 m0, -1 -; G_VI-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1] -; G_VI-NEXT: s_lshl_b32 s2, s4, 4 -; G_VI-NEXT: v_mov_b32_e32 v4, s2 -; G_VI-NEXT: ds_min_f64 v4, v[0:1] -; G_VI-NEXT: v_mov_b32_e32 v0, s1 -; G_VI-NEXT: s_waitcnt lgkmcnt(1) -; G_VI-NEXT: ds_min_rtn_f64 v[0:1], v0, v[2:3] -; G_VI-NEXT: v_mov_b32_e32 v2, s0 -; G_VI-NEXT: s_add_u32 s0, s0, 4 -; G_VI-NEXT: v_mov_b32_e32 v3, s0 -; G_VI-NEXT: s_waitcnt lgkmcnt(0) -; G_VI-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen -; G_VI-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen -; G_VI-NEXT: s_endpgm -; -; G_GFX9-LABEL: lds_ds_fmin_f64: -; G_GFX9: ; %bb.0: -; G_GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; G_GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; G_GFX9-NEXT: s_mov_b32 s10, -1 -; G_GFX9-NEXT: s_mov_b32 s11, 0xe00000 -; G_GFX9-NEXT: s_add_u32 s8, s8, s3 -; G_GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; G_GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; G_GFX9-NEXT: s_mov_b32 s0, 0 -; G_GFX9-NEXT: s_addc_u32 s9, s9, 0 -; G_GFX9-NEXT: s_mov_b32 s1, 0x40450000 -; G_GFX9-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX9-NEXT: s_add_i32 s4, s4, 4 -; G_GFX9-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX9-NEXT: s_lshl_b32 s0, s4, 3 -; G_GFX9-NEXT: v_mov_b32_e32 v1, s1 -; G_GFX9-NEXT: v_mov_b32_e32 v2, s0 -; G_GFX9-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1] -; G_GFX9-NEXT: s_lshl_b32 s0, s4, 4 -; G_GFX9-NEXT: v_mov_b32_e32 v5, s0 -; G_GFX9-NEXT: v_mov_b32_e32 v4, s3 -; G_GFX9-NEXT: ds_min_f64 v5, v[0:1] -; G_GFX9-NEXT: s_waitcnt lgkmcnt(1) -; G_GFX9-NEXT: ds_min_rtn_f64 v[0:1], v4, v[2:3] -; G_GFX9-NEXT: v_mov_b32_e32 v2, s2 -; G_GFX9-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX9-NEXT: buffer_store_dword v0, v2, s[8:11], 0 offen -; G_GFX9-NEXT: buffer_store_dword v1, v2, s[8:11], 0 offen offset:4 -; G_GFX9-NEXT: s_endpgm -; -; G_GFX10-LABEL: lds_ds_fmin_f64: -; G_GFX10: ; %bb.0: -; G_GFX10-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; G_GFX10-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; G_GFX10-NEXT: s_mov_b32 s10, -1 -; G_GFX10-NEXT: s_mov_b32 s11, 0x31c16000 -; G_GFX10-NEXT: s_add_u32 s8, s8, s3 -; G_GFX10-NEXT: s_clause 0x1 -; G_GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; G_GFX10-NEXT: s_addc_u32 s9, s9, 0 -; G_GFX10-NEXT: s_mov_b32 s0, 0 -; G_GFX10-NEXT: s_mov_b32 s1, 0x40450000 -; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 -; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: s_add_i32 s4, s4, 4 -; G_GFX10-NEXT: v_mov_b32_e32 v5, s3 -; G_GFX10-NEXT: s_lshl_b32 s5, s4, 3 -; G_GFX10-NEXT: s_lshl_b32 s0, s4, 4 -; G_GFX10-NEXT: v_mov_b32_e32 v2, s5 -; G_GFX10-NEXT: v_mov_b32_e32 v4, s0 -; G_GFX10-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1] -; G_GFX10-NEXT: ds_min_f64 v4, v[0:1] -; G_GFX10-NEXT: s_waitcnt lgkmcnt(1) -; G_GFX10-NEXT: ds_min_rtn_f64 v[0:1], v5, v[2:3] -; G_GFX10-NEXT: v_mov_b32_e32 v2, s2 -; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: buffer_store_dword v0, v2, s[8:11], 0 offen -; G_GFX10-NEXT: buffer_store_dword v1, v2, s[8:11], 0 offen offset:4 -; G_GFX10-NEXT: s_endpgm -; -; G_GFX11-LABEL: lds_ds_fmin_f64: -; G_GFX11: ; %bb.0: -; G_GFX11-NEXT: s_clause 0x1 -; G_GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; G_GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; G_GFX11-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX11-NEXT: s_add_i32 s4, s2, 4 -; G_GFX11-NEXT: s_mov_b32 s2, 0 -; G_GFX11-NEXT: s_mov_b32 s3, 0x40450000 -; G_GFX11-NEXT: s_lshl_b32 s5, s4, 3 -; G_GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v5, s1 -; G_GFX11-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v2, s5 -; G_GFX11-NEXT: s_lshl_b32 s2, s4, 4 -; G_GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; G_GFX11-NEXT: v_mov_b32_e32 v4, s2 -; G_GFX11-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1] -; G_GFX11-NEXT: ds_min_f64 v4, v[0:1] -; G_GFX11-NEXT: s_waitcnt lgkmcnt(1) -; G_GFX11-NEXT: ds_min_rtn_f64 v[0:1], v5, v[2:3] -; G_GFX11-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX11-NEXT: scratch_store_b64 off, v[0:1], s0 -; G_GFX11-NEXT: s_endpgm - %idx.add = add nuw i32 %idx, 4 - %shl0 = shl i32 %idx.add, 3 - %shl1 = shl i32 %idx.add, 4 - %ptr0 = inttoptr i32 %shl0 to ptr addrspace(3) - %ptr1 = inttoptr i32 %shl1 to ptr addrspace(3) - %a1 = call double @llvm.amdgcn.ds.fmin.f64(ptr addrspace(3) %ptr0, double 4.2e+1, i32 0, i32 0, i1 false) - %a2 = call double @llvm.amdgcn.ds.fmin.f64(ptr addrspace(3) %ptr1, double 4.2e+1, i32 0, i32 0, i1 false) - %a3 = call double @llvm.amdgcn.ds.fmin.f64(ptr addrspace(3) %ptrf, double %a1, i32 0, i32 0, i1 false) - store double %a3, ptr addrspace(5) %out - ret void -} - -define amdgpu_kernel void @lds_ds_fmax_f64(ptr addrspace(5) %out, ptr addrspace(3) %ptrf, i32 %idx) { -; SI-LABEL: lds_ds_fmax_f64: -; SI: ; %bb.0: -; SI-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 -; SI-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s7, 0xe8f000 -; SI-NEXT: s_add_u32 s4, s4, s3 -; SI-NEXT: s_addc_u32 s5, s5, 0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s3, s2, 4 -; SI-NEXT: s_lshl_b32 s2, s2, 3 -; SI-NEXT: v_mov_b32_e32 v0, 0 -; SI-NEXT: s_add_i32 s2, s2, 32 -; SI-NEXT: v_mov_b32_e32 v1, 0x40450000 -; SI-NEXT: v_mov_b32_e32 v2, s2 -; SI-NEXT: s_mov_b32 m0, -1 -; SI-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1] -; SI-NEXT: v_mov_b32_e32 v4, s1 -; SI-NEXT: s_add_i32 s1, s3, 64 -; SI-NEXT: v_mov_b32_e32 v5, s1 -; SI-NEXT: ds_max_f64 v5, v[0:1] -; SI-NEXT: s_waitcnt lgkmcnt(1) -; SI-NEXT: ds_max_rtn_f64 v[0:1], v4, v[2:3] -; SI-NEXT: s_add_i32 s1, s0, 4 -; SI-NEXT: v_mov_b32_e32 v3, s1 -; SI-NEXT: v_mov_b32_e32 v2, s0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dword v1, v3, s[4:7], 0 offen -; SI-NEXT: buffer_store_dword v0, v2, s[4:7], 0 offen -; SI-NEXT: s_endpgm -; -; GFX7-LABEL: lds_ds_fmax_f64: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 -; GFX7-NEXT: s_load_dword s2, s[0:1], 0xb -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX7-NEXT: s_mov_b32 s6, -1 -; GFX7-NEXT: s_mov_b32 s7, 0xe8f000 -; GFX7-NEXT: s_add_u32 s4, s4, s3 -; GFX7-NEXT: s_addc_u32 s5, s5, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, 0 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_lshl_b32 s3, s2, 3 -; GFX7-NEXT: v_mov_b32_e32 v1, 0x40450000 -; GFX7-NEXT: v_mov_b32_e32 v2, s3 -; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1] offset:32 -; GFX7-NEXT: s_lshl_b32 s2, s2, 4 -; GFX7-NEXT: v_mov_b32_e32 v5, s2 -; GFX7-NEXT: v_mov_b32_e32 v4, s1 -; GFX7-NEXT: ds_max_f64 v5, v[0:1] offset:64 -; GFX7-NEXT: s_waitcnt lgkmcnt(1) -; GFX7-NEXT: ds_max_rtn_f64 v[0:1], v4, v[2:3] -; GFX7-NEXT: s_add_i32 s1, s0, 4 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: buffer_store_dword v1, v3, s[4:7], 0 offen -; GFX7-NEXT: buffer_store_dword v0, v2, s[4:7], 0 offen -; GFX7-NEXT: s_endpgm -; -; VI-LABEL: lds_ds_fmax_f64: -; VI: ; %bb.0: -; VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 -; VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 -; VI-NEXT: s_load_dword s2, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s90, -1 -; VI-NEXT: s_mov_b32 s91, 0xe80000 -; VI-NEXT: s_add_u32 s88, s88, s3 -; VI-NEXT: s_addc_u32 s89, s89, 0 -; VI-NEXT: v_mov_b32_e32 v0, 0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s3, s2, 3 -; VI-NEXT: v_mov_b32_e32 v1, 0x40450000 -; VI-NEXT: v_mov_b32_e32 v2, s3 -; VI-NEXT: s_mov_b32 m0, -1 -; VI-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1] offset:32 -; VI-NEXT: s_lshl_b32 s2, s2, 4 -; VI-NEXT: v_mov_b32_e32 v5, s2 -; VI-NEXT: v_mov_b32_e32 v4, s1 -; VI-NEXT: ds_max_f64 v5, v[0:1] offset:64 -; VI-NEXT: s_waitcnt lgkmcnt(1) -; VI-NEXT: ds_max_rtn_f64 v[0:1], v4, v[2:3] -; VI-NEXT: s_add_i32 s1, s0, 4 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen -; VI-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: lds_ds_fmax_f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s11, 0xe00000 -; GFX9-NEXT: s_add_u32 s8, s8, s3 -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_addc_u32 s9, s9, 0 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0x40450000 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s0, s4, 3 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1] offset:32 -; GFX9-NEXT: s_lshl_b32 s0, s4, 4 -; GFX9-NEXT: v_mov_b32_e32 v5, s0 -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: ds_max_f64 v5, v[0:1] offset:64 -; GFX9-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-NEXT: ds_max_rtn_f64 v[0:1], v4, v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v1, v2, s[8:11], 0 offen offset:4 -; GFX9-NEXT: buffer_store_dword v0, v2, s[8:11], 0 offen -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: lds_ds_fmax_f64: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX10-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX10-NEXT: s_mov_b32 s10, -1 -; GFX10-NEXT: s_mov_b32 s11, 0x31c16000 -; GFX10-NEXT: s_add_u32 s8, s8, s3 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10-NEXT: s_addc_u32 s9, s9, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, 0x40450000 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_lshl_b32 s0, s4, 3 -; GFX10-NEXT: v_mov_b32_e32 v5, s3 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: s_lshl_b32 s0, s4, 4 -; GFX10-NEXT: v_mov_b32_e32 v4, s0 -; GFX10-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1] offset:32 -; GFX10-NEXT: ds_max_f64 v4, v[0:1] offset:64 -; GFX10-NEXT: s_waitcnt lgkmcnt(1) -; GFX10-NEXT: ds_max_rtn_f64 v[0:1], v5, v[2:3] -; GFX10-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: buffer_store_dword v1, v2, s[8:11], 0 offen offset:4 -; GFX10-NEXT: buffer_store_dword v0, v2, s[8:11], 0 offen -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: lds_ds_fmax_f64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshl_b32 s3, s2, 3 -; GFX11-NEXT: v_mov_b32_e32 v5, s1 -; GFX11-NEXT: v_dual_mov_b32 v1, 0x40450000 :: v_dual_mov_b32 v2, s3 -; GFX11-NEXT: s_lshl_b32 s2, s2, 4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v4, s2 -; GFX11-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1] offset:32 -; GFX11-NEXT: ds_max_f64 v4, v[0:1] offset:64 -; GFX11-NEXT: s_waitcnt lgkmcnt(1) -; GFX11-NEXT: ds_max_rtn_f64 v[0:1], v5, v[2:3] -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: scratch_store_b64 off, v[0:1], s0 -; GFX11-NEXT: s_endpgm -; -; G_SI-LABEL: lds_ds_fmax_f64: -; G_SI: ; %bb.0: -; G_SI-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; G_SI-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; G_SI-NEXT: s_load_dword s4, s[0:1], 0xb -; G_SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; G_SI-NEXT: s_mov_b32 s10, -1 -; G_SI-NEXT: s_mov_b32 s11, 0xe8f000 -; G_SI-NEXT: s_add_u32 s8, s8, s3 -; G_SI-NEXT: s_mov_b32 s2, 0 -; G_SI-NEXT: s_addc_u32 s9, s9, 0 -; G_SI-NEXT: s_mov_b32 s3, 0x40450000 -; G_SI-NEXT: s_waitcnt lgkmcnt(0) -; G_SI-NEXT: s_add_i32 s4, s4, 4 -; G_SI-NEXT: v_mov_b32_e32 v0, s2 -; G_SI-NEXT: s_lshl_b32 s2, s4, 3 -; G_SI-NEXT: v_mov_b32_e32 v1, s3 -; G_SI-NEXT: v_mov_b32_e32 v2, s2 -; G_SI-NEXT: s_mov_b32 m0, -1 -; G_SI-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1] -; G_SI-NEXT: s_lshl_b32 s2, s4, 4 -; G_SI-NEXT: v_mov_b32_e32 v4, s2 -; G_SI-NEXT: ds_max_f64 v4, v[0:1] -; G_SI-NEXT: v_mov_b32_e32 v0, s1 -; G_SI-NEXT: s_waitcnt lgkmcnt(1) -; G_SI-NEXT: ds_max_rtn_f64 v[0:1], v0, v[2:3] -; G_SI-NEXT: v_mov_b32_e32 v2, s0 -; G_SI-NEXT: s_add_u32 s0, s0, 4 -; G_SI-NEXT: v_mov_b32_e32 v3, s0 -; G_SI-NEXT: s_waitcnt lgkmcnt(0) -; G_SI-NEXT: buffer_store_dword v0, v2, s[8:11], 0 offen -; G_SI-NEXT: buffer_store_dword v1, v3, s[8:11], 0 offen -; G_SI-NEXT: s_endpgm -; -; G_GFX7-LABEL: lds_ds_fmax_f64: -; G_GFX7: ; %bb.0: -; G_GFX7-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; G_GFX7-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; G_GFX7-NEXT: s_load_dword s4, s[0:1], 0xb -; G_GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; G_GFX7-NEXT: s_mov_b32 s10, -1 -; G_GFX7-NEXT: s_mov_b32 s11, 0xe8f000 -; G_GFX7-NEXT: s_add_u32 s8, s8, s3 -; G_GFX7-NEXT: s_mov_b32 s2, 0 -; G_GFX7-NEXT: s_addc_u32 s9, s9, 0 -; G_GFX7-NEXT: s_mov_b32 s3, 0x40450000 -; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX7-NEXT: s_add_i32 s4, s4, 4 -; G_GFX7-NEXT: v_mov_b32_e32 v0, s2 -; G_GFX7-NEXT: s_lshl_b32 s2, s4, 3 -; G_GFX7-NEXT: v_mov_b32_e32 v1, s3 -; G_GFX7-NEXT: v_mov_b32_e32 v2, s2 -; G_GFX7-NEXT: s_mov_b32 m0, -1 -; G_GFX7-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1] -; G_GFX7-NEXT: s_lshl_b32 s2, s4, 4 -; G_GFX7-NEXT: v_mov_b32_e32 v4, s2 -; G_GFX7-NEXT: ds_max_f64 v4, v[0:1] -; G_GFX7-NEXT: v_mov_b32_e32 v0, s1 -; G_GFX7-NEXT: s_waitcnt lgkmcnt(1) -; G_GFX7-NEXT: ds_max_rtn_f64 v[0:1], v0, v[2:3] -; G_GFX7-NEXT: v_mov_b32_e32 v2, s0 -; G_GFX7-NEXT: s_add_u32 s0, s0, 4 -; G_GFX7-NEXT: v_mov_b32_e32 v3, s0 -; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX7-NEXT: buffer_store_dword v0, v2, s[8:11], 0 offen -; G_GFX7-NEXT: buffer_store_dword v1, v3, s[8:11], 0 offen -; G_GFX7-NEXT: s_endpgm -; -; G_VI-LABEL: lds_ds_fmax_f64: -; G_VI: ; %bb.0: -; G_VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 -; G_VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 -; G_VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; G_VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; G_VI-NEXT: s_mov_b32 s90, -1 -; G_VI-NEXT: s_mov_b32 s91, 0xe80000 -; G_VI-NEXT: s_add_u32 s88, s88, s3 -; G_VI-NEXT: s_mov_b32 s2, 0 -; G_VI-NEXT: s_addc_u32 s89, s89, 0 -; G_VI-NEXT: s_mov_b32 s3, 0x40450000 -; G_VI-NEXT: s_waitcnt lgkmcnt(0) -; G_VI-NEXT: s_add_i32 s4, s4, 4 -; G_VI-NEXT: v_mov_b32_e32 v0, s2 -; G_VI-NEXT: s_lshl_b32 s2, s4, 3 -; G_VI-NEXT: v_mov_b32_e32 v1, s3 -; G_VI-NEXT: v_mov_b32_e32 v2, s2 -; G_VI-NEXT: s_mov_b32 m0, -1 -; G_VI-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1] -; G_VI-NEXT: s_lshl_b32 s2, s4, 4 -; G_VI-NEXT: v_mov_b32_e32 v4, s2 -; G_VI-NEXT: ds_max_f64 v4, v[0:1] -; G_VI-NEXT: v_mov_b32_e32 v0, s1 -; G_VI-NEXT: s_waitcnt lgkmcnt(1) -; G_VI-NEXT: ds_max_rtn_f64 v[0:1], v0, v[2:3] -; G_VI-NEXT: v_mov_b32_e32 v2, s0 -; G_VI-NEXT: s_add_u32 s0, s0, 4 -; G_VI-NEXT: v_mov_b32_e32 v3, s0 -; G_VI-NEXT: s_waitcnt lgkmcnt(0) -; G_VI-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen -; G_VI-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen -; G_VI-NEXT: s_endpgm -; -; G_GFX9-LABEL: lds_ds_fmax_f64: -; G_GFX9: ; %bb.0: -; G_GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; G_GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; G_GFX9-NEXT: s_mov_b32 s10, -1 -; G_GFX9-NEXT: s_mov_b32 s11, 0xe00000 -; G_GFX9-NEXT: s_add_u32 s8, s8, s3 -; G_GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; G_GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; G_GFX9-NEXT: s_mov_b32 s0, 0 -; G_GFX9-NEXT: s_addc_u32 s9, s9, 0 -; G_GFX9-NEXT: s_mov_b32 s1, 0x40450000 -; G_GFX9-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX9-NEXT: s_add_i32 s4, s4, 4 -; G_GFX9-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX9-NEXT: s_lshl_b32 s0, s4, 3 -; G_GFX9-NEXT: v_mov_b32_e32 v1, s1 -; G_GFX9-NEXT: v_mov_b32_e32 v2, s0 -; G_GFX9-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1] -; G_GFX9-NEXT: s_lshl_b32 s0, s4, 4 -; G_GFX9-NEXT: v_mov_b32_e32 v5, s0 -; G_GFX9-NEXT: v_mov_b32_e32 v4, s3 -; G_GFX9-NEXT: ds_max_f64 v5, v[0:1] -; G_GFX9-NEXT: s_waitcnt lgkmcnt(1) -; G_GFX9-NEXT: ds_max_rtn_f64 v[0:1], v4, v[2:3] -; G_GFX9-NEXT: v_mov_b32_e32 v2, s2 -; G_GFX9-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX9-NEXT: buffer_store_dword v0, v2, s[8:11], 0 offen -; G_GFX9-NEXT: buffer_store_dword v1, v2, s[8:11], 0 offen offset:4 -; G_GFX9-NEXT: s_endpgm -; -; G_GFX10-LABEL: lds_ds_fmax_f64: -; G_GFX10: ; %bb.0: -; G_GFX10-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; G_GFX10-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; G_GFX10-NEXT: s_mov_b32 s10, -1 -; G_GFX10-NEXT: s_mov_b32 s11, 0x31c16000 -; G_GFX10-NEXT: s_add_u32 s8, s8, s3 -; G_GFX10-NEXT: s_clause 0x1 -; G_GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c -; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; G_GFX10-NEXT: s_addc_u32 s9, s9, 0 -; G_GFX10-NEXT: s_mov_b32 s0, 0 -; G_GFX10-NEXT: s_mov_b32 s1, 0x40450000 -; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 -; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: s_add_i32 s4, s4, 4 -; G_GFX10-NEXT: v_mov_b32_e32 v5, s3 -; G_GFX10-NEXT: s_lshl_b32 s5, s4, 3 -; G_GFX10-NEXT: s_lshl_b32 s0, s4, 4 -; G_GFX10-NEXT: v_mov_b32_e32 v2, s5 -; G_GFX10-NEXT: v_mov_b32_e32 v4, s0 -; G_GFX10-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1] -; G_GFX10-NEXT: ds_max_f64 v4, v[0:1] -; G_GFX10-NEXT: s_waitcnt lgkmcnt(1) -; G_GFX10-NEXT: ds_max_rtn_f64 v[0:1], v5, v[2:3] -; G_GFX10-NEXT: v_mov_b32_e32 v2, s2 -; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: buffer_store_dword v0, v2, s[8:11], 0 offen -; G_GFX10-NEXT: buffer_store_dword v1, v2, s[8:11], 0 offen offset:4 -; G_GFX10-NEXT: s_endpgm -; -; G_GFX11-LABEL: lds_ds_fmax_f64: -; G_GFX11: ; %bb.0: -; G_GFX11-NEXT: s_clause 0x1 -; G_GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c -; G_GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; G_GFX11-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX11-NEXT: s_add_i32 s4, s2, 4 -; G_GFX11-NEXT: s_mov_b32 s2, 0 -; G_GFX11-NEXT: s_mov_b32 s3, 0x40450000 -; G_GFX11-NEXT: s_lshl_b32 s5, s4, 3 -; G_GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v5, s1 -; G_GFX11-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v2, s5 -; G_GFX11-NEXT: s_lshl_b32 s2, s4, 4 -; G_GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; G_GFX11-NEXT: v_mov_b32_e32 v4, s2 -; G_GFX11-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1] -; G_GFX11-NEXT: ds_max_f64 v4, v[0:1] -; G_GFX11-NEXT: s_waitcnt lgkmcnt(1) -; G_GFX11-NEXT: ds_max_rtn_f64 v[0:1], v5, v[2:3] -; G_GFX11-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX11-NEXT: scratch_store_b64 off, v[0:1], s0 -; G_GFX11-NEXT: s_endpgm - %idx.add = add nuw i32 %idx, 4 - %shl0 = shl i32 %idx.add, 3 - %shl1 = shl i32 %idx.add, 4 - %ptr0 = inttoptr i32 %shl0 to ptr addrspace(3) - %ptr1 = inttoptr i32 %shl1 to ptr addrspace(3) - %a1 = call double @llvm.amdgcn.ds.fmax.f64(ptr addrspace(3) %ptr0, double 4.2e+1, i32 0, i32 0, i1 false) - %a2 = call double @llvm.amdgcn.ds.fmax.f64(ptr addrspace(3) %ptr1, double 4.2e+1, i32 0, i32 0, i1 false) - %a3 = call double @llvm.amdgcn.ds.fmax.f64(ptr addrspace(3) %ptrf, double %a1, i32 0, i32 0, i1 false) - store double %a3, ptr addrspace(5) %out - ret void -} _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits