Author: Kerry McLaughlin Date: 2020-12-07T13:20:19Z New Revision: 111f559bbd12c59b0ac450ea2feb8f6981705647
URL: https://github.com/llvm/llvm-project/commit/111f559bbd12c59b0ac450ea2feb8f6981705647 DIFF: https://github.com/llvm/llvm-project/commit/111f559bbd12c59b0ac450ea2feb8f6981705647.diff LOG: [SVE][CodeGen] Call refineIndexType & refineUniformBase from visitMGATHER The refineIndexType & refineUniformBase functions added by D90942 can also be used to improve CodeGen of masked gathers. These changes were split out from D91092 Reviewed By: sdesmalen Differential Revision: https://reviews.llvm.org/D92319 Added: Modified: llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp llvm/lib/Target/AArch64/AArch64ISelLowering.cpp llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-scaled.ll llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-unscaled.ll llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-scaled.ll llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-unscaled.ll llvm/test/CodeGen/AArch64/sve-masked-gather-64b-unscaled.ll llvm/test/CodeGen/X86/masked_gather_scatter.ll Removed: ################################################################################ diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 5481c52a5b12..96baaabdb813 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -9410,13 +9410,13 @@ bool refineUniformBase(SDValue &BasePtr, SDValue &Index, SelectionDAG &DAG) { } // Fold sext/zext of index into index type. -bool refineIndexType(MaskedScatterSDNode *MSC, SDValue &Index, bool Scaled, - SelectionDAG &DAG) { +bool refineIndexType(MaskedGatherScatterSDNode *MGS, SDValue &Index, + bool Scaled, SelectionDAG &DAG) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (Index.getOpcode() == ISD::ZERO_EXTEND) { SDValue Op = Index.getOperand(0); - MSC->setIndexType(Scaled ? ISD::UNSIGNED_SCALED : ISD::UNSIGNED_UNSCALED); + MGS->setIndexType(Scaled ? ISD::UNSIGNED_SCALED : ISD::UNSIGNED_UNSCALED); if (TLI.shouldRemoveExtendFromGSIndex(Op.getValueType())) { Index = Op; return true; @@ -9425,7 +9425,7 @@ bool refineIndexType(MaskedScatterSDNode *MSC, SDValue &Index, bool Scaled, if (Index.getOpcode() == ISD::SIGN_EXTEND) { SDValue Op = Index.getOperand(0); - MSC->setIndexType(Scaled ? ISD::SIGNED_SCALED : ISD::SIGNED_UNSCALED); + MGS->setIndexType(Scaled ? ISD::SIGNED_SCALED : ISD::SIGNED_UNSCALED); if (TLI.shouldRemoveExtendFromGSIndex(Op.getValueType())) { Index = Op; return true; @@ -9494,11 +9494,30 @@ SDValue DAGCombiner::visitMSTORE(SDNode *N) { SDValue DAGCombiner::visitMGATHER(SDNode *N) { MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(N); SDValue Mask = MGT->getMask(); + SDValue Chain = MGT->getChain(); + SDValue Index = MGT->getIndex(); + SDValue Scale = MGT->getScale(); + SDValue PassThru = MGT->getPassThru(); + SDValue BasePtr = MGT->getBasePtr(); SDLoc DL(N); // Zap gathers with a zero mask. if (ISD::isBuildVectorAllZeros(Mask.getNode())) - return CombineTo(N, MGT->getPassThru(), MGT->getChain()); + return CombineTo(N, PassThru, MGT->getChain()); + + if (refineUniformBase(BasePtr, Index, DAG)) { + SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale}; + return DAG.getMaskedGather(DAG.getVTList(N->getValueType(0), MVT::Other), + PassThru.getValueType(), DL, Ops, + MGT->getMemOperand(), MGT->getIndexType()); + } + + if (refineIndexType(MGT, Index, MGT->isIndexScaled(), DAG)) { + SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale}; + return DAG.getMaskedGather(DAG.getVTList(N->getValueType(0), MVT::Other), + PassThru.getValueType(), DL, Ops, + MGT->getMemOperand(), MGT->getIndexType()); + } return SDValue(); } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index d729252c92d9..517f5e965157 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -3894,6 +3894,9 @@ SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op, SDVTList VTs = DAG.getVTList(PassThru.getSimpleValueType(), MVT::Other); + if (getGatherScatterIndexIsExtended(Index)) + Index = Index.getOperand(0); + SDValue Ops[] = {Chain, Mask, BasePtr, Index, InputVT, PassThru}; return DAG.getNode(getGatherVecOpcode(IsScaled, IsSigned, IdxNeedsExtend), DL, VTs, Ops); diff --git a/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-scaled.ll b/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-scaled.ll index 747468ae3cf4..32dca0d26cdc 100644 --- a/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-scaled.ll +++ b/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-scaled.ll @@ -8,8 +8,6 @@ define <vscale x 2 x i64> @masked_gather_nxv2i16(i16* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: masked_gather_nxv2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d -; CHECK-NEXT: sxtw z0.d, p1/m, z0.d ; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1] ; CHECK-NEXT: and z0.d, z0.d, #0xffff ; CHECK-NEXT: ret @@ -22,8 +20,6 @@ define <vscale x 2 x i64> @masked_gather_nxv2i16(i16* %base, <vscale x 2 x i32> define <vscale x 2 x i64> @masked_gather_nxv2i32(i32* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: masked_gather_nxv2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d -; CHECK-NEXT: sxtw z0.d, p1/m, z0.d ; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2] ; CHECK-NEXT: and z0.d, z0.d, #0xffffffff ; CHECK-NEXT: ret @@ -36,8 +32,6 @@ define <vscale x 2 x i64> @masked_gather_nxv2i32(i32* %base, <vscale x 2 x i32> define <vscale x 2 x i64> @masked_gather_nxv2i64(i64* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: masked_gather_nxv2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d -; CHECK-NEXT: sxtw z0.d, p1/m, z0.d ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw #3] ; CHECK-NEXT: ret %ptrs = getelementptr i64, i64* %base, <vscale x 2 x i32> %offsets @@ -48,8 +42,6 @@ define <vscale x 2 x i64> @masked_gather_nxv2i64(i64* %base, <vscale x 2 x i32> define <vscale x 2 x half> @masked_gather_nxv2f16(half* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: masked_gather_nxv2f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d -; CHECK-NEXT: sxtw z0.d, p1/m, z0.d ; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1] ; CHECK-NEXT: ret %ptrs = getelementptr half, half* %base, <vscale x 2 x i32> %offsets @@ -60,8 +52,6 @@ define <vscale x 2 x half> @masked_gather_nxv2f16(half* %base, <vscale x 2 x i32 define <vscale x 2 x float> @masked_gather_nxv2f32(float* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: masked_gather_nxv2f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d -; CHECK-NEXT: sxtw z0.d, p1/m, z0.d ; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2] ; CHECK-NEXT: ret %ptrs = getelementptr float, float* %base, <vscale x 2 x i32> %offsets @@ -72,8 +62,6 @@ define <vscale x 2 x float> @masked_gather_nxv2f32(float* %base, <vscale x 2 x i define <vscale x 2 x double> @masked_gather_nxv2f64(double* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: masked_gather_nxv2f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d -; CHECK-NEXT: sxtw z0.d, p1/m, z0.d ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw #3] ; CHECK-NEXT: ret %ptrs = getelementptr double, double* %base, <vscale x 2 x i32> %offsets @@ -84,10 +72,9 @@ define <vscale x 2 x double> @masked_gather_nxv2f64(double* %base, <vscale x 2 x define <vscale x 2 x i64> @masked_sgather_nxv2i16(i16* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: masked_sgather_nxv2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d -; CHECK-NEXT: sxtw z0.d, p1/m, z0.d ; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1] -; CHECK-NEXT: sxth z0.d, p1/m, z0.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: sxth z0.d, p0/m, z0.d ; CHECK-NEXT: ret %ptrs = getelementptr i16, i16* %base, <vscale x 2 x i32> %offsets %vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef) @@ -98,10 +85,9 @@ define <vscale x 2 x i64> @masked_sgather_nxv2i16(i16* %base, <vscale x 2 x i32> define <vscale x 2 x i64> @masked_sgather_nxv2i32(i32* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: masked_sgather_nxv2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d -; CHECK-NEXT: sxtw z0.d, p1/m, z0.d ; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2] -; CHECK-NEXT: sxtw z0.d, p1/m, z0.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: sxtw z0.d, p0/m, z0.d ; CHECK-NEXT: ret %ptrs = getelementptr i32, i32* %base, <vscale x 2 x i32> %offsets %vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef) diff --git a/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-unscaled.ll b/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-unscaled.ll index b214fcf15911..1fc048a3adf7 100644 --- a/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-unscaled.ll +++ b/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-unscaled.ll @@ -8,8 +8,6 @@ define <vscale x 2 x i64> @masked_gather_nxv2i8(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: masked_gather_nxv2i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d -; CHECK-NEXT: sxtw z0.d, p1/m, z0.d ; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, z0.d, sxtw] ; CHECK-NEXT: and z0.d, z0.d, #0xff ; CHECK-NEXT: ret @@ -22,12 +20,7 @@ define <vscale x 2 x i64> @masked_gather_nxv2i8(i8* %base, <vscale x 2 x i32> %o define <vscale x 2 x i64> @masked_gather_nxv2i16(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: masked_gather_nxv2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d -; CHECK-NEXT: sxtw z0.d, p1/m, z0.d -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw] ; CHECK-NEXT: and z0.d, z0.d, #0xffff ; CHECK-NEXT: ret %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets @@ -40,12 +33,7 @@ define <vscale x 2 x i64> @masked_gather_nxv2i16(i8* %base, <vscale x 2 x i32> % define <vscale x 2 x i64> @masked_gather_nxv2i32(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: masked_gather_nxv2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d -; CHECK-NEXT: sxtw z0.d, p1/m, z0.d -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw] ; CHECK-NEXT: and z0.d, z0.d, #0xffffffff ; CHECK-NEXT: ret %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets @@ -58,12 +46,7 @@ define <vscale x 2 x i64> @masked_gather_nxv2i32(i8* %base, <vscale x 2 x i32> % define <vscale x 2 x i64> @masked_gather_nxv2i64(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: masked_gather_nxv2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d -; CHECK-NEXT: sxtw z0.d, p1/m, z0.d -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw] ; CHECK-NEXT: ret %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i64*> @@ -74,12 +57,7 @@ define <vscale x 2 x i64> @masked_gather_nxv2i64(i8* %base, <vscale x 2 x i32> % define <vscale x 2 x half> @masked_gather_nxv2f16(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: masked_gather_nxv2f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d -; CHECK-NEXT: sxtw z0.d, p1/m, z0.d -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw] ; CHECK-NEXT: ret %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x half*> @@ -90,12 +68,7 @@ define <vscale x 2 x half> @masked_gather_nxv2f16(i8* %base, <vscale x 2 x i32> define <vscale x 2 x float> @masked_gather_nxv2f32(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: masked_gather_nxv2f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d -; CHECK-NEXT: sxtw z0.d, p1/m, z0.d -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw] ; CHECK-NEXT: ret %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x float*> @@ -106,12 +79,7 @@ define <vscale x 2 x float> @masked_gather_nxv2f32(i8* %base, <vscale x 2 x i32> define <vscale x 2 x double> @masked_gather_nxv2f64(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: masked_gather_nxv2f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d -; CHECK-NEXT: sxtw z0.d, p1/m, z0.d -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw] ; CHECK-NEXT: ret %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x double*> @@ -122,10 +90,9 @@ define <vscale x 2 x double> @masked_gather_nxv2f64(i8* %base, <vscale x 2 x i32 define <vscale x 2 x i64> @masked_sgather_nxv2i8(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: masked_sgather_nxv2i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d -; CHECK-NEXT: sxtw z0.d, p1/m, z0.d ; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, z0.d, sxtw] -; CHECK-NEXT: sxtb z0.d, p1/m, z0.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: sxtb z0.d, p0/m, z0.d ; CHECK-NEXT: ret %ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets %vals = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef) @@ -136,13 +103,9 @@ define <vscale x 2 x i64> @masked_sgather_nxv2i8(i8* %base, <vscale x 2 x i32> % define <vscale x 2 x i64> @masked_sgather_nxv2i16(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: masked_sgather_nxv2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d -; CHECK-NEXT: sxtw z0.d, p1/m, z0.d -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d] -; CHECK-NEXT: sxth z0.d, p1/m, z0.d +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: sxth z0.d, p0/m, z0.d ; CHECK-NEXT: ret %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i16*> @@ -154,13 +117,9 @@ define <vscale x 2 x i64> @masked_sgather_nxv2i16(i8* %base, <vscale x 2 x i32> define <vscale x 2 x i64> @masked_sgather_nxv2i32(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: masked_sgather_nxv2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d -; CHECK-NEXT: sxtw z0.d, p1/m, z0.d -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d] -; CHECK-NEXT: sxtw z0.d, p1/m, z0.d +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: sxtw z0.d, p0/m, z0.d ; CHECK-NEXT: ret %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i32*> @@ -188,18 +147,7 @@ define <vscale x 4 x i32> @masked_gather_nxv4i8(i8* %base, <vscale x 4 x i32> %o define <vscale x 4 x i32> @masked_gather_nxv4i16(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: masked_gather_nxv4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: sunpklo z2.d, z0.s -; CHECK-NEXT: sunpkhi z0.d, z0.s -; CHECK-NEXT: pfalse p1.b -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z2.d, z1.d, z2.d -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: zip2 p2.s, p0.s, p1.s -; CHECK-NEXT: zip1 p0.s, p0.s, p1.s -; CHECK-NEXT: ld1h { z0.d }, p2/z, [x8, z0.d] -; CHECK-NEXT: ld1h { z1.d }, p0/z, [x8, z2.d] -; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, sxtw] ; CHECK-NEXT: and z0.s, z0.s, #0xffff ; CHECK-NEXT: ret %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i32> %offsets @@ -212,18 +160,7 @@ define <vscale x 4 x i32> @masked_gather_nxv4i16(i8* %base, <vscale x 4 x i32> % define <vscale x 4 x i32> @masked_gather_nxv4i32(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: masked_gather_nxv4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: sunpklo z2.d, z0.s -; CHECK-NEXT: sunpkhi z0.d, z0.s -; CHECK-NEXT: pfalse p1.b -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z2.d, z1.d, z2.d -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: zip2 p2.s, p0.s, p1.s -; CHECK-NEXT: zip1 p0.s, p0.s, p1.s -; CHECK-NEXT: ld1w { z0.d }, p2/z, [x8, z0.d] -; CHECK-NEXT: ld1w { z1.d }, p0/z, [x8, z2.d] -; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, z0.s, sxtw] ; CHECK-NEXT: ret %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i32> %offsets %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x i32*> @@ -234,18 +171,7 @@ define <vscale x 4 x i32> @masked_gather_nxv4i32(i8* %base, <vscale x 4 x i32> % define <vscale x 4 x half> @masked_gather_nxv4f16(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: masked_gather_nxv4f16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: sunpklo z2.d, z0.s -; CHECK-NEXT: sunpkhi z0.d, z0.s -; CHECK-NEXT: pfalse p1.b -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z2.d, z1.d, z2.d -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: zip2 p2.s, p0.s, p1.s -; CHECK-NEXT: zip1 p0.s, p0.s, p1.s -; CHECK-NEXT: ld1h { z0.d }, p2/z, [x8, z0.d] -; CHECK-NEXT: ld1h { z1.d }, p0/z, [x8, z2.d] -; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, sxtw] ; CHECK-NEXT: ret %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i32> %offsets %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x half*> @@ -256,18 +182,7 @@ define <vscale x 4 x half> @masked_gather_nxv4f16(i8* %base, <vscale x 4 x i32> define <vscale x 4 x float> @masked_gather_nxv4f32(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: masked_gather_nxv4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: sunpklo z2.d, z0.s -; CHECK-NEXT: sunpkhi z0.d, z0.s -; CHECK-NEXT: pfalse p1.b -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z2.d, z1.d, z2.d -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: zip2 p2.s, p0.s, p1.s -; CHECK-NEXT: zip1 p0.s, p0.s, p1.s -; CHECK-NEXT: ld1w { z0.d }, p2/z, [x8, z0.d] -; CHECK-NEXT: ld1w { z1.d }, p0/z, [x8, z2.d] -; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, z0.s, sxtw] ; CHECK-NEXT: ret %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i32> %offsets %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x float*> @@ -291,19 +206,8 @@ define <vscale x 4 x i32> @masked_sgather_nxv4i8(i8* %base, <vscale x 4 x i32> % define <vscale x 4 x i32> @masked_sgather_nxv4i16(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: masked_sgather_nxv4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: sunpklo z2.d, z0.s -; CHECK-NEXT: sunpkhi z0.d, z0.s -; CHECK-NEXT: pfalse p1.b -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z2.d, z1.d, z2.d -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: zip2 p2.s, p0.s, p1.s -; CHECK-NEXT: zip1 p0.s, p0.s, p1.s -; CHECK-NEXT: ld1h { z0.d }, p2/z, [x8, z0.d] -; CHECK-NEXT: ld1h { z1.d }, p0/z, [x8, z2.d] +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, sxtw] ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s ; CHECK-NEXT: sxth z0.s, p0/m, z0.s ; CHECK-NEXT: ret %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i32> %offsets diff --git a/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-scaled.ll b/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-scaled.ll index d938567beb04..ada49b7fecbc 100644 --- a/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-scaled.ll +++ b/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-scaled.ll @@ -8,8 +8,7 @@ define <vscale x 2 x i64> @masked_gather_nxv2i16(i16* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: masked_gather_nxv2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: and z0.d, z0.d, #0xffffffff -; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1] +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, uxtw #1] ; CHECK-NEXT: and z0.d, z0.d, #0xffff ; CHECK-NEXT: ret %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64> @@ -22,8 +21,7 @@ define <vscale x 2 x i64> @masked_gather_nxv2i16(i16* %base, <vscale x 2 x i32> define <vscale x 2 x i64> @masked_gather_nxv2i32(i32* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: masked_gather_nxv2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: and z0.d, z0.d, #0xffffffff -; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2] +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, uxtw #2] ; CHECK-NEXT: and z0.d, z0.d, #0xffffffff ; CHECK-NEXT: ret %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64> @@ -36,8 +34,7 @@ define <vscale x 2 x i64> @masked_gather_nxv2i32(i32* %base, <vscale x 2 x i32> define <vscale x 2 x i64> @masked_gather_nxv2i64(i64* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: masked_gather_nxv2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: and z0.d, z0.d, #0xffffffff -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw #3] +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d, uxtw #3] ; CHECK-NEXT: ret %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64> %ptrs = getelementptr i64, i64* %base, <vscale x 2 x i64> %offsets.zext @@ -48,8 +45,7 @@ define <vscale x 2 x i64> @masked_gather_nxv2i64(i64* %base, <vscale x 2 x i32> define <vscale x 2 x half> @masked_gather_nxv2f16(half* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: masked_gather_nxv2f16: ; CHECK: // %bb.0: -; CHECK-NEXT: and z0.d, z0.d, #0xffffffff -; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1] +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, uxtw #1] ; CHECK-NEXT: ret %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64> %ptrs = getelementptr half, half* %base, <vscale x 2 x i64> %offsets.zext @@ -60,8 +56,7 @@ define <vscale x 2 x half> @masked_gather_nxv2f16(half* %base, <vscale x 2 x i32 define <vscale x 2 x float> @masked_gather_nxv2f32(float* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: masked_gather_nxv2f32: ; CHECK: // %bb.0: -; CHECK-NEXT: and z0.d, z0.d, #0xffffffff -; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2] +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, uxtw #2] ; CHECK-NEXT: ret %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64> %ptrs = getelementptr float, float* %base, <vscale x 2 x i64> %offsets.zext @@ -72,8 +67,7 @@ define <vscale x 2 x float> @masked_gather_nxv2f32(float* %base, <vscale x 2 x i define <vscale x 2 x double> @masked_gather_nxv2f64(double* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: masked_gather_nxv2f64: ; CHECK: // %bb.0: -; CHECK-NEXT: and z0.d, z0.d, #0xffffffff -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw #3] +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d, uxtw #3] ; CHECK-NEXT: ret %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64> %ptrs = getelementptr double, double* %base, <vscale x 2 x i64> %offsets.zext @@ -84,8 +78,7 @@ define <vscale x 2 x double> @masked_gather_nxv2f64(double* %base, <vscale x 2 x define <vscale x 2 x i64> @masked_sgather_nxv2i16(i16* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: masked_sgather_nxv2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: and z0.d, z0.d, #0xffffffff -; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1] +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, uxtw #1] ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: sxth z0.d, p0/m, z0.d ; CHECK-NEXT: ret @@ -99,8 +92,7 @@ define <vscale x 2 x i64> @masked_sgather_nxv2i16(i16* %base, <vscale x 2 x i32> define <vscale x 2 x i64> @masked_sgather_nxv2i32(i32* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: masked_sgather_nxv2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: and z0.d, z0.d, #0xffffffff -; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2] +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, uxtw #2] ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: sxtw z0.d, p0/m, z0.d ; CHECK-NEXT: ret @@ -118,14 +110,7 @@ define <vscale x 2 x i64> @masked_sgather_nxv2i32(i32* %base, <vscale x 2 x i32> define <vscale x 4 x i32> @masked_gather_nxv4i16(i16* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: masked_gather_nxv4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: pfalse p1.b -; CHECK-NEXT: uunpklo z1.d, z0.s -; CHECK-NEXT: uunpkhi z0.d, z0.s -; CHECK-NEXT: zip2 p2.s, p0.s, p1.s -; CHECK-NEXT: zip1 p0.s, p0.s, p1.s -; CHECK-NEXT: ld1h { z0.d }, p2/z, [x0, z0.d, sxtw #1] -; CHECK-NEXT: ld1h { z1.d }, p0/z, [x0, z1.d, sxtw #1] -; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, uxtw #1] ; CHECK-NEXT: and z0.s, z0.s, #0xffff ; CHECK-NEXT: ret %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64> @@ -138,14 +123,7 @@ define <vscale x 4 x i32> @masked_gather_nxv4i16(i16* %base, <vscale x 4 x i32> define <vscale x 4 x i32> @masked_gather_nxv4i32(i32* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: masked_gather_nxv4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: pfalse p1.b -; CHECK-NEXT: uunpklo z1.d, z0.s -; CHECK-NEXT: uunpkhi z0.d, z0.s -; CHECK-NEXT: zip2 p2.s, p0.s, p1.s -; CHECK-NEXT: zip1 p0.s, p0.s, p1.s -; CHECK-NEXT: ld1w { z0.d }, p2/z, [x0, z0.d, sxtw #2] -; CHECK-NEXT: ld1w { z1.d }, p0/z, [x0, z1.d, sxtw #2] -; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, z0.s, uxtw #2] ; CHECK-NEXT: ret %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64> %ptrs = getelementptr i32, i32* %base, <vscale x 4 x i64> %offsets.zext @@ -156,14 +134,7 @@ define <vscale x 4 x i32> @masked_gather_nxv4i32(i32* %base, <vscale x 4 x i32> define <vscale x 4 x half> @masked_gather_nxv4f16(half* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: masked_gather_nxv4f16: ; CHECK: // %bb.0: -; CHECK-NEXT: pfalse p1.b -; CHECK-NEXT: uunpklo z1.d, z0.s -; CHECK-NEXT: uunpkhi z0.d, z0.s -; CHECK-NEXT: zip2 p2.s, p0.s, p1.s -; CHECK-NEXT: zip1 p0.s, p0.s, p1.s -; CHECK-NEXT: ld1h { z0.d }, p2/z, [x0, z0.d, sxtw #1] -; CHECK-NEXT: ld1h { z1.d }, p0/z, [x0, z1.d, sxtw #1] -; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, uxtw #1] ; CHECK-NEXT: ret %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64> %ptrs = getelementptr half, half* %base, <vscale x 4 x i64> %offsets.zext @@ -174,14 +145,7 @@ define <vscale x 4 x half> @masked_gather_nxv4f16(half* %base, <vscale x 4 x i32 define <vscale x 4 x float> @masked_gather_nxv4f32(float* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: masked_gather_nxv4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: pfalse p1.b -; CHECK-NEXT: uunpklo z1.d, z0.s -; CHECK-NEXT: uunpkhi z0.d, z0.s -; CHECK-NEXT: zip2 p2.s, p0.s, p1.s -; CHECK-NEXT: zip1 p0.s, p0.s, p1.s -; CHECK-NEXT: ld1w { z0.d }, p2/z, [x0, z0.d, sxtw #2] -; CHECK-NEXT: ld1w { z1.d }, p0/z, [x0, z1.d, sxtw #2] -; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, z0.s, uxtw #2] ; CHECK-NEXT: ret %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64> %ptrs = getelementptr float, float* %base, <vscale x 4 x i64> %offsets.zext @@ -192,15 +156,8 @@ define <vscale x 4 x float> @masked_gather_nxv4f32(float* %base, <vscale x 4 x i define <vscale x 4 x i32> @masked_sgather_nxv4i16(i16* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: masked_sgather_nxv4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: pfalse p1.b -; CHECK-NEXT: uunpklo z1.d, z0.s -; CHECK-NEXT: uunpkhi z0.d, z0.s -; CHECK-NEXT: zip2 p2.s, p0.s, p1.s -; CHECK-NEXT: zip1 p0.s, p0.s, p1.s -; CHECK-NEXT: ld1h { z0.d }, p2/z, [x0, z0.d, sxtw #1] -; CHECK-NEXT: ld1h { z1.d }, p0/z, [x0, z1.d, sxtw #1] +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, uxtw #1] ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s ; CHECK-NEXT: sxth z0.s, p0/m, z0.s ; CHECK-NEXT: ret %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64> diff --git a/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-unscaled.ll b/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-unscaled.ll index 7a47311484f8..61b8e3e53e23 100644 --- a/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-unscaled.ll +++ b/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-unscaled.ll @@ -8,8 +8,7 @@ define <vscale x 2 x i64> @masked_gather_nxv2i8(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: masked_gather_nxv2i8: ; CHECK: // %bb.0: -; CHECK-NEXT: and z0.d, z0.d, #0xffffffff -; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, z0.d, sxtw] +; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, z0.d, uxtw] ; CHECK-NEXT: and z0.d, z0.d, #0xff ; CHECK-NEXT: ret %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64> @@ -22,11 +21,7 @@ define <vscale x 2 x i64> @masked_gather_nxv2i8(i8* %base, <vscale x 2 x i32> %o define <vscale x 2 x i64> @masked_gather_nxv2i16(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: masked_gather_nxv2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: and z0.d, z0.d, #0xffffffff -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, uxtw] ; CHECK-NEXT: and z0.d, z0.d, #0xffff ; CHECK-NEXT: ret %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64> @@ -40,11 +35,7 @@ define <vscale x 2 x i64> @masked_gather_nxv2i16(i8* %base, <vscale x 2 x i32> % define <vscale x 2 x i64> @masked_gather_nxv2i32(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: masked_gather_nxv2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: and z0.d, z0.d, #0xffffffff -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, uxtw] ; CHECK-NEXT: and z0.d, z0.d, #0xffffffff ; CHECK-NEXT: ret %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64> @@ -58,11 +49,7 @@ define <vscale x 2 x i64> @masked_gather_nxv2i32(i8* %base, <vscale x 2 x i32> % define <vscale x 2 x i64> @masked_gather_nxv2i64(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: masked_gather_nxv2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: and z0.d, z0.d, #0xffffffff -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d, uxtw] ; CHECK-NEXT: ret %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64> %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext @@ -74,11 +61,7 @@ define <vscale x 2 x i64> @masked_gather_nxv2i64(i8* %base, <vscale x 2 x i32> % define <vscale x 2 x half> @masked_gather_nxv2f16(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: masked_gather_nxv2f16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: and z0.d, z0.d, #0xffffffff -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, uxtw] ; CHECK-NEXT: ret %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64> %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext @@ -90,11 +73,7 @@ define <vscale x 2 x half> @masked_gather_nxv2f16(i8* %base, <vscale x 2 x i32> define <vscale x 2 x float> @masked_gather_nxv2f32(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: masked_gather_nxv2f32: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: and z0.d, z0.d, #0xffffffff -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, uxtw] ; CHECK-NEXT: ret %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64> %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext @@ -106,11 +85,7 @@ define <vscale x 2 x float> @masked_gather_nxv2f32(i8* %base, <vscale x 2 x i32> define <vscale x 2 x double> @masked_gather_nxv2f64(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: masked_gather_nxv2f64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: and z0.d, z0.d, #0xffffffff -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d, uxtw] ; CHECK-NEXT: ret %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64> %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext @@ -122,8 +97,7 @@ define <vscale x 2 x double> @masked_gather_nxv2f64(i8* %base, <vscale x 2 x i32 define <vscale x 2 x i64> @masked_sgather_nxv2i8(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: masked_sgather_nxv2i8: ; CHECK: // %bb.0: -; CHECK-NEXT: and z0.d, z0.d, #0xffffffff -; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, z0.d, sxtw] +; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, z0.d, uxtw] ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: sxtb z0.d, p0/m, z0.d ; CHECK-NEXT: ret @@ -137,11 +111,7 @@ define <vscale x 2 x i64> @masked_sgather_nxv2i8(i8* %base, <vscale x 2 x i32> % define <vscale x 2 x i64> @masked_sgather_nxv2i16(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: masked_sgather_nxv2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: and z0.d, z0.d, #0xffffffff -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, uxtw] ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: sxth z0.d, p0/m, z0.d ; CHECK-NEXT: ret @@ -156,11 +126,7 @@ define <vscale x 2 x i64> @masked_sgather_nxv2i16(i8* %base, <vscale x 2 x i32> define <vscale x 2 x i64> @masked_sgather_nxv2i32(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: masked_sgather_nxv2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: and z0.d, z0.d, #0xffffffff -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, uxtw] ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: sxtw z0.d, p0/m, z0.d ; CHECK-NEXT: ret @@ -179,14 +145,7 @@ define <vscale x 2 x i64> @masked_sgather_nxv2i32(i8* %base, <vscale x 2 x i32> define <vscale x 4 x i32> @masked_gather_nxv4i8(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: masked_gather_nxv4i8: ; CHECK: // %bb.0: -; CHECK-NEXT: pfalse p1.b -; CHECK-NEXT: uunpklo z1.d, z0.s -; CHECK-NEXT: uunpkhi z0.d, z0.s -; CHECK-NEXT: zip2 p2.s, p0.s, p1.s -; CHECK-NEXT: zip1 p0.s, p0.s, p1.s -; CHECK-NEXT: ld1b { z0.d }, p2/z, [x0, z0.d, sxtw] -; CHECK-NEXT: ld1b { z1.d }, p0/z, [x0, z1.d, sxtw] -; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s +; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, z0.s, uxtw] ; CHECK-NEXT: and z0.s, z0.s, #0xff ; CHECK-NEXT: ret %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64> @@ -199,18 +158,7 @@ define <vscale x 4 x i32> @masked_gather_nxv4i8(i8* %base, <vscale x 4 x i32> %o define <vscale x 4 x i32> @masked_gather_nxv4i16(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: masked_gather_nxv4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: uunpkhi z2.d, z0.s -; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: pfalse p1.b -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: add z1.d, z1.d, z2.d -; CHECK-NEXT: zip2 p2.s, p0.s, p1.s -; CHECK-NEXT: zip1 p0.s, p0.s, p1.s -; CHECK-NEXT: ld1h { z1.d }, p2/z, [x8, z1.d] -; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d] -; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, uxtw] ; CHECK-NEXT: and z0.s, z0.s, #0xffff ; CHECK-NEXT: ret %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64> @@ -224,18 +172,7 @@ define <vscale x 4 x i32> @masked_gather_nxv4i16(i8* %base, <vscale x 4 x i32> % define <vscale x 4 x i32> @masked_gather_nxv4i32(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: masked_gather_nxv4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: uunpkhi z2.d, z0.s -; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: pfalse p1.b -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: add z1.d, z1.d, z2.d -; CHECK-NEXT: zip2 p2.s, p0.s, p1.s -; CHECK-NEXT: zip1 p0.s, p0.s, p1.s -; CHECK-NEXT: ld1w { z1.d }, p2/z, [x8, z1.d] -; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d] -; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, z0.s, uxtw] ; CHECK-NEXT: ret %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64> %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext @@ -247,18 +184,7 @@ define <vscale x 4 x i32> @masked_gather_nxv4i32(i8* %base, <vscale x 4 x i32> % define <vscale x 4 x half> @masked_gather_nxv4f16(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: masked_gather_nxv4f16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: uunpkhi z2.d, z0.s -; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: pfalse p1.b -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: add z1.d, z1.d, z2.d -; CHECK-NEXT: zip2 p2.s, p0.s, p1.s -; CHECK-NEXT: zip1 p0.s, p0.s, p1.s -; CHECK-NEXT: ld1h { z1.d }, p2/z, [x8, z1.d] -; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d] -; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, uxtw] ; CHECK-NEXT: ret %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64> %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext @@ -270,18 +196,7 @@ define <vscale x 4 x half> @masked_gather_nxv4f16(i8* %base, <vscale x 4 x i32> define <vscale x 4 x float> @masked_gather_nxv4f32(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: masked_gather_nxv4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: uunpkhi z2.d, z0.s -; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: pfalse p1.b -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: add z1.d, z1.d, z2.d -; CHECK-NEXT: zip2 p2.s, p0.s, p1.s -; CHECK-NEXT: zip1 p0.s, p0.s, p1.s -; CHECK-NEXT: ld1w { z1.d }, p2/z, [x8, z1.d] -; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d] -; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, z0.s, uxtw] ; CHECK-NEXT: ret %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64> %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext @@ -293,15 +208,8 @@ define <vscale x 4 x float> @masked_gather_nxv4f32(i8* %base, <vscale x 4 x i32> define <vscale x 4 x i32> @masked_sgather_nxv4i8(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: masked_sgather_nxv4i8: ; CHECK: // %bb.0: -; CHECK-NEXT: pfalse p1.b -; CHECK-NEXT: uunpklo z1.d, z0.s -; CHECK-NEXT: uunpkhi z0.d, z0.s -; CHECK-NEXT: zip2 p2.s, p0.s, p1.s -; CHECK-NEXT: zip1 p0.s, p0.s, p1.s -; CHECK-NEXT: ld1b { z0.d }, p2/z, [x0, z0.d, sxtw] -; CHECK-NEXT: ld1b { z1.d }, p0/z, [x0, z1.d, sxtw] +; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, z0.s, uxtw] ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s ; CHECK-NEXT: sxtb z0.s, p0/m, z0.s ; CHECK-NEXT: ret %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64> @@ -314,19 +222,8 @@ define <vscale x 4 x i32> @masked_sgather_nxv4i8(i8* %base, <vscale x 4 x i32> % define <vscale x 4 x i32> @masked_sgather_nxv4i16(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: masked_sgather_nxv4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: uunpkhi z2.d, z0.s -; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: pfalse p1.b -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: add z1.d, z1.d, z2.d -; CHECK-NEXT: zip2 p2.s, p0.s, p1.s -; CHECK-NEXT: zip1 p0.s, p0.s, p1.s -; CHECK-NEXT: ld1h { z1.d }, p2/z, [x8, z1.d] -; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, uxtw] ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s ; CHECK-NEXT: sxth z0.s, p0/m, z0.s ; CHECK-NEXT: ret %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64> diff --git a/llvm/test/CodeGen/AArch64/sve-masked-gather-64b-unscaled.ll b/llvm/test/CodeGen/AArch64/sve-masked-gather-64b-unscaled.ll index be8909201a83..3f4f54c5d839 100644 --- a/llvm/test/CodeGen/AArch64/sve-masked-gather-64b-unscaled.ll +++ b/llvm/test/CodeGen/AArch64/sve-masked-gather-64b-unscaled.ll @@ -16,10 +16,7 @@ define <vscale x 2 x i64> @masked_gather_nxv2i8(i8* %base, <vscale x 2 x i64> %o define <vscale x 2 x i64> @masked_gather_nxv2i16(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: masked_gather_nxv2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d] ; CHECK-NEXT: and z0.d, z0.d, #0xffff ; CHECK-NEXT: ret %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets @@ -32,10 +29,7 @@ define <vscale x 2 x i64> @masked_gather_nxv2i16(i8* %base, <vscale x 2 x i64> % define <vscale x 2 x i64> @masked_gather_nxv2i32(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: masked_gather_nxv2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d] ; CHECK-NEXT: and z0.d, z0.d, #0xffffffff ; CHECK-NEXT: ret %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets @@ -48,10 +42,7 @@ define <vscale x 2 x i64> @masked_gather_nxv2i32(i8* %base, <vscale x 2 x i64> % define <vscale x 2 x i64> @masked_gather_nxv2i64(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: masked_gather_nxv2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d] ; CHECK-NEXT: ret %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i64*> @@ -62,10 +53,7 @@ define <vscale x 2 x i64> @masked_gather_nxv2i64(i8* %base, <vscale x 2 x i64> % define <vscale x 2 x half> @masked_gather_nxv2f16(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: masked_gather_nxv2f16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d] ; CHECK-NEXT: ret %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x half*> @@ -76,10 +64,7 @@ define <vscale x 2 x half> @masked_gather_nxv2f16(i8* %base, <vscale x 2 x i64> define <vscale x 2 x float> @masked_gather_nxv2f32(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: masked_gather_nxv2f32: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d] ; CHECK-NEXT: ret %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x float*> @@ -90,10 +75,7 @@ define <vscale x 2 x float> @masked_gather_nxv2f32(i8* %base, <vscale x 2 x i64> define <vscale x 2 x double> @masked_gather_nxv2f64(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: masked_gather_nxv2f64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d] ; CHECK-NEXT: ret %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x double*> @@ -117,10 +99,7 @@ define <vscale x 2 x i64> @masked_sgather_nxv2i8(i8* %base, <vscale x 2 x i64> % define <vscale x 2 x i64> @masked_sgather_nxv2i16(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: masked_sgather_nxv2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d] ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: sxth z0.d, p0/m, z0.d ; CHECK-NEXT: ret @@ -134,10 +113,7 @@ define <vscale x 2 x i64> @masked_sgather_nxv2i16(i8* %base, <vscale x 2 x i64> define <vscale x 2 x i64> @masked_sgather_nxv2i32(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: masked_sgather_nxv2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d] ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: sxtw z0.d, p0/m, z0.d ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll index 948928099d38..995e39f56355 100644 --- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll +++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll @@ -765,45 +765,41 @@ define <16 x float> @test13(float* %base, <16 x i32> %ind) { define <16 x float> @test14(float* %base, i32 %ind, <16 x float*> %vec) { ; KNL_64-LABEL: test14: ; KNL_64: # %bb.0: -; KNL_64-NEXT: vpbroadcastq %xmm0, %zmm0 -; KNL_64-NEXT: vmovd %esi, %xmm1 -; KNL_64-NEXT: vpbroadcastd %xmm1, %ymm1 -; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1 -; KNL_64-NEXT: vpsllq $2, %zmm1, %zmm1 -; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; KNL_64-NEXT: vmovq %xmm0, %rax +; KNL_64-NEXT: vmovd %esi, %xmm0 +; KNL_64-NEXT: vpbroadcastd %xmm0, %ymm0 +; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0 +; KNL_64-NEXT: vpsllq $2, %zmm0, %zmm0 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1 -; KNL_64-NEXT: vgatherqps (,%zmm0), %ymm1 {%k1} +; KNL_64-NEXT: vgatherqps (%rax,%zmm0), %ymm1 {%k1} ; KNL_64-NEXT: vinsertf64x4 $1, %ymm1, %zmm1, %zmm0 ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test14: ; KNL_32: # %bb.0: -; KNL_32-NEXT: vpbroadcastd %xmm0, %zmm0 +; KNL_32-NEXT: vmovd %xmm0, %eax ; KNL_32-NEXT: vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1 -; KNL_32-NEXT: vpaddd %zmm1, %zmm0, %zmm1 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1 -; KNL_32-NEXT: vgatherdps (,%zmm1), %zmm0 {%k1} +; KNL_32-NEXT: vgatherdps (%eax,%zmm1), %zmm0 {%k1} ; KNL_32-NEXT: retl ; ; SKX-LABEL: test14: ; SKX: # %bb.0: -; SKX-NEXT: vpbroadcastq %xmm0, %zmm0 -; SKX-NEXT: vpbroadcastd %esi, %ymm1 -; SKX-NEXT: vpmovsxdq %ymm1, %zmm1 -; SKX-NEXT: vpsllq $2, %zmm1, %zmm1 -; SKX-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; SKX-NEXT: vmovq %xmm0, %rax +; SKX-NEXT: vpbroadcastd %esi, %ymm0 +; SKX-NEXT: vpmovsxdq %ymm0, %zmm0 +; SKX-NEXT: vpsllq $2, %zmm0, %zmm0 ; SKX-NEXT: kxnorw %k0, %k0, %k1 -; SKX-NEXT: vgatherqps (,%zmm0), %ymm1 {%k1} +; SKX-NEXT: vgatherqps (%rax,%zmm0), %ymm1 {%k1} ; SKX-NEXT: vinsertf64x4 $1, %ymm1, %zmm1, %zmm0 ; SKX-NEXT: retq ; ; SKX_32-LABEL: test14: ; SKX_32: # %bb.0: -; SKX_32-NEXT: vpbroadcastd %xmm0, %zmm0 +; SKX_32-NEXT: vmovd %xmm0, %eax ; SKX_32-NEXT: vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1 -; SKX_32-NEXT: vpaddd %zmm1, %zmm0, %zmm1 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1 -; SKX_32-NEXT: vgatherdps (,%zmm1), %zmm0 {%k1} +; SKX_32-NEXT: vgatherdps (%eax,%zmm1), %zmm0 {%k1} ; SKX_32-NEXT: retl %broadcast.splatinsert = insertelement <16 x float*> %vec, float* %base, i32 1 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits