Author: Simon Pilgrim Date: 2021-01-20T14:34:54Z New Revision: 19d02842ee56089b9208875ce4582e113e08fb6d
URL: https://github.com/llvm/llvm-project/commit/19d02842ee56089b9208875ce4582e113e08fb6d DIFF: https://github.com/llvm/llvm-project/commit/19d02842ee56089b9208875ce4582e113e08fb6d.diff LOG: [X86][AVX] Fold extract_subvector(VSRLI/VSHLI(x,32)) -> VSRLI/VSHLI(extract_subvector(x),32) As discussed on D56387, if we're shifting to extract the upper/lower half of a vXi64 vector then we're actually better off performing this at the subvector level as its very likely to fold into something. combineConcatVectorOps can perform this in reverse if necessary. Added: Modified: llvm/lib/Target/X86/X86ISelLowering.cpp llvm/test/CodeGen/X86/combine-sra.ll llvm/test/CodeGen/X86/pmul.ll llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll Removed: ################################################################################ diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 0ee671710219..0b52b2021c73 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -49799,8 +49799,8 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, // If we're extracting the lowest subvector and we're the only user, // we may be able to perform this with a smaller vector width. + unsigned InOpcode = InVec.getOpcode(); if (IdxVal == 0 && InVec.hasOneUse()) { - unsigned InOpcode = InVec.getOpcode(); if (VT == MVT::v2f64 && InVecVT == MVT::v4f64) { // v2f64 CVTDQ2PD(v4i32). if (InOpcode == ISD::SINT_TO_FP && @@ -49853,6 +49853,17 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, } } + // Always split vXi64 logical shifts where we're extracting the upper 32-bits + // as this is very likely to fold into a shuffle/truncation. + if ((InOpcode == X86ISD::VSHLI || InOpcode == X86ISD::VSRLI) && + InVecVT.getScalarSizeInBits() == 64 && + InVec.getConstantOperandAPInt(1) == 32) { + SDLoc DL(N); + SDValue Ext = + extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits); + return DAG.getNode(InOpcode, DL, VT, Ext, InVec.getOperand(1)); + } + return SDValue(); } diff --git a/llvm/test/CodeGen/X86/combine-sra.ll b/llvm/test/CodeGen/X86/combine-sra.ll index 28a73cdb6a41..453a61b8565e 100644 --- a/llvm/test/CodeGen/X86/combine-sra.ll +++ b/llvm/test/CodeGen/X86/combine-sra.ll @@ -207,9 +207,8 @@ define <4 x i32> @combine_vec_ashr_trunc_lshr(<4 x i64> %x) { ; ; AVX2-SLOW-LABEL: combine_vec_ashr_trunc_lshr: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpsrlq $32, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] ; AVX2-SLOW-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/pmul.ll b/llvm/test/CodeGen/X86/pmul.ll index db6009f273d2..56476eea323e 100644 --- a/llvm/test/CodeGen/X86/pmul.ll +++ b/llvm/test/CodeGen/X86/pmul.ll @@ -1150,9 +1150,8 @@ define <4 x i32> @mul_v4i64_zero_lower(<4 x i32> %val1, <4 x i64> %val2) { ; AVX-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX-NEXT: vpsrlq $32, %ymm1, %ymm1 ; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vpsllq $32, %ymm0, %ymm0 ; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll index a274baefc1ef..f0cb46e63d8f 100644 --- a/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll +++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll @@ -834,19 +834,20 @@ define <4 x double> @uitofp_v4i64_v4f64(<4 x i64> %x) #0 { ; ; AVX2-64-LABEL: uitofp_v4i64_v4f64: ; AVX2-64: # %bb.0: -; AVX2-64-NEXT: vpsrlq $32, %ymm0, %ymm1 -; AVX2-64-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-64-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-64-NEXT: vpsrlq $32, %xmm1, %xmm1 +; AVX2-64-NEXT: vpextrq $1, %xmm1, %rax +; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 +; AVX2-64-NEXT: vmovq %xmm1, %rax +; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1 +; AVX2-64-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-64-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX2-64-NEXT: vpextrq $1, %xmm2, %rax ; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm3, %xmm3 ; AVX2-64-NEXT: vmovq %xmm2, %rax ; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm2 ; AVX2-64-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; AVX2-64-NEXT: vpextrq $1, %xmm1, %rax -; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3 -; AVX2-64-NEXT: vmovq %xmm1, %rax -; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm1 -; AVX2-64-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; AVX2-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-64-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX2-64-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4.294967296E+9,4.294967296E+9,4.294967296E+9,4.294967296E+9] ; AVX2-64-NEXT: vmulpd %ymm2, %ymm1, %ymm1 ; AVX2-64-NEXT: vxorpd %xmm2, %xmm2, %xmm2 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits