Author: Vitaly Buka Date: 2025-02-22T16:02:11-08:00 New Revision: 1aacd3108d3fb66c1f2483e973b52a97006eba9b
URL: https://github.com/llvm/llvm-project/commit/1aacd3108d3fb66c1f2483e973b52a97006eba9b DIFF: https://github.com/llvm/llvm-project/commit/1aacd3108d3fb66c1f2483e973b52a97006eba9b.diff LOG: Revert "[X86] combineBROADCAST_LOAD - merge across chains (#128209)" This reverts commit e21a1737f3523488a04169096fa27d0914a142a7. Added: Modified: llvm/lib/Target/X86/X86ISelLowering.cpp llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll Removed: ################################################################################ diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index a4357197e2843..2bc76d3814792 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -59360,14 +59360,21 @@ static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG, return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt); } -// Try to find a larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD that we can extract. +// Try to find a larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD that we can extract +// from. Limit this to cases where the loads have the same input chain and the +// output chains are unused. This avoids any memory ordering issues. static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { assert((N->getOpcode() == X86ISD::VBROADCAST_LOAD || N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) && "Unknown broadcast load type"); + // Only do this if the chain result is unused. + if (N->hasAnyUseOfValue(1)) + return SDValue(); + auto *MemIntrin = cast<MemIntrinsicSDNode>(N); + SDValue Ptr = MemIntrin->getBasePtr(); SDValue Chain = MemIntrin->getChain(); EVT VT = N->getSimpleValueType(0); @@ -59381,15 +59388,12 @@ static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG, cast<MemIntrinsicSDNode>(User)->getChain() == Chain && cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() == MemVT.getSizeInBits() && + !User->hasAnyUseOfValue(1) && User->getValueSizeInBits(0).getFixedValue() > VT.getFixedSizeInBits()) { - assert(cast<MemIntrinsicSDNode>(User)->isSimple() && - MemIntrin->isSimple() && "Illegal broadcast load type"); SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N), VT.getSizeInBits()); Extract = DAG.getBitcast(VT, Extract); - Extract = DCI.CombineTo(N, Extract, SDValue(User, 1)); - DAG.makeEquivalentMemoryOrdering(SDValue(N, 1), Extract.getValue(1)); - return Extract; + return DCI.CombineTo(N, Extract, SDValue(User, 1)); } return SDValue(); diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll index ca589e63b671a..d617cfb6aedee 100644 --- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll +++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll @@ -1888,14 +1888,15 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in ; ; AVX2-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpbroadcastb (%rdi), %ymm1 -; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) +; AVX2-NEXT: vpbroadcastb (%rdi), %xmm0 +; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpbroadcastb (%rdi), %ymm2 +; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] +; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -2111,14 +2112,15 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in ; ; AVX2-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX2-NEXT: vpbroadcastb (%rdi), %ymm1 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) +; AVX2-NEXT: vpbroadcastb (%rdi), %xmm0 +; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX2-NEXT: vpbroadcastb (%rdi), %ymm2 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] +; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -2235,29 +2237,33 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.e ; ; AVX512F-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14] -; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm1 -; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512F-NEXT: vpalignr {{.*#+}} xmm1 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14] +; AVX512F-NEXT: vpbroadcastb (%rdi), %xmm2 +; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14] -; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm1 -; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14] +; AVX512DQ-NEXT: vpbroadcastb (%rdi), %xmm2 +; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -2266,8 +2272,9 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.e ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14] -; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm1 +; AVX512BW-NEXT: vpbroadcastb (%rdi), %xmm1 ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm1 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -2332,14 +2339,15 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.e ; ; AVX2-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX2-NEXT: vpbroadcastb (%rdi), %ymm1 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) +; AVX2-NEXT: vpbroadcastb (%rdi), %xmm0 +; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX2-NEXT: vpbroadcastb (%rdi), %ymm2 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -2454,29 +2462,33 @@ define void @vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4(ptr %in. ; ; AVX512F-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14] -; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm1 -; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512F-NEXT: vpalignr {{.*#+}} xmm1 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14] +; AVX512F-NEXT: vpbroadcastb (%rdi), %xmm2 +; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14] -; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm1 -; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14] +; AVX512DQ-NEXT: vpbroadcastb (%rdi), %xmm2 +; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -2485,8 +2497,9 @@ define void @vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4(ptr %in. ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14] -; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm1 +; AVX512BW-NEXT: vpbroadcastb (%rdi), %xmm1 ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm1 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -2775,13 +2788,14 @@ define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %i ; ; AVX2-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX2-NEXT: vpbroadcastw (%rdi), %ymm1 -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) +; AVX2-NEXT: vpbroadcastw (%rdi), %xmm0 +; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX2-NEXT: vpbroadcastw (%rdi), %ymm2 +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] +; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -2976,13 +2990,14 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in. ; ; AVX2-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX2-NEXT: vpbroadcastw (%rdi), %ymm1 -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15] -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) +; AVX2-NEXT: vpbroadcastw (%rdi), %xmm0 +; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX2-NEXT: vpbroadcastw (%rdi), %ymm2 +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] +; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -3093,25 +3108,27 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in. ; ; AVX512F-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastw (%rdi), %ymm0 +; AVX512F-NEXT: vpbroadcastw (%rdi), %xmm0 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7] -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512F-NEXT: vpbroadcastw (%rdi), %ymm1 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpbroadcastw (%rdi), %ymm0 +; AVX512DQ-NEXT: vpbroadcastw (%rdi), %xmm0 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512DQ-NEXT: vpbroadcastw (%rdi), %ymm1 +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll index 7f206fd3bf2aa..dbd9df36239bd 100644 --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll @@ -1906,6 +1906,7 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in ; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm1 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512F-NEXT: vpbroadcastb (%rdi), %xmm1 ; AVX512F-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 @@ -1921,6 +1922,7 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in ; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm1 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512DQ-NEXT: vpbroadcastb (%rdi), %xmm1 ; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 @@ -1931,13 +1933,14 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in ; ; AVX512BW-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm0 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23] -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpbroadcastb (%rdi), %xmm0 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm2 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] +; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-NEXT: vzeroupper @@ -2888,43 +2891,46 @@ define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %i ; ; AVX2-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX2-NEXT: vpbroadcastw (%rdi), %ymm1 -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpbroadcastw (%rdi), %xmm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX2-NEXT: vpbroadcastw (%rdi), %ymm2 +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX512F-NEXT: vpbroadcastw (%rdi), %ymm1 -; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpbroadcastw (%rdi), %xmm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX512F-NEXT: vpbroadcastw (%rdi), %ymm2 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] +; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX512DQ-NEXT: vpbroadcastw (%rdi), %ymm1 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] -; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpbroadcastw (%rdi), %xmm0 +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX512DQ-NEXT: vpbroadcastw (%rdi), %ymm2 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] +; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits