Author: Vitaly Buka Date: 2025-02-22T16:15:48-08:00 New Revision: dde78a2e84edc0fe987266a7fcae774dd36532c3
URL: https://github.com/llvm/llvm-project/commit/dde78a2e84edc0fe987266a7fcae774dd36532c3 DIFF: https://github.com/llvm/llvm-project/commit/dde78a2e84edc0fe987266a7fcae774dd36532c3.diff LOG: Revert "Revert "[X86] combineBROADCAST_LOAD - merge across chains" (#128380)" This reverts commit 50b0669e8468279518ae0be27c8b6a134c4d95d1. Added: Modified: llvm/lib/Target/X86/X86ISelLowering.cpp llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll Removed: ################################################################################ diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 2bc76d3814792..a4357197e2843 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -59360,21 +59360,14 @@ static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG, return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt); } -// Try to find a larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD that we can extract -// from. Limit this to cases where the loads have the same input chain and the -// output chains are unused. This avoids any memory ordering issues. +// Try to find a larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD that we can extract. static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { assert((N->getOpcode() == X86ISD::VBROADCAST_LOAD || N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) && "Unknown broadcast load type"); - // Only do this if the chain result is unused. - if (N->hasAnyUseOfValue(1)) - return SDValue(); - auto *MemIntrin = cast<MemIntrinsicSDNode>(N); - SDValue Ptr = MemIntrin->getBasePtr(); SDValue Chain = MemIntrin->getChain(); EVT VT = N->getSimpleValueType(0); @@ -59388,12 +59381,15 @@ static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG, cast<MemIntrinsicSDNode>(User)->getChain() == Chain && cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() == MemVT.getSizeInBits() && - !User->hasAnyUseOfValue(1) && User->getValueSizeInBits(0).getFixedValue() > VT.getFixedSizeInBits()) { + assert(cast<MemIntrinsicSDNode>(User)->isSimple() && + MemIntrin->isSimple() && "Illegal broadcast load type"); SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N), VT.getSizeInBits()); Extract = DAG.getBitcast(VT, Extract); - return DCI.CombineTo(N, Extract, SDValue(User, 1)); + Extract = DCI.CombineTo(N, Extract, SDValue(User, 1)); + DAG.makeEquivalentMemoryOrdering(SDValue(N, 1), Extract.getValue(1)); + return Extract; } return SDValue(); diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll index d617cfb6aedee..ca589e63b671a 100644 --- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll +++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll @@ -1888,15 +1888,14 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in ; ; AVX2-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb (%rdi), %xmm0 -; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpbroadcastb (%rdi), %ymm2 -; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] -; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX2-NEXT: vmovdqa %ymm1, (%rdx) +; AVX2-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpbroadcastb (%rdi), %ymm1 +; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm0, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -2112,15 +2111,14 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in ; ; AVX2-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb (%rdi), %xmm0 -; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX2-NEXT: vpbroadcastb (%rdi), %ymm2 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] -; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX2-NEXT: vmovdqa %ymm1, (%rdx) +; AVX2-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX2-NEXT: vpbroadcastb (%rdi), %ymm1 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm0, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -2237,33 +2235,29 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.e ; ; AVX512F-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512F-NEXT: vpalignr {{.*#+}} xmm1 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14] -; AVX512F-NEXT: vpbroadcastb (%rdi), %xmm2 -; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14] +; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm1 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm0 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14] -; AVX512DQ-NEXT: vpbroadcastb (%rdi), %xmm2 -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14] +; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm1 +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -2272,9 +2266,8 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.e ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14] -; AVX512BW-NEXT: vpbroadcastb (%rdi), %xmm1 -; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm1 +; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -2339,15 +2332,14 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.e ; ; AVX2-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb (%rdi), %xmm0 -; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX2-NEXT: vpbroadcastb (%rdi), %ymm2 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX2-NEXT: vmovdqa %ymm1, (%rdx) +; AVX2-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX2-NEXT: vpbroadcastb (%rdi), %ymm1 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm0, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -2462,33 +2454,29 @@ define void @vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4(ptr %in. ; ; AVX512F-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512F-NEXT: vpalignr {{.*#+}} xmm1 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14] -; AVX512F-NEXT: vpbroadcastb (%rdi), %xmm2 -; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14] +; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm1 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm0 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14] -; AVX512DQ-NEXT: vpbroadcastb (%rdi), %xmm2 -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14] +; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm1 +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -2497,9 +2485,8 @@ define void @vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4(ptr %in. ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14] -; AVX512BW-NEXT: vpbroadcastb (%rdi), %xmm1 -; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm1 +; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -2788,14 +2775,13 @@ define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %i ; ; AVX2-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastw (%rdi), %xmm0 -; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX2-NEXT: vpbroadcastw (%rdi), %ymm2 -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] -; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX2-NEXT: vmovdqa %ymm1, (%rdx) +; AVX2-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX2-NEXT: vpbroadcastw (%rdi), %ymm1 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm0, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -2990,14 +2976,13 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in. ; ; AVX2-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastw (%rdi), %xmm0 -; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX2-NEXT: vpbroadcastw (%rdi), %ymm2 -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] -; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX2-NEXT: vmovdqa %ymm1, (%rdx) +; AVX2-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX2-NEXT: vpbroadcastw (%rdi), %ymm1 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15] +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm0, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -3108,27 +3093,25 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in. ; ; AVX512F-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastw (%rdi), %xmm0 +; AVX512F-NEXT: vpbroadcastw (%rdi), %ymm0 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7] -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512F-NEXT: vpbroadcastw (%rdi), %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpbroadcastw (%rdi), %xmm0 +; AVX512DQ-NEXT: vpbroadcastw (%rdi), %ymm0 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512DQ-NEXT: vpbroadcastw (%rdi), %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll index dbd9df36239bd..7f206fd3bf2aa 100644 --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll @@ -1906,7 +1906,6 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in ; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm1 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512F-NEXT: vpbroadcastb (%rdi), %xmm1 ; AVX512F-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 @@ -1922,7 +1921,6 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in ; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm1 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpbroadcastb (%rdi), %xmm1 ; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 @@ -1933,14 +1931,13 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in ; ; AVX512BW-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb (%rdi), %xmm0 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm2 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm0 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm2 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23] +; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-NEXT: vzeroupper @@ -2891,46 +2888,43 @@ define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %i ; ; AVX2-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastw (%rdi), %xmm0 -; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX2-NEXT: vpbroadcastw (%rdi), %ymm2 -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX2-NEXT: vmovdqa %ymm1, (%rdx) +; AVX2-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX2-NEXT: vpbroadcastw (%rdi), %ymm1 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rdx) +; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastw (%rdi), %xmm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512F-NEXT: vpbroadcastw (%rdi), %ymm2 -; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX512F-NEXT: vpbroadcastw (%rdi), %ymm1 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] +; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpbroadcastw (%rdi), %xmm0 -; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512DQ-NEXT: vpbroadcastw (%rdi), %ymm2 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] -; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX512DQ-NEXT: vpbroadcastw (%rdi), %ymm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] +; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits