https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103771
--- Comment #11 from Hongtao.liu <crazylht at gmail dot com> --- (In reply to Hongtao.liu from comment #10) > with > @@ -12120,7 +12120,8 @@ supportable_narrowing_operation (enum tree_code code, > c1 = VEC_PACK_TRUNC_EXPR; > if (VECTOR_BOOLEAN_TYPE_P (narrow_vectype) > && VECTOR_BOOLEAN_TYPE_P (vectype) > - && TYPE_MODE (narrow_vectype) == TYPE_MODE (vectype) > + && (TYPE_MODE (narrow_vectype) == TYPE_MODE (vectype) > + || known_lt (TYPE_VECTOR_SUBPARTS (vectype), BITS_PER_UNIT)) > && SCALAR_INT_MODE_P (TYPE_MODE (vectype))) > optab1 = vec_pack_sbool_trunc_optab; > else > @@ -12213,6 +12214,7 @@ supportable_narrowing_operation (enum tree_code code, > if (VECTOR_BOOLEAN_TYPE_P (intermediate_type) > && VECTOR_BOOLEAN_TYPE_P (prev_type) > && intermediate_mode == prev_mode > + && known_lt (TYPE_VECTOR_SUBPARTS (intermediate_type), BITS_PER_UNIT) > && SCALAR_INT_MODE_P (prev_mode)) > interm_optab = vec_pack_sbool_trunc_optab; > else > > -march=icelake-server -O3 -mprefer-vector-width=128 now can get vectorized > loop. > > > vmovdqu8 (%rsi,%rax), %xmm0 > vpmovzxbw %xmm0, %xmm2 > vpmovzxwd %xmm2, %xmm1 > vpsrldq $8, %xmm0, %xmm0 > vpsrldq $8, %xmm2, %xmm2 > vpmovzxbw %xmm0, %xmm0 > vpmovzxwd %xmm2, %xmm2 > vpmulld %xmm9, %xmm1, %xmm1 > vpmulld %xmm9, %xmm2, %xmm2 > vpmovzxwd %xmm0, %xmm4 > vpsrldq $8, %xmm0, %xmm0 > vpmovzxwd %xmm0, %xmm0 > vpmulld %xmm9, %xmm4, %xmm4 > vpmulld %xmm9, %xmm0, %xmm0 > vpcmpud $6, %xmm6, %xmm1, %k0 > vpsubd %xmm1, %xmm7, %xmm3 > vpcmpud $6, %xmm6, %xmm2, %k1 > vpsubd %xmm2, %xmm7, %xmm5 > vpsrad $31, %xmm5, %xmm5 > vpsrad $31, %xmm3, %xmm3 > vpermt2w %xmm5, %xmm8, %xmm3 > vpsubd %xmm0, %xmm7, %xmm10 > vpsubd %xmm4, %xmm7, %xmm5 > kshiftlb $4, %k1, %k1 > vpcmpud $6, %xmm6, %xmm0, %k2 > vpsrad $31, %xmm5, %xmm5 > vpsrad $31, %xmm10, %xmm10 > kandb %k3, %k0, %k0 > korb %k1, %k0, %k0 > vpcmpud $6, %xmm6, %xmm4, %k1 > vpermt2w %xmm10, %xmm8, %xmm5 > vpermt2w %xmm2, %xmm8, %xmm1 > vpermt2w %xmm0, %xmm8, %xmm4 > vpermt2b %xmm5, %xmm11, %xmm3 > vpermt2b %xmm4, %xmm11, %xmm1 > kandb %k3, %k1, %k1 > kshiftlb $4, %k2, %k2 > korb %k2, %k1, %k1 > kunpckbw %k0, %k1, %k1 > vmovdqu8 %xmm3, %xmm1{%k1} > vmovdqu8 %xmm1, (%rdi,%rax) > addq $16, %rax > cmpq %rax, %r8 > jne .L4 But still not as good as before, since original version we only need to pack data which is produced by vec_cond_expr, but now need to extraly pack mask. before # x_24 = PHI <x_16(9), 0(21)> # vectp_src.11_73 = PHI <vectp_src.11_74(9), src_11(D)(21)> # vectp_dst.23_112 = PHI <vectp_dst.23_113(9), dst_13(D)(21)> # ivtmp_115 = PHI <ivtmp_116(9), 0(21)> # DEBUG x => NULL # DEBUG BEGIN_STMT _1 = (sizetype) x_24; _2 = src_11(D) + _1; vect__3.13_75 = MEM <vector(16) unsigned char> [(uint8_t *)vectp_src.11_73]; _3 = *_2; vect__4.15_76 = [vec_unpack_lo_expr] vect__3.13_75; vect__4.15_77 = [vec_unpack_hi_expr] vect__3.13_75; vect__4.14_78 = [vec_unpack_lo_expr] vect__4.15_76; vect__4.14_79 = [vec_unpack_hi_expr] vect__4.15_76; vect__4.14_80 = [vec_unpack_lo_expr] vect__4.15_77; vect__4.14_81 = [vec_unpack_hi_expr] vect__4.15_77; _4 = (int) _3; vect__5.16_83 = vect__4.14_78 * vect_cst__82; vect__5.16_84 = vect__4.14_79 * vect_cst__82; vect__5.16_85 = vect__4.14_80 * vect_cst__82; vect__5.16_86 = vect__4.14_81 * vect_cst__82; _5 = _4 * i_scale_12(D); _6 = dst_13(D) + _1; # DEBUG x => NULL # DEBUG INLINE_ENTRY x264_clip_uint8 # DEBUG BEGIN_STMT vect__14.17_88 = vect__5.16_83 & vect_cst__87; vect__14.17_89 = vect__5.16_84 & vect_cst__87; vect__14.17_90 = vect__5.16_85 & vect_cst__87; vect__14.17_91 = vect__5.16_86 & vect_cst__87; _14 = _5 & -256; vect__17.18_92 = -vect__5.16_83; vect__17.18_93 = -vect__5.16_84; vect__17.18_94 = -vect__5.16_85; vect__17.18_95 = -vect__5.16_86; _17 = -_5; vect__18.19_96 = vect__17.18_92 >> 31; vect__18.19_97 = vect__17.18_93 >> 31; vect__18.19_98 = vect__17.18_94 >> 31; vect__18.19_99 = vect__17.18_95 >> 31; _18 = _17 >> 31; iftmp.0_19 = (unsigned char) _18; iftmp.0_20 = (unsigned char) _5; _101 = vect__14.17_88 != vect_cst__100; vect_patt_40.20_102 = VEC_COND_EXPR <_101, vect__18.19_96, vect__5.16_83>; _103 = vect__14.17_89 != vect_cst__100; vect_patt_40.20_104 = VEC_COND_EXPR <_103, vect__18.19_97, vect__5.16_84>; _105 = vect__14.17_90 != vect_cst__100; vect_patt_40.20_106 = VEC_COND_EXPR <_105, vect__18.19_98, vect__5.16_85>; _107 = vect__14.17_91 != vect_cst__100; vect_patt_40.20_108 = VEC_COND_EXPR <_107, vect__18.19_99, vect__5.16_86>; vect_patt_41.22_109 = VEC_PACK_TRUNC_EXPR <vect_patt_40.20_102, vect_patt_40.20_104>; vect_patt_41.22_110 = VEC_PACK_TRUNC_EXPR <vect_patt_40.20_106, vect_patt_40.20_108>; vect_patt_41.21_111 = VEC_PACK_TRUNC_EXPR <vect_patt_41.22_109, vect_patt_41.22_110>; iftmp.0_21 = _14 != 0 ? iftmp.0_19 : iftmp.0_20; # DEBUG x => NULL MEM <vector(16) unsigned char> [(uint8_t *)vectp_dst.23_112] = vect_patt_41.21_111; # DEBUG BEGIN_STMT x_16 = x_24 + 1; # DEBUG x => x_16 # DEBUG BEGIN_STMT vectp_src.11_74 = vectp_src.11_73 + 16; vectp_dst.23_113 = vectp_dst.23_112 + 16; ivtmp_116 = ivtmp_115 + 1; after # x_24 = PHI <x_16(9), 0(21)> # vectp_src.12_78 = PHI <vectp_src.12_79(9), src_11(D)(21)> # vectp_dst.30_123 = PHI <vectp_dst.30_124(9), dst_13(D)(21)> # ivtmp_126 = PHI <ivtmp_127(9), 0(21)> _1 = (sizetype) x_24; _2 = src_11(D) + _1; vect__3.14_80 = MEM <vector(16) unsigned char> [(uint8_t *)vectp_src.12_78]; _3 = *_2; vect__4.16_81 = [vec_unpack_lo_expr] vect__3.14_80; vect__4.16_82 = [vec_unpack_hi_expr] vect__3.14_80; vect__4.15_83 = [vec_unpack_lo_expr] vect__4.16_81; vect__4.15_84 = [vec_unpack_hi_expr] vect__4.16_81; vect__4.15_85 = [vec_unpack_lo_expr] vect__4.16_82; vect__4.15_86 = [vec_unpack_hi_expr] vect__4.16_82; _4 = (int) _3; vect__5.17_88 = vect__4.15_83 * vect_cst__87; vect__5.17_89 = vect__4.15_84 * vect_cst__87; vect__5.17_90 = vect__4.15_85 * vect_cst__87; vect__5.17_91 = vect__4.15_86 * vect_cst__87; _5 = _4 * i_scale_12(D); _6 = dst_13(D) + _1; vect_x.18_92 = VIEW_CONVERT_EXPR<vector(4) unsigned int>(vect__5.17_88); vect_x.18_93 = VIEW_CONVERT_EXPR<vector(4) unsigned int>(vect__5.17_89); vect_x.18_94 = VIEW_CONVERT_EXPR<vector(4) unsigned int>(vect__5.17_90); vect_x.18_95 = VIEW_CONVERT_EXPR<vector(4) unsigned int>(vect__5.17_91); x.1_14 = (unsigned int) _5; vect__41.19_96 = -vect_x.18_92; vect__41.19_97 = -vect_x.18_93; vect__41.19_98 = -vect_x.18_94; vect__41.19_99 = -vect_x.18_95; _41 = -x.1_14; vect__17.20_100 = VIEW_CONVERT_EXPR<vector(4) int>(vect__41.19_96); vect__17.20_101 = VIEW_CONVERT_EXPR<vector(4) int>(vect__41.19_97); vect__17.20_102 = VIEW_CONVERT_EXPR<vector(4) int>(vect__41.19_98); vect__17.20_103 = VIEW_CONVERT_EXPR<vector(4) int>(vect__41.19_99); _17 = (int) _41; vect__18.21_104 = vect__17.20_100 >> 31; vect__18.21_105 = vect__17.20_101 >> 31; vect__18.21_106 = vect__17.20_102 >> 31; vect__18.21_107 = vect__17.20_103 >> 31; _18 = _17 >> 31; vect_iftmp.23_108 = VEC_PACK_TRUNC_EXPR <vect__18.21_104, vect__18.21_105>; vect_iftmp.23_109 = VEC_PACK_TRUNC_EXPR <vect__18.21_106, vect__18.21_107>; vect_iftmp.22_110 = VEC_PACK_TRUNC_EXPR <vect_iftmp.23_108, vect_iftmp.23_109>; iftmp.0_19 = (unsigned char) _18; vect_iftmp.25_111 = VEC_PACK_TRUNC_EXPR <vect__5.17_88, vect__5.17_89>; vect_iftmp.25_112 = VEC_PACK_TRUNC_EXPR <vect__5.17_90, vect__5.17_91>; vect_iftmp.24_113 = VEC_PACK_TRUNC_EXPR <vect_iftmp.25_111, vect_iftmp.25_112>; iftmp.0_20 = (unsigned char) _5; mask_patt_40.26_115 = vect_x.18_92 > { 255, 255, 255, 255 }; mask_patt_40.26_116 = vect_x.18_93 > { 255, 255, 255, 255 }; mask_patt_40.26_117 = vect_x.18_94 > { 255, 255, 255, 255 }; mask_patt_40.26_118 = vect_x.18_95 > { 255, 255, 255, 255 }; mask_patt_42.28_119 = VEC_PACK_TRUNC_EXPR <mask_patt_40.26_115, mask_patt_40.26_116>; mask_patt_42.28_120 = VEC_PACK_TRUNC_EXPR <mask_patt_40.26_117, mask_patt_40.26_118>; mask_patt_42.27_121 = VEC_PACK_TRUNC_EXPR <mask_patt_42.28_119, mask_patt_42.28_120>; vect_patt_43.29_122 = VEC_COND_EXPR <mask_patt_42.27_121, vect_iftmp.22_110, vect_iftmp.24_113>; iftmp.0_21 = x.1_14 > 255 ? iftmp.0_19 : iftmp.0_20; MEM <vector(16) unsigned char> [(uint8_t *)vectp_dst.30_123] = vect_patt_43.29_122; x_16 = x_24 + 1; vectp_src.12_79 = vectp_src.12_78 + 16; vectp_dst.30_124 = vectp_dst.30_123 + 16; ivtmp_127 = ivtmp_126 + 1;