https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103771

--- Comment #11 from Hongtao.liu <crazylht at gmail dot com> ---
(In reply to Hongtao.liu from comment #10)
> with
> @@ -12120,7 +12120,8 @@ supportable_narrowing_operation (enum tree_code code,
>        c1 = VEC_PACK_TRUNC_EXPR;
>        if (VECTOR_BOOLEAN_TYPE_P (narrow_vectype)
>         && VECTOR_BOOLEAN_TYPE_P (vectype)
> -       && TYPE_MODE (narrow_vectype) == TYPE_MODE (vectype)
> +       && (TYPE_MODE (narrow_vectype) == TYPE_MODE (vectype)
> +           || known_lt (TYPE_VECTOR_SUBPARTS (vectype), BITS_PER_UNIT))
>         && SCALAR_INT_MODE_P (TYPE_MODE (vectype)))
>       optab1 = vec_pack_sbool_trunc_optab;
>        else
> @@ -12213,6 +12214,7 @@ supportable_narrowing_operation (enum tree_code code,
>        if (VECTOR_BOOLEAN_TYPE_P (intermediate_type)
>         && VECTOR_BOOLEAN_TYPE_P (prev_type)
>         && intermediate_mode == prev_mode
> +       && known_lt (TYPE_VECTOR_SUBPARTS (intermediate_type), BITS_PER_UNIT)
>         && SCALAR_INT_MODE_P (prev_mode))
>       interm_optab = vec_pack_sbool_trunc_optab;
>        else
> 
> -march=icelake-server -O3 -mprefer-vector-width=128 now can get vectorized
> loop.
> 
> 
>       vmovdqu8        (%rsi,%rax), %xmm0
>       vpmovzxbw       %xmm0, %xmm2
>       vpmovzxwd       %xmm2, %xmm1
>       vpsrldq $8, %xmm0, %xmm0
>       vpsrldq $8, %xmm2, %xmm2
>       vpmovzxbw       %xmm0, %xmm0
>       vpmovzxwd       %xmm2, %xmm2
>       vpmulld %xmm9, %xmm1, %xmm1
>       vpmulld %xmm9, %xmm2, %xmm2
>       vpmovzxwd       %xmm0, %xmm4
>       vpsrldq $8, %xmm0, %xmm0
>       vpmovzxwd       %xmm0, %xmm0
>       vpmulld %xmm9, %xmm4, %xmm4
>       vpmulld %xmm9, %xmm0, %xmm0
>       vpcmpud $6, %xmm6, %xmm1, %k0
>       vpsubd  %xmm1, %xmm7, %xmm3
>       vpcmpud $6, %xmm6, %xmm2, %k1
>       vpsubd  %xmm2, %xmm7, %xmm5
>       vpsrad  $31, %xmm5, %xmm5
>       vpsrad  $31, %xmm3, %xmm3
>       vpermt2w        %xmm5, %xmm8, %xmm3
>       vpsubd  %xmm0, %xmm7, %xmm10
>       vpsubd  %xmm4, %xmm7, %xmm5
>       kshiftlb        $4, %k1, %k1
>       vpcmpud $6, %xmm6, %xmm0, %k2
>       vpsrad  $31, %xmm5, %xmm5
>       vpsrad  $31, %xmm10, %xmm10
>       kandb   %k3, %k0, %k0
>       korb    %k1, %k0, %k0
>       vpcmpud $6, %xmm6, %xmm4, %k1
>       vpermt2w        %xmm10, %xmm8, %xmm5
>       vpermt2w        %xmm2, %xmm8, %xmm1
>       vpermt2w        %xmm0, %xmm8, %xmm4
>       vpermt2b        %xmm5, %xmm11, %xmm3
>       vpermt2b        %xmm4, %xmm11, %xmm1
>       kandb   %k3, %k1, %k1
>       kshiftlb        $4, %k2, %k2
>       korb    %k2, %k1, %k1
>       kunpckbw        %k0, %k1, %k1
>       vmovdqu8        %xmm3, %xmm1{%k1}
>       vmovdqu8        %xmm1, (%rdi,%rax)
>       addq    $16, %rax
>       cmpq    %rax, %r8
>       jne     .L4

But still not as good as before, since original version we only need to pack
data which is produced by vec_cond_expr, but now need to extraly pack mask.

before

  # x_24 = PHI <x_16(9), 0(21)>
  # vectp_src.11_73 = PHI <vectp_src.11_74(9), src_11(D)(21)>
  # vectp_dst.23_112 = PHI <vectp_dst.23_113(9), dst_13(D)(21)>
  # ivtmp_115 = PHI <ivtmp_116(9), 0(21)>
  # DEBUG x => NULL
  # DEBUG BEGIN_STMT
  _1 = (sizetype) x_24;
  _2 = src_11(D) + _1;
  vect__3.13_75 = MEM <vector(16) unsigned char> [(uint8_t *)vectp_src.11_73];
  _3 = *_2;
  vect__4.15_76 = [vec_unpack_lo_expr] vect__3.13_75;
  vect__4.15_77 = [vec_unpack_hi_expr] vect__3.13_75;
  vect__4.14_78 = [vec_unpack_lo_expr] vect__4.15_76;
  vect__4.14_79 = [vec_unpack_hi_expr] vect__4.15_76;
  vect__4.14_80 = [vec_unpack_lo_expr] vect__4.15_77;
  vect__4.14_81 = [vec_unpack_hi_expr] vect__4.15_77;
  _4 = (int) _3;
  vect__5.16_83 = vect__4.14_78 * vect_cst__82;
  vect__5.16_84 = vect__4.14_79 * vect_cst__82;
  vect__5.16_85 = vect__4.14_80 * vect_cst__82;
  vect__5.16_86 = vect__4.14_81 * vect_cst__82;
  _5 = _4 * i_scale_12(D);
  _6 = dst_13(D) + _1;
  # DEBUG x => NULL
  # DEBUG INLINE_ENTRY x264_clip_uint8
  # DEBUG BEGIN_STMT
  vect__14.17_88 = vect__5.16_83 & vect_cst__87;
  vect__14.17_89 = vect__5.16_84 & vect_cst__87;
  vect__14.17_90 = vect__5.16_85 & vect_cst__87;
  vect__14.17_91 = vect__5.16_86 & vect_cst__87;
  _14 = _5 & -256;
  vect__17.18_92 = -vect__5.16_83;
  vect__17.18_93 = -vect__5.16_84;
  vect__17.18_94 = -vect__5.16_85;
  vect__17.18_95 = -vect__5.16_86;
  _17 = -_5;
  vect__18.19_96 = vect__17.18_92 >> 31;
  vect__18.19_97 = vect__17.18_93 >> 31;
  vect__18.19_98 = vect__17.18_94 >> 31;
  vect__18.19_99 = vect__17.18_95 >> 31;
  _18 = _17 >> 31;
  iftmp.0_19 = (unsigned char) _18;
  iftmp.0_20 = (unsigned char) _5;
  _101 = vect__14.17_88 != vect_cst__100;
  vect_patt_40.20_102 = VEC_COND_EXPR <_101, vect__18.19_96, vect__5.16_83>;
  _103 = vect__14.17_89 != vect_cst__100;
  vect_patt_40.20_104 = VEC_COND_EXPR <_103, vect__18.19_97, vect__5.16_84>;
  _105 = vect__14.17_90 != vect_cst__100;
  vect_patt_40.20_106 = VEC_COND_EXPR <_105, vect__18.19_98, vect__5.16_85>;
  _107 = vect__14.17_91 != vect_cst__100;
  vect_patt_40.20_108 = VEC_COND_EXPR <_107, vect__18.19_99, vect__5.16_86>;
  vect_patt_41.22_109 = VEC_PACK_TRUNC_EXPR <vect_patt_40.20_102,
vect_patt_40.20_104>;
  vect_patt_41.22_110 = VEC_PACK_TRUNC_EXPR <vect_patt_40.20_106,
vect_patt_40.20_108>;
  vect_patt_41.21_111 = VEC_PACK_TRUNC_EXPR <vect_patt_41.22_109,
vect_patt_41.22_110>;
  iftmp.0_21 = _14 != 0 ? iftmp.0_19 : iftmp.0_20;
  # DEBUG x => NULL
  MEM <vector(16) unsigned char> [(uint8_t *)vectp_dst.23_112] =
vect_patt_41.21_111;
  # DEBUG BEGIN_STMT
  x_16 = x_24 + 1;
  # DEBUG x => x_16
  # DEBUG BEGIN_STMT
  vectp_src.11_74 = vectp_src.11_73 + 16;
  vectp_dst.23_113 = vectp_dst.23_112 + 16;
  ivtmp_116 = ivtmp_115 + 1;

after

  # x_24 = PHI <x_16(9), 0(21)>
  # vectp_src.12_78 = PHI <vectp_src.12_79(9), src_11(D)(21)>
  # vectp_dst.30_123 = PHI <vectp_dst.30_124(9), dst_13(D)(21)>
  # ivtmp_126 = PHI <ivtmp_127(9), 0(21)>
  _1 = (sizetype) x_24;
  _2 = src_11(D) + _1;
  vect__3.14_80 = MEM <vector(16) unsigned char> [(uint8_t *)vectp_src.12_78];
  _3 = *_2;
  vect__4.16_81 = [vec_unpack_lo_expr] vect__3.14_80;
  vect__4.16_82 = [vec_unpack_hi_expr] vect__3.14_80;
  vect__4.15_83 = [vec_unpack_lo_expr] vect__4.16_81;
  vect__4.15_84 = [vec_unpack_hi_expr] vect__4.16_81;
  vect__4.15_85 = [vec_unpack_lo_expr] vect__4.16_82;
  vect__4.15_86 = [vec_unpack_hi_expr] vect__4.16_82;
  _4 = (int) _3;
  vect__5.17_88 = vect__4.15_83 * vect_cst__87;
  vect__5.17_89 = vect__4.15_84 * vect_cst__87;
  vect__5.17_90 = vect__4.15_85 * vect_cst__87;
  vect__5.17_91 = vect__4.15_86 * vect_cst__87;
  _5 = _4 * i_scale_12(D);
  _6 = dst_13(D) + _1;
  vect_x.18_92 = VIEW_CONVERT_EXPR<vector(4) unsigned int>(vect__5.17_88);
  vect_x.18_93 = VIEW_CONVERT_EXPR<vector(4) unsigned int>(vect__5.17_89);
  vect_x.18_94 = VIEW_CONVERT_EXPR<vector(4) unsigned int>(vect__5.17_90);
  vect_x.18_95 = VIEW_CONVERT_EXPR<vector(4) unsigned int>(vect__5.17_91);
  x.1_14 = (unsigned int) _5;
  vect__41.19_96 = -vect_x.18_92;
  vect__41.19_97 = -vect_x.18_93;
  vect__41.19_98 = -vect_x.18_94;
  vect__41.19_99 = -vect_x.18_95;
  _41 = -x.1_14;
  vect__17.20_100 = VIEW_CONVERT_EXPR<vector(4) int>(vect__41.19_96);
  vect__17.20_101 = VIEW_CONVERT_EXPR<vector(4) int>(vect__41.19_97);
  vect__17.20_102 = VIEW_CONVERT_EXPR<vector(4) int>(vect__41.19_98);
  vect__17.20_103 = VIEW_CONVERT_EXPR<vector(4) int>(vect__41.19_99);
  _17 = (int) _41;
  vect__18.21_104 = vect__17.20_100 >> 31;
  vect__18.21_105 = vect__17.20_101 >> 31;
  vect__18.21_106 = vect__17.20_102 >> 31;
  vect__18.21_107 = vect__17.20_103 >> 31;
  _18 = _17 >> 31;
  vect_iftmp.23_108 = VEC_PACK_TRUNC_EXPR <vect__18.21_104, vect__18.21_105>;
  vect_iftmp.23_109 = VEC_PACK_TRUNC_EXPR <vect__18.21_106, vect__18.21_107>;
  vect_iftmp.22_110 = VEC_PACK_TRUNC_EXPR <vect_iftmp.23_108,
vect_iftmp.23_109>;
  iftmp.0_19 = (unsigned char) _18;
  vect_iftmp.25_111 = VEC_PACK_TRUNC_EXPR <vect__5.17_88, vect__5.17_89>;
  vect_iftmp.25_112 = VEC_PACK_TRUNC_EXPR <vect__5.17_90, vect__5.17_91>;
  vect_iftmp.24_113 = VEC_PACK_TRUNC_EXPR <vect_iftmp.25_111,
vect_iftmp.25_112>;
  iftmp.0_20 = (unsigned char) _5;
  mask_patt_40.26_115 = vect_x.18_92 > { 255, 255, 255, 255 };
  mask_patt_40.26_116 = vect_x.18_93 > { 255, 255, 255, 255 };
  mask_patt_40.26_117 = vect_x.18_94 > { 255, 255, 255, 255 };
  mask_patt_40.26_118 = vect_x.18_95 > { 255, 255, 255, 255 };
  mask_patt_42.28_119 = VEC_PACK_TRUNC_EXPR <mask_patt_40.26_115,
mask_patt_40.26_116>;
  mask_patt_42.28_120 = VEC_PACK_TRUNC_EXPR <mask_patt_40.26_117,
mask_patt_40.26_118>;
  mask_patt_42.27_121 = VEC_PACK_TRUNC_EXPR <mask_patt_42.28_119,
mask_patt_42.28_120>;
  vect_patt_43.29_122 = VEC_COND_EXPR <mask_patt_42.27_121, vect_iftmp.22_110,
vect_iftmp.24_113>;
  iftmp.0_21 = x.1_14 > 255 ? iftmp.0_19 : iftmp.0_20;
  MEM <vector(16) unsigned char> [(uint8_t *)vectp_dst.30_123] =
vect_patt_43.29_122;
  x_16 = x_24 + 1;
  vectp_src.12_79 = vectp_src.12_78 + 16;
  vectp_dst.30_124 = vectp_dst.30_123 + 16;
  ivtmp_127 = ivtmp_126 + 1;

Reply via email to