https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103771

--- Comment #10 from Hongtao.liu <crazylht at gmail dot com> ---
with
@@ -12120,7 +12120,8 @@ supportable_narrowing_operation (enum tree_code code,
       c1 = VEC_PACK_TRUNC_EXPR;
       if (VECTOR_BOOLEAN_TYPE_P (narrow_vectype)
          && VECTOR_BOOLEAN_TYPE_P (vectype)
-         && TYPE_MODE (narrow_vectype) == TYPE_MODE (vectype)
+         && (TYPE_MODE (narrow_vectype) == TYPE_MODE (vectype)
+             || known_lt (TYPE_VECTOR_SUBPARTS (vectype), BITS_PER_UNIT))
          && SCALAR_INT_MODE_P (TYPE_MODE (vectype)))
        optab1 = vec_pack_sbool_trunc_optab;
       else
@@ -12213,6 +12214,7 @@ supportable_narrowing_operation (enum tree_code code,
       if (VECTOR_BOOLEAN_TYPE_P (intermediate_type)
          && VECTOR_BOOLEAN_TYPE_P (prev_type)
          && intermediate_mode == prev_mode
+         && known_lt (TYPE_VECTOR_SUBPARTS (intermediate_type), BITS_PER_UNIT)
          && SCALAR_INT_MODE_P (prev_mode))
        interm_optab = vec_pack_sbool_trunc_optab;
       else

-march=icelake-server -O3 -mprefer-vector-width=128 now can get vectorized
loop.


        vmovdqu8        (%rsi,%rax), %xmm0
        vpmovzxbw       %xmm0, %xmm2
        vpmovzxwd       %xmm2, %xmm1
        vpsrldq $8, %xmm0, %xmm0
        vpsrldq $8, %xmm2, %xmm2
        vpmovzxbw       %xmm0, %xmm0
        vpmovzxwd       %xmm2, %xmm2
        vpmulld %xmm9, %xmm1, %xmm1
        vpmulld %xmm9, %xmm2, %xmm2
        vpmovzxwd       %xmm0, %xmm4
        vpsrldq $8, %xmm0, %xmm0
        vpmovzxwd       %xmm0, %xmm0
        vpmulld %xmm9, %xmm4, %xmm4
        vpmulld %xmm9, %xmm0, %xmm0
        vpcmpud $6, %xmm6, %xmm1, %k0
        vpsubd  %xmm1, %xmm7, %xmm3
        vpcmpud $6, %xmm6, %xmm2, %k1
        vpsubd  %xmm2, %xmm7, %xmm5
        vpsrad  $31, %xmm5, %xmm5
        vpsrad  $31, %xmm3, %xmm3
        vpermt2w        %xmm5, %xmm8, %xmm3
        vpsubd  %xmm0, %xmm7, %xmm10
        vpsubd  %xmm4, %xmm7, %xmm5
        kshiftlb        $4, %k1, %k1
        vpcmpud $6, %xmm6, %xmm0, %k2
        vpsrad  $31, %xmm5, %xmm5
        vpsrad  $31, %xmm10, %xmm10
        kandb   %k3, %k0, %k0
        korb    %k1, %k0, %k0
        vpcmpud $6, %xmm6, %xmm4, %k1
        vpermt2w        %xmm10, %xmm8, %xmm5
        vpermt2w        %xmm2, %xmm8, %xmm1
        vpermt2w        %xmm0, %xmm8, %xmm4
        vpermt2b        %xmm5, %xmm11, %xmm3
        vpermt2b        %xmm4, %xmm11, %xmm1
        kandb   %k3, %k1, %k1
        kshiftlb        $4, %k2, %k2
        korb    %k2, %k1, %k1
        kunpckbw        %k0, %k1, %k1
        vmovdqu8        %xmm3, %xmm1{%k1}
        vmovdqu8        %xmm1, (%rdi,%rax)
        addq    $16, %rax
        cmpq    %rax, %r8
        jne     .L4

Reply via email to