https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103750

--- Comment #11 from Hongtao.liu <crazylht at gmail dot com> ---
(In reply to Thiago Macieira from comment #6)
> It got worse. Now I'm seeing:
> 
> .L807:
>         vmovdqu16       (%rsi), %ymm2
>         vmovdqu16       32(%rsi), %ymm3
>         vpcmpuw $6, %ymm0, %ymm2, %k2
>         vpcmpuw $6, %ymm0, %ymm3, %k3
>         kmovw   %k2, %eax
>         kmovw   %k3, %edx
>         kmovd   %eax, %k4
>         kmovd   %edx, %k5
>         kortestd        %k5, %k4
>         je      .L814
> 
(define_insn "*zero_extend<mode>si2"
  [(set (match_operand:SI 0 "register_operand" "=r,*r,*k")
        (zero_extend:SI
          (match_operand:SWI12 1 "nonimmediate_operand" "<r>m,*k,*km")))]
  "!(TARGET_ZERO_EXTEND_WITH_AND && optimize_function_for_speed_p (cfun))"

zero_extendhisi is supported with k alternative, it should be optimized to 

         vmovdqu16       (%rsi), %ymm2
         vmovdqu16       32(%rsi), %ymm3
         vpcmpuw $6, %ymm0, %ymm2, %k2
         vpcmpuw $6, %ymm0, %ymm3, %k3
         kmovw   %k2, %k4
         kmovw   %k3, %k5
         kortestd        %k5, %k4

And considering vpcmpuw will implicitly zero extend k2&k3, it can be further
optimized to

         vmovdqu16       (%rsi), %ymm2
         vmovdqu16       32(%rsi), %ymm3
         vpcmpuw $6, %ymm0, %ymm2, %k2
         vpcmpuw $6, %ymm0, %ymm3, %k3
         kortestd        %k3, %k2

> Code snippet:
> 
>         auto loadAndCompare = [maxval](const Char *ptr, unsigned mask = ~0U)
> {
>             if constexpr (sizeof(Char) == 1) {
>                 __m256i mval = _mm256_set1_epi8(maxval);
>                 __m256i data = _mm256_maskz_loadu_epi8(mask, ptr);
>                 return _mm256_cmpgt_epu8_mask(data, mval);
>             } else if constexpr (sizeof(Char) == 2) {
>                 __m256i mval = _mm256_set1_epi16(maxval);
>                 __m256i data = _mm256_maskz_loadu_epi16(mask, ptr);
>                 return _mm256_cmpgt_epu16_mask(data, mval);
>             } else if constexpr (sizeof(Char) == 4) {
>                 __m256i mval = _mm256_set1_epi32(maxval);
>                 __m256i data = _mm256_maskz_loadu_epi32(mask, ptr);
>                 return _mm256_cmpgt_epu32_mask(data, mval);
>             }
>         };
> /*...*/
>             auto mask1 = loadAndCompare(n1);
>             auto mask2 = loadAndCompare(n2);
> 
> I can make a compilable version if you need me to

Yes, please.

Reply via email to