https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92492
Bug ID: 92492 Summary: [AVX512F] Icc generate much better code for loop vectorization Product: gcc Version: 10.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: target Assignee: unassigned at gcc dot gnu.org Reporter: crazylht at gmail dot com CC: hjl.tools at gmail dot com Target Milestone: --- Target: i386, x86-64 Created attachment 47231 --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=47231&action=edit testcase cut from 525.x264_r for simple testcase cat test.c: typedef unsigned char uint8_t; static inline uint8_t x264_clip_uint8( int x ) { return x&(~63) ? (-x)>>7 : x; } void mc_weight( uint8_t *dst, uint8_t *src, int i_width, int i_height ) { for( int x = 0; x < i_width; x++ ) dst[x] = x264_clip_uint8(src[x]); } Refer to https://godbolt.org/z/TnACA- Icc generate much better code by using vptestmd and maskmov gcc loop: ------------ vmovdqu8 (%rsi,%rax), %ymm6 vpmovzxbw %xmm6, %ymm3 vpmovzxwd %xmm3, %ymm0 vextracti128 $0x1, %ymm3, %xmm3 vpand %ymm5, %ymm0, %ymm9 vpmovzxwd %xmm3, %ymm3 vextracti128 $0x1, %ymm6, %xmm2 vpcmpd $0, %ymm4, %ymm9, %k0 vpmovzxbw %xmm2, %ymm2 vpand %ymm5, %ymm3, %ymm9 vpcmpd $0, %ymm4, %ymm9, %k1 vpmovzxwd %xmm2, %ymm1 vextracti128 $0x1, %ymm2, %xmm2 vpand %ymm5, %ymm1, %ymm9 vpmovzxwd %xmm2, %ymm2 vpcmpd $0, %ymm4, %ymm9, %k2 vpand %ymm5, %ymm2, %ymm9 kunpckbw %k0, %k1, %k0 vpcmpd $0, %ymm4, %ymm9, %k1 vpsubd %ymm0, %ymm4, %ymm0 vpsubd %ymm3, %ymm4, %ymm3 vpsubd %ymm1, %ymm4, %ymm1 vpsubd %ymm2, %ymm4, %ymm2 kunpckbw %k2, %k1, %k1 kunpckwd %k0, %k1, %k1 vpermt2w %ymm3, %ymm8, %ymm0 vpermt2w %ymm2, %ymm8, %ymm1 vpsraw $7, %ymm0, %ymm0 vpsraw $7, %ymm1, %ymm1 vpand %ymm0, %ymm7, %ymm0 vpand %ymm1, %ymm7, %ymm1 vpackuswb %ymm1, %ymm0, %ymm0 vpermq $216, %ymm0, %ymm0 vmovdqu8 %ymm6, %ymm0{%k1} vmovdqu8 %ymm0, (%rdi,%rax) addq $32, %rax cmpq %rcx, %rax ------------ icc loop: ---------- vpmovzxbd (%rsi,%r8), %ymm3 #12.31 vptestmd %ymm1, %ymm3, %k1 #5.12 vpsubd %ymm3, %ymm0, %ymm2 #12.31 vpsrad $7, %ymm2, %ymm3{%k1} #12.31 vpmovdb %ymm3, (%r8,%rdi) #12.6 addq $8, %r8 #11.2 cmpq %rcx, %r8 #11.2 jb ..B1.7 # Prob 82% #11.2 ---------- origin case cut from SPEC2017 525.x264_r, refer to attachment