https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98934
--- Comment #2 from Andrew Pinski <pinskia at gcc dot gnu.org> --- This is really poor with -mavx512f even. We should be able to do it like (which is what LLVM does): vpmovzxbd %xmm1, %zmm1 vpmovzxbd %xmm0, %zmm0 vpsravd %zmm1, %zmm0, %zmm0 vpmovdb %zmm0, %xmm0 Basically zero extend it out to from char to int and then do the shift and then truncate back down to char. Which we can emulate: typedef char __attribute__((vector_size(16))) v16i8; typedef int __attribute__((vector_size(16*sizeof(int)))) v16i32; typedef int __attribute__((vector_size(4*sizeof(int)))) v4i32; typedef char __attribute__((vector_size(4))) v4i8; v16i8 f1(v16i8 x, v16i8 y) { v16i32 x1, y1; x1 = __builtin_convertvector(x, __typeof(x1)); y1 = __builtin_convertvector(y, __typeof(y1)); x1 = x1 >> y1; x = __builtin_convertvector(x1, __typeof(x)); return x; }