https://gcc.gnu.org/bugzilla/show_bug.cgi?id=117568
Bug ID: 117568 Summary: z13: Use vector instructions for fixed length memcmp Product: gcc Version: 13.2.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: target Assignee: unassigned at gcc dot gnu.org Reporter: jens.seifert at de dot ibm.com Target Milestone: --- #include <memory.h> #include <vecintrin.h> Up to 16 bytes consider using vector instructions for memcmp. This is not required for 1,2,4,8 bytes, but for the rest. For general memcmp. memcmp == 0: Use lochi to get result of clc otherwise maybe use 2xlochi instead of ipm+2*shift after clc, but I am not 100% sure if lochi is faster.b bool eq15(const unsigned char *a, const unsigned char *b) { return memcmp(a, b, 15) == 0; } bool eq15_vec(const unsigned char *a, const unsigned char *b) { const int len = 15; vector unsigned char va = vec_load_len(a, len-1); vector unsigned char vb = vec_load_len(b, len-1); return vec_all_eq(va, vb); } eq15(unsigned char const*, unsigned char const*): clc 0(15,%r3),0(%r2) ipm %r2 sll %r2,2 sra %r2,30 lpr %r0,%r2 ahi %r0,-1 risbgn %r2,%r0,64-1,128+63,32+1 br %r14 eq15_vec(unsigned char const*, unsigned char const*): lhi %r1,14 vll %v0,%r1,0(%r2) vll %v2,%r1,0(%r3) lghi %r2,0 vceqbs %v0,%v0,%v2 locghie %r2,1 br %r14 int compare15(const unsigned char *a, const unsigned char *b) { return memcmp(a, b, 15); } int compare15_vec(const unsigned char *a, const unsigned char *b) { const int len = 15; vector unsigned char va = vec_load_len(a, len-1); vector unsigned char vb = vec_load_len(b, len-1); vector int le = (vector int)vec_subc_u128(va, vb); vector int ge = (vector int)vec_subc_u128(vb, va); return vec_extract(le - ge, 3); } compare15(unsigned char const*, unsigned char const*): clc 0(15,%r3),0(%r2) ipm %r2 sllg %r0,%r2,34 srag %r2,%r0,62 br %r14 compare15_vec(unsigned char const*, unsigned char const*): lhi %r1,14 vll %v4,%r1,0(%r2) vll %v0,%r1,0(%r3) vscbiq %v2,%v4,%v0 vscbiq %v6,%v0,%v4 vsf %v1,%v2,%v6 vlgvf %r2,%v1,3 lgfr %r2,%r2 br %r14 => For general memcmp vectorization does not pay off.