https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102265
Bug ID: 102265 Summary: s390: Inefficient code for __builtin_ctzll Product: gcc Version: 10.2.1 Status: UNCONFIRMED Severity: normal Priority: P3 Component: target Assignee: unassigned at gcc dot gnu.org Reporter: jens.seifert at de dot ibm.com Target Milestone: --- unsigned long long ctzll(unsigned long long x) { return __builtin_ctzll(x); } creates: lcgr %r1,%r2 ngr %r2,%r1 lghi %r1,63 flogr %r2,%r2 sgrk %r2,%r1,%r2 lgfr %r2,%r2 br %r14 Optimal sequence for z15 uses population count, for all others use ^ 63 instead of 63 -. unsigned long long ctzll_opt(unsigned long long x) { #if __ARCH__ >= 13 return __builtin_popcountll((x-1) & ~x); #else return __builtin_clzll(x & -x) ^ 63; #endif } < z15: lcgr %r1,%r2 ngr %r2,%r1 flogr %r2,%r2 xilf %r2,63 lgfr %r2,%r2 br %r14 => 1 instruction saved. z15: .cfi_startproc lay %r1,-1(%r2) ncgrk %r2,%r1,%r2 popcnt %r2,%r2,8 br %r14 .cfi_endproc => On z15 only 3 instructions required.