https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102265

            Bug ID: 102265
           Summary: s390: Inefficient code for __builtin_ctzll
           Product: gcc
           Version: 10.2.1
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: jens.seifert at de dot ibm.com
  Target Milestone: ---

unsigned long long ctzll(unsigned long long x)
{
   return __builtin_ctzll(x);
}

creates:
        lcgr    %r1,%r2
        ngr     %r2,%r1
        lghi    %r1,63
        flogr   %r2,%r2
        sgrk    %r2,%r1,%r2
        lgfr    %r2,%r2
        br      %r14


Optimal sequence for z15 uses population count, for all others use ^ 63 instead
of 63 -.

unsigned long long ctzll_opt(unsigned long long x)
{
#if __ARCH__ >= 13
   return __builtin_popcountll((x-1) & ~x);
#else
   return __builtin_clzll(x & -x) ^ 63;
#endif
}

< z15:
        lcgr    %r1,%r2
        ngr     %r2,%r1
        flogr   %r2,%r2
        xilf    %r2,63
        lgfr    %r2,%r2
        br      %r14

=> 1 instruction saved.

z15:
        .cfi_startproc
        lay     %r1,-1(%r2)
        ncgrk   %r2,%r1,%r2
        popcnt  %r2,%r2,8
        br      %r14
        .cfi_endproc

=> On z15 only 3 instructions required.
  • [Bug target/102265] New: s390:... jens.seifert at de dot ibm.com via Gcc-bugs

Reply via email to