https://gcc.gnu.org/bugzilla/show_bug.cgi?id=27663

Georg-Johann Lay <gjl at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
      Known to work|                            |14.0
         Resolution|---                         |FIXED
             Status|NEW                         |RESOLVED

--- Comment #10 from Georg-Johann Lay <gjl at gcc dot gnu.org> ---
bswap recognition appears to have improved since then.  When I -Os
-mmcu=atmega8 -dp with 14.0.0 20230530:

unsigned long f_mem1 (unsigned char *P)
{
  unsigned long c;
  c = ((unsigned long) P[1] << 24)
    | ((unsigned long) P[2] << 16)
    | ((unsigned long) P[3] << 8)
    | ((unsigned long) P[4] << 0);
  return c;
}

unsigned long f_mem2 (unsigned char *P)
{
  unsigned long c;
  c = ((unsigned long) P[4] << 0)
    | ((unsigned long) P[3] << 8)
    | ((unsigned long) P[2] << 16)
    | ((unsigned long) P[1] << 24);
  return c;
}

unsigned long f_reg (unsigned long p)
{
  unsigned long c;
  c = (p << 24)
    | (0xff0000 & (p << 8))
    | (0xff00 & (p >> 8))
    | (p >> 24);
  return c;
}


I am getting:


f_mem1:
        movw r30,r24     ;  33  [c=4 l=1]  *movhi/0
        ldd r22,Z+1      ;  34  [c=16 l=4]  *movsi/2
        ldd r23,Z+2
        ldd r24,Z+3
        ldd r25,Z+4
        rcall __bswapsi2         ;  35  [c=16 l=1]  *bswapsi2.libgcc
        ret              ;  38  [c=0 l=1]  return

f_mem2:
        movw r30,r24     ;  33  [c=4 l=1]  *movhi/0
        ldd r22,Z+1      ;  34  [c=16 l=4]  *movsi/2
        ldd r23,Z+2
        ldd r24,Z+3
        ldd r25,Z+4
        rcall __bswapsi2         ;  35  [c=16 l=1]  *bswapsi2.libgcc
        ret              ;  38  [c=0 l=1]  return

f_reg:
        rcall __bswapsi2         ;  42  [c=16 l=1]  *bswapsi2.libgcc
        ret              ;  45  [c=0 l=1]  return

The RCALL + RET will be optimized to RJMP by -mrelax.

Splitting into a zoo of subregs is usually not a good idea because the register
allocator won't manage to optimize them, resulting in bunch of moves and
register pressure that are more expensive than the bswap itself.

Notice the calls are transparent, i.e. their footprint is just as big as
required:

(define_insn "*bswapsi2.libgcc"
  [(set (reg:SI 22)
        (bswap:SI (reg:SI 22)))
   (clobber (reg:CC REG_CC))]
  "reload_completed"
  "%~call __bswapsi2"
  [(set_attr "type" "xcall")])

Reply via email to