https://gcc.gnu.org/bugzilla/show_bug.cgi?id=27663
Georg-Johann Lay <gjl at gcc dot gnu.org> changed: What |Removed |Added ---------------------------------------------------------------------------- Known to work| |14.0 Resolution|--- |FIXED Status|NEW |RESOLVED --- Comment #10 from Georg-Johann Lay <gjl at gcc dot gnu.org> --- bswap recognition appears to have improved since then. When I -Os -mmcu=atmega8 -dp with 14.0.0 20230530: unsigned long f_mem1 (unsigned char *P) { unsigned long c; c = ((unsigned long) P[1] << 24) | ((unsigned long) P[2] << 16) | ((unsigned long) P[3] << 8) | ((unsigned long) P[4] << 0); return c; } unsigned long f_mem2 (unsigned char *P) { unsigned long c; c = ((unsigned long) P[4] << 0) | ((unsigned long) P[3] << 8) | ((unsigned long) P[2] << 16) | ((unsigned long) P[1] << 24); return c; } unsigned long f_reg (unsigned long p) { unsigned long c; c = (p << 24) | (0xff0000 & (p << 8)) | (0xff00 & (p >> 8)) | (p >> 24); return c; } I am getting: f_mem1: movw r30,r24 ; 33 [c=4 l=1] *movhi/0 ldd r22,Z+1 ; 34 [c=16 l=4] *movsi/2 ldd r23,Z+2 ldd r24,Z+3 ldd r25,Z+4 rcall __bswapsi2 ; 35 [c=16 l=1] *bswapsi2.libgcc ret ; 38 [c=0 l=1] return f_mem2: movw r30,r24 ; 33 [c=4 l=1] *movhi/0 ldd r22,Z+1 ; 34 [c=16 l=4] *movsi/2 ldd r23,Z+2 ldd r24,Z+3 ldd r25,Z+4 rcall __bswapsi2 ; 35 [c=16 l=1] *bswapsi2.libgcc ret ; 38 [c=0 l=1] return f_reg: rcall __bswapsi2 ; 42 [c=16 l=1] *bswapsi2.libgcc ret ; 45 [c=0 l=1] return The RCALL + RET will be optimized to RJMP by -mrelax. Splitting into a zoo of subregs is usually not a good idea because the register allocator won't manage to optimize them, resulting in bunch of moves and register pressure that are more expensive than the bswap itself. Notice the calls are transparent, i.e. their footprint is just as big as required: (define_insn "*bswapsi2.libgcc" [(set (reg:SI 22) (bswap:SI (reg:SI 22))) (clobber (reg:CC REG_CC))] "reload_completed" "%~call __bswapsi2" [(set_attr "type" "xcall")])