the simplest way (from C-lang) to load a word from unknown alignment is:
typedef struct { unsigned value __attribute__((packed)); } unaligned_uint;
unsigned load(const unaligned_uint* ptr) { return ptr->value; }
load_little_endian: // cycle times
ldrb r3, [r0, #0] // 1S + 1S + 1I
ldrb r2, [r0, #1] // 1S + 1S + 1I
ldrb r1, [r0, #2] // 1S + 1S + 1I
orr r3, r3, r2, asl #8 // 1S
ldrb r0, [r0, #3] // 1S + 1S + 1I
orr r3, r3, r1, asl #16 // 1S
orr r0, r3, r0, asl #24 // 1S
= // 11S + 4I
bx lr
load_big_endian: // cycle times
mov r3, r0 // 1S
ldrb r0, [r0, #1] // 1S + 1S + 1I
ldrb r1, [r3, #0] // 1S + 1S + 1I
ldrb ip, [r3, #2] // 1S + 1S + 1I
mov r0, r0, asl #16 // 1S
orr r0, r0, r1, asl #24 // 1S
ldrb r2, [r3, #3] // 1S + 1S + 1I
orr r0, r0, ip, asl #8 // 1S
orr r0, r2, r0 // 1S
= // 13S + 4I
bx lr
the optimized for size and speed routine uses always 4 regs
and has the same cycle time for little/big endian variant.
enter with address in Ra, uses Rb, Rc (c < d). result in Rd.
example:
[LE]: memory @0x40
offset : @0x40: +0 +1 +2 +3 +4 +5 +6 +7
dump : 00 11 22 33 44 00 00 00
e.g. Ra = 0x41
opt_load_little_endian:
bic Rb, Ra, #3 // Rb = 0x40
ldmia Rb, {Rd,Rc} // Rd = *0x40 = 0x33221100, Rc = *0x44 = 0x00000044
and Rb, Ra, #3 // Rb = 0x01
movs Rb, Rb, lsl #3 // RB = 0x08, set NE
movne Rd, Rd, lsr Rb // Rd = 0x00332211
rsbne Rb, Rb, #32 // Rb = 0x18
orrne Rd, Rd, Rc, lsl Rb // Rd |= (0x00000044 << 0x18) = 0x44332211
[BE]: memory @0x40
offset : @0x40: +0 +1 +2 +3 +4 +5 +6 +7
dump : 00 44 33 22 11 00 00 00
e.g. Ra = 0x41
opt_load_big_endian:
bic Rb, Ra, #3 // Rb = 0x40
ldmia Rb, {Rd,Rc} // Rd = *0x40 = 0x00443322, Rc = *0x44 = 0x11000000
and Rb, Ra, #3 // Rb = 0x01
movs Rb, Rb, lsl #3 // RB = 0x08, set NE
movne Rd, Rd, lsl Rb // Rd = 0x44332200
rsbne Rb, Rb, #32 // Rb = 0x18
orrne Rd, Rd, Rc, lsr Rb // Rd |= (0x11000000 >> 0x18) = 0x44332211
opt. cycle times:
bic Rb, Ra, #3 // 1S
ldmia Rb, {Rd,Rc} // 2S + 1S + 1I
and Rb, Ra, #3 // 1S
movs Rb, Rb, lsl #3 // 1S
movne Rd, Rd, ls{r,l} Rb // 1S + 1I
rsbne Rb, Rb, #32 // 1S
orrne Rd, Rd, Rc, ls{l,r} Rb // 1S + 1I
= // 9S + 3I
finally we get smaller and faster routine.
--
Summary: [missed-optimization] loading a word from an unknown
alignment.
Product: gcc
Version: 4.1.0
Status: UNCONFIRMED
Severity: normal
Priority: P2
Component: other
AssignedTo: unassigned at gcc dot gnu dot org
ReportedBy: pluto at agmk dot net
CC: gcc-bugs at gcc dot gnu dot org
GCC build triplet: *
GCC host triplet: *
GCC target triplet: arm-linux-eabi
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=23066