the simplest way (from C-lang) to load a word from unknown alignment is: typedef struct { unsigned value __attribute__((packed)); } unaligned_uint; unsigned load(const unaligned_uint* ptr) { return ptr->value; } load_little_endian: // cycle times ldrb r3, [r0, #0] // 1S + 1S + 1I ldrb r2, [r0, #1] // 1S + 1S + 1I ldrb r1, [r0, #2] // 1S + 1S + 1I orr r3, r3, r2, asl #8 // 1S ldrb r0, [r0, #3] // 1S + 1S + 1I orr r3, r3, r1, asl #16 // 1S orr r0, r3, r0, asl #24 // 1S = // 11S + 4I bx lr load_big_endian: // cycle times mov r3, r0 // 1S ldrb r0, [r0, #1] // 1S + 1S + 1I ldrb r1, [r3, #0] // 1S + 1S + 1I ldrb ip, [r3, #2] // 1S + 1S + 1I mov r0, r0, asl #16 // 1S orr r0, r0, r1, asl #24 // 1S ldrb r2, [r3, #3] // 1S + 1S + 1I orr r0, r0, ip, asl #8 // 1S orr r0, r2, r0 // 1S = // 13S + 4I bx lr the optimized for size and speed routine uses always 4 regs and has the same cycle time for little/big endian variant. enter with address in Ra, uses Rb, Rc (c < d). result in Rd. example: [LE]: memory @0x40 offset : @0x40: +0 +1 +2 +3 +4 +5 +6 +7 dump : 00 11 22 33 44 00 00 00 e.g. Ra = 0x41 opt_load_little_endian: bic Rb, Ra, #3 // Rb = 0x40 ldmia Rb, {Rd,Rc} // Rd = *0x40 = 0x33221100, Rc = *0x44 = 0x00000044 and Rb, Ra, #3 // Rb = 0x01 movs Rb, Rb, lsl #3 // RB = 0x08, set NE movne Rd, Rd, lsr Rb // Rd = 0x00332211 rsbne Rb, Rb, #32 // Rb = 0x18 orrne Rd, Rd, Rc, lsl Rb // Rd |= (0x00000044 << 0x18) = 0x44332211 [BE]: memory @0x40 offset : @0x40: +0 +1 +2 +3 +4 +5 +6 +7 dump : 00 44 33 22 11 00 00 00 e.g. Ra = 0x41 opt_load_big_endian: bic Rb, Ra, #3 // Rb = 0x40 ldmia Rb, {Rd,Rc} // Rd = *0x40 = 0x00443322, Rc = *0x44 = 0x11000000 and Rb, Ra, #3 // Rb = 0x01 movs Rb, Rb, lsl #3 // RB = 0x08, set NE movne Rd, Rd, lsl Rb // Rd = 0x44332200 rsbne Rb, Rb, #32 // Rb = 0x18 orrne Rd, Rd, Rc, lsr Rb // Rd |= (0x11000000 >> 0x18) = 0x44332211 opt. cycle times: bic Rb, Ra, #3 // 1S ldmia Rb, {Rd,Rc} // 2S + 1S + 1I and Rb, Ra, #3 // 1S movs Rb, Rb, lsl #3 // 1S movne Rd, Rd, ls{r,l} Rb // 1S + 1I rsbne Rb, Rb, #32 // 1S orrne Rd, Rd, Rc, ls{l,r} Rb // 1S + 1I = // 9S + 3I finally we get smaller and faster routine.
-- Summary: [missed-optimization] loading a word from an unknown alignment. Product: gcc Version: 4.1.0 Status: UNCONFIRMED Severity: normal Priority: P2 Component: other AssignedTo: unassigned at gcc dot gnu dot org ReportedBy: pluto at agmk dot net CC: gcc-bugs at gcc dot gnu dot org GCC build triplet: * GCC host triplet: * GCC target triplet: arm-linux-eabi http://gcc.gnu.org/bugzilla/show_bug.cgi?id=23066