the simplest way (from C-lang) to load a word from unknown alignment is: 
 
typedef struct { unsigned value __attribute__((packed)); } unaligned_uint; 
unsigned load(const unaligned_uint* ptr) { return ptr->value; } 
 
load_little_endian:                     // cycle times 
        ldrb    r3, [r0, #0]            // 1S + 1S + 1I 
        ldrb    r2, [r0, #1]            // 1S + 1S + 1I 
        ldrb    r1, [r0, #2]            // 1S + 1S + 1I 
        orr     r3, r3, r2, asl #8      // 1S 
        ldrb    r0, [r0, #3]            // 1S + 1S + 1I 
        orr     r3, r3, r1, asl #16     // 1S 
        orr     r0, r3, r0, asl #24     // 1S 
                                      = // 11S + 4I 
        bx      lr 
 
load_big_endian:                        // cycle times 
        mov     r3, r0                  // 1S 
        ldrb    r0, [r0, #1]            // 1S + 1S + 1I 
        ldrb    r1, [r3, #0]            // 1S + 1S + 1I 
        ldrb    ip, [r3, #2]            // 1S + 1S + 1I 
        mov     r0, r0, asl #16         // 1S 
        orr     r0, r0, r1, asl #24     // 1S 
        ldrb    r2, [r3, #3]            // 1S + 1S + 1I 
        orr     r0, r0, ip, asl #8      // 1S 
        orr     r0, r2, r0              // 1S  
                                      = // 13S + 4I 
        bx      lr 
 
the optimized for size and speed routine uses always 4 regs 
and has the same cycle time for little/big endian variant. 
 
enter with address in Ra, uses Rb, Rc (c < d). result in Rd. 
 
example: 
 
[LE]: memory @0x40 
 
offset : @0x40: +0 +1 +2 +3 +4 +5 +6 +7 
dump   :        00 11 22 33 44 00 00 00 
 
e.g. Ra = 0x41 
 
opt_load_little_endian: 
 
bic   Rb, Ra, #3         // Rb = 0x40 
ldmia Rb, {Rd,Rc}        // Rd = *0x40 = 0x33221100, Rc = *0x44 = 0x00000044 
and   Rb, Ra, #3         // Rb = 0x01 
movs  Rb, Rb, lsl #3     // RB = 0x08, set NE 
movne Rd, Rd, lsr Rb     // Rd = 0x00332211 
rsbne Rb, Rb, #32        // Rb = 0x18 
orrne Rd, Rd, Rc, lsl Rb // Rd |= (0x00000044 << 0x18) = 0x44332211 
 
[BE]: memory @0x40 
 
offset : @0x40: +0 +1 +2 +3 +4 +5 +6 +7 
dump   :        00 44 33 22 11 00 00 00 
 
e.g. Ra = 0x41 
 
opt_load_big_endian: 
 
bic   Rb, Ra, #3         // Rb = 0x40 
ldmia Rb, {Rd,Rc}        // Rd = *0x40 = 0x00443322, Rc = *0x44 = 0x11000000 
and   Rb, Ra, #3         // Rb = 0x01 
movs  Rb, Rb, lsl #3     // RB = 0x08, set NE 
movne Rd, Rd, lsl Rb     // Rd = 0x44332200 
rsbne Rb, Rb, #32        // Rb = 0x18 
orrne Rd, Rd, Rc, lsr Rb // Rd |= (0x11000000 >> 0x18) = 0x44332211 
 
opt. cycle times: 
 
bic   Rb, Ra, #3             // 1S 
ldmia Rb, {Rd,Rc}            // 2S + 1S + 1I 
and   Rb, Ra, #3             // 1S 
movs  Rb, Rb, lsl #3         // 1S 
movne Rd, Rd, ls{r,l} Rb     // 1S + 1I 
rsbne Rb, Rb, #32            // 1S 
orrne Rd, Rd, Rc, ls{l,r} Rb // 1S + 1I 
                           = // 9S + 3I 
 
finally we get smaller and faster routine.

-- 
           Summary: [missed-optimization] loading a word from an unknown
                    alignment.
           Product: gcc
           Version: 4.1.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P2
         Component: other
        AssignedTo: unassigned at gcc dot gnu dot org
        ReportedBy: pluto at agmk dot net
                CC: gcc-bugs at gcc dot gnu dot org
 GCC build triplet: *
  GCC host triplet: *
GCC target triplet: arm-linux-eabi


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=23066

Reply via email to