https://gcc.gnu.org/bugzilla/show_bug.cgi?id=64306
Bug ID: 64306 Summary: [SH] Improve unaligned loads Product: gcc Version: 5.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: target Assignee: unassigned at gcc dot gnu.org Reporter: olegendo at gcc dot gnu.org Target: sh*-*-* On SH4A the movua.l insn can be used to do 32 bit unaligned loads (currently defunct, see PR 52480). It could also be used to do 16 bit unaligned loads with fewer insns, if over-reading adjacent bytes is OK to do (usually it's not safe, but could be relaxed and enabled by a -m option). struct __attribute__((packed)) x { int val32; short val_s16; unsigned short val_u16; }; int load_unaligned_s16 (const x& xx) { return xx.val_s16; } currently compiles to little endian: mov.b @(4,r4),r0 extu.b r0,r1 mov.b @(5,r4),r0 extu.b r0,r4 swap.b r4,r4 or r1,r4 exts.w r4,r0 big endian: mov.b @(4,r4),r0 mov r0,r1 mov.b @(5,r4),r0 extu.b r0,r4 extu.b r1,r0 swap.b r0,r0 or r4,r0 exts.w r0,r0 better: mov.b @({5|4},r4),r0 extu.b r0,r1 mov.b @({4|5},r4),r0 shll8 r0 or r1,r0 SH4A little endian (unsafe): movua.l @r4,r0 exts.w r0,r0 SH4A big endian (unsafe): movua.l @r4,r0 shlr16 r0 exts.w r0,r0 int load_unaligned_u16 (const x& xx) { return xx.val_u16; } currently compiles to little endian: mov.b @(6,r4),r0 extu.b r0,r1 mov.b @(7,r4),r0 extu.b r0,r4 swap.b r4,r0 or r1,r0 big endian: mov.b @(6,r4),r0 mov r0,r1 mov.b @(7,r4),r0 extu.b r0,r4 extu.b r1,r0 swap.b r0,r0 or r4,r0 better (uses fewer regs): mov.b @({6|7},r4),r0 extu.b r0,r1 mov.b @({7|6},r4),r0 shll8 r0 or r1,r0 extu.w r0,r0 int load_unaligned32 (const x& xx) { return xx.val32; } currently compiles to little endian: mov.b @(1,r4),r0 mov.b @r4,r2 extu.b r0,r1 mov.b @(2,r4),r0 extu.b r2,r3 swap.b r1,r2 or r3,r2 extu.b r0,r3 mov r3,r0 shll16 r0 mov r0,r1 mov.b @(3,r4),r0 or r2,r1 shll16 r0 shll8 r0 or r1,r0 better: mov.b @r4+,r0 ! r0 = xx.xx.xx.aa mov.b @r4+,r1 ! r1 = xx.xx.xx.bb extu.b r0,r0 ! r0 = 00.00.00.aa mov.b @r4+,r2 ! r2 = xx.xx.xx.cc shll8 r1 ! r1 = xx.xx.bb.00 or r1,r0 ! r0 = xx.xx.bb.aa mov.b @r4+,r3 ! r3 = xx.xx.xx.dd extu.b r2,r2 ! r2 = 00.00.00.cc shll16 r0 ! r0 = bb.aa.00.00 shll8 r3 ! r3 = xx.xx.dd.00 or r3,r2 ! r2 = xx.xx.dd.cc xtrct r2,r0 ! r0 = dd.cc.bb.aa which is two unaligned signed 16 bit loads + shll16 + xtrct. If the (mis)alignment offset value is known, it can be even more compact. x0.x1.aa.bb.cc.dd.y0.y1 ^^^^^^^^^^^ add #-2,r4 mov.l @r4,r0 ! r0 = bb.aa.x1.x0 mov.l @(4,r4),r1 ! r1 = y1.y0.dd.cc xtrct r1,r0 ! r0 = dd.cc.bb.aa x0.aa.bb.cc.dd.y0.y1.y2 ^^^^^^^^^^^ add #-1,r4 mov.l @r4,r0 ! r0 = cc.bb.aa.x0 mov.l @(4,r4),r1 ! r1 = y2.y1.y0.dd ! r1:r0 = y2.y1.y0.dd : cc.bb.aa.x0 mov r0,r2 xtrct r1,r2 ! r2 = y0.dd.cc.bb shlr8 r2 ! r2 = 00.y0.dd.cc shll8 r0 ! r0 = bb.aa.x0.00 xtrct r1,r0 ! r0 = dd.cc.bb.aa void store_unaligned16 (x& xx, int val) { xx.val_s16 = val; } currently compiles to little endian: extu.w r5,r0 mov.b r0,@(4,r4) shlr8 r0 mov.b r0,@(5,r4) big endian: extu.w r5,r5 mov r5,r1 shlr8 r1 mov r1,r0 mov.b r0,@(4,r4) mov r5,r0 mov.b r0,@(5,r4) better (eliminate unnecessary extu.w): mov r5,r0 mov.b r0,@({4|5},r4) shlr8 r0 mov.b r0,@({5|4},r4) void store_unaligned32 (x& xx, int val) { xx.val32 = val; } currently compiles to little endian: mov r5,r0 shlr8 r0 mov.b r5,@r4 mov.b r0,@(1,r4) mov r5,r0 shlr16 r0 mov.b r0,@(2,r4) mov r5,r0 shlr16 r0 shlr8 r0 mov.b r0,@(3,r4) big endian: mov r5,r1 mov r5,r0 shlr16 r1 shlr16 r0 shlr8 r1 mov.b r0,@(1,r4) mov r5,r0 shlr8 r0 mov.b r0,@(2,r4) mov r5,r0 mov.b r1,@r4 mov.b r0,@(3,r4) better: mov r5,r0 mov.b r0,@({0|3},r4) shlr8 r0 mov.b r0,@({1|2},r4) shlr8 r0 mov.b r0,@({2|1},r4) shlr8 r0 mov.b r0,@({3|0},r4)