[Bug target/64306] New: [SH] Improve unaligned loads

olegendo at gcc dot gnu.org Sun, 14 Dec 2014 07:32:38 -0800

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=64306


            Bug ID: 64306
           Summary: [SH] Improve unaligned loads
           Product: gcc
           Version: 5.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: olegendo at gcc dot gnu.org
            Target: sh*-*-*

On SH4A the movua.l insn can be used to do 32 bit unaligned loads (currently
defunct, see PR 52480).  It could also be used to do 16 bit unaligned loads
with fewer insns, if over-reading adjacent bytes is OK to do (usually it's not
safe, but could be relaxed and enabled by a -m option).

struct __attribute__((packed)) x
{
  int val32;
  short val_s16;
  unsigned short val_u16;
};

int load_unaligned_s16 (const x& xx)
{
  return xx.val_s16;
}

currently compiles to
little endian:
        mov.b   @(4,r4),r0
        extu.b  r0,r1
        mov.b   @(5,r4),r0
        extu.b  r0,r4
        swap.b  r4,r4
        or      r1,r4
        exts.w  r4,r0

big endian:
        mov.b   @(4,r4),r0
        mov     r0,r1
        mov.b   @(5,r4),r0
        extu.b  r0,r4
        extu.b  r1,r0
        swap.b  r0,r0
        or      r4,r0
        exts.w  r0,r0

better:
        mov.b   @({5|4},r4),r0
        extu.b  r0,r1
        mov.b   @({4|5},r4),r0
        shll8   r0
        or      r1,r0

SH4A little endian (unsafe):
        movua.l @r4,r0
        exts.w  r0,r0

SH4A big endian (unsafe):
        movua.l @r4,r0
        shlr16  r0
        exts.w  r0,r0


int load_unaligned_u16 (const x& xx)
{
  return xx.val_u16;
}

currently compiles to
little endian:
        mov.b   @(6,r4),r0
        extu.b  r0,r1
        mov.b   @(7,r4),r0
        extu.b  r0,r4
        swap.b  r4,r0
        or      r1,r0

big endian:
        mov.b   @(6,r4),r0
        mov     r0,r1
        mov.b   @(7,r4),r0
        extu.b  r0,r4
        extu.b  r1,r0
        swap.b  r0,r0
        or      r4,r0

better (uses fewer regs):
        mov.b   @({6|7},r4),r0
        extu.b  r0,r1
        mov.b   @({7|6},r4),r0
        shll8   r0
        or      r1,r0
        extu.w    r0,r0



int load_unaligned32 (const x& xx)
{
  return xx.val32;
}

currently compiles to
little endian:
        mov.b   @(1,r4),r0
        mov.b   @r4,r2
        extu.b  r0,r1
        mov.b   @(2,r4),r0
        extu.b  r2,r3
        swap.b  r1,r2
        or      r3,r2
        extu.b  r0,r3
        mov     r3,r0
        shll16  r0
        mov     r0,r1
        mov.b   @(3,r4),r0
        or      r2,r1
        shll16  r0
        shll8   r0
        or      r1,r0

better:
        mov.b   @r4+,r0  ! r0 = xx.xx.xx.aa
        mov.b   @r4+,r1  ! r1 = xx.xx.xx.bb
        extu.b  r0,r0    ! r0 = 00.00.00.aa
        mov.b   @r4+,r2  ! r2 = xx.xx.xx.cc
        shll8   r1       ! r1 = xx.xx.bb.00
        or      r1,r0    ! r0 = xx.xx.bb.aa
        mov.b   @r4+,r3  ! r3 = xx.xx.xx.dd
        extu.b  r2,r2    ! r2 = 00.00.00.cc
        shll16  r0       ! r0 = bb.aa.00.00
        shll8   r3       ! r3 = xx.xx.dd.00
        or      r3,r2    ! r2 = xx.xx.dd.cc
        xtrct   r2,r0    ! r0 = dd.cc.bb.aa

which is two unaligned signed 16 bit loads + shll16 + xtrct.


If the (mis)alignment offset value is known, it can be even more compact.

x0.x1.aa.bb.cc.dd.y0.y1
      ^^^^^^^^^^^
        add     #-2,r4
        mov.l   @r4,r0       ! r0 = bb.aa.x1.x0
        mov.l   @(4,r4),r1   ! r1 = y1.y0.dd.cc
        xtrct   r1,r0        ! r0 = dd.cc.bb.aa

x0.aa.bb.cc.dd.y0.y1.y2
   ^^^^^^^^^^^
        add     #-1,r4
        mov.l   @r4,r0       ! r0 = cc.bb.aa.x0
        mov.l   @(4,r4),r1   ! r1 = y2.y1.y0.dd
                             ! r1:r0 = y2.y1.y0.dd : cc.bb.aa.x0
        mov     r0,r2
        xtrct   r1,r2        ! r2 = y0.dd.cc.bb
        shlr8   r2           ! r2 = 00.y0.dd.cc

        shll8   r0           ! r0 = bb.aa.x0.00
        xtrct   r1,r0        ! r0 = dd.cc.bb.aa



void store_unaligned16 (x& xx, int val)
{
  xx.val_s16 = val;
}

currently compiles to
little endian:
        extu.w    r5,r0
        mov.b   r0,@(4,r4)
        shlr8   r0
        mov.b   r0,@(5,r4)

big endian:
        extu.w  r5,r5
        mov     r5,r1
        shlr8   r1
        mov     r1,r0
        mov.b   r0,@(4,r4)
        mov     r5,r0
        mov.b   r0,@(5,r4)

better (eliminate unnecessary extu.w):
        mov     r5,r0
        mov.b   r0,@({4|5},r4)
        shlr8   r0
        mov.b   r0,@({5|4},r4)



void store_unaligned32 (x& xx, int val)
{
  xx.val32 = val;
}


currently compiles to
little endian:
        mov     r5,r0
        shlr8   r0
        mov.b   r5,@r4
        mov.b   r0,@(1,r4)
        mov     r5,r0
        shlr16  r0
        mov.b   r0,@(2,r4)
        mov     r5,r0
        shlr16  r0
        shlr8   r0
        mov.b   r0,@(3,r4)

big endian:
        mov     r5,r1
        mov     r5,r0
        shlr16  r1
        shlr16  r0
        shlr8   r1
        mov.b   r0,@(1,r4)
    mov     r5,r0
    shlr8   r0
    mov.b   r0,@(2,r4)
    mov     r5,r0
    mov.b   r1,@r4
    mov.b    r0,@(3,r4)

better:
        mov     r5,r0
        mov.b   r0,@({0|3},r4)
        shlr8   r0
        mov.b   r0,@({1|2},r4)
        shlr8   r0
        mov.b   r0,@({2|1},r4)
        shlr8   r0
        mov.b   r0,@({3|0},r4)

[Bug target/64306] New: [SH] Improve unaligned loads

Reply via email to