https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100926

            Bug ID: 100926
           Summary: PPCLE: Inefficient code for vec_xl_be(unsigned short
                    *) < P9
           Product: gcc
           Version: 8.3.1
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: jens.seifert at de dot ibm.com
  Target Milestone: ---

Input:

vector unsigned short load_be(unsigned short *c)
{
   return vec_xl_be(0L, c);
}

creates:
_Z7load_bePt:
.LFB6:
        .cfi_startproc
.LCF6:
0:      addis 2,12,.TOC.-.LCF6@ha
        addi 2,2,.TOC.-.LCF6@l
        .localentry     _Z7load_bePt,.-_Z7load_bePt
        addis 9,2,.LC4@toc@ha
        lxvw4x 34,0,3
        addi 9,9,.LC4@toc@l
        lvx 0,0,9
        vperm 2,2,2,0
        blr


Optimal sequence:

vector unsigned short load_be_opt2(unsigned short *c)
{
   vector signed int vneg16;
   __asm__("vspltisw %0,-16":"=v"(vneg16));
   vector unsigned int tmp = vec_xl_be(0L, (unsigned int *)c);
   tmp = vec_rl(tmp, (vector unsigned int)vneg16);
   return (vector unsigned short)tmp;
}

creates:
_Z12load_be_opt2Pt:
.LFB8:
        .cfi_startproc
        lxvw4x 34,0,3
#APP
 # 77 "vec.C" 1
        vspltisw 0,-16
 # 0 "" 2
#NO_APP
        vrlw 2,2,0
        blr

rotate left (-16) = rotate right (+16) as only the 5 bits get evaluated.

Please note that the inline assembly is required, because vec_splats(-16) gets
converted into a very inefficient constant generation.

vector unsigned short load_be_opt(unsigned short *c)
{
   vector signed int vneg16 = vec_splats(-16);
   vector unsigned int tmp = vec_xl_be(0L, (unsigned int *)c);
   tmp = vec_rl(tmp, (vector unsigned int)vneg16);
   return (vector unsigned short)tmp;
}

creates:
_Z11load_be_optPt:
.LFB7:
        .cfi_startproc
        li 9,48
        lxvw4x 34,0,3
        vspltisw 0,0
        mtvsrd 33,9
        xxspltw 33,33,1
        vsubuwm 0,0,1
        vrlw 2,2,0
        blr

Reply via email to