https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100926
Bug ID: 100926 Summary: PPCLE: Inefficient code for vec_xl_be(unsigned short *) < P9 Product: gcc Version: 8.3.1 Status: UNCONFIRMED Severity: normal Priority: P3 Component: target Assignee: unassigned at gcc dot gnu.org Reporter: jens.seifert at de dot ibm.com Target Milestone: --- Input: vector unsigned short load_be(unsigned short *c) { return vec_xl_be(0L, c); } creates: _Z7load_bePt: .LFB6: .cfi_startproc .LCF6: 0: addis 2,12,.TOC.-.LCF6@ha addi 2,2,.TOC.-.LCF6@l .localentry _Z7load_bePt,.-_Z7load_bePt addis 9,2,.LC4@toc@ha lxvw4x 34,0,3 addi 9,9,.LC4@toc@l lvx 0,0,9 vperm 2,2,2,0 blr Optimal sequence: vector unsigned short load_be_opt2(unsigned short *c) { vector signed int vneg16; __asm__("vspltisw %0,-16":"=v"(vneg16)); vector unsigned int tmp = vec_xl_be(0L, (unsigned int *)c); tmp = vec_rl(tmp, (vector unsigned int)vneg16); return (vector unsigned short)tmp; } creates: _Z12load_be_opt2Pt: .LFB8: .cfi_startproc lxvw4x 34,0,3 #APP # 77 "vec.C" 1 vspltisw 0,-16 # 0 "" 2 #NO_APP vrlw 2,2,0 blr rotate left (-16) = rotate right (+16) as only the 5 bits get evaluated. Please note that the inline assembly is required, because vec_splats(-16) gets converted into a very inefficient constant generation. vector unsigned short load_be_opt(unsigned short *c) { vector signed int vneg16 = vec_splats(-16); vector unsigned int tmp = vec_xl_be(0L, (unsigned int *)c); tmp = vec_rl(tmp, (vector unsigned int)vneg16); return (vector unsigned short)tmp; } creates: _Z11load_be_optPt: .LFB7: .cfi_startproc li 9,48 lxvw4x 34,0,3 vspltisw 0,0 mtvsrd 33,9 xxspltw 33,33,1 vsubuwm 0,0,1 vrlw 2,2,0 blr