https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99398
Bug ID: 99398 Summary: Miss to optimize vector permutation fed by CTOR and CTOR/CST Product: gcc Version: 11.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: tree-optimization Assignee: unassigned at gcc dot gnu.org Reporter: linkw at gcc dot gnu.org Target Milestone: --- #include "altivec.h" vector long long foo(long long a, long long b) { vector long long v1 = {a, 0}; vector long long v2 = {b, 0}; vector unsigned char vc = {0,1,2,3,4,5,6,7, 16,17,18,19,20,21,22,23}; vector long long vres = (vector long long)vec_perm ((vector unsigned char)v1, (vector unsigned char)v2, vc); return vres; } gcc -Ofast -mcpu=power9, it generates (asm on BE btw) mtvsrdd 32,3,9 mtvsrdd 33,4,9 lxv 34,0(10) vperm 2,0,1,2 blr But it can be optimized into: mtvsrdd 34,3,4 blr The gimple at optimized dumping looks like: __vector long foo (long long int a, long long int b) { __vector long vres; __vector long v2; __vector long v1; __vector unsigned char _5; __vector unsigned char _6; __vector unsigned char _7; <bb 2> [local count: 1073741824]: v1_2 = {a_1(D), 0}; v2_4 = {b_3(D), 0}; _5 = VIEW_CONVERT_EXPR<__vector unsigned char>(v1_2); _6 = VIEW_CONVERT_EXPR<__vector unsigned char>(v2_4); _7 = VEC_PERM_EXPR <_5, _6, { 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 }>; vres_8 = VIEW_CONVERT_EXPR<__vector long>(_7); return vres_8; } But it can look like: __vector long foo (long long int a, long long int b) { vector(2) long long int _10; <bb 2> [local count: 1073741824]: _10 = {a_1(D), b_3(D)}; return _10; }