https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99398
Bug ID: 99398
Summary: Miss to optimize vector permutation fed by CTOR and
CTOR/CST
Product: gcc
Version: 11.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: tree-optimization
Assignee: unassigned at gcc dot gnu.org
Reporter: linkw at gcc dot gnu.org
Target Milestone: ---
#include "altivec.h"
vector long long foo(long long a, long long b) {
vector long long v1 = {a, 0};
vector long long v2 = {b, 0};
vector unsigned char vc = {0,1,2,3,4,5,6,7, 16,17,18,19,20,21,22,23};
vector long long vres = (vector long long)vec_perm ((vector unsigned char)v1,
(vector unsigned char)v2, vc);
return vres;
}
gcc -Ofast -mcpu=power9, it generates (asm on BE btw)
mtvsrdd 32,3,9
mtvsrdd 33,4,9
lxv 34,0(10)
vperm 2,0,1,2
blr
But it can be optimized into:
mtvsrdd 34,3,4
blr
The gimple at optimized dumping looks like:
__vector long foo (long long int a, long long int b)
{
__vector long vres;
__vector long v2;
__vector long v1;
__vector unsigned char _5;
__vector unsigned char _6;
__vector unsigned char _7;
<bb 2> [local count: 1073741824]:
v1_2 = {a_1(D), 0};
v2_4 = {b_3(D), 0};
_5 = VIEW_CONVERT_EXPR<__vector unsigned char>(v1_2);
_6 = VIEW_CONVERT_EXPR<__vector unsigned char>(v2_4);
_7 = VEC_PERM_EXPR <_5, _6, { 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21,
22, 23 }>;
vres_8 = VIEW_CONVERT_EXPR<__vector long>(_7);
return vres_8;
}
But it can look like:
__vector long foo (long long int a, long long int b)
{
vector(2) long long int _10;
<bb 2> [local count: 1073741824]:
_10 = {a_1(D), b_3(D)};
return _10;
}