https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99718
--- Comment #13 from luoxhu at gcc dot gnu.org ---
Performance data in #c11 is for int variable vec_insert of 32bit mode, the
float variable vec_insert of 32-bit is a bit slower but much better than
original(extra stfs+lwz of insn #17 and insn 18 in expand to move SF register
to SI register by hex value.):
46.677s -> 8.723s
test.c
#include <altivec.h>
#define TYPE float
vector TYPE
test (vector TYPE u, TYPE i, signed int n){
return vec_insert (i, u, n);
}
Expand:
1: NOTE_INSN_DELETED
6: NOTE_INSN_BASIC_BLOCK 2
2: r122:V4SF=%2:V4SF
3: r123:SF=%1:SF
4: r124:SI=%3:SI
5: NOTE_INSN_FUNCTION_BEG
8: r120:V4SF=r122:V4SF
9: r125:SI=r124:SI&0x3
10: r126:V4SF=r120:V4SF
11: r128:SI=r125:SI<<0x2
12: {r128:SI=0x14-r128:SI;clobber ca:SI;}
13: r132:SI=high(`*.LC0')
14: r131:SI=r132:SI+low(`*.LC0')
REG_EQUAL `*.LC0'
15: r130:V2DI=[r131:SI]
REG_EQUAL const_vector
16: r129:V16QI=r130:V2DI#0
17: [r112:SI]=r123:SF
18: r133:SI=[r112:SI]
19: r136:DI#4=r133:SI
22: {r137:SI=r133:SI>>0x1f;clobber ca:SI;}
23: r136:DI#0=r137:SI
24: r138:DI=0
25: r135:V2DI=vec_concat(r136:DI,r138:DI)
26: r134:V16QI=r135:V2DI#0
27: r139:V16QI=unspec[r128:SI] 151
28: r140:V16QI=unspec[r134:V16QI,r134:V16QI,r139:V16QI] 236
29: r141:V16QI=unspec[r129:V16QI,r129:V16QI,r139:V16QI] 236
30: r126:V4SF#0={(r141:V16QI!=const_vector)?r140:V16QI:r126:V4SF#0}
31: r119:V4SF=r126:V4SF
32: r120:V4SF=r119:V4SF
ASM:
.LFB0:
.cfi_startproc
stwu 1,-16(1)
.cfi_def_cfa_offset 16
lis 9,.LC0@ha
rlwinm 3,3,2,28,29
xxlxor 0,0,0
la 9,.LC0@l(9)
subfic 3,3,20
lxvd2x 33,0,9
lvsl 13,0,3
stfs 1,8(1)
vperm 1,1,1,13
ori 2,2,0
lwz 9,8(1)
addi 1,1,16
.cfi_def_cfa_offset 0
srawi 10,9,31
mtvsrwz 13,9
mtvsrwz 12,10
fmrgow 11,12,13
xxpermdi 32,11,0,0
vperm 0,0,0,13
xxsel 34,34,32,33
blr