https://gcc.gnu.org/bugzilla/show_bug.cgi?id=117990
--- Comment #5 from Li Pan <pan2.li at intel dot com> --- The tree optimized looks right up to a point. 5 │ int main () 6 │ { 7 │ vector(8) int vect__4.8; 8 │ vector(8) char vect__3.7; 9 │ vector(8) char D.2823; 10 │ int _5; 11 │ int _8; 12 │ vector(8) char _26(D); 13 │ 14 │ <bb 2> [local count: 97603129]: 15 │ d[0] = 9; 16 │ d[10] = 9; 17 │ d[20] = 9; 18 │ d[30] = 9; 19 │ d[40] = 9; 20 │ d[50] = 9; 21 │ d[60] = 9; 22 │ d[70] = 9; 23 │ d[80] = 9; 24 │ d[90] = 9; 25 │ vect__3.7_27 = .MASK_LEN_STRIDED_LOAD (&MEM <char[225]> [(void *)&d + 30B], 10, { 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1 }, _26(D), 7, 0); 26 │ vect__4.8_28 = (vector(8) int) vect__3.7_27; 27 │ .MASK_LEN_STORE (&MEM <int[10]> [(void *)&e + 12B], 32B, { -1, -1, -1, -1, -1, -1, -1, -1 }, 7, 0, vect__4.8_28); 28 │ _5 = e[5]; 29 │ if (_5 != 9) 30 │ goto <bb 3>; [51.11%] 31 │ else 32 │ goto <bb 4>; [48.89%] 33 │ 34 │ <bb 3> [local count: 49884959]: 35 │ 36 │ <bb 4> [local count: 97603128]: 37 │ # _8 = PHI <123(3), 0(2)> 38 │ return _8; 39 │ 40 │ } But the asm dump looks not that correct. main: lui a5,%hi(.LANCHOR0) addi a5,a5,%lo(.LANCHOR0) li a4,9 sb a4,30(a5) addi a3,a5,30 vsetivli zero,7,e32,m1,ta,ma li a2,10 vlse8.v v2,0(a3),a2 // depends on 30(a5), 40(a5), ... 90(a5) but only 30(a5) has been promoted before vlse, looks incorrect memory dependencies. addi a3,a5,252 sb a4,0(a5) sb a4,10(a5) sb a4,20(a5) sb a4,40(a5) vzext.vf4 v1,v2 sb a4,50(a5) sb a4,60(a5) vse32.v v1,0(a3) li a0,0 sb a4,70(a5) sb a4,80(a5) sb a4,90(a5) lw a5,260(a5) beq a5,a4,.L4 li a0,123