https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63679
alalaw01 at gcc dot gnu.org changed: What |Removed |Added ---------------------------------------------------------------------------- CC| |alalaw01 at gcc dot gnu.org --- Comment #32 from alalaw01 at gcc dot gnu.org --- Is the SRA approach going to work? I have hacked up my SRA so that it generates this: foo () { int sum; int i; const int a[8]; unsigned int i.0_7; int _8; unsigned int i.0_19; <bb 2>: MEM[(int[8] *)&a] = 0; MEM[(int[8] *)&a + 4B] = 1; MEM[(int[8] *)&a + 8B] = 2; MEM[(int[8] *)&a + 12B] = 3; MEM[(int[8] *)&a + 16B] = 4; MEM[(int[8] *)&a + 20B] = 5; MEM[(int[8] *)&a + 24B] = 6; MEM[(int[8] *)&a + 28B] = 7; i.0_19 = 0; if (i.0_19 != 8) goto <bb 3>; else goto <bb 4>; <bb 3>: # i_20 = PHI <i_10(3), 0(2)> # sum_21 = PHI <sum_9(3), 0(2)> _8 = a[i_20]; sum_9 = sum_21 + _8; i_10 = i_20 + 1; i.0_7 = (unsigned int) i_10; if (i.0_7 != 8) goto <bb 3>; else goto <bb 4>; <bb 4>: # sum_22 = PHI <sum_9(3), 0(2)> a ={v} {CLOBBER}; return sum_22; } the vectorizer then transforms to: ... <bb 2>: MEM[(int[8] *)&a] = 0; MEM[(int[8] *)&a + 4B] = 1; MEM[(int[8] *)&a + 8B] = 2; MEM[(int[8] *)&a + 12B] = 3; MEM[(int[8] *)&a + 16B] = 4; MEM[(int[8] *)&a + 20B] = 5; MEM[(int[8] *)&a + 24B] = 6; MEM[(int[8] *)&a + 28B] = 7; <bb 3>: # i_20 = PHI <0(2), i_10(4)> # sum_21 = PHI <0(2), sum_9(4)> # ivtmp_19 = PHI <8(2), ivtmp_22(4)> # vectp_a.1_1 = PHI <&a(2), vectp_a.1_2(4)> # vect_sum_9.4_17 = PHI <{ 0, 0, 0, 0 }(2), vect_sum_9.4_23(4)> # ivtmp_27 = PHI <0(2), ivtmp_28(4)> vect__8.3_18 = MEM[(int *)vectp_a.1_1]; _8 = a[i_20]; vect_sum_9.4_23 = vect__8.3_18 + vect_sum_9.4_17; sum_9 = _8 + sum_21; i_10 = i_20 + 1; ivtmp_22 = ivtmp_19 - 1; vectp_a.1_2 = vectp_a.1_1 + 16; ivtmp_28 = ivtmp_27 + 1; if (ivtmp_28 < 2) goto <bb 4>; else goto <bb 5>; <bb 4>: goto <bb 3>; <bb 5>: # sum_7 = PHI <sum_9(3)> # vect_sum_9.4_24 = PHI <vect_sum_9.4_23(3)> stmp_sum_9.5_25 = [reduc_plus_expr] vect_sum_9.4_24; vect_sum_9.6_26 = stmp_sum_9.5_25 + 0; a ={v} {CLOBBER}; return vect_sum_9.6_26; } and the optimized tree is: foo () { int vect_sum_9.6; int stmp_sum_9.5; vector(4) int vect_sum_9.4; const vector(4) int vect__8.3; const int a[8]; <bb 2>: MEM[(int[8] *)&a] = { 0, 1, 2, 3 }; MEM[(int[8] *)&a + 16B] = { 4, 5, 6, 7 }; vect__8.3_20 = MEM[(int *)&a]; vect__8.3_18 = MEM[(int *)&a + 16B]; vect_sum_9.4_23 = vect__8.3_18 + vect__8.3_20; stmp_sum_9.5_25 = [reduc_plus_expr] vect_sum_9.4_23; vect_sum_9.6_26 = stmp_sum_9.5_25; a ={v} {CLOBBER}; return vect_sum_9.6_26; } final assembly is: ldr q1, .LC1 sub sp, sp, #32 ldr q0, .LC2 add sp, sp, 32 add v0.4s, v0.4s, v1.4s addv s0, v0.4s umov w0, v0.s[0] ret which is a slight improvement, but not really what we are looking for...