https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63679

alalaw01 at gcc dot gnu.org changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
                 CC|                            |alalaw01 at gcc dot gnu.org

--- Comment #32 from alalaw01 at gcc dot gnu.org ---
Is the SRA approach going to work? I have hacked up my SRA so that it generates
this:

foo ()
{
  int sum;
  int i;
  const int a[8];
  unsigned int i.0_7;
  int _8;
  unsigned int i.0_19;

  <bb 2>:
  MEM[(int[8] *)&a] = 0;
  MEM[(int[8] *)&a + 4B] = 1;
  MEM[(int[8] *)&a + 8B] = 2;
  MEM[(int[8] *)&a + 12B] = 3;
  MEM[(int[8] *)&a + 16B] = 4;
  MEM[(int[8] *)&a + 20B] = 5;
  MEM[(int[8] *)&a + 24B] = 6;
  MEM[(int[8] *)&a + 28B] = 7;
  i.0_19 = 0;
  if (i.0_19 != 8)
    goto <bb 3>;
  else
    goto <bb 4>;

  <bb 3>:
  # i_20 = PHI <i_10(3), 0(2)>
  # sum_21 = PHI <sum_9(3), 0(2)>
  _8 = a[i_20];
  sum_9 = sum_21 + _8;
  i_10 = i_20 + 1;
  i.0_7 = (unsigned int) i_10;
  if (i.0_7 != 8)
    goto <bb 3>;
  else
    goto <bb 4>;

  <bb 4>:
  # sum_22 = PHI <sum_9(3), 0(2)>
  a ={v} {CLOBBER};
  return sum_22;
}

the vectorizer then transforms to:
...
  <bb 2>:
  MEM[(int[8] *)&a] = 0;
  MEM[(int[8] *)&a + 4B] = 1;
  MEM[(int[8] *)&a + 8B] = 2;
  MEM[(int[8] *)&a + 12B] = 3;
  MEM[(int[8] *)&a + 16B] = 4;
  MEM[(int[8] *)&a + 20B] = 5;
  MEM[(int[8] *)&a + 24B] = 6;
  MEM[(int[8] *)&a + 28B] = 7;

  <bb 3>:
  # i_20 = PHI <0(2), i_10(4)>
  # sum_21 = PHI <0(2), sum_9(4)>
  # ivtmp_19 = PHI <8(2), ivtmp_22(4)>
  # vectp_a.1_1 = PHI <&a(2), vectp_a.1_2(4)>
  # vect_sum_9.4_17 = PHI <{ 0, 0, 0, 0 }(2), vect_sum_9.4_23(4)>
  # ivtmp_27 = PHI <0(2), ivtmp_28(4)>
  vect__8.3_18 = MEM[(int *)vectp_a.1_1];
  _8 = a[i_20];
  vect_sum_9.4_23 = vect__8.3_18 + vect_sum_9.4_17;
  sum_9 = _8 + sum_21;
  i_10 = i_20 + 1;
  ivtmp_22 = ivtmp_19 - 1;
  vectp_a.1_2 = vectp_a.1_1 + 16;
  ivtmp_28 = ivtmp_27 + 1;
  if (ivtmp_28 < 2)
    goto <bb 4>;
  else
    goto <bb 5>;

  <bb 4>:
  goto <bb 3>;

  <bb 5>:
  # sum_7 = PHI <sum_9(3)>
  # vect_sum_9.4_24 = PHI <vect_sum_9.4_23(3)>
  stmp_sum_9.5_25 = [reduc_plus_expr] vect_sum_9.4_24;
  vect_sum_9.6_26 = stmp_sum_9.5_25 + 0;
  a ={v} {CLOBBER};
  return vect_sum_9.6_26;

}

and the optimized tree is:

foo ()
{
  int vect_sum_9.6;
  int stmp_sum_9.5;
  vector(4) int vect_sum_9.4;
  const vector(4) int vect__8.3;
  const int a[8];

  <bb 2>:
  MEM[(int[8] *)&a] = { 0, 1, 2, 3 };
  MEM[(int[8] *)&a + 16B] = { 4, 5, 6, 7 };
  vect__8.3_20 = MEM[(int *)&a];
  vect__8.3_18 = MEM[(int *)&a + 16B];
  vect_sum_9.4_23 = vect__8.3_18 + vect__8.3_20;
  stmp_sum_9.5_25 = [reduc_plus_expr] vect_sum_9.4_23;
  vect_sum_9.6_26 = stmp_sum_9.5_25;
  a ={v} {CLOBBER};
  return vect_sum_9.6_26;
}

final assembly is:
        ldr     q1, .LC1
        sub     sp, sp, #32
        ldr     q0, .LC2
        add     sp, sp, 32
        add     v0.4s, v0.4s, v1.4s
        addv    s0, v0.4s
        umov    w0, v0.s[0]
        ret
which is a slight improvement, but not really what we are looking for...

Reply via email to