[Bug tree-optimization/69710] performance issue with SP Linpack with Autovectorization

pinskia at gcc dot gnu.org Sat, 06 Feb 2016 14:29:20 -0800

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69710


--- Comment #3 from Andrew Pinski <pinskia at gcc dot gnu.org> ---
For the double one, in .optimized on the trunk for aarch64
(--with-cpu=thunderx) I get:

  <bb 11>:
  # ivtmp.22_60 = PHI <0(10), ivtmp.22_59(11)>
  # ivtmp.25_75 = PHI <0(10), ivtmp.25_79(11)>
  vect__12.14_86 = MEM[base: vectp_dy.13_82, index: ivtmp.25_75, offset: 0B];
  vect__15.17_91 = MEM[base: vectp_dx.16_87, index: ivtmp.25_75, offset: 0B];
  vect__17.19_94 = vect__15.17_91 * vect_cst__92 + vect__12.14_86;
  MEM[base: vectp_dy.13_82, index: ivtmp.25_75, offset: 0B] = vect__17.19_94;
  ivtmp.22_59 = ivtmp.22_60 + 1;
  ivtmp.25_79 = ivtmp.25_75 + 16;
  if (bnd.9_55 > ivtmp.22_59)
    goto <bb 11>;
  else

So yes there are two IV one for the memory accesses and one for the comparison.

The assembly code for the above loop (without -funroll-all-loops):
.L8:
        ldr     q1, [x7, x0]
        add     w5, w5, 1
        ldr     q2, [x2, x0]
        fmla    v1.2d, v2.2d, v3.2d
        str     q1, [x7, x0]
        add     x0, x0, 16
        cmp     w6, w5
        bhi     .L8


For SP I get:

  <bb 13>:
  # ivtmp.22_80 = PHI <0(12), ivtmp.22_79(13)>
  # ivtmp.25_113 = PHI <0(12), ivtmp.25_112(13)>
  vect__12.14_87 = MEM[base: vectp_dy.13_83, index: ivtmp.25_113, offset: 0B];
  vect__15.17_92 = MEM[base: vectp_dx.16_88, index: ivtmp.25_113, offset: 0B];
  vect__17.19_95 = vect__15.17_92 * vect_cst__93 + vect__12.14_87;
  MEM[base: vectp_dy.13_83, index: ivtmp.25_113, offset: 0B] = vect__17.19_95;
  ivtmp.22_79 = ivtmp.22_80 + 1;
  ivtmp.25_112 = ivtmp.25_113 + 16;
  if (bnd.9_55 > ivtmp.22_79)
    goto <bb 13>;
  else
    goto <bb 15>;


.L8:
        ldr     q1, [x8, x4]
        add     w7, w7, 1
        ldr     q2, [x2, x4]
        fmla    v1.4s, v2.4s, v3.4s
        str     q1, [x8, x4]
        add     x4, x4, 16
        cmp     w5, w7
        bhi     .L8

So it works for me on aarch64 correctly.

For -funroll-all-loops I get (similarly for DP also):
.L8:
        ldr     q7, [x15, x4]
        add     x7, x4, 16
        ldr     q16, [x16, x4]
        add     x9, x4, 32
        ldr     q17, [x16, x7]
        add     x8, x4, 48
        ldr     q19, [x16, x9]
        add     x11, x4, 64
        ldr     q22, [x16, x8]
        add     x14, x4, 80
        fmla    v7.4s, v16.4s, v21.4s
        ldr     q24, [x16, x11]
        ldr     q26, [x16, x14]
        add     x12, x4, 96
        ldr     q28, [x16, x12]
        add     x17, x4, 112
        ldr     q30, [x16, x17]
        add     w2, w2, 8
        str     q7, [x15, x4]
        add     x4, x4, 128
        ldr     q18, [x15, x7]
        fmla    v18.4s, v17.4s, v21.4s
        str     q18, [x15, x7]
        ldr     q20, [x15, x9]
        fmla    v20.4s, v19.4s, v21.4s
        str     q20, [x15, x9]
        ldr     q23, [x15, x8]
        fmla    v23.4s, v22.4s, v21.4s
        str     q23, [x15, x8]
        ldr     q25, [x15, x11]
        fmla    v25.4s, v24.4s, v21.4s
        str     q25, [x15, x11]
        ldr     q27, [x15, x14]
        fmla    v27.4s, v26.4s, v21.4s
        str     q27, [x15, x14]
        ldr     q29, [x15, x12]
        fmla    v29.4s, v28.4s, v21.4s
        str     q29, [x15, x12]
        ldr     q31, [x15, x17]
        fmla    v31.4s, v30.4s, v21.4s
        str     q31, [x15, x17]
        cmp     w13, w2
        bhi     .L8

[Bug tree-optimization/69710] performance issue with SP Linpack with Autovectorization

Reply via email to