ARM GCC 8.x Performance Dropping Compared to Linaro GCC 7.x

Yupeng Chang Sun, 18 Aug 2019 21:16:04 -0700

Hi Dear Linaro Team,
I recently found a very strange issue regarding the code performance.
I have a loop written in GCC NEON.
The binary of this coded generated by Linaro GCC 7.x is much faster than it
generated by ARM GCC 8.x


My CPU is ARM Cortex-A53 AARCH64.
The compile option is:
-Wall -O3 -mcpu=cortex-a53+crypto

the code is like below:
     for (uint32 c = 0; c < channels; c += 16, roi_result += 16) {
         int32x4_t       S1, S2, S3, S4;
         int16x4_t       DT;

         DT = vld1_s16(feature1 + c + 0);
         S1 = vmull_lane_s16(DT, SZ, 0);
         DT = vld1_s16(feature1 + c + 4);
         S2 = vmull_lane_s16(DT, SZ, 0);
         DT = vld1_s16(feature1 + c + 8);
         S3 = vmull_lane_s16(DT, SZ, 0);
         DT = vld1_s16(feature1 + c + 12);
         S4 = vmull_lane_s16(DT, SZ, 0);

         DT = vld1_s16(feature2 + c + 0);
         S1 = vmlal_lane_s16(S1, DT, SZ, 1);
         DT = vld1_s16(feature2 + c + 4);
         S2 = vmlal_lane_s16(S2, DT, SZ, 1);
         DT = vld1_s16(feature2 + c + 8);
         S3 = vmlal_lane_s16(S3, DT, SZ, 1);
         DT = vld1_s16(feature2 + c + 12);
         S4 = vmlal_lane_s16(S4, DT, SZ, 1);

         DT = vld1_s16(feature3 + c + 0);
         S1 = vmlal_lane_s16(S1, DT, SZ, 2);
         DT = vld1_s16(feature3 + c + 4);
         S2 = vmlal_lane_s16(S2, DT, SZ, 2);
         DT = vld1_s16(feature3 + c + 8);
         S3 = vmlal_lane_s16(S3, DT, SZ, 2);
         DT = vld1_s16(feature3 + c + 12);
         S4 = vmlal_lane_s16(S4, DT, SZ, 2);

         DT = vld1_s16(feature4 + c + 0);
         S1 = vmlal_lane_s16(S1, DT, SZ, 3);
         DT = vld1_s16(feature4 + c + 4);
         S2 = vmlal_lane_s16(S2, DT, SZ, 3);
         DT = vld1_s16(feature4 + c + 8);
         S3 = vmlal_lane_s16(S3, DT, SZ, 3);
         DT = vld1_s16(feature4 + c + 12);
         S4 = vmlal_lane_s16(S4, DT, SZ, 3);

         DT = vrshrn_n_s32(S1, Q_VALUE);
         vst1_s16(roi_result + 0, DT);
         DT = vrshrn_n_s32(S2, Q_VALUE);
         vst1_s16(roi_result + 4, DT);
         DT = vrshrn_n_s32(S3, Q_VALUE);
         vst1_s16(roi_result + 8, DT);
         DT = vrshrn_n_s32(S4, Q_VALUE);
         vst1_s16(roi_result + 12, DT);
     }

Code generated by GCC7:
  294:   6b10031f    cmp w24, w16
  298:   fc606959    ldr d25, [x10, x0]
  29c:   fc686922    ldr d2, [x9, x8]
  2a0:   fc676921    ldr d1, [x9, x7]
  2a4:   fc666920    ldr d0, [x9, x6]
  2a8:   fc686958    ldr d24, [x10, x8]
  2ac:   fc676957    ldr d23, [x10, x7]
  2b0:   fc666956    ldr d22, [x10, x6]
  2b4:   fc606855    ldr d21, [x2, x0]
  2b8:   fc686854    ldr d20, [x2, x8]
  2bc:   fc676853    ldr d19, [x2, x7]
  2c0:   fc666852    ldr d18, [x2, x6]
  2c4:   fc606891    ldr d17, [x4, x0]
  2c8:   fc686890    ldr d16, [x4, x8]
  2cc:   fc676887    ldr d7, [x4, x7]
  2d0:   fc666885    ldr d5, [x4, x6]
  2d4:   0f44a063    smull   v3.4s, v3.4h, v4.h[0]
  2d8:   0f44a042    smull   v2.4s, v2.4h, v4.h[0]
  2dc:   0f44a021    smull   v1.4s, v1.4h, v4.h[0]
  2e0:   0f44a000    smull   v0.4s, v0.4h, v4.h[0]
  2e4:   0f542323    smlal   v3.4s, v25.4h, v4.h[1]
  2e8:   0f542302    smlal   v2.4s, v24.4h, v4.h[1]
  2ec:   0f5422e1    smlal   v1.4s, v23.4h, v4.h[1]
  2f0:   0f5422c0    smlal   v0.4s, v22.4h, v4.h[1]
  2f4:   0f6422a3    smlal   v3.4s, v21.4h, v4.h[2]
  2f8:   0f642282    smlal   v2.4s, v20.4h, v4.h[2]
  2fc:   0f642261    smlal   v1.4s, v19.4h, v4.h[2]
  300:   0f642240    smlal   v0.4s, v18.4h, v4.h[2]
  304:   0f742223    smlal   v3.4s, v17.4h, v4.h[3]
  308:   0f742202    smlal   v2.4s, v16.4h, v4.h[3]
  30c:   0f7420e1    smlal   v1.4s, v7.4h, v4.h[3]
  310:   0f7420a0    smlal   v0.4s, v5.4h, v4.h[3]
  314:   0f138c63    rshrn   v3.4h, v3.4s, #13
  318:   0f138c42    rshrn   v2.4h, v2.4s, #13
  31c:   0f138c21    rshrn   v1.4h, v1.4s, #13
  320:   0f138c00    rshrn   v0.4h, v0.4s, #13
  324:   6d3e0a63    stp d3, d2, [x19, #-32]
  328:   6d3f0261    stp d1, d0, [x19, #-16]

Code generated by GCC8:

  26c:   6b0b02ff    cmp w23, w11
  270:   fc606922    ldr d2, [x9, x0]
  274:   fc666941    ldr d1, [x10, x6]
  278:   fc666920    ldr d0, [x9, x6]
  27c:   0f44a000    smull   v0.4s, v0.4h, v4.h[0]
  280:   0f542020    smlal   v0.4s, v1.4h, v4.h[1]
  284:   fc6668e1    ldr d1, [x7, x6]
  288:   0f642020    smlal   v0.4s, v1.4h, v4.h[2]
  28c:   fc646945    ldr d5, [x10, x4]
  290:   fc666901    ldr d1, [x8, x6]
  294:   0f742020    smlal   v0.4s, v1.4h, v4.h[3]
  298:   fc646921    ldr d1, [x9, x4]
  29c:   0f44a021    smull   v1.4s, v1.4h, v4.h[0]
  2a0:   0f5420a1    smlal   v1.4s, v5.4h, v4.h[1]
  2a4:   fc626945    ldr d5, [x10, x2]
  2a8:   0f138c03    rshrn   v3.4h, v0.4s, #13
  2ac:   fc626920    ldr d0, [x9, x2]
  2b0:   0f44a000    smull   v0.4s, v0.4h, v4.h[0]
  2b4:   0f5420a0    smlal   v0.4s, v5.4h, v4.h[1]
  2b8:   fc606945    ldr d5, [x10, x0]
  2bc:   0f44a042    smull   v2.4s, v2.4h, v4.h[0]
  2c0:   0f5420a2    smlal   v2.4s, v5.4h, v4.h[1]
  2c4:   fc6468e5    ldr d5, [x7, x4]
  2c8:   0f6420a1    smlal   v1.4s, v5.4h, v4.h[2]
  2cc:   fc6268e5    ldr d5, [x7, x2]
  2d0:   0f6420a0    smlal   v0.4s, v5.4h, v4.h[2]
  2d4:   fc6068e5    ldr d5, [x7, x0]
  2d8:   0f6420a2    smlal   v2.4s, v5.4h, v4.h[2]
  2dc:   fc646905    ldr d5, [x8, x4]
  2e0:   0f7420a1    smlal   v1.4s, v5.4h, v4.h[3]
  2e4:   fc626905    ldr d5, [x8, x2]
  2e8:   0f138c21    rshrn   v1.4h, v1.4s, #13
  2ec:   0f7420a0    smlal   v0.4s, v5.4h, v4.h[3]
  2f0:   0f138c00    rshrn   v0.4h, v0.4s, #13
  2f4:   fc606905    ldr d5, [x8, x0]
  2f8:   0f7420a2    smlal   v2.4s, v5.4h, v4.h[3]
  2fc:   0f138c42    rshrn   v2.4h, v2.4s, #13
  300:   6d000e62    stp d2, d3, [x19]
  304:   6d010261    stp d1, d0, [x19, #16]
  308:   91008273    add x19, x19, #0x20

I did some tests on different compile options, and found that option
"-fschedule-insns" on GCC 7 will generate code that runs faster, if I
disable schedule-insns, GCC7 will generate the same code as GCC8.
However, this option seems don't work on GCC8, if I enable
"-fschedule-insns" with GCC8, the code generated by GCC8 is even slower. If
I disable "-fschedule-insns" with GCC8, the generated code is just like the
sequence as in C code.

I compiled my code with -O3, which means -fschedule-insns will be enabled
by default.

With this option enabled, GCC7 will reschedule instructions, and it seems
that GCC7 will arrange the same instructions all together, but GCC8 doesn't
do that, or GCC8 will reschedule instructions in a worse way.

My question is, is this behavior expected in GCC8, GCC9 and the future
version?
Is this change in GCC code scheduling related to the fix of "spectre and
mitigation" ?

If I want the same instruction scheduling mechanism in GCC8, what can I do ?

Thank you for looking into this.

Looking forward to your reply!

Tomas Chang
Aug 19, 2019
_______________________________________________
linaro-toolchain mailing list
linaro-toolchain@lists.linaro.org
https://lists.linaro.org/mailman/listinfo/linaro-toolchain

ARM GCC 8.x Performance Dropping Compared to Linaro GCC 7.x

Reply via email to