stp instructions emitted at -O3

xiezhiheng at huawei dot com Wed, 01 Apr 2020 05:57:38 -0700

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94442


            Bug ID: 94442
           Summary: [AArch64] Redundant ldp/stp instructions emitted at
                    -O3
           Product: gcc
           Version: 10.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: rtl-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: xiezhiheng at huawei dot com
  Target Milestone: ---
            Target: aarch64

Test case:

#include <arm_neon.h>

struct __m256i
{
  int8x16_t vect_s8[2];
};

__attribute__((inline)) __m256i _mm256_adds_epi8(__m256i a, __m256i b)
{
    __m256i res_m256i;
    res_m256i.vect_s8[0] = vqaddq_s8(a.vect_s8[0], b.vect_s8[0]);
    res_m256i.vect_s8[1] = vqaddq_s8(a.vect_s8[1], b.vect_s8[1]);
    return res_m256i;
}

void PerfTest1(__m256i *output, unsigned caseCount)
{
    unsigned loopCount = caseCount;
    __m256i& a = output[0];
    __m256i& b = output[1];
    __m256i& c = output[2];
    for (unsigned i = 0; i < loopCount; i++) {
        a = _mm256_adds_epi8(b, c);
        b = _mm256_adds_epi8(a, c);
        c = _mm256_adds_epi8(c, b);
        a = _mm256_adds_epi8(b, c);
        b = _mm256_adds_epi8(a, c);
        c = _mm256_adds_epi8(c, b);
        a = _mm256_adds_epi8(b, c);
        b = _mm256_adds_epi8(a, c);
        c = _mm256_adds_epi8(c, b);
        b = _mm256_adds_epi8(a, c);
    }
}

Command line (GCC version 10.0): aarch64-linux-gnu-g++ -S -O3 a.c

.L6:
        ldp     q3, q2, [x2]
        add     w4, w4, 1
        sqadd   v1.16b, v1.16b, v3.16b
        sqadd   v0.16b, v0.16b, v2.16b
        stp     q1, q0, [x0]
        ldp     q3, q2, [x2]
        sqadd   v1.16b, v1.16b, v3.16b
        sqadd   v0.16b, v0.16b, v2.16b
        stp     q1, q0, [x0, 32]
        ldp     q3, q2, [x2]
        sqadd   v3.16b, v3.16b, v1.16b
        sqadd   v2.16b, v2.16b, v0.16b
        stp     q3, q2, [x0, 64]
        ldp     q1, q0, [x3]
        sqadd   v1.16b, v1.16b, v3.16b
        sqadd   v0.16b, v0.16b, v2.16b
        stp     q1, q0, [x0]
        ldp     q3, q2, [x2]
        sqadd   v1.16b, v1.16b, v3.16b
        sqadd   v0.16b, v0.16b, v2.16b
        stp     q1, q0, [x0, 32]
        ldp     q3, q2, [x2]
        sqadd   v3.16b, v3.16b, v1.16b
        sqadd   v2.16b, v2.16b, v0.16b
        stp     q3, q2, [x0, 64]
        ldp     q1, q0, [x3]
        sqadd   v1.16b, v1.16b, v3.16b
        sqadd   v0.16b, v0.16b, v2.16b
        stp     q1, q0, [x0]
        ldp     q2, q3, [x2]
        sqadd   v4.16b, v1.16b, v2.16b
        sqadd   v5.16b, v0.16b, v3.16b
        stp     q4, q5, [x0, 32]
        ldp     q2, q3, [x2]
        sqadd   v3.16b, v3.16b, v5.16b
        sqadd   v2.16b, v2.16b, v4.16b
        sqadd   v0.16b, v0.16b, v3.16b
        sqadd   v1.16b, v1.16b, v2.16b
        stp     q2, q3, [x0, 64]
        stp     q1, q0, [x0, 32]
        cmp     w1, w4
        bne     .L6

And command line (GCC version 10.0): aarch64-linux-gnu-g++ -S -O1 a.c
Or (GCC version 9.2.0): aarch64-linux-gnu-g++ -S -O3 a.c

.L4:
        ldr     q0, [x0, 48]
        ldr     q2, [x0, 80]
        ldr     q1, [x0, 32]
        ldr     q3, [x0, 64]
        sqadd   v1.16b, v1.16b, v3.16b
        sqadd   v0.16b, v0.16b, v2.16b
        sqadd   v1.16b, v1.16b, v3.16b
        sqadd   v0.16b, v0.16b, v2.16b
        sqadd   v3.16b, v3.16b, v1.16b
        sqadd   v2.16b, v2.16b, v0.16b
        sqadd   v1.16b, v1.16b, v3.16b
        sqadd   v0.16b, v0.16b, v2.16b
        sqadd   v1.16b, v1.16b, v3.16b
        sqadd   v0.16b, v0.16b, v2.16b
        sqadd   v3.16b, v3.16b, v1.16b
        sqadd   v2.16b, v2.16b, v0.16b
        sqadd   v1.16b, v1.16b, v3.16b
        sqadd   v0.16b, v0.16b, v2.16b
        str     q1, [x0]
        str     q0, [x0, 16]
        sqadd   v5.16b, v1.16b, v3.16b
        sqadd   v4.16b, v0.16b, v2.16b
        sqadd   v3.16b, v3.16b, v5.16b
        sqadd   v2.16b, v2.16b, v4.16b
        str     q3, [x0, 64]
        str     q2, [x0, 80]
        sqadd   v1.16b, v1.16b, v3.16b
        sqadd   v0.16b, v0.16b, v2.16b
        str     q1, [x0, 32]
        str     q0, [x0, 48]
        add     w3, w3, 1
        cmp     w1, w3
        bne     .L4

This issue triggers after commit
https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=3b47da42de621c6c3bf7d2f9245df989aa7eb5a1

This commit changes the gimple from
  a = MEM[(const struct __m256i &)output_5(D) + 32];
  a$vect_s8$0_4 = MEM <int8x16_t> [(const struct __m256i &)output_5(D) + 32];
  a$vect_s8$1_6 = MEM <int8x16_t> [(const struct __m256i &)output_5(D) + 48];
  b = MEM[(const struct __m256i &)output_5(D) + 64];
  b$vect_s8$0_9 = MEM <int8x16_t> [(const struct __m256i &)output_5(D) + 64];
  b$vect_s8$1_11 = MEM <int8x16_t> [(const struct __m256i &)output_5(D) + 80];
  _76 = a$vect_s8$0_4;
  _77 = b$vect_s8$0_9;
To
  a = MEM[(const struct __m256i &)output_5(D) + 32];
  a$vect_s8$0_4 = MEM[(const struct __m256i &)output_5(D) + 32].vect_s8[0]; 
<========
  a$vect_s8$1_6 = MEM[(const struct __m256i &)output_5(D) + 32].vect_s8[1]; 
<========
  b = MEM[(const struct __m256i &)output_5(D) + 64];
  b$vect_s8$0_9 = MEM[(const struct __m256i &)output_5(D) + 64].vect_s8[0]; 
<========
  b$vect_s8$1_11 = MEM[(const struct __m256i &)output_5(D) + 64].vect_s8[1]; 
<========
  _76 = a$vect_s8$0_4;
  _77 = b$vect_s8$0_9;

When expand to RTL, the latter form will emit two insns.
(insn 23 22 24 6 (set (reg/f:DI 140)
        (plus:DI (reg/v/f:DI 133 [ output ])
            (const_int 64 [0x40]))) -1
     (nil))
(insn 24 23 25 6 (set (reg:V16QI 94 [ b$vect_s8$1 ])
        (mem:V16QI (plus:DI (reg/f:DI 140)
                (const_int 16 [0x10])) [0 MEM[(const struct __m256i
&)output_5(D) + 64]+16 S16 A128])) -1
     (nil))

And later in rtl pre pass, insn 23 will be extracted outside the loop as a
common subexpression.
This will cause in dse pass it cannot determine whether the following two insns
reference the same location.
(insn 33 32 36 5 (set (mem:V16QI (plus:DI (reg/v/f:DI 133 [ output ])
                (const_int 16 [0x10])) [1 MEM <int8x16_t> [(struct __m256i
*)output_5(D) + 16B]+0 S16 A128])
        (reg:V16QI 114 [ _35 ])) "a.c":23:34 1203 {*aarch64_simd_movv16qi}
     (nil))
(insn 36 33 41 5 (set (reg:V16QI 116 [ b$vect_s8$1 ])
        (mem:V16QI (plus:DI (reg/f:DI 194)
                (const_int 16 [0x10])) [0 MEM[(const struct __m256i
&)output_5(D) + 64]+16 S16 A128])) 1203 {*aarch64_simd_movv16qi}
     (nil))

Because insn
(insn 140 5 130 4 (set (reg/f:DI 194)
        (plus:DI (reg/v/f:DI 133 [ output ])
            (const_int 64 [0x40]))) 121 {*adddi3_aarch64}
     (nil))

has just be extracted to another bb in rtl pre pass and dse pass is unable to
get this information.
Thus dse pass cannot eliminate these extra STRs.

I would like to solve this problem by propagating insn 23 to its use in fwprop
pass.
However, there exists some restrictions here. I try to modify like this:
diff --git a/gcc/fwprop.c b/gcc/fwprop.c
index 705d2885aae..0edbbc65047 100644
--- a/gcc/fwprop.c
+++ b/gcc/fwprop.c
@@ -416,7 +416,7 @@ should_replace_address (rtx old_rtx, rtx new_rtx,
machine_mode mode,
     gain = (set_src_cost (new_rtx, VOIDmode, speed)
            - set_src_cost (old_rtx, VOIDmode, speed));

-  return (gain > 0);
+  return (gain >= 0);
 }


@@ -1573,10 +1573,14 @@ fwprop (bool fwprop_addr_p)
       df_ref use = DF_USES_GET (i);
       if (use)
        {
+         df_ref def = get_def_for_use (use);
          if (DF_REF_TYPE (use) == DF_REF_REG_USE
              || DF_REF_BB (use)->loop_father == NULL
              /* The outer most loop is not really a loop.  */
-             || loop_outer (DF_REF_BB (use)->loop_father) == NULL)
+             || loop_outer (DF_REF_BB (use)->loop_father) == NULL
+             || (def && (DF_REF_BB (def)->loop_father == DF_REF_BB
(use)->loop_father
+                         || flow_loop_nested_p (DF_REF_BB(use)->loop_father,
+                                               
DF_REF_BB(def)->loop_father))))
            forward_propagate_into (use, fwprop_addr_p);

          else if (fwprop_addr_p)

some discussion mails here
https://gcc.gnu.org/pipermail/gcc/2020-March/231980.html

[Bug rtl-optimization/94442] New: [AArch64] Redundant ldp/stp instructions emitted at -O3

Reply via email to