https://gcc.gnu.org/bugzilla/show_bug.cgi?id=119384

            Bug ID: 119384
           Summary: Extra move in tight loop with SIMD and subregs
           Product: gcc
           Version: 15.0
            Status: UNCONFIRMED
          Keywords: missed-optimization
          Severity: normal
          Priority: P3
         Component: middle-end
          Assignee: unassigned at gcc dot gnu.org
          Reporter: ktkachov at gcc dot gnu.org
  Target Milestone: ---
            Target: aarch64

We have a workload for aarch64 using the SIMDe translation error that results
in slower code than Clang due to an extra move emitted in a hot, tight loop.
The reduced source code is:

#include <arm_neon.h>

#define simde_mm_set1_epi8(x)      (vreinterpretq_s64_u8(vdupq_n_u8(x)))
#define simde_mm_load_si128(x)    
(vreinterpretq_s64_s32(vld1q_s32((reinterpret_cast<int32_t const*>(x)))))
#define simde_mm_adds_epu8(x, y)  
(vreinterpretq_s64_u8(vqaddq_u8(vreinterpretq_u8_s64(x),
vreinterpretq_u8_s64(y))))
#define simde_mm_subs_epu8(x, y)  
(vreinterpretq_s64_u8(vqsubq_u8(vreinterpretq_u8_s64(x),
vreinterpretq_u8_s64(y))))
#define simde_mm_max_epu8(x, y)   
(vreinterpretq_s64_u8(vmaxq_u8(vreinterpretq_u8_s64(x),
vreinterpretq_u8_s64(y))))
#define simde_mm_store_si128(x, y) (vst1q_s32((reinterpret_cast<int32_t*>(x)),
vreinterpretq_s32_s64(y)))

void foobar(
    int64x2_t* query_profile_byte, unsigned char *db_sequence, int8_t ref_dir,
    int32_t db_length, int32_t query_length, int64x2_t* pvHStore,
    int64x2_t* pvHLoad, int64x2_t* pvE, uint8_t gap_open, uint8_t gap_extend,
    uint8_t bias, uint8_t * maxColumn)
{
    const int SIMD_SIZE = 4 * 4;
    int32_t segLen = (query_length + SIMD_SIZE-1) / SIMD_SIZE;

    int64x2_t vZero = simde_mm_set1_epi8(0);
    int64x2_t vGapO = simde_mm_set1_epi8(gap_open);
    int64x2_t vGapE = simde_mm_set1_epi8(gap_extend);
    int64x2_t vBias = simde_mm_set1_epi8(bias);
    int64x2_t vTemp;

    #pragma GCC unroll 1
    for (int i = 0; __builtin_expect((i != db_length),1); i++) {
        int64x2_t e, vF = vZero, vMaxColumn = vZero;

        int64x2_t vH = vZero;
        const int64x2_t* vP = query_profile_byte + db_sequence[i] * segLen;

        int64x2_t* pv = pvHLoad;
        pvHLoad = pvHStore;
        pvHStore = pv;

        // loop of interest
        #pragma GCC unroll 1
        for (int j = 0; __builtin_expect((j < segLen),1); ++j)
        {
            int64x2_t score = simde_mm_set1_epi8(0);
            score = simde_mm_load_si128(vP + j);

            vH = simde_mm_adds_epu8(vH, score);
            vH = simde_mm_subs_epu8(vH, vBias);

            e = simde_mm_load_si128(pvE + j);
            vH = simde_mm_max_epu8(vH, e);
            vH = simde_mm_max_epu8(vH, vF);
            vMaxColumn = simde_mm_max_epu8(vMaxColumn, vH);

            simde_mm_store_si128(pvHStore + j, vH);

            vH = simde_mm_subs_epu8(vH, vGapO);

            e = simde_mm_subs_epu8(e, vGapE);
            e = simde_mm_max_epu8(e, vH);
            simde_mm_store_si128(pvE + j, e);

            vF = simde_mm_subs_epu8(vF, vGapE);
            vF = simde_mm_max_epu8(vF, vH);

            vH = simde_mm_load_si128(pvHLoad + j);
        }

        // comment this line to have previous loop without MOVs:
        simde_mm_store_si128(pvHStore, simde_mm_subs_epu8(vF, vH));
    }
}

Compiled with -O3 -mcpu=neoverse-v2, for example produces the loop:
.L4:
        ldr     q26, [x4, x2]
        uqsub   v24.16b, v27.16b, v30.16b
        add     w3, w3, 1
        ldr     q0, [x7, x2]
        uqadd   v26.16b, v28.16b, v26.16b
        uqsub   v26.16b, v26.16b, v29.16b
        umax    v26.16b, v26.16b, v0.16b
        uqsub   v0.16b, v0.16b, v30.16b
        umax    v26.16b, v26.16b, v27.16b
        str     q26, [x6, x2]
        uqsub   v26.16b, v26.16b, v31.16b
        umax    v0.16b, v0.16b, v26.16b
        umax    v24.16b, v24.16b, v26.16b
        str     q0, [x7, x2]
        mov     v27.16b, v24.16b   // Superfluous mov
        ldr     q25, [x5, x2]
        add     x2, x2, 16
        mov     v28.16b, v25.16b
        cmp     w8, w3
        bgt     .L4

Whereas Clang emits a tighter sequence:
.LBB0_4:
        ldr     q5, [x14, x13]
        uqadd   v4.16b, v4.16b, v5.16b
        ldr     q5, [x7, x13]
        uqsub   v4.16b, v4.16b, v2.16b
        umax    v4.16b, v4.16b, v5.16b
        uqsub   v5.16b, v5.16b, v1.16b
        umax    v4.16b, v4.16b, v3.16b
        uqsub   v3.16b, v3.16b, v1.16b
        str     q4, [x6, x13]
        uqsub   v4.16b, v4.16b, v0.16b
        umax    v5.16b, v5.16b, v4.16b
        umax    v3.16b, v3.16b, v4.16b
        str     q5, [x7, x13]
        ldr     q4, [x12, x13]
        add     x13, x13, #16
        cmp     x11, x13
        b.ne    .LBB0_4

This ends up hurting performance considerably in the original application where
this sequence is hot

Reply via email to