https://gcc.gnu.org/bugzilla/show_bug.cgi?id=119384
Bug ID: 119384 Summary: Extra move in tight loop with SIMD and subregs Product: gcc Version: 15.0 Status: UNCONFIRMED Keywords: missed-optimization Severity: normal Priority: P3 Component: middle-end Assignee: unassigned at gcc dot gnu.org Reporter: ktkachov at gcc dot gnu.org Target Milestone: --- Target: aarch64 We have a workload for aarch64 using the SIMDe translation error that results in slower code than Clang due to an extra move emitted in a hot, tight loop. The reduced source code is: #include <arm_neon.h> #define simde_mm_set1_epi8(x) (vreinterpretq_s64_u8(vdupq_n_u8(x))) #define simde_mm_load_si128(x) (vreinterpretq_s64_s32(vld1q_s32((reinterpret_cast<int32_t const*>(x))))) #define simde_mm_adds_epu8(x, y) (vreinterpretq_s64_u8(vqaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)))) #define simde_mm_subs_epu8(x, y) (vreinterpretq_s64_u8(vqsubq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)))) #define simde_mm_max_epu8(x, y) (vreinterpretq_s64_u8(vmaxq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)))) #define simde_mm_store_si128(x, y) (vst1q_s32((reinterpret_cast<int32_t*>(x)), vreinterpretq_s32_s64(y))) void foobar( int64x2_t* query_profile_byte, unsigned char *db_sequence, int8_t ref_dir, int32_t db_length, int32_t query_length, int64x2_t* pvHStore, int64x2_t* pvHLoad, int64x2_t* pvE, uint8_t gap_open, uint8_t gap_extend, uint8_t bias, uint8_t * maxColumn) { const int SIMD_SIZE = 4 * 4; int32_t segLen = (query_length + SIMD_SIZE-1) / SIMD_SIZE; int64x2_t vZero = simde_mm_set1_epi8(0); int64x2_t vGapO = simde_mm_set1_epi8(gap_open); int64x2_t vGapE = simde_mm_set1_epi8(gap_extend); int64x2_t vBias = simde_mm_set1_epi8(bias); int64x2_t vTemp; #pragma GCC unroll 1 for (int i = 0; __builtin_expect((i != db_length),1); i++) { int64x2_t e, vF = vZero, vMaxColumn = vZero; int64x2_t vH = vZero; const int64x2_t* vP = query_profile_byte + db_sequence[i] * segLen; int64x2_t* pv = pvHLoad; pvHLoad = pvHStore; pvHStore = pv; // loop of interest #pragma GCC unroll 1 for (int j = 0; __builtin_expect((j < segLen),1); ++j) { int64x2_t score = simde_mm_set1_epi8(0); score = simde_mm_load_si128(vP + j); vH = simde_mm_adds_epu8(vH, score); vH = simde_mm_subs_epu8(vH, vBias); e = simde_mm_load_si128(pvE + j); vH = simde_mm_max_epu8(vH, e); vH = simde_mm_max_epu8(vH, vF); vMaxColumn = simde_mm_max_epu8(vMaxColumn, vH); simde_mm_store_si128(pvHStore + j, vH); vH = simde_mm_subs_epu8(vH, vGapO); e = simde_mm_subs_epu8(e, vGapE); e = simde_mm_max_epu8(e, vH); simde_mm_store_si128(pvE + j, e); vF = simde_mm_subs_epu8(vF, vGapE); vF = simde_mm_max_epu8(vF, vH); vH = simde_mm_load_si128(pvHLoad + j); } // comment this line to have previous loop without MOVs: simde_mm_store_si128(pvHStore, simde_mm_subs_epu8(vF, vH)); } } Compiled with -O3 -mcpu=neoverse-v2, for example produces the loop: .L4: ldr q26, [x4, x2] uqsub v24.16b, v27.16b, v30.16b add w3, w3, 1 ldr q0, [x7, x2] uqadd v26.16b, v28.16b, v26.16b uqsub v26.16b, v26.16b, v29.16b umax v26.16b, v26.16b, v0.16b uqsub v0.16b, v0.16b, v30.16b umax v26.16b, v26.16b, v27.16b str q26, [x6, x2] uqsub v26.16b, v26.16b, v31.16b umax v0.16b, v0.16b, v26.16b umax v24.16b, v24.16b, v26.16b str q0, [x7, x2] mov v27.16b, v24.16b // Superfluous mov ldr q25, [x5, x2] add x2, x2, 16 mov v28.16b, v25.16b cmp w8, w3 bgt .L4 Whereas Clang emits a tighter sequence: .LBB0_4: ldr q5, [x14, x13] uqadd v4.16b, v4.16b, v5.16b ldr q5, [x7, x13] uqsub v4.16b, v4.16b, v2.16b umax v4.16b, v4.16b, v5.16b uqsub v5.16b, v5.16b, v1.16b umax v4.16b, v4.16b, v3.16b uqsub v3.16b, v3.16b, v1.16b str q4, [x6, x13] uqsub v4.16b, v4.16b, v0.16b umax v5.16b, v5.16b, v4.16b umax v3.16b, v3.16b, v4.16b str q5, [x7, x13] ldr q4, [x12, x13] add x13, x13, #16 cmp x11, x13 b.ne .LBB0_4 This ends up hurting performance considerably in the original application where this sequence is hot