https://gcc.gnu.org/bugzilla/show_bug.cgi?id=119384
Bug ID: 119384
Summary: Extra move in tight loop with SIMD and subregs
Product: gcc
Version: 15.0
Status: UNCONFIRMED
Keywords: missed-optimization
Severity: normal
Priority: P3
Component: middle-end
Assignee: unassigned at gcc dot gnu.org
Reporter: ktkachov at gcc dot gnu.org
Target Milestone: ---
Target: aarch64
We have a workload for aarch64 using the SIMDe translation error that results
in slower code than Clang due to an extra move emitted in a hot, tight loop.
The reduced source code is:
#include <arm_neon.h>
#define simde_mm_set1_epi8(x) (vreinterpretq_s64_u8(vdupq_n_u8(x)))
#define simde_mm_load_si128(x)
(vreinterpretq_s64_s32(vld1q_s32((reinterpret_cast<int32_t const*>(x)))))
#define simde_mm_adds_epu8(x, y)
(vreinterpretq_s64_u8(vqaddq_u8(vreinterpretq_u8_s64(x),
vreinterpretq_u8_s64(y))))
#define simde_mm_subs_epu8(x, y)
(vreinterpretq_s64_u8(vqsubq_u8(vreinterpretq_u8_s64(x),
vreinterpretq_u8_s64(y))))
#define simde_mm_max_epu8(x, y)
(vreinterpretq_s64_u8(vmaxq_u8(vreinterpretq_u8_s64(x),
vreinterpretq_u8_s64(y))))
#define simde_mm_store_si128(x, y) (vst1q_s32((reinterpret_cast<int32_t*>(x)),
vreinterpretq_s32_s64(y)))
void foobar(
int64x2_t* query_profile_byte, unsigned char *db_sequence, int8_t ref_dir,
int32_t db_length, int32_t query_length, int64x2_t* pvHStore,
int64x2_t* pvHLoad, int64x2_t* pvE, uint8_t gap_open, uint8_t gap_extend,
uint8_t bias, uint8_t * maxColumn)
{
const int SIMD_SIZE = 4 * 4;
int32_t segLen = (query_length + SIMD_SIZE-1) / SIMD_SIZE;
int64x2_t vZero = simde_mm_set1_epi8(0);
int64x2_t vGapO = simde_mm_set1_epi8(gap_open);
int64x2_t vGapE = simde_mm_set1_epi8(gap_extend);
int64x2_t vBias = simde_mm_set1_epi8(bias);
int64x2_t vTemp;
#pragma GCC unroll 1
for (int i = 0; __builtin_expect((i != db_length),1); i++) {
int64x2_t e, vF = vZero, vMaxColumn = vZero;
int64x2_t vH = vZero;
const int64x2_t* vP = query_profile_byte + db_sequence[i] * segLen;
int64x2_t* pv = pvHLoad;
pvHLoad = pvHStore;
pvHStore = pv;
// loop of interest
#pragma GCC unroll 1
for (int j = 0; __builtin_expect((j < segLen),1); ++j)
{
int64x2_t score = simde_mm_set1_epi8(0);
score = simde_mm_load_si128(vP + j);
vH = simde_mm_adds_epu8(vH, score);
vH = simde_mm_subs_epu8(vH, vBias);
e = simde_mm_load_si128(pvE + j);
vH = simde_mm_max_epu8(vH, e);
vH = simde_mm_max_epu8(vH, vF);
vMaxColumn = simde_mm_max_epu8(vMaxColumn, vH);
simde_mm_store_si128(pvHStore + j, vH);
vH = simde_mm_subs_epu8(vH, vGapO);
e = simde_mm_subs_epu8(e, vGapE);
e = simde_mm_max_epu8(e, vH);
simde_mm_store_si128(pvE + j, e);
vF = simde_mm_subs_epu8(vF, vGapE);
vF = simde_mm_max_epu8(vF, vH);
vH = simde_mm_load_si128(pvHLoad + j);
}
// comment this line to have previous loop without MOVs:
simde_mm_store_si128(pvHStore, simde_mm_subs_epu8(vF, vH));
}
}
Compiled with -O3 -mcpu=neoverse-v2, for example produces the loop:
.L4:
ldr q26, [x4, x2]
uqsub v24.16b, v27.16b, v30.16b
add w3, w3, 1
ldr q0, [x7, x2]
uqadd v26.16b, v28.16b, v26.16b
uqsub v26.16b, v26.16b, v29.16b
umax v26.16b, v26.16b, v0.16b
uqsub v0.16b, v0.16b, v30.16b
umax v26.16b, v26.16b, v27.16b
str q26, [x6, x2]
uqsub v26.16b, v26.16b, v31.16b
umax v0.16b, v0.16b, v26.16b
umax v24.16b, v24.16b, v26.16b
str q0, [x7, x2]
mov v27.16b, v24.16b // Superfluous mov
ldr q25, [x5, x2]
add x2, x2, 16
mov v28.16b, v25.16b
cmp w8, w3
bgt .L4
Whereas Clang emits a tighter sequence:
.LBB0_4:
ldr q5, [x14, x13]
uqadd v4.16b, v4.16b, v5.16b
ldr q5, [x7, x13]
uqsub v4.16b, v4.16b, v2.16b
umax v4.16b, v4.16b, v5.16b
uqsub v5.16b, v5.16b, v1.16b
umax v4.16b, v4.16b, v3.16b
uqsub v3.16b, v3.16b, v1.16b
str q4, [x6, x13]
uqsub v4.16b, v4.16b, v0.16b
umax v5.16b, v5.16b, v4.16b
umax v3.16b, v3.16b, v4.16b
str q5, [x7, x13]
ldr q4, [x12, x13]
add x13, x13, #16
cmp x11, x13
b.ne .LBB0_4
This ends up hurting performance considerably in the original application where
this sequence is hot