https://gcc.gnu.org/bugzilla/show_bug.cgi?id=121463
Bug ID: 121463 Summary: Suboptimal scalar absdiff codegen Product: gcc Version: 15.0 Status: UNCONFIRMED Keywords: missed-optimization Severity: normal Priority: P3 Component: target Assignee: unassigned at gcc dot gnu.org Reporter: ktkachov at gcc dot gnu.org Target Milestone: --- Target: aarch64 The C++ testcases: #include <stdbool.h> #include <stdint.h> #include <utility> typedef uint8_t u8; typedef uint16_t u16; typedef uint32_t u32; typedef uint64_t u64; typedef __uint128_t u128; typedef int8_t i8; typedef int16_t i16; typedef int32_t i32; typedef int64_t i64; typedef __int128_t i128; template <typename T> auto src(T x, T y) -> T { T diff1 = x - y; T diff2 = y - x; return x > y ? diff1 : diff2; } template <typename T> auto tgt(T x, T y) -> T { T diff = x - y; return x > y ? diff : -diff; } extern "C" { auto src_u8(u8 x, u8 y) -> u8 { return src(x, y); } auto tgt_u8(u8 x, u8 y) -> u8 { return tgt(x, y); } auto src_i8(i8 x, i8 y) -> i8 { return src(x, y); } auto tgt_i8(i8 x, i8 y) -> i8 { return tgt(x, y); } auto src_u16(u16 x, u16 y) -> u16 { return src(x, y); } auto tgt_u16(u16 x, u16 y) -> u16 { return tgt(x, y); } auto src_i16(i16 x, i16 y) -> i16 { return src(x, y); } auto tgt_i16(i16 x, i16 y) -> i16 { return tgt(x, y); } auto src_u32(u32 x, u32 y) -> u32 { return src(x, y); } auto tgt_u32(u32 x, u32 y) -> u32 { return tgt(x, y); } auto src_i32(i32 x, i32 y) -> i32 { return src(x, y); } auto tgt_i32(i32 x, i32 y) -> i32 { return tgt(x, y); } auto src_u64(u64 x, u64 y) -> u64 { return src(x, y); } auto tgt_u64(u64 x, u64 y) -> u64 { return tgt(x, y); } auto src_i64(i64 x, i64 y) -> i64 { return src(x, y); } auto tgt_i64(i64 x, i64 y) -> i64 { return tgt(x, y); } auto src_u128(u128 x, u128 y) -> u128 { return src(x, y); } auto tgt_u128(u128 x, u128 y) -> u128 { return tgt(x, y); } auto src_i128(i128 x, i128 y) -> i128 { return src(x, y); } auto tgt_i128(i128 x, i128 y) -> i128 { return tgt(x, y); } } generate suboptimal aarch64 code with GCC for aarch64 with -O3 -fwrapv: src_u8: and w0, w0, 255 and w1, w1, 255 sub w2, w0, w1 sub w3, w1, w0 cmp w0, w1 and w2, w2, 255 and w0, w3, 255 csel w0, w0, w2, ls ret tgt_u8: and w0, w0, 255 and w1, w1, 255 sub w2, w0, w1 sub w3, w1, w0 cmp w0, w1 and w2, w2, 255 and w0, w3, 255 csel w0, w0, w2, ls ret src_i8: sxtb w0, w0 sxtb w1, w1 sub w2, w0, w1 sub w3, w1, w0 cmp w0, w1 sxtb w2, w2 sxtb w0, w3 csel w0, w0, w2, le ret tgt_i8: sxtb w0, w0 sxtb w2, w1 subs w1, w0, w2 sxtb w1, w1 neg w0, w1 sxtb w0, w0 csel w0, w0, w1, le ret src_u16: and w0, w0, 65535 and w1, w1, 65535 sub w2, w0, w1 sub w3, w1, w0 cmp w0, w1 and w2, w2, 65535 and w0, w3, 65535 csel w0, w0, w2, ls ret tgt_u16: and w0, w0, 65535 and w1, w1, 65535 sub w2, w0, w1 sub w3, w1, w0 cmp w0, w1 and w2, w2, 65535 and w0, w3, 65535 csel w0, w0, w2, ls ret src_i16: sxth w0, w0 sxth w1, w1 sub w2, w0, w1 sub w3, w1, w0 cmp w0, w1 sxth w2, w2 sxth w0, w3 csel w0, w0, w2, le ret tgt_i16: sxth w0, w0 sxth w2, w1 subs w1, w0, w2 sxth w1, w1 neg w0, w1 sxth w0, w0 csel w0, w0, w1, le ret src_u32: subs w2, w0, w1 sub w0, w1, w0 csel w0, w0, w2, ls ret tgt_u32: subs w2, w0, w1 sub w0, w1, w0 csel w0, w0, w2, ls ret src_i32: subs w2, w0, w1 sub w0, w1, w0 csel w0, w0, w2, le ret tgt_i32: subs w2, w0, w1 sub w0, w1, w0 csel w0, w0, w2, le ret src_u64: subs x2, x0, x1 sub x0, x1, x0 csel x0, x0, x2, ls ret tgt_u64: subs x2, x0, x1 sub x0, x1, x0 csel x0, x0, x2, ls ret src_i64: subs x2, x0, x1 sub x0, x1, x0 csel x0, x0, x2, le ret tgt_i64: subs x2, x0, x1 sub x0, x1, x0 csel x0, x0, x2, le ret src_u128: cmp x1, x3 bhi .L52 beq .L53 .L49: subs x0, x2, x0 sbc x1, x3, x1 ret .L53: cmp x0, x2 bls .L49 .L52: subs x0, x0, x2 sbc x1, x1, x3 ret tgt_u128: cmp x1, x3 bhi .L58 beq .L59 .L55: subs x0, x2, x0 sbc x1, x3, x1 ret .L59: cmp x0, x2 bls .L55 .L58: subs x0, x0, x2 sbc x1, x1, x3 ret src_i128: cmp x1, x3 bgt .L64 beq .L65 .L61: subs x0, x2, x0 sbc x1, x3, x1 ret .L65: cmp x0, x2 bls .L61 .L64: subs x0, x0, x2 sbc x1, x1, x3 ret tgt_i128: cmp x1, x3 bgt .L70 beq .L71 .L67: subs x0, x2, x0 sbc x1, x3, x1 ret .L71: cmp x0, x2 bls .L67 .L70: subs x0, x0, x2 sbc x1, x1, x3 ret LLVM generates more compact and branchless sequences: https://godbolt.org/z/38x8r3zsT I've marked it as a target bug, but it may be missing midend optimisations too