[Bug target/121463] New: Suboptimal scalar absdiff codegen

ktkachov at gcc dot gnu.org via Gcc-bugs Fri, 08 Aug 2025 02:39:03 -0700

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=121463


            Bug ID: 121463
           Summary: Suboptimal scalar absdiff codegen
           Product: gcc
           Version: 15.0
            Status: UNCONFIRMED
          Keywords: missed-optimization
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: ktkachov at gcc dot gnu.org
  Target Milestone: ---
            Target: aarch64

The C++ testcases:
#include <stdbool.h>
#include <stdint.h>

#include <utility>

typedef uint8_t u8;
typedef uint16_t u16;
typedef uint32_t u32;
typedef uint64_t u64;
typedef __uint128_t u128;

typedef int8_t i8;
typedef int16_t i16;
typedef int32_t i32;
typedef int64_t i64;
typedef __int128_t i128;

template <typename T>
auto src(T x, T y) -> T {
    T diff1 = x - y;
    T diff2 = y - x;
    return x > y ? diff1 : diff2;
}

template <typename T>
auto tgt(T x, T y) -> T {
    T diff = x - y;
    return x > y ? diff : -diff;
}

extern "C" {
auto src_u8(u8 x, u8 y) -> u8 { return src(x, y); }
auto tgt_u8(u8 x, u8 y) -> u8 { return tgt(x, y); }

auto src_i8(i8 x, i8 y) -> i8 { return src(x, y); }
auto tgt_i8(i8 x, i8 y) -> i8 { return tgt(x, y); }

auto src_u16(u16 x, u16 y) -> u16 { return src(x, y); }
auto tgt_u16(u16 x, u16 y) -> u16 { return tgt(x, y); }

auto src_i16(i16 x, i16 y) -> i16 { return src(x, y); }
auto tgt_i16(i16 x, i16 y) -> i16 { return tgt(x, y); }

auto src_u32(u32 x, u32 y) -> u32 { return src(x, y); }
auto tgt_u32(u32 x, u32 y) -> u32 { return tgt(x, y); }

auto src_i32(i32 x, i32 y) -> i32 { return src(x, y); }
auto tgt_i32(i32 x, i32 y) -> i32 { return tgt(x, y); }

auto src_u64(u64 x, u64 y) -> u64 { return src(x, y); }
auto tgt_u64(u64 x, u64 y) -> u64 { return tgt(x, y); }

auto src_i64(i64 x, i64 y) -> i64 { return src(x, y); }
auto tgt_i64(i64 x, i64 y) -> i64 { return tgt(x, y); }

auto src_u128(u128 x, u128 y) -> u128 { return src(x, y); }
auto tgt_u128(u128 x, u128 y) -> u128 { return tgt(x, y); }

auto src_i128(i128 x, i128 y) -> i128 { return src(x, y); }
auto tgt_i128(i128 x, i128 y) -> i128 { return tgt(x, y); }
}

generate suboptimal aarch64 code with GCC for aarch64 with -O3 -fwrapv:
src_u8:
        and     w0, w0, 255
        and     w1, w1, 255
        sub     w2, w0, w1
        sub     w3, w1, w0
        cmp     w0, w1
        and     w2, w2, 255
        and     w0, w3, 255
        csel    w0, w0, w2, ls
        ret
tgt_u8:
        and     w0, w0, 255
        and     w1, w1, 255
        sub     w2, w0, w1
        sub     w3, w1, w0
        cmp     w0, w1
        and     w2, w2, 255
        and     w0, w3, 255
        csel    w0, w0, w2, ls
        ret
src_i8:
        sxtb    w0, w0
        sxtb    w1, w1
        sub     w2, w0, w1
        sub     w3, w1, w0
        cmp     w0, w1
        sxtb    w2, w2
        sxtb    w0, w3
        csel    w0, w0, w2, le
        ret
tgt_i8:
        sxtb    w0, w0
        sxtb    w2, w1
        subs    w1, w0, w2
        sxtb    w1, w1
        neg     w0, w1
        sxtb    w0, w0
        csel    w0, w0, w1, le
        ret
src_u16:
        and     w0, w0, 65535
        and     w1, w1, 65535
        sub     w2, w0, w1
        sub     w3, w1, w0
        cmp     w0, w1
        and     w2, w2, 65535
        and     w0, w3, 65535
        csel    w0, w0, w2, ls
        ret
tgt_u16:
        and     w0, w0, 65535
        and     w1, w1, 65535
        sub     w2, w0, w1
        sub     w3, w1, w0
        cmp     w0, w1
        and     w2, w2, 65535
        and     w0, w3, 65535
        csel    w0, w0, w2, ls
        ret
src_i16:
        sxth    w0, w0
        sxth    w1, w1
        sub     w2, w0, w1
        sub     w3, w1, w0
        cmp     w0, w1
        sxth    w2, w2
        sxth    w0, w3
        csel    w0, w0, w2, le
        ret
tgt_i16:
        sxth    w0, w0
        sxth    w2, w1
        subs    w1, w0, w2
        sxth    w1, w1
        neg     w0, w1
        sxth    w0, w0
        csel    w0, w0, w1, le
        ret
src_u32:
        subs    w2, w0, w1
        sub     w0, w1, w0
        csel    w0, w0, w2, ls
        ret
tgt_u32:
        subs    w2, w0, w1
        sub     w0, w1, w0
        csel    w0, w0, w2, ls
        ret
src_i32:
        subs    w2, w0, w1
        sub     w0, w1, w0
        csel    w0, w0, w2, le
        ret
tgt_i32:
        subs    w2, w0, w1
        sub     w0, w1, w0
        csel    w0, w0, w2, le
        ret
src_u64:
        subs    x2, x0, x1
        sub     x0, x1, x0
        csel    x0, x0, x2, ls
        ret
tgt_u64:
        subs    x2, x0, x1
        sub     x0, x1, x0
        csel    x0, x0, x2, ls
        ret
src_i64:
        subs    x2, x0, x1
        sub     x0, x1, x0
        csel    x0, x0, x2, le
        ret
tgt_i64:
        subs    x2, x0, x1
        sub     x0, x1, x0
        csel    x0, x0, x2, le
        ret
src_u128:
        cmp     x1, x3
        bhi     .L52
        beq     .L53
.L49:
        subs    x0, x2, x0
        sbc     x1, x3, x1
        ret
.L53:
        cmp     x0, x2
        bls     .L49
.L52:
        subs    x0, x0, x2
        sbc     x1, x1, x3
        ret
tgt_u128:
        cmp     x1, x3
        bhi     .L58
        beq     .L59
.L55:
        subs    x0, x2, x0
        sbc     x1, x3, x1
        ret
.L59:
        cmp     x0, x2
        bls     .L55
.L58:
        subs    x0, x0, x2
        sbc     x1, x1, x3
        ret
src_i128:
        cmp     x1, x3
        bgt     .L64
        beq     .L65
.L61:
        subs    x0, x2, x0
        sbc     x1, x3, x1
        ret
.L65:
        cmp     x0, x2
        bls     .L61
.L64:
        subs    x0, x0, x2
        sbc     x1, x1, x3
        ret
tgt_i128:
        cmp     x1, x3
        bgt     .L70
        beq     .L71
.L67:
        subs    x0, x2, x0
        sbc     x1, x3, x1
        ret
.L71:
        cmp     x0, x2
        bls     .L67
.L70:
        subs    x0, x0, x2
        sbc     x1, x1, x3
        ret

LLVM generates more compact and branchless sequences:
https://godbolt.org/z/38x8r3zsT

I've marked it as a target bug, but it may be missing midend optimisations too

[Bug target/121463] New: Suboptimal scalar absdiff codegen

Reply via email to