https://gcc.gnu.org/bugzilla/show_bug.cgi?id=124546

            Bug ID: 124546
           Summary: Missed optimisation of 128-bit widening multiply
           Product: gcc
           Version: 15.0
            Status: UNCONFIRMED
          Keywords: missed-optimization
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: ktkachov at gcc dot gnu.org
  Target Milestone: ---

Input:

#include <cstdint>

unsigned char add_uint64(uint64_t op1, uint64_t op2, uint64_t *r) {
  *r = op1 + op2;
  return static_cast<unsigned char>(*r < op1);
}

void mul_uint64 (uint64_t op1, uint64_t op2, uint64_t *r) {
  auto op1_c_r = op1 & 0x00000000FFFFFFFF;
  auto op2_c_r = op2 & 0x00000000FFFFFFFF;
  op1 >>= 32;
  op2 >>= 32;

  auto m1 = op1 * op2_c_r;
  uint64_t m;
  auto left = op1 * op2 +
              (static_cast<uint64_t>(add_uint64(m1, op2 * op1_c_r, &m)) << 32);
  auto rt = op1_c_r * op2_c_r;
  auto ts = (rt >> 32) + (m & 0x00000000FFFFFFFF);

  r[1] = static_cast<uint64_t>(left + (m >> 32) + (ts >> 32));
  r[0] = static_cast<uint64_t>((ts << 32) | (rt & 0x00000000FFFFFFFF));
}

On aarch64 with -O3 we generate:
add_uint64(unsigned long, unsigned long, unsigned long*):
        adds    x1, x0, x1
        str     x1, [x2]
        cset    w0, cs
        ret
mul_uint64(unsigned long, unsigned long, unsigned long*):
        lsr     x4, x0, 32
        lsr     x6, x1, 32
        umull   x3, w0, w1
        umull   x1, w1, w4
        and     x5, x3, 4294967295
        umaddl  x0, w0, w6, x1
        cmp     x1, x0
        and     x1, x0, 4294967295
        lsr     x0, x0, 32
        add     x3, x1, x3, lsr 32
        cset    x1, hi
        orr     x5, x5, x3, lsl 32
        umaddl  x4, w4, w6, x0
        extr    x0, x1, x3, 32
        add     x0, x0, x4
        stp     x5, x0, [x2]
        ret

whereas Clang can detect the wide multiply:
add_uint64(unsigned long, unsigned long, unsigned long*):
        adds    x8, x1, x0
        cset    w0, hs
        str     x8, [x2]
        ret

mul_uint64(unsigned long, unsigned long, unsigned long*):
        umulh   x8, x1, x0
        mul     x9, x0, x1
        stp     x9, x8, [x2]
        ret

This causes GCC to be much slower than Clang on a crypto workload where this
pattern appears

Reply via email to