[Bug target/91681] New: Missed optimization for 128 bit arithmetic operations

antoshkka at gmail dot com Fri, 06 Sep 2019 03:12:08 -0700

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=91681


            Bug ID: 91681
           Summary: Missed optimization for 128 bit arithmetic operations
           Product: gcc
           Version: 10.0
            Status: UNCONFIRMED
          Keywords: missed-optimization
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: antoshkka at gmail dot com
  Target Milestone: ---

Consider the function:

void multiply128x64x2_3 ( 
    const unsigned long a, 
    const unsigned long b, 
    const unsigned long c, 
    const unsigned long d, 
    __uint128_t o[2]
  ) noexcept
{
    __uint128_t B0 = __uint128_t{ b } * c;
    __uint128_t B2 = __uint128_t{ a } * c;
    __uint128_t B1 = __uint128_t{ b } * d;
    __uint128_t B3 = __uint128_t{ a } * d;

    o[0] = B2 + (B0 >> 64);
    o[1] = B3 + (B1 >> 64);
}


With compilation flags "-O2 -std=c++17 -mavx" the following assembly is
produced:

multiply128x64x2_3(unsigned long, unsigned long, unsigned long, unsigned long,
unsigned __int128*):
  mov rax, rdx
  push rbx
  mov rbx, rdx
  mov r9, rdi
  mul rsi
  mov rax, rdx
  xor edx, edx
  mov r10, rax
  mov rax, rbx
  mov r11, rdx
  pop rbx
  mul rdi
  add rax, r10
  adc rdx, r11
  mov QWORD PTR [r8], rax
  mov rax, rsi
  xor edi, edi
  mov QWORD PTR [r8+8], rdx
  mul rcx
  mov rax, rcx
  mov rsi, rdx
  mul r9
  add rsi, rax
  adc rdi, rdx
  mov QWORD PTR [r8+16], rsi
  mov QWORD PTR [r8+24], rdi
  ret

However, it is sub-optimal. Touching the stack is not necessary and the same
result could be achieved with less instructions:

multiply128x64x2_3(unsigned long, unsigned long, unsigned long, unsigned long,
unsigned __int128*):
  mov r9, r8
  mov r8, rdx
  mov rax, rsi
  mul r8
  mov rax, r8
  mov r10, rdx
  mul rdi
  add r10, rax
  mov rax, rsi
  mov QWORD PTR [r9], r10
  adc rdx, 0
  mov QWORD PTR [8+r9], rdx
  mul rcx
  mov rax, rdi
  mov r11, rdx
  mul rcx
  add r11, rax
  mov QWORD PTR [16+r9], r11
  adc rdx, 0
  mov QWORD PTR [24+r9], rdx
  ret

[Bug target/91681] New: Missed optimization for 128 bit arithmetic operations

Reply via email to