https://gcc.gnu.org/bugzilla/show_bug.cgi?id=125474

            Bug ID: 125474
           Summary: LoongArch: missing usdot_prodMN patterns
           Product: gcc
           Version: 17.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: xry111 at gcc dot gnu.org
  Target Milestone: ---

Code simplified from SPEC 2026 706.stockfish_r:

#include <cstdint>

using IndexType = std::uint32_t;

void transform(std::int32_t *__restrict__ output, const std::int8_t *weights,
               const std::int32_t *biases, const std::uint8_t *input) {
  for (IndexType i = 0; i < 0x1000; ++i) {
    const IndexType offset = i * 0x1000;

    std::int32_t sum = biases[i];
    for (IndexType j = 0; j < 0x1000; ++j) {
      sum += weights[offset + j] * input[j];
    }
    output[i] = sum;
  }
}

The loop body is compiled to:

.L2:
        xvldx   $xr3,$r13,$r12
        xvldx   $xr1,$r7,$r12
        xvmulwod.h.b    $xr2,$xr3,$xr6
        xvmulwev.h.b    $xr0,$xr3,$xr6
        xvhaddw.w.h     $xr4,$xr2,$xr2
        xvhaddw.w.h     $xr0,$xr0,$xr0
        addi.d  $r12,$r12,32
        xvadd.w $xr0,$xr0,$xr4
        xvadd.w $xr5,$xr0,$xr5
        xvadd.b $xr1,$xr1,$xr7
        xvadd.w $xr0,$xr0,$xr5
        xvmulwev.h.b    $xr2,$xr1,$xr3
        xvmulwod.h.b    $xr1,$xr1,$xr3
        xvhaddw.w.h     $xr2,$xr2,$xr2
        xvhaddw.w.h     $xr1,$xr1,$xr1
        xvadd.w $xr2,$xr2,$xr1
        xvadd.w $xr5,$xr2,$xr0
        bne     $r12,$r14,.L2

But if we have usdot_prod, we can generate much better code:

.L2:
        xvldx   $xr0,$r7,$r12
        xvldx   $xr2,$r13,$r12
        addi.d  $r12,$r12,32
        xvmulwev.h.bu.b $xr1,$xr0,$xr2
        xvmulwod.h.bu.b $xr0,$xr0,$xr2
        xvhaddw.w.h     $xr1,$xr1,$xr1
        xvhaddw.w.h     $xr0,$xr0,$xr0
        xvadd.w $xr1,$xr1,$xr0
        xvadd.w $xr3,$xr1,$xr3
        bne     $r12,$r14,.L2

Reply via email to