https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112105

            Bug ID: 112105
           Summary: [14 Regression] vector by lane operation costing
                    broken since
                    g:21416caf221fae4351319ef8ca8d41c0234bdfa7
           Product: gcc
           Version: 14.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: tnfchris at gcc dot gnu.org
                CC: rsandifo at gcc dot gnu.org
  Target Milestone: ---
            Target: aarch64-*

After this commit g:21416caf221fae4351319ef8ca8d41c0234bdfa7

commit 21416caf221fae4351319ef8ca8d41c0234bdfa7
Author: Richard Sandiford <richard.sandif...@arm.com>
Date:   Tue Oct 24 11:01:52 2023 +0100

    aarch64: Define TARGET_INSN_COST

    This patch adds a bare-bones TARGET_INSN_COST.  See the comment
    in the patch for the rationale.

we now fail to form by lane instructions when they're not single use:

> cat test.c

#include <arm_neon.h>
typedef struct {
  float re;
  float im;
} cmplx_f32_t;

void test2x2_f32(const cmplx_f32_t *p_src_a,
             const cmplx_f32_t *p_src_b,
             cmplx_f32_t *p_dst) {
  const float32_t *a_ptr = (const float32_t *)p_src_a;
  const float32_t *b_ptr = (const float32_t *)p_src_b;
  float32_t *out_ptr = (float32_t *)p_dst;

  float32x2x2_t a_col[2];
  float32x2x2_t b[2];
  float32x2x2_t result[2];

  a_col[0] = vld2_f32(a_ptr);
  b[0] = vld2_f32(b_ptr);

  result[0].val[0] = vmul_lane_f32(a_col[0].val[0], b[0].val[0], 0);
  result[0].val[1] = vmul_lane_f32(a_col[0].val[1], b[0].val[0], 0);

  vst2_f32(out_ptr, result[0]);
  out_ptr = out_ptr + 4;
}

---
> ./bin/gcc test.c -O1 -S -o -
...
        test2x2_f32:
        ld2     {v27.2s - v28.2s}, [x0]
        ld2     {v30.2s - v31.2s}, [x1]
        dup     v31.2s, v30.s[0]
        fmul    v29.2s, v31.2s, v27.2s
        fmul    v30.2s, v31.2s, v28.2s
        st2     {v29.2s - v30.2s}, [x2]
        ret

which has an unneeded dup.  Before this we generated:

test2x2_f32:
        ld2     {v0.2s - v1.2s}, [x1]
        ld2     {v4.2s - v5.2s}, [x0]
        fmul    v2.2s, v4.2s, v0.s[0]
        fmul    v3.2s, v5.2s, v0.s[0]
        st2     {v2.2s - v3.2s}, [x2]
        ret

Reply via email to