https://gcc.gnu.org/bugzilla/show_bug.cgi?id=121240

            Bug ID: 121240
           Summary: missing support for section anchors to FP or vector
                    constants
           Product: gcc
           Version: 16.0
            Status: UNCONFIRMED
          Keywords: missed-optimization
          Severity: normal
          Priority: P3
         Component: rtl-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: tnfchris at gcc dot gnu.org
  Target Milestone: ---

Couldn't find an existing ticket for this. but the following example

const double b[4] = {0.2435334343f, 0.2233535343f, 0.4232433f, 0.34343434f};
typedef double v2df __attribute__ ((vector_size (16)));
typedef double v2df __attribute__ ((vector_size (16)));

v2df f (v2df c1, v2df c2)
{
   v2df a1 = *(v2df *)&b[0];
   v2df a2 = *(v2df *)&b[2];
   return (a1 * c1) + (a2 * c2);
}

at -O3 on AArch64 generates:

f:
        adrp    x0, .LC0
        ldr     q30, [x0, #:lo12:.LC0]
        adrp    x0, .LC1
        ldr     q31, [x0, #:lo12:.LC1]
        fmul    v1.2d, v1.2d, v30.2d
        fmla    v1.2d, v0.2d, v31.2d
        mov     v0.16b, v1.16b
        ret
        .global b
        .section        .rodata.cst16,"aM",@progbits,16
        .align  4
.LC0:
        .word   536870912
        .word   1071322731
        .word   0
        .word   1070987988
        .align  4
.LC1:
        .word   -2147483648
        .word   1070541850
        .word   1073741824
        .word   1070372569

where we generate two different labels instead of using
an anchor + offset.

This means on functions using lots of FP or vector constants we
do quite a lot of unneeded address computations and we miss pairwise
loads because all the constants have different bases.

I wonder whether this can be implemented by extending force_const_mem to
support "global" constant pools and re-using the target hooks from
use_anchored_address

The expected code to come out of this is the same as


v2df f (v2df c1, v2df c2)
{
   double *l = &b[0];
   asm volatile ("" : "+r"(l));
   v2df a1 = *(v2df *)l;
   v2df a2 = *(v2df *)(l+2);
   return (a1 * c1) + (a2 * c2);
}

which generates:

f:
        adrp    x0, .LANCHOR0
        add     x0, x0, :lo12:.LANCHOR0
        ldp     q31, q30, [x0]
        fmul    v1.2d, v1.2d, v30.2d
        fmla    v1.2d, v31.2d, v0.2d
        mov     v0.16b, v1.16b
        ret
        .global b
        .section        .rodata
        .align  4
        .set    .LANCHOR0,. + 0
        .type   b, %object
        .size   b, 32
b:
        .word   -2147483648
        .word   1070541850
        .word   1073741824
        .word   1070372569
        .word   536870912
        .word   1071322731
        .word   0
        .word   1070987988

Reply via email to