https://gcc.gnu.org/bugzilla/show_bug.cgi?id=119681

            Bug ID: 119681
           Summary: extraneous move instructions when unrolling
                    core_list_reverse ()
           Product: gcc
           Version: 15.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: rtl-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: artemiy at synopsys dot com
  Target Milestone: ---

When unrolling the core_list_reverse () loop from Coremark (with an unroll
pragma):

list_head *
core_list_reverse(list_head *list)
{
    list_head *next = 0, *tmp;
    #pragma GCC unroll 4
    while (list)
    {
        tmp        = list->next;
        list->next = next;
        next       = list;
        list       = tmp;
    }
    return next;
}

gcc (15 or any version) doesn't split the variable 'next', causing a move to x0
at every iteration:

core_list_reverse:
        cbz     x0, .L2
        ldr     x1, [x0]
        mov     x6, 0
        str     x6, [x0]
        mov     x3, x0
        cbz     x1, .L2
.L4:
        ldr     x2, [x1]
        str     x3, [x1]
        mov     x0, x1
        cbz     x2, .L2
        ldr     x4, [x2]
        str     x1, [x2]
        mov     x0, x2
        cbz     x4, .L2
        ldr     x5, [x4]
        str     x2, [x4]
        mov     x0, x4
        mov     x6, x4
        cbz     x5, .L2
        mov     x0, x5
        mov     x3, x0
        ldr     x1, [x0]
        str     x6, [x0]
        cbnz    x1, .L4
.L2:
        ret

LLVM behavior, for comparison:

core_list_reverse:
        cbz     x0, .LBB0_6
        mov     x8, x0
        mov     x0, xzr
.LBB0_2:
        ldr     x10, [x8]
        str     x0, [x8]
        cbz     x10, .LBB0_7
        ldr     x9, [x10]
        str     x8, [x10]
        cbz     x9, .LBB0_8
        ldr     x0, [x9]
        str     x10, [x9]
        cbz     x0, .LBB0_9
        ldr     x8, [x0]
        str     x9, [x0]
        cbnz    x8, .LBB0_2
.LBB0_6:
        ret
.LBB0_7:
        mov     x0, x8
        ret
.LBB0_8:
        mov     x0, x10
        ret
.LBB0_9:
        mov     x0, x9
        ret

Under certain conditions (the load doesn't fully hide the latency of the mov,
the loop is executed a sufficient number of times, etc.), the version with
multiple exits is faster.

godbolt for convenience: https://godbolt.org/z/W965qqWKe

$ aarch64-unknown-linux-gnu-gcc -v
Using built-in specs.
COLLECT_GCC=/home/art/install/aarch64-gcc/bin/aarch64-unknown-linux-gnu-gcc
COLLECT_LTO_WRAPPER=/home/art/install/aarch64-gcc/libexec/gcc/aarch64-unknown-linux-gnu/15.0.1/lto-wrapper
Target: aarch64-unknown-linux-gnu
Configured with: ../../src/gcc/configure --enable-checking --disable-bootstrap
--enable-languages=c,c++ --prefix=/home/art/install/aarch64-gcc
Thread model: posix
Supported LTO compression algorithms: zlib
gcc version 15.0.1 20250408 (experimental) (GCC)

Reply via email to