https://gcc.gnu.org/bugzilla/show_bug.cgi?id=121315

            Bug ID: 121315
           Summary: Missed LDP/STP fusion opportunity
           Product: gcc
           Version: 15.0
            Status: UNCONFIRMED
          Keywords: missed-optimization
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: ktkachov at gcc dot gnu.org
                CC: acoplan at gcc dot gnu.org
  Target Milestone: ---
            Target: aarch64

We have some C++ code that implements various reversed memcpy-like operations.
A reproducer is:
#include <cstdlib>
#include <cstdint>
#include <bit>
#include <iostream>

template <size_t Size>
struct uint_types_by_size;

#define GEN(sz, fn)                                      \
  static inline uint##sz##_t byteswap_gen(uint##sz##_t v) { \
    return fn(v);                                           \
  }                                                         \
  template <>                                               \
  struct uint_types_by_size<sz / 8> {                       \
    using type = uint##sz##_t;                              \
  };

GEN(8, uint8_t)
GEN(64, __builtin_bswap64)
GEN(32, __builtin_bswap32)
GEN(16, __builtin_bswap16)

#undef GEN

constexpr auto kIsLittleEndian = __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__;
constexpr auto kIsBigEndian = !kIsLittleEndian;

template <typename T>
struct Endian {
    static T swap(T x) {
        constexpr auto s = sizeof(T);
        using B = typename uint_types_by_size<s>::type;
        return std::bit_cast<T>(byteswap_gen(std::bit_cast<B>(x)));
    }
    static T big(T x) { return kIsLittleEndian ? swap(x) : x; }
    static T little(T x) { return kIsBigEndian ? swap(x) : x; }
};

template <typename T>
void __attribute__((noinline)) copyReverseGeneric(T* dst, T* src, size_t len) {
    for(int i = 0; i < len; ++i) {
      dst[i] = Endian<T>::big(src[i]);
    }
}

int main(int argc, char** argv) {
    constexpr size_t N = 10000;
    constexpr int iterations = 100000;
    int* src = static_cast<int*>(aligned_alloc(16, N * sizeof(int)));
    int* dst = static_cast<int*>(aligned_alloc(16, N * sizeof(int)));

    for (size_t i = 0; i < N; ++i) {
        src[i] = i;
    }

    volatile int sink = 0; // Prevent compiler optimization

    for (int i = 0; i < iterations; ++i) {
        copyReverseGeneric(dst, src, N);
        sink += dst[0]; // Force memory access
    }

    free(src);
    free(dst);
    return 0;
}

Compiled with e.g. -std=c++20 -O3 -mcpu=neoverse-v2 it generates for
copyReverseGeneric:
...
        add     x5, x1, 16
        add     x4, x0, 16
        mov     x3, 40000
        .p2align 5,,15
.L3:
        ldr     q31, [x1, x2]
        ldr     q30, [x5, x2]
        rev32   v31.16b, v31.16b
        rev32   v30.16b, v30.16b
        str     q31, [x0, x2]
        str     q30, [x4, x2]
        add     x2, x2, 32
        cmp     x2, x3
        bne     .L3
...

The two LDRs and two STRs should be merged into LDP and STP but I guess the
register addressing mode blocks this. Maybe this something induction variable
selection needs to take into account, or maybe the ldp/stp fusion analysis can
reason about the offset increments?

Reply via email to