https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106322

--- Comment #19 from Mathieu Malaterre <malat at debian dot org> ---
Without hwy dependency:

 % more Makefile bytes.cc demo.cc
::::::::::::::
Makefile
::::::::::::::
CXXFLAGS := -O2

demo: demo.o bytes.o
        $(CXX) $(CXXFLAGS) -o $@ $^

clean:
        rm -f bytes.o demo.o
::::::::::::::
bytes.cc
::::::::::::::
#include <cstring>

bool BytesEqual(const void *bytes1, const void *bytes2, const size_t size) {
  return memcmp(bytes1, bytes2, size) == 0;
}
::::::::::::::
demo.cc
::::::::::::::
#include <atomic>
#include <cassert>
#include <cstdlib>
#include <cstring>
#include <limits>
#include <memory>

#define HWY_ALIGNMENT 64
constexpr size_t kAlignment = HWY_ALIGNMENT;
constexpr size_t kAlias = kAlignment * 4;

bool BytesEqual(const void *p1, const void *p2, const size_t size);

namespace hwy {
namespace N_EMU128 {
template <typename T, size_t N = 16 / sizeof(T)> struct Vec128 {
  T raw[16 / sizeof(T)] = {};
};
} // namespace N_EMU128
} // namespace hwy

template <typename T, size_t N>
static void Store(const hwy::N_EMU128::Vec128<T, N> v,
                  T *__restrict__ aligned) {
  __builtin_memcpy(aligned, v.raw, sizeof(T) * N);
}

template <typename T, size_t N>
static hwy::N_EMU128::Vec128<T, N> Load(const T *__restrict__ aligned) {
  hwy::N_EMU128::Vec128<T, N> v;
  __builtin_memcpy(v.raw, aligned, sizeof(T) * N);
  return v;
}

template <size_t N>
static hwy::N_EMU128::Vec128<uint16_t, N>
MulHigh(hwy::N_EMU128::Vec128<uint16_t, N> a,
        const hwy::N_EMU128::Vec128<uint16_t, N> b) {
  for (size_t i = 0; i < N; ++i) {
    // Cast to uint32_t first to prevent overflow. Otherwise the result of
    // uint16_t * uint16_t is in "int" which may overflow. In practice the
    // result is the same but this way it is also defined.
    a.raw[i] = static_cast<uint16_t>(
        (static_cast<uint32_t>(a.raw[i]) * static_cast<uint32_t>(b.raw[i])) >>
        16);
  }
  return a;
}

#define HWY_ASSERT(condition) assert((condition))
#define HWY_ASSUME_ALIGNED(ptr, align) __builtin_assume_aligned((ptr), (align))

#pragma pack(push, 1)
struct AllocationHeader {
  void *allocated;
  size_t payload_size;
};
#pragma pack(pop)

static void FreeAlignedBytes(const void *aligned_pointer) {
  HWY_ASSERT(aligned_pointer != nullptr);
  if (aligned_pointer == nullptr)
    return;

  const uintptr_t payload = reinterpret_cast<uintptr_t>(aligned_pointer);
  HWY_ASSERT(payload % kAlignment == 0);
  const AllocationHeader *header =
      reinterpret_cast<const AllocationHeader *>(payload) - 1;

  free(header->allocated);
}

class AlignedFreer {
public:
  template <typename T> void operator()(T *aligned_pointer) const {
    FreeAlignedBytes(aligned_pointer);
  }
};

template <typename T>
using AlignedFreeUniquePtr = std::unique_ptr<T, AlignedFreer>;

static inline constexpr size_t ShiftCount(size_t n) {
  return (n <= 1) ? 0 : 1 + ShiftCount(n / 2);
}

namespace {
static size_t NextAlignedOffset() {
  static std::atomic<uint32_t> next{0};
  constexpr uint32_t kGroups = kAlias / kAlignment;
  const uint32_t group = next.fetch_add(1, std::memory_order_relaxed) %
kGroups;
  const size_t offset = kAlignment * group;
  HWY_ASSERT((offset % kAlignment == 0) && offset <= kAlias);
  //  std::cerr << "O: " << offset << std::endl;
  return offset;
}
} // namespace

static void *AllocateAlignedBytes(const size_t payload_size) {
  HWY_ASSERT(payload_size != 0); // likely a bug in caller
  if (payload_size >= std::numeric_limits<size_t>::max() / 2) {
    HWY_ASSERT(false && "payload_size too large");
    return nullptr;
  }

  size_t offset = NextAlignedOffset();

  // What: | misalign | unused | AllocationHeader |payload
  // Size: |<= kAlias | offset                    |payload_size
  //       ^allocated.^aligned.^header............^payload
  // The header must immediately precede payload, which must remain aligned.
  // To avoid wasting space, the header resides at the end of `unused`,
  // which therefore cannot be empty (offset == 0).
  if (offset == 0) {
    offset = kAlignment; // = RoundUpTo(sizeof(AllocationHeader), kAlignment)
    static_assert(sizeof(AllocationHeader) <= kAlignment, "Else: round up");
  }

  const size_t allocated_size = kAlias + offset + payload_size;
  void *allocated = malloc(allocated_size);
  HWY_ASSERT(allocated != nullptr);
  if (allocated == nullptr)
    return nullptr;
  // Always round up even if already aligned - we already asked for kAlias
  // extra bytes and there's no way to give them back.
  uintptr_t aligned = reinterpret_cast<uintptr_t>(allocated) + kAlias;
  static_assert((kAlias & (kAlias - 1)) == 0, "kAlias must be a power of 2");
  static_assert(kAlias >= kAlignment, "Cannot align to more than kAlias");
  aligned &= ~(kAlias - 1);

  const uintptr_t payload = aligned + offset; // still aligned

  // Stash `allocated` and payload_size inside header for FreeAlignedBytes().
  // The allocated_size can be reconstructed from the payload_size.
  AllocationHeader *header = reinterpret_cast<AllocationHeader *>(payload) - 1;
  header->allocated = allocated;
  header->payload_size = payload_size;

  //printf("%d-byte aligned addr: %p\n", kAlignment,
reinterpret_cast<void*>(payload));
  return HWY_ASSUME_ALIGNED(reinterpret_cast<void *>(payload), kAlignment);
}

template <typename T> static T *AllocateAlignedItems(size_t items) {
  constexpr size_t size = sizeof(T);

  constexpr bool is_pow2 = (size & (size - 1)) == 0;
  constexpr size_t bits = ShiftCount(size);
  static_assert(!is_pow2 || (1ull << bits) == size, "ShiftCount is incorrect");

  const size_t bytes = is_pow2 ? items << bits : items * size;
  const size_t check = is_pow2 ? bytes >> bits : bytes / size;
  if (check != items) {
    return nullptr; // overflowed
  }
  return static_cast<T *>(AllocateAlignedBytes(bytes));
}

template <typename T>
static AlignedFreeUniquePtr<T[]> AllocateAligned(const size_t items) {
  return AlignedFreeUniquePtr<T[]>(AllocateAlignedItems<T>(items),
                                   AlignedFreer());
}

int main() {
  AlignedFreeUniquePtr<uint16_t[]> in_lanes = AllocateAligned<uint16_t>(2);
  uint16_t expected_lanes[2];
  in_lanes[0] = 65535;
  in_lanes[1] = 32767;
  expected_lanes[0] = 65534;
  expected_lanes[1] = 16383;
  hwy::N_EMU128::Vec128<uint16_t, 2> v = Load<uint16_t, 2>(in_lanes.get());
  hwy::N_EMU128::Vec128<uint16_t, 2> actual = MulHigh(v, v);
  {
    auto actual_lanes = AllocateAligned<uint16_t>(2);
    Store(actual, actual_lanes.get());
    const uint8_t *expected_array =
        reinterpret_cast<const uint8_t *>(expected_lanes);
    const uint8_t *actual_array =
        reinterpret_cast<const uint8_t *>(actual_lanes.get());
    for (size_t i = 0; i < 2; ++i) {
      const uint8_t *expected_ptr = expected_array + i * 2;
      const uint8_t *actual_ptr = actual_array + i * 2;
#if 1
      // trigger bug
      if (!BytesEqual(expected_ptr, actual_ptr, 2)) {
#else
      // no bug
      if (std::memcmp(expected_ptr, actual_ptr, 2) != 0) {
#endif
        abort();
      }
    }
  }
}

Reply via email to