https://github.com/doshimili updated https://github.com/llvm/llvm-project/pull/71558
>From 6c313955185c0d59564f6535b6f1580dca168bea Mon Sep 17 00:00:00 2001 From: Your Name <milido...@google.com> Date: Tue, 24 Oct 2023 21:15:23 +0000 Subject: [PATCH 01/17] Add software prefetching to memset --- libc/src/string/memory_utils/op_generic.h | 19 +++++++++++++++++++ .../memory_utils/x86_64/inline_memset.h | 12 +++++++++++- 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h index fd71ca30e24b936..54af7ea10e25e46 100644 --- a/libc/src/string/memory_utils/op_generic.h +++ b/libc/src/string/memory_utils/op_generic.h @@ -163,6 +163,25 @@ template <typename T> struct Memset { } while (offset < count - SIZE); tail(dst, value, count); } + + template <size_t prefetch_distance, size_t prefetch_degree> + LIBC_INLINE static void loop_and_tail_prefetch(Ptr dst, uint8_t value, + size_t count) { + Memset<uint512_t>::block(dst, value); + Memset<uint256_t>::block(dst + 64, value); + size_t offset = 96; + while (offset + prefetch_degree + kSize <= count) { + for (size_t i = 0; i < prefetch_degree / kCachelineSize; ++i) + PrefetchW(dst + offset + prefetch_distance + kCachelineSize * i); + for (size_t i = 0; i < prefetch_degree; i += kSize, offset += kSize) + block(dst + offset, value); + } + while (offset + kSize < count) { + block(dst + offset, value); + offset += kSize; + } + tail(dst, value, count); + } }; template <typename T, typename... TS> struct MemsetSequence { diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h index 6436594856b0eaf..da463bc0029f9aa 100644 --- a/libc/src/string/memory_utils/x86_64/inline_memset.h +++ b/libc/src/string/memory_utils/x86_64/inline_memset.h @@ -17,6 +17,11 @@ namespace LIBC_NAMESPACE { +static constexpr size_t kCachelineSize = 64; + +// prefetch for write +static inline void PrefetchW(CPtr dst) { __builtin_prefetch(dst, 1, 3); } + [[maybe_unused]] LIBC_INLINE static void inline_memset_x86(Ptr dst, uint8_t value, size_t count) { #if defined(__AVX512F__) @@ -53,12 +58,17 @@ inline_memset_x86(Ptr dst, uint8_t value, size_t count) { return generic::Memset<uint128_t>::head_tail(dst, value, count); if (count <= 64) return generic::Memset<uint256_t>::head_tail(dst, value, count); + PrefetchW(dst + kCachelineSize); if (count <= 128) return generic::Memset<uint512_t>::head_tail(dst, value, count); + PrefetchW(dst + kCachelineSize * 2); // Aligned loop generic::Memset<uint256_t>::block(dst, value); align_to_next_boundary<32>(dst, count); - return generic::Memset<uint256_t>::loop_and_tail(dst, value, count); + if (count <= 192) { + return Memset<uint256_t>::loop_and_tail(dst, value, count); + } + return Memset<uint256_t>::loop_and_tail_prefetch<320, 128>(dst, value, count); } } // namespace LIBC_NAMESPACE >From 15cbd0a0c851fa3ac5315e796bb69c1bf791e956 Mon Sep 17 00:00:00 2001 From: Your Name <milido...@google.com> Date: Tue, 24 Oct 2023 21:15:23 +0000 Subject: [PATCH 02/17] Add software prefetching to memset --- libc/src/string/CMakeLists.txt | 1 + .../memory_utils/x86_64/inline_memset.h | 32 ++++++++++++++++--- .../llvm-project-overlay/libc/BUILD.bazel | 1 + 3 files changed, 29 insertions(+), 5 deletions(-) diff --git a/libc/src/string/CMakeLists.txt b/libc/src/string/CMakeLists.txt index 67675b682081c67..aa69bff7a8cfada 100644 --- a/libc/src/string/CMakeLists.txt +++ b/libc/src/string/CMakeLists.txt @@ -656,6 +656,7 @@ if(${LIBC_TARGET_ARCHITECTURE_IS_X86}) add_memset(memset_x86_64_opt_sse4 COMPILE_OPTIONS -march=nehalem REQUIRE SSE4_2) add_memset(memset_x86_64_opt_avx2 COMPILE_OPTIONS -march=haswell REQUIRE AVX2) add_memset(memset_x86_64_opt_avx512 COMPILE_OPTIONS -march=skylake-avx512 REQUIRE AVX512F) + add_memset(memset_x86_64_opt_sw_prefetch COMPILE_OPTIONS -DLIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING) add_memset(memset_opt_host COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE}) add_memset(memset) elseif(${LIBC_TARGET_ARCHITECTURE_IS_AARCH64}) diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h index da463bc0029f9aa..f3ad04930c52c64 100644 --- a/libc/src/string/memory_utils/x86_64/inline_memset.h +++ b/libc/src/string/memory_utils/x86_64/inline_memset.h @@ -16,12 +16,34 @@ #include <stddef.h> // size_t namespace LIBC_NAMESPACE { +namespace x86 { static constexpr size_t kCachelineSize = 64; +LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetching = + LLVM_LIBC_IS_DEFINED(LIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING); + +} // namespace x86 + // prefetch for write static inline void PrefetchW(CPtr dst) { __builtin_prefetch(dst, 1, 3); } +[[maybe_unused]] LIBC_INLINE static void inline_memset_x86_sw_prefetching(Ptr dst, uint8_t value, size_t count) { + PrefetchW(dst + kCachelineSize); + if (count <= 128) + return generic::Memset<uint512_t>::head_tail(dst, value, count); + PrefetchW(dst + kCachelineSize * 2); + // Aligned loop + generic::Memset<uint256_t>::block(dst, value); + align_to_next_boundary<32>(dst, count); + if (count <= 192) { + return Memset<uint256_t>::loop_and_tail(dst, value, count); + } + else { + return Memset<uint256_t>::loop_and_tail_prefetch<320, 128>(dst, value, count); + } +} + [[maybe_unused]] LIBC_INLINE static void inline_memset_x86(Ptr dst, uint8_t value, size_t count) { #if defined(__AVX512F__) @@ -58,17 +80,17 @@ inline_memset_x86(Ptr dst, uint8_t value, size_t count) { return generic::Memset<uint128_t>::head_tail(dst, value, count); if (count <= 64) return generic::Memset<uint256_t>::head_tail(dst, value, count); - PrefetchW(dst + kCachelineSize); + if constexpr (x86::kUseSoftwarePrefetching) { + return inline_memset_x86_sw_prefetching(dst, value, count); + } if (count <= 128) return generic::Memset<uint512_t>::head_tail(dst, value, count); - PrefetchW(dst + kCachelineSize * 2); // Aligned loop generic::Memset<uint256_t>::block(dst, value); align_to_next_boundary<32>(dst, count); - if (count <= 192) { - return Memset<uint256_t>::loop_and_tail(dst, value, count); + else { + return Memset<uint256_t>::loop_and_tail(dst, value, count); } - return Memset<uint256_t>::loop_and_tail_prefetch<320, 128>(dst, value, count); } } // namespace LIBC_NAMESPACE diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel index 3ae68193dccd2b2..dea21fd77182605 100644 --- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel @@ -33,6 +33,7 @@ PRINTF_COPTS = [ MEMORY_COPTS = [ # "LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE=0", # "LIBC_COPT_MEMCPY_X86_USE_SOFTWARE_PREFETCHING", + # "LIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING", ] # A flag to pick which `mpfr` to use for math tests. >From abb9debc49b7e171eae14a98320b9a49779c808c Mon Sep 17 00:00:00 2001 From: Your Name <milido...@google.com> Date: Fri, 27 Oct 2023 17:55:47 +0000 Subject: [PATCH 03/17] Fix formatting --- libc/src/string/memory_utils/x86_64/inline_memset.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h index f3ad04930c52c64..e82b600bf66ab96 100644 --- a/libc/src/string/memory_utils/x86_64/inline_memset.h +++ b/libc/src/string/memory_utils/x86_64/inline_memset.h @@ -28,7 +28,8 @@ LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetching = // prefetch for write static inline void PrefetchW(CPtr dst) { __builtin_prefetch(dst, 1, 3); } -[[maybe_unused]] LIBC_INLINE static void inline_memset_x86_sw_prefetching(Ptr dst, uint8_t value, size_t count) { +[[maybe_unused]] LIBC_INLINE static void +inline_memset_x86_sw_prefetching(Ptr dst, uint8_t value, size_t count) { PrefetchW(dst + kCachelineSize); if (count <= 128) return generic::Memset<uint512_t>::head_tail(dst, value, count); @@ -38,9 +39,9 @@ static inline void PrefetchW(CPtr dst) { __builtin_prefetch(dst, 1, 3); } align_to_next_boundary<32>(dst, count); if (count <= 192) { return Memset<uint256_t>::loop_and_tail(dst, value, count); - } - else { - return Memset<uint256_t>::loop_and_tail_prefetch<320, 128>(dst, value, count); + } else { + return Memset<uint256_t>::loop_and_tail_prefetch<320, 128>(dst, value, + count); } } @@ -89,7 +90,7 @@ inline_memset_x86(Ptr dst, uint8_t value, size_t count) { generic::Memset<uint256_t>::block(dst, value); align_to_next_boundary<32>(dst, count); else { - return Memset<uint256_t>::loop_and_tail(dst, value, count); + return Memset<uint256_t>::loop_and_tail(dst, value, count); } } } // namespace LIBC_NAMESPACE >From 2155db70066c2c220160c4178bd73237e1372d45 Mon Sep 17 00:00:00 2001 From: Your Name <milido...@google.com> Date: Mon, 30 Oct 2023 14:53:56 +0000 Subject: [PATCH 04/17] Fix build errors --- libc/src/string/memory_utils/op_generic.h | 15 +++++++++------ .../string/memory_utils/x86_64/inline_memset.h | 7 ++----- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h index 54af7ea10e25e46..4ba137c97ec9a9a 100644 --- a/libc/src/string/memory_utils/op_generic.h +++ b/libc/src/string/memory_utils/op_generic.h @@ -87,6 +87,9 @@ template <class T, size_t N> struct array_size<cpp::array<T, N>> : cpp::integral_constant<size_t, N> {}; template <typename T> constexpr size_t array_size_v = array_size<T>::value; +// Size of a cacheline for software prefetching +static constexpr size_t kCachelineSize = 64; + // Generic operations for the above type categories. template <typename T> T load(CPtr src) { @@ -167,18 +170,18 @@ template <typename T> struct Memset { template <size_t prefetch_distance, size_t prefetch_degree> LIBC_INLINE static void loop_and_tail_prefetch(Ptr dst, uint8_t value, size_t count) { - Memset<uint512_t>::block(dst, value); - Memset<uint256_t>::block(dst + 64, value); + Memset<64>::block(dst, value); + Memset<32>::block(dst + 64, value); size_t offset = 96; - while (offset + prefetch_degree + kSize <= count) { + while (offset + prefetch_degree + SIZE <= count) { for (size_t i = 0; i < prefetch_degree / kCachelineSize; ++i) PrefetchW(dst + offset + prefetch_distance + kCachelineSize * i); - for (size_t i = 0; i < prefetch_degree; i += kSize, offset += kSize) + for (size_t i = 0; i < prefetch_degree; i += SIZE, offset += SIZE) block(dst + offset, value); } - while (offset + kSize < count) { + while (offset + SIZE < count) { block(dst + offset, value); - offset += kSize; + offset += SIZE; } tail(dst, value, count); } diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h index e82b600bf66ab96..fca48e9658a752d 100644 --- a/libc/src/string/memory_utils/x86_64/inline_memset.h +++ b/libc/src/string/memory_utils/x86_64/inline_memset.h @@ -17,9 +17,6 @@ namespace LIBC_NAMESPACE { namespace x86 { - -static constexpr size_t kCachelineSize = 64; - LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetching = LLVM_LIBC_IS_DEFINED(LIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING); @@ -30,10 +27,10 @@ static inline void PrefetchW(CPtr dst) { __builtin_prefetch(dst, 1, 3); } [[maybe_unused]] LIBC_INLINE static void inline_memset_x86_sw_prefetching(Ptr dst, uint8_t value, size_t count) { - PrefetchW(dst + kCachelineSize); + PrefetchW(dst + generic::kCachelineSize); if (count <= 128) return generic::Memset<uint512_t>::head_tail(dst, value, count); - PrefetchW(dst + kCachelineSize * 2); + PrefetchW(dst + generic::kCachelineSize * 2); // Aligned loop generic::Memset<uint256_t>::block(dst, value); align_to_next_boundary<32>(dst, count); >From 9fe0041c2bb8ba1d522538c79ac1ebae7d0632bb Mon Sep 17 00:00:00 2001 From: Your Name <milido...@google.com> Date: Mon, 30 Oct 2023 17:09:15 +0000 Subject: [PATCH 05/17] Fix build errors --- libc/src/string/memory_utils/op_generic.h | 15 +++--- .../memory_utils/x86_64/inline_memset.h | 50 +++++++++---------- 2 files changed, 33 insertions(+), 32 deletions(-) diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h index 4ba137c97ec9a9a..12eeb65a1edc52e 100644 --- a/libc/src/string/memory_utils/op_generic.h +++ b/libc/src/string/memory_utils/op_generic.h @@ -48,6 +48,13 @@ using generic_v256 = uint8_t __attribute__((__vector_size__(32))); using generic_v512 = uint8_t __attribute__((__vector_size__(64))); } // namespace LIBC_NAMESPACE +namespace sw_prefetch { + // Size of a cacheline for software prefetching +static constexpr size_t kCachelineSize = 64; + // prefetch for write +static inline void PrefetchW(CPtr dst) { __builtin_prefetch(dst, 1, 3); } +} + namespace LIBC_NAMESPACE::generic { // We accept three types of values as elements for generic operations: @@ -87,9 +94,6 @@ template <class T, size_t N> struct array_size<cpp::array<T, N>> : cpp::integral_constant<size_t, N> {}; template <typename T> constexpr size_t array_size_v = array_size<T>::value; -// Size of a cacheline for software prefetching -static constexpr size_t kCachelineSize = 64; - // Generic operations for the above type categories. template <typename T> T load(CPtr src) { @@ -167,12 +171,9 @@ template <typename T> struct Memset { tail(dst, value, count); } - template <size_t prefetch_distance, size_t prefetch_degree> + template <size_t prefetch_distance, size_t prefetch_degree, size_t offset> LIBC_INLINE static void loop_and_tail_prefetch(Ptr dst, uint8_t value, size_t count) { - Memset<64>::block(dst, value); - Memset<32>::block(dst + 64, value); - size_t offset = 96; while (offset + prefetch_degree + SIZE <= count) { for (size_t i = 0; i < prefetch_degree / kCachelineSize; ++i) PrefetchW(dst + offset + prefetch_distance + kCachelineSize * i); diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h index fca48e9658a752d..bc7a6162f77b9cd 100644 --- a/libc/src/string/memory_utils/x86_64/inline_memset.h +++ b/libc/src/string/memory_utils/x86_64/inline_memset.h @@ -17,33 +17,11 @@ namespace LIBC_NAMESPACE { namespace x86 { -LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetching = +LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetchingMemset = LLVM_LIBC_IS_DEFINED(LIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING); } // namespace x86 -// prefetch for write -static inline void PrefetchW(CPtr dst) { __builtin_prefetch(dst, 1, 3); } - -[[maybe_unused]] LIBC_INLINE static void -inline_memset_x86_sw_prefetching(Ptr dst, uint8_t value, size_t count) { - PrefetchW(dst + generic::kCachelineSize); - if (count <= 128) - return generic::Memset<uint512_t>::head_tail(dst, value, count); - PrefetchW(dst + generic::kCachelineSize * 2); - // Aligned loop - generic::Memset<uint256_t>::block(dst, value); - align_to_next_boundary<32>(dst, count); - if (count <= 192) { - return Memset<uint256_t>::loop_and_tail(dst, value, count); - } else { - return Memset<uint256_t>::loop_and_tail_prefetch<320, 128>(dst, value, - count); - } -} - -[[maybe_unused]] LIBC_INLINE static void -inline_memset_x86(Ptr dst, uint8_t value, size_t count) { #if defined(__AVX512F__) using uint128_t = generic_v128; using uint256_t = generic_v256; @@ -62,6 +40,28 @@ inline_memset_x86(Ptr dst, uint8_t value, size_t count) { using uint512_t = cpp::array<uint64_t, 8>; #endif +[[maybe_unused]] LIBC_INLINE static void +inline_memset_x86_sw_prefetching(Ptr dst, uint8_t value, size_t count) { + sw_prefetch::PrefetchW(dst + generic::kCachelineSize); + if (count <= 128) + return generic::Memset<uint512_t>::head_tail(dst, value, count); + sw_prefetch::PrefetchW(dst + generic::kCachelineSize * 2); + // Aligned loop + generic::Memset<uint256_t>::block(dst, value); + align_to_next_boundary<32>(dst, count); + if (count <= 192) { + return Memset<uint256_t>::loop_and_tail(dst, value, count); + } else { + // Warm up memset + generic::Memset<uint256_t>::block(dst, value); + generic::Memset<uint128_t>::block(dst + 64, value); + return generic::Memset<uint256_t>::loop_and_tail_prefetch<320, 128, 96>(dst, value, + count); + } +} + +[[maybe_unused]] LIBC_INLINE static void +inline_memset_x86(Ptr dst, uint8_t value, size_t count) { if (count == 0) return; if (count == 1) @@ -78,7 +78,7 @@ inline_memset_x86(Ptr dst, uint8_t value, size_t count) { return generic::Memset<uint128_t>::head_tail(dst, value, count); if (count <= 64) return generic::Memset<uint256_t>::head_tail(dst, value, count); - if constexpr (x86::kUseSoftwarePrefetching) { + if constexpr (x86::kUseSoftwarePrefetchingMemset) { return inline_memset_x86_sw_prefetching(dst, value, count); } if (count <= 128) @@ -87,7 +87,7 @@ inline_memset_x86(Ptr dst, uint8_t value, size_t count) { generic::Memset<uint256_t>::block(dst, value); align_to_next_boundary<32>(dst, count); else { - return Memset<uint256_t>::loop_and_tail(dst, value, count); + return generic::Memset<uint256_t>::loop_and_tail(dst, value, count); } } } // namespace LIBC_NAMESPACE >From 52aad858a4a8652f95a3e1120e4dd7bd2f45d225 Mon Sep 17 00:00:00 2001 From: Your Name <milido...@google.com> Date: Mon, 30 Oct 2023 17:39:04 +0000 Subject: [PATCH 06/17] Fix formatting --- libc/src/string/memory_utils/op_generic.h | 6 +- .../memory_utils/x86_64/inline_memset.h | 92 +++++++++---------- 2 files changed, 49 insertions(+), 49 deletions(-) diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h index 12eeb65a1edc52e..af6a814be1542a4 100644 --- a/libc/src/string/memory_utils/op_generic.h +++ b/libc/src/string/memory_utils/op_generic.h @@ -49,11 +49,11 @@ using generic_v512 = uint8_t __attribute__((__vector_size__(64))); } // namespace LIBC_NAMESPACE namespace sw_prefetch { - // Size of a cacheline for software prefetching +// Size of a cacheline for software prefetching static constexpr size_t kCachelineSize = 64; - // prefetch for write +// prefetch for write static inline void PrefetchW(CPtr dst) { __builtin_prefetch(dst, 1, 3); } -} +} // namespace sw_prefetch namespace LIBC_NAMESPACE::generic { diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h index bc7a6162f77b9cd..9000aa03019d291 100644 --- a/libc/src/string/memory_utils/x86_64/inline_memset.h +++ b/libc/src/string/memory_utils/x86_64/inline_memset.h @@ -40,56 +40,56 @@ LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetchingMemset = using uint512_t = cpp::array<uint64_t, 8>; #endif -[[maybe_unused]] LIBC_INLINE static void -inline_memset_x86_sw_prefetching(Ptr dst, uint8_t value, size_t count) { - sw_prefetch::PrefetchW(dst + generic::kCachelineSize); - if (count <= 128) - return generic::Memset<uint512_t>::head_tail(dst, value, count); - sw_prefetch::PrefetchW(dst + generic::kCachelineSize * 2); - // Aligned loop - generic::Memset<uint256_t>::block(dst, value); - align_to_next_boundary<32>(dst, count); - if (count <= 192) { - return Memset<uint256_t>::loop_and_tail(dst, value, count); - } else { - // Warm up memset + [[maybe_unused]] LIBC_INLINE static void + inline_memset_x86_sw_prefetching(Ptr dst, uint8_t value, size_t count) { + sw_prefetch::PrefetchW(dst + generic::kCachelineSize); + if (count <= 128) + return generic::Memset<uint512_t>::head_tail(dst, value, count); + sw_prefetch::PrefetchW(dst + generic::kCachelineSize * 2); + // Aligned loop generic::Memset<uint256_t>::block(dst, value); - generic::Memset<uint128_t>::block(dst + 64, value); - return generic::Memset<uint256_t>::loop_and_tail_prefetch<320, 128, 96>(dst, value, - count); + align_to_next_boundary<32>(dst, count); + if (count <= 192) { + return Memset<uint256_t>::loop_and_tail(dst, value, count); + } else { + // Warm up memset + generic::Memset<uint256_t>::block(dst, value); + generic::Memset<uint128_t>::block(dst + 64, value); + return generic::Memset<uint256_t>::loop_and_tail_prefetch<320, 128, 96>( + dst, value, count); + } } -} -[[maybe_unused]] LIBC_INLINE static void -inline_memset_x86(Ptr dst, uint8_t value, size_t count) { - if (count == 0) - return; - if (count == 1) - return generic::Memset<uint8_t>::block(dst, value); - if (count == 2) - return generic::Memset<uint16_t>::block(dst, value); - if (count == 3) - return generic::MemsetSequence<uint16_t, uint8_t>::block(dst, value); - if (count <= 8) - return generic::Memset<uint32_t>::head_tail(dst, value, count); - if (count <= 16) - return generic::Memset<uint64_t>::head_tail(dst, value, count); - if (count <= 32) - return generic::Memset<uint128_t>::head_tail(dst, value, count); - if (count <= 64) - return generic::Memset<uint256_t>::head_tail(dst, value, count); - if constexpr (x86::kUseSoftwarePrefetchingMemset) { - return inline_memset_x86_sw_prefetching(dst, value, count); - } - if (count <= 128) - return generic::Memset<uint512_t>::head_tail(dst, value, count); - // Aligned loop - generic::Memset<uint256_t>::block(dst, value); - align_to_next_boundary<32>(dst, count); - else { - return generic::Memset<uint256_t>::loop_and_tail(dst, value, count); + [[maybe_unused]] LIBC_INLINE static void + inline_memset_x86(Ptr dst, uint8_t value, size_t count) { + if (count == 0) + return; + if (count == 1) + return generic::Memset<uint8_t>::block(dst, value); + if (count == 2) + return generic::Memset<uint16_t>::block(dst, value); + if (count == 3) + return generic::MemsetSequence<uint16_t, uint8_t>::block(dst, value); + if (count <= 8) + return generic::Memset<uint32_t>::head_tail(dst, value, count); + if (count <= 16) + return generic::Memset<uint64_t>::head_tail(dst, value, count); + if (count <= 32) + return generic::Memset<uint128_t>::head_tail(dst, value, count); + if (count <= 64) + return generic::Memset<uint256_t>::head_tail(dst, value, count); + if constexpr (x86::kUseSoftwarePrefetchingMemset) { + return inline_memset_x86_sw_prefetching(dst, value, count); + } + if (count <= 128) + return generic::Memset<uint512_t>::head_tail(dst, value, count); + // Aligned loop + generic::Memset<uint256_t>::block(dst, value); + align_to_next_boundary<32>(dst, count); + else { + return generic::Memset<uint256_t>::loop_and_tail(dst, value, count); + } } -} } // namespace LIBC_NAMESPACE #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_X86_64_INLINE_MEMSET_H >From efbfcd19cecbd3e27b72523d06d8cff3a5bbbafa Mon Sep 17 00:00:00 2001 From: Your Name <milido...@google.com> Date: Mon, 30 Oct 2023 19:12:19 +0000 Subject: [PATCH 07/17] Fix formatting --- libc/src/string/memory_utils/op_generic.h | 24 +++++++++++-------- .../memory_utils/x86_64/inline_memset.h | 6 ++--- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h index af6a814be1542a4..ae221e0fa380655 100644 --- a/libc/src/string/memory_utils/op_generic.h +++ b/libc/src/string/memory_utils/op_generic.h @@ -48,12 +48,12 @@ using generic_v256 = uint8_t __attribute__((__vector_size__(32))); using generic_v512 = uint8_t __attribute__((__vector_size__(64))); } // namespace LIBC_NAMESPACE -namespace sw_prefetch { +namespace LIBC_NAMESPACE::sw_prefetch { // Size of a cacheline for software prefetching static constexpr size_t kCachelineSize = 64; // prefetch for write static inline void PrefetchW(CPtr dst) { __builtin_prefetch(dst, 1, 3); } -} // namespace sw_prefetch +} // namespace LIBC_NAMESPACE::sw_prefetch namespace LIBC_NAMESPACE::generic { @@ -174,15 +174,19 @@ template <typename T> struct Memset { template <size_t prefetch_distance, size_t prefetch_degree, size_t offset> LIBC_INLINE static void loop_and_tail_prefetch(Ptr dst, uint8_t value, size_t count) { - while (offset + prefetch_degree + SIZE <= count) { - for (size_t i = 0; i < prefetch_degree / kCachelineSize; ++i) - PrefetchW(dst + offset + prefetch_distance + kCachelineSize * i); - for (size_t i = 0; i < prefetch_degree; i += SIZE, offset += SIZE) - block(dst + offset, value); + size_t prefetch_offset = offset; + + while (prefetch_offset + prefetch_degree + SIZE <= count) { + for (size_t i = 0; i < prefetch_degree / sw_prefetch::kCachelineSize; ++i) + PrefetchW(dst + prefetch_offset + prefetch_distance + + sw_prefetch::kCachelineSize * i); + for (size_t i = 0; i < prefetch_degree; + i += SIZE, prefetch_offset += SIZE) + block(dst + prefetch_offset, value); } - while (offset + SIZE < count) { - block(dst + offset, value); - offset += SIZE; + while (prefetch_offset + SIZE < count) { + block(dst + prefetch_offset, value); + prefetch_offset += SIZE; } tail(dst, value, count); } diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h index 9000aa03019d291..fc00f86fc0fb34e 100644 --- a/libc/src/string/memory_utils/x86_64/inline_memset.h +++ b/libc/src/string/memory_utils/x86_64/inline_memset.h @@ -42,15 +42,15 @@ LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetchingMemset = [[maybe_unused]] LIBC_INLINE static void inline_memset_x86_sw_prefetching(Ptr dst, uint8_t value, size_t count) { - sw_prefetch::PrefetchW(dst + generic::kCachelineSize); + sw_prefetch::PrefetchW(dst + sw_prefetch::kCachelineSize); if (count <= 128) return generic::Memset<uint512_t>::head_tail(dst, value, count); - sw_prefetch::PrefetchW(dst + generic::kCachelineSize * 2); + sw_prefetch::PrefetchW(dst + sw_prefetch::kCachelineSize * 2); // Aligned loop generic::Memset<uint256_t>::block(dst, value); align_to_next_boundary<32>(dst, count); if (count <= 192) { - return Memset<uint256_t>::loop_and_tail(dst, value, count); + return generic::Memset<uint256_t>::loop_and_tail(dst, value, count); } else { // Warm up memset generic::Memset<uint256_t>::block(dst, value); >From d97c8c0d17c860a892c029059469b70962e4a201 Mon Sep 17 00:00:00 2001 From: Your Name <milido...@google.com> Date: Mon, 30 Oct 2023 20:21:49 +0000 Subject: [PATCH 08/17] Fix formatting --- libc/src/string/memory_utils/op_generic.h | 4 ++-- libc/src/string/memory_utils/x86_64/inline_memset.h | 4 +--- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h index ae221e0fa380655..35f74b544bb3598 100644 --- a/libc/src/string/memory_utils/op_generic.h +++ b/libc/src/string/memory_utils/op_generic.h @@ -178,8 +178,8 @@ template <typename T> struct Memset { while (prefetch_offset + prefetch_degree + SIZE <= count) { for (size_t i = 0; i < prefetch_degree / sw_prefetch::kCachelineSize; ++i) - PrefetchW(dst + prefetch_offset + prefetch_distance + - sw_prefetch::kCachelineSize * i); + sw_prefetch::PrefetchW(dst + prefetch_offset + prefetch_distance + + sw_prefetch::kCachelineSize * i); for (size_t i = 0; i < prefetch_degree; i += SIZE, prefetch_offset += SIZE) block(dst + prefetch_offset, value); diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h index fc00f86fc0fb34e..50ba2fb2e37cfab 100644 --- a/libc/src/string/memory_utils/x86_64/inline_memset.h +++ b/libc/src/string/memory_utils/x86_64/inline_memset.h @@ -86,9 +86,7 @@ LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetchingMemset = // Aligned loop generic::Memset<uint256_t>::block(dst, value); align_to_next_boundary<32>(dst, count); - else { - return generic::Memset<uint256_t>::loop_and_tail(dst, value, count); - } + return generic::Memset<uint256_t>::loop_and_tail(dst, value, count); } } // namespace LIBC_NAMESPACE >From 34d572e81b561b4450022dc358f6e3a91632224f Mon Sep 17 00:00:00 2001 From: Your Name <milido...@google.com> Date: Mon, 30 Oct 2023 20:21:49 +0000 Subject: [PATCH 09/17] Fix formatting --- libc/src/string/memory_utils/op_generic.h | 18 +++++++++--------- .../string/memory_utils/x86_64/inline_memset.h | 5 +---- 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h index 35f74b544bb3598..f36c3acafff5665 100644 --- a/libc/src/string/memory_utils/op_generic.h +++ b/libc/src/string/memory_utils/op_generic.h @@ -171,22 +171,22 @@ template <typename T> struct Memset { tail(dst, value, count); } - template <size_t prefetch_distance, size_t prefetch_degree, size_t offset> + template <size_t prefetch_distance, size_t prefetch_degree> LIBC_INLINE static void loop_and_tail_prefetch(Ptr dst, uint8_t value, size_t count) { - size_t prefetch_offset = offset; + size_t offset = 0; - while (prefetch_offset + prefetch_degree + SIZE <= count) { + while (offset + prefetch_degree + SIZE <= count) { for (size_t i = 0; i < prefetch_degree / sw_prefetch::kCachelineSize; ++i) - sw_prefetch::PrefetchW(dst + prefetch_offset + prefetch_distance + + sw_prefetch::PrefetchW(dst + offset + prefetch_distance + sw_prefetch::kCachelineSize * i); for (size_t i = 0; i < prefetch_degree; - i += SIZE, prefetch_offset += SIZE) - block(dst + prefetch_offset, value); + i += SIZE, offset += SIZE) + block(dst + offset, value); } - while (prefetch_offset + SIZE < count) { - block(dst + prefetch_offset, value); - prefetch_offset += SIZE; + while (offset + SIZE < count) { + block(dst + offset, value); + offset += SIZE; } tail(dst, value, count); } diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h index 50ba2fb2e37cfab..4834968c0b99f38 100644 --- a/libc/src/string/memory_utils/x86_64/inline_memset.h +++ b/libc/src/string/memory_utils/x86_64/inline_memset.h @@ -52,10 +52,7 @@ LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetchingMemset = if (count <= 192) { return generic::Memset<uint256_t>::loop_and_tail(dst, value, count); } else { - // Warm up memset - generic::Memset<uint256_t>::block(dst, value); - generic::Memset<uint128_t>::block(dst + 64, value); - return generic::Memset<uint256_t>::loop_and_tail_prefetch<320, 128, 96>( + return generic::Memset<uint256_t>::loop_and_tail_prefetch<320, 128>( dst, value, count); } } >From f363ce21eedc44821e5163ad9472396856a096c1 Mon Sep 17 00:00:00 2001 From: Your Name <milido...@google.com> Date: Mon, 30 Oct 2023 20:21:49 +0000 Subject: [PATCH 10/17] Fix formatting --- libc/src/string/memory_utils/op_generic.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h index f36c3acafff5665..4063de1d5f5832a 100644 --- a/libc/src/string/memory_utils/op_generic.h +++ b/libc/src/string/memory_utils/op_generic.h @@ -180,8 +180,7 @@ template <typename T> struct Memset { for (size_t i = 0; i < prefetch_degree / sw_prefetch::kCachelineSize; ++i) sw_prefetch::PrefetchW(dst + offset + prefetch_distance + sw_prefetch::kCachelineSize * i); - for (size_t i = 0; i < prefetch_degree; - i += SIZE, offset += SIZE) + for (size_t i = 0; i < prefetch_degree; i += SIZE, offset += SIZE) block(dst + offset, value); } while (offset + SIZE < count) { >From 1110c4e6237dcae26351d7678ee91de0e7fe3791 Mon Sep 17 00:00:00 2001 From: doshimili <milido...@google.com> Date: Tue, 31 Oct 2023 15:42:32 -0400 Subject: [PATCH 11/17] Sw prefetch in memset (#2) * Add software prefetching to memset * Add software prefetching to memset * Fix formatting * Fix build errors * Fix build errors * Fix formatting * Fix formatting * Fix formatting * Fix formatting * Fix formatting --- libc/src/string/CMakeLists.txt | 1 + libc/src/string/memory_utils/op_generic.h | 26 +++++++ .../memory_utils/x86_64/inline_memset.h | 75 ++++++++++++------- .../llvm-project-overlay/libc/BUILD.bazel | 1 + 4 files changed, 78 insertions(+), 25 deletions(-) diff --git a/libc/src/string/CMakeLists.txt b/libc/src/string/CMakeLists.txt index 67675b682081c67..aa69bff7a8cfada 100644 --- a/libc/src/string/CMakeLists.txt +++ b/libc/src/string/CMakeLists.txt @@ -656,6 +656,7 @@ if(${LIBC_TARGET_ARCHITECTURE_IS_X86}) add_memset(memset_x86_64_opt_sse4 COMPILE_OPTIONS -march=nehalem REQUIRE SSE4_2) add_memset(memset_x86_64_opt_avx2 COMPILE_OPTIONS -march=haswell REQUIRE AVX2) add_memset(memset_x86_64_opt_avx512 COMPILE_OPTIONS -march=skylake-avx512 REQUIRE AVX512F) + add_memset(memset_x86_64_opt_sw_prefetch COMPILE_OPTIONS -DLIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING) add_memset(memset_opt_host COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE}) add_memset(memset) elseif(${LIBC_TARGET_ARCHITECTURE_IS_AARCH64}) diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h index fd71ca30e24b936..4063de1d5f5832a 100644 --- a/libc/src/string/memory_utils/op_generic.h +++ b/libc/src/string/memory_utils/op_generic.h @@ -48,6 +48,13 @@ using generic_v256 = uint8_t __attribute__((__vector_size__(32))); using generic_v512 = uint8_t __attribute__((__vector_size__(64))); } // namespace LIBC_NAMESPACE +namespace LIBC_NAMESPACE::sw_prefetch { +// Size of a cacheline for software prefetching +static constexpr size_t kCachelineSize = 64; +// prefetch for write +static inline void PrefetchW(CPtr dst) { __builtin_prefetch(dst, 1, 3); } +} // namespace LIBC_NAMESPACE::sw_prefetch + namespace LIBC_NAMESPACE::generic { // We accept three types of values as elements for generic operations: @@ -163,6 +170,25 @@ template <typename T> struct Memset { } while (offset < count - SIZE); tail(dst, value, count); } + + template <size_t prefetch_distance, size_t prefetch_degree> + LIBC_INLINE static void loop_and_tail_prefetch(Ptr dst, uint8_t value, + size_t count) { + size_t offset = 0; + + while (offset + prefetch_degree + SIZE <= count) { + for (size_t i = 0; i < prefetch_degree / sw_prefetch::kCachelineSize; ++i) + sw_prefetch::PrefetchW(dst + offset + prefetch_distance + + sw_prefetch::kCachelineSize * i); + for (size_t i = 0; i < prefetch_degree; i += SIZE, offset += SIZE) + block(dst + offset, value); + } + while (offset + SIZE < count) { + block(dst + offset, value); + offset += SIZE; + } + tail(dst, value, count); + } }; template <typename T, typename... TS> struct MemsetSequence { diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h index 6436594856b0eaf..4834968c0b99f38 100644 --- a/libc/src/string/memory_utils/x86_64/inline_memset.h +++ b/libc/src/string/memory_utils/x86_64/inline_memset.h @@ -16,9 +16,12 @@ #include <stddef.h> // size_t namespace LIBC_NAMESPACE { +namespace x86 { +LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetchingMemset = + LLVM_LIBC_IS_DEFINED(LIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING); + +} // namespace x86 -[[maybe_unused]] LIBC_INLINE static void -inline_memset_x86(Ptr dst, uint8_t value, size_t count) { #if defined(__AVX512F__) using uint128_t = generic_v128; using uint256_t = generic_v256; @@ -37,29 +40,51 @@ inline_memset_x86(Ptr dst, uint8_t value, size_t count) { using uint512_t = cpp::array<uint64_t, 8>; #endif - if (count == 0) - return; - if (count == 1) - return generic::Memset<uint8_t>::block(dst, value); - if (count == 2) - return generic::Memset<uint16_t>::block(dst, value); - if (count == 3) - return generic::MemsetSequence<uint16_t, uint8_t>::block(dst, value); - if (count <= 8) - return generic::Memset<uint32_t>::head_tail(dst, value, count); - if (count <= 16) - return generic::Memset<uint64_t>::head_tail(dst, value, count); - if (count <= 32) - return generic::Memset<uint128_t>::head_tail(dst, value, count); - if (count <= 64) - return generic::Memset<uint256_t>::head_tail(dst, value, count); - if (count <= 128) - return generic::Memset<uint512_t>::head_tail(dst, value, count); - // Aligned loop - generic::Memset<uint256_t>::block(dst, value); - align_to_next_boundary<32>(dst, count); - return generic::Memset<uint256_t>::loop_and_tail(dst, value, count); -} + [[maybe_unused]] LIBC_INLINE static void + inline_memset_x86_sw_prefetching(Ptr dst, uint8_t value, size_t count) { + sw_prefetch::PrefetchW(dst + sw_prefetch::kCachelineSize); + if (count <= 128) + return generic::Memset<uint512_t>::head_tail(dst, value, count); + sw_prefetch::PrefetchW(dst + sw_prefetch::kCachelineSize * 2); + // Aligned loop + generic::Memset<uint256_t>::block(dst, value); + align_to_next_boundary<32>(dst, count); + if (count <= 192) { + return generic::Memset<uint256_t>::loop_and_tail(dst, value, count); + } else { + return generic::Memset<uint256_t>::loop_and_tail_prefetch<320, 128>( + dst, value, count); + } + } + + [[maybe_unused]] LIBC_INLINE static void + inline_memset_x86(Ptr dst, uint8_t value, size_t count) { + if (count == 0) + return; + if (count == 1) + return generic::Memset<uint8_t>::block(dst, value); + if (count == 2) + return generic::Memset<uint16_t>::block(dst, value); + if (count == 3) + return generic::MemsetSequence<uint16_t, uint8_t>::block(dst, value); + if (count <= 8) + return generic::Memset<uint32_t>::head_tail(dst, value, count); + if (count <= 16) + return generic::Memset<uint64_t>::head_tail(dst, value, count); + if (count <= 32) + return generic::Memset<uint128_t>::head_tail(dst, value, count); + if (count <= 64) + return generic::Memset<uint256_t>::head_tail(dst, value, count); + if constexpr (x86::kUseSoftwarePrefetchingMemset) { + return inline_memset_x86_sw_prefetching(dst, value, count); + } + if (count <= 128) + return generic::Memset<uint512_t>::head_tail(dst, value, count); + // Aligned loop + generic::Memset<uint256_t>::block(dst, value); + align_to_next_boundary<32>(dst, count); + return generic::Memset<uint256_t>::loop_and_tail(dst, value, count); + } } // namespace LIBC_NAMESPACE #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_X86_64_INLINE_MEMSET_H diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel index 3ae68193dccd2b2..dea21fd77182605 100644 --- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel @@ -33,6 +33,7 @@ PRINTF_COPTS = [ MEMORY_COPTS = [ # "LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE=0", # "LIBC_COPT_MEMCPY_X86_USE_SOFTWARE_PREFETCHING", + # "LIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING", ] # A flag to pick which `mpfr` to use for math tests. >From 6c96e79b76f2a09f908af1fc323b7986e871ceec Mon Sep 17 00:00:00 2001 From: Your Name <milido...@google.com> Date: Tue, 31 Oct 2023 20:35:35 +0000 Subject: [PATCH 12/17] Add warmup to memset --- libc/src/string/memory_utils/op_generic.h | 3 +-- libc/src/string/memory_utils/x86_64/inline_memset.h | 4 ++++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h index 4063de1d5f5832a..2844501a7459044 100644 --- a/libc/src/string/memory_utils/op_generic.h +++ b/libc/src/string/memory_utils/op_generic.h @@ -174,8 +174,7 @@ template <typename T> struct Memset { template <size_t prefetch_distance, size_t prefetch_degree> LIBC_INLINE static void loop_and_tail_prefetch(Ptr dst, uint8_t value, size_t count) { - size_t offset = 0; - + size_t offset = 96; while (offset + prefetch_degree + SIZE <= count) { for (size_t i = 0; i < prefetch_degree / sw_prefetch::kCachelineSize; ++i) sw_prefetch::PrefetchW(dst + offset + prefetch_distance + diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h index 4834968c0b99f38..98f559bca875a3a 100644 --- a/libc/src/string/memory_utils/x86_64/inline_memset.h +++ b/libc/src/string/memory_utils/x86_64/inline_memset.h @@ -42,9 +42,11 @@ LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetchingMemset = [[maybe_unused]] LIBC_INLINE static void inline_memset_x86_sw_prefetching(Ptr dst, uint8_t value, size_t count) { + // Prefetch one cacheline sw_prefetch::PrefetchW(dst + sw_prefetch::kCachelineSize); if (count <= 128) return generic::Memset<uint512_t>::head_tail(dst, value, count); + // Prefetch the next cacheline sw_prefetch::PrefetchW(dst + sw_prefetch::kCachelineSize * 2); // Aligned loop generic::Memset<uint256_t>::block(dst, value); @@ -52,6 +54,8 @@ LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetchingMemset = if (count <= 192) { return generic::Memset<uint256_t>::loop_and_tail(dst, value, count); } else { + generic::Memset<uint512_t>::block(dst, value); + generic::Memset<uint256_t>::block(dst + sizeof(uint512_t), value); return generic::Memset<uint256_t>::loop_and_tail_prefetch<320, 128>( dst, value, count); } >From 01be692503dc4e913b00b5d074b2cbb4c63347f6 Mon Sep 17 00:00:00 2001 From: doshimili <milido...@google.com> Date: Tue, 31 Oct 2023 17:05:57 -0400 Subject: [PATCH 13/17] Add software prefetch instructions to memset * Add software prefetching to memset * Add software prefetching to memset * Fix formatting * Fix build errors * Fix build errors * Fix formatting * Fix formatting * Fix formatting * Fix formatting * Fix formatting * Add warmup to memset --- libc/src/string/memory_utils/op_generic.h | 3 +-- libc/src/string/memory_utils/x86_64/inline_memset.h | 4 ++++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h index 4063de1d5f5832a..2844501a7459044 100644 --- a/libc/src/string/memory_utils/op_generic.h +++ b/libc/src/string/memory_utils/op_generic.h @@ -174,8 +174,7 @@ template <typename T> struct Memset { template <size_t prefetch_distance, size_t prefetch_degree> LIBC_INLINE static void loop_and_tail_prefetch(Ptr dst, uint8_t value, size_t count) { - size_t offset = 0; - + size_t offset = 96; while (offset + prefetch_degree + SIZE <= count) { for (size_t i = 0; i < prefetch_degree / sw_prefetch::kCachelineSize; ++i) sw_prefetch::PrefetchW(dst + offset + prefetch_distance + diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h index 4834968c0b99f38..98f559bca875a3a 100644 --- a/libc/src/string/memory_utils/x86_64/inline_memset.h +++ b/libc/src/string/memory_utils/x86_64/inline_memset.h @@ -42,9 +42,11 @@ LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetchingMemset = [[maybe_unused]] LIBC_INLINE static void inline_memset_x86_sw_prefetching(Ptr dst, uint8_t value, size_t count) { + // Prefetch one cacheline sw_prefetch::PrefetchW(dst + sw_prefetch::kCachelineSize); if (count <= 128) return generic::Memset<uint512_t>::head_tail(dst, value, count); + // Prefetch the next cacheline sw_prefetch::PrefetchW(dst + sw_prefetch::kCachelineSize * 2); // Aligned loop generic::Memset<uint256_t>::block(dst, value); @@ -52,6 +54,8 @@ LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetchingMemset = if (count <= 192) { return generic::Memset<uint256_t>::loop_and_tail(dst, value, count); } else { + generic::Memset<uint512_t>::block(dst, value); + generic::Memset<uint256_t>::block(dst + sizeof(uint512_t), value); return generic::Memset<uint256_t>::loop_and_tail_prefetch<320, 128>( dst, value, count); } >From 2f3f80163438cd663eed98b63fd0b704a38315b8 Mon Sep 17 00:00:00 2001 From: Your Name <milido...@google.com> Date: Tue, 31 Oct 2023 21:10:17 +0000 Subject: [PATCH 14/17] SW Prefetching in Memset --- libc/src/string/memory_utils/x86_64/inline_memset.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h index 98f559bca875a3a..e4eadf614adc6bf 100644 --- a/libc/src/string/memory_utils/x86_64/inline_memset.h +++ b/libc/src/string/memory_utils/x86_64/inline_memset.h @@ -46,7 +46,7 @@ LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetchingMemset = sw_prefetch::PrefetchW(dst + sw_prefetch::kCachelineSize); if (count <= 128) return generic::Memset<uint512_t>::head_tail(dst, value, count); - // Prefetch the next cacheline + // Prefetch the second cacheline sw_prefetch::PrefetchW(dst + sw_prefetch::kCachelineSize * 2); // Aligned loop generic::Memset<uint256_t>::block(dst, value); >From 24467d08dd39a286629e6fb4bcc3c8d0fede2a41 Mon Sep 17 00:00:00 2001 From: Your Name <milido...@google.com> Date: Tue, 7 Nov 2023 16:25:15 +0000 Subject: [PATCH 15/17] Move implementation to src/string/memory_utils/x86_64/inline_memset.h and other minor changes --- libc/src/string/memory_utils/op_generic.h | 43 ++---- libc/src/string/memory_utils/utils.h | 6 + .../memory_utils/x86_64/inline_memcpy.h | 5 - .../memory_utils/x86_64/inline_memset.h | 131 ++++++++++-------- 4 files changed, 92 insertions(+), 93 deletions(-) diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h index 2844501a7459044..2ee1a650ba71879 100644 --- a/libc/src/string/memory_utils/op_generic.h +++ b/libc/src/string/memory_utils/op_generic.h @@ -48,13 +48,6 @@ using generic_v256 = uint8_t __attribute__((__vector_size__(32))); using generic_v512 = uint8_t __attribute__((__vector_size__(64))); } // namespace LIBC_NAMESPACE -namespace LIBC_NAMESPACE::sw_prefetch { -// Size of a cacheline for software prefetching -static constexpr size_t kCachelineSize = 64; -// prefetch for write -static inline void PrefetchW(CPtr dst) { __builtin_prefetch(dst, 1, 3); } -} // namespace LIBC_NAMESPACE::sw_prefetch - namespace LIBC_NAMESPACE::generic { // We accept three types of values as elements for generic operations: @@ -141,19 +134,23 @@ template <typename T> struct Memset { static_assert(is_element_type_v<T>); static constexpr size_t SIZE = sizeof(T); - LIBC_INLINE static void block(Ptr dst, uint8_t value) { + LIBC_INLINE static void block_offset(Ptr dst, uint8_t value, size_t offset) { if constexpr (is_scalar_v<T> || is_vector_v<T>) { - store<T>(dst, splat<T>(value)); + store<T>(dst + offset, splat<T>(value)); } else if constexpr (is_array_v<T>) { using value_type = typename T::value_type; const auto Splat = splat<value_type>(value); for (size_t I = 0; I < array_size_v<T>; ++I) - store<value_type>(dst + (I * sizeof(value_type)), Splat); + store<value_type>(dst + offset + (I * sizeof(value_type)), Splat); } } + LIBC_INLINE static void block(Ptr dst, uint8_t value) { + block_offset(dst, value, 0); + } + LIBC_INLINE static void tail(Ptr dst, uint8_t value, size_t count) { - block(dst + count - SIZE, value); + block_offset(dst, value, count - SIZE); } LIBC_INLINE static void head_tail(Ptr dst, uint8_t value, size_t count) { @@ -161,33 +158,19 @@ template <typename T> struct Memset { tail(dst, value, count); } - LIBC_INLINE static void loop_and_tail(Ptr dst, uint8_t value, size_t count) { + LIBC_INLINE static void loop_and_tail_offset(Ptr dst, uint8_t value, size_t count, size_t offset) { static_assert(SIZE > 1, "a loop of size 1 does not need tail"); - size_t offset = 0; do { - block(dst + offset, value); + block_offset(dst, value, offset); offset += SIZE; } while (offset < count - SIZE); tail(dst, value, count); } - template <size_t prefetch_distance, size_t prefetch_degree> - LIBC_INLINE static void loop_and_tail_prefetch(Ptr dst, uint8_t value, - size_t count) { - size_t offset = 96; - while (offset + prefetch_degree + SIZE <= count) { - for (size_t i = 0; i < prefetch_degree / sw_prefetch::kCachelineSize; ++i) - sw_prefetch::PrefetchW(dst + offset + prefetch_distance + - sw_prefetch::kCachelineSize * i); - for (size_t i = 0; i < prefetch_degree; i += SIZE, offset += SIZE) - block(dst + offset, value); - } - while (offset + SIZE < count) { - block(dst + offset, value); - offset += SIZE; + LIBC_INLINE static void loop_and_tail(Ptr dst, uint8_t value, size_t count) { + return loop_and_tail_offset(dst, value, count, 0); } - tail(dst, value, count); - } + }; template <typename T, typename... TS> struct MemsetSequence { diff --git a/libc/src/string/memory_utils/utils.h b/libc/src/string/memory_utils/utils.h index 85677e51fad0e09..62b3b7a0d728bd5 100644 --- a/libc/src/string/memory_utils/utils.h +++ b/libc/src/string/memory_utils/utils.h @@ -374,6 +374,12 @@ template <size_t SIZE> struct AlignHelper { uintptr_t offset_; }; +LIBC_INLINE void prefetch_for_write(CPtr dst) { __builtin_prefetch(dst, 1, 3); } + +LIBC_INLINE void prefetch_to_local_cache(CPtr dst) { + __builtin_prefetch(dst, 0, 3); +} + } // namespace LIBC_NAMESPACE #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_UTILS_H diff --git a/libc/src/string/memory_utils/x86_64/inline_memcpy.h b/libc/src/string/memory_utils/x86_64/inline_memcpy.h index f43230ffd8ad125..f851bcec09650d3 100644 --- a/libc/src/string/memory_utils/x86_64/inline_memcpy.h +++ b/libc/src/string/memory_utils/x86_64/inline_memcpy.h @@ -47,11 +47,6 @@ LIBC_INLINE_VAR constexpr size_t kRepMovsbThreshold = } // namespace x86 -// TODO: Move to a shared header when appropriate. -[[maybe_unused]] LIBC_INLINE void prefetch_to_local_cache(const void *addr) { - __builtin_prefetch(addr, 0, 3); -} - [[maybe_unused]] LIBC_INLINE void inline_memcpy_x86_sse2_ge64(Ptr __restrict dst, CPtr __restrict src, size_t count) { diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h index e4eadf614adc6bf..2f132b45789b5c9 100644 --- a/libc/src/string/memory_utils/x86_64/inline_memset.h +++ b/libc/src/string/memory_utils/x86_64/inline_memset.h @@ -12,83 +12,98 @@ #include "src/string/memory_utils/op_generic.h" #include "src/string/memory_utils/op_x86.h" #include "src/string/memory_utils/utils.h" // Ptr, CPtr +#include "third_party/llvm/llvm-project/libc/src/string/memory_utils/inline_memcpy.h" #include <stddef.h> // size_t namespace LIBC_NAMESPACE { namespace x86 { +// Size of one cache line for software prefetching +LIBC_INLINE_VAR constexpr size_t kOneCachelineSize = 64; +LIBC_INLINE_VAR constexpr size_t kTwoCachelinesSize = kOneCachelineSize * 2; +LIBC_INLINE_VAR constexpr size_t kFiveCachelinesSize = kOneCachelineSize * 5; + LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetchingMemset = LLVM_LIBC_IS_DEFINED(LIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING); } // namespace x86 #if defined(__AVX512F__) - using uint128_t = generic_v128; - using uint256_t = generic_v256; - using uint512_t = generic_v512; +using uint128_t = generic_v128; +using uint256_t = generic_v256; +using uint512_t = generic_v512; #elif defined(__AVX__) - using uint128_t = generic_v128; - using uint256_t = generic_v256; - using uint512_t = cpp::array<generic_v256, 2>; +using uint128_t = generic_v128; +using uint256_t = generic_v256; +using uint512_t = cpp::array<generic_v256, 2>; #elif defined(__SSE2__) - using uint128_t = generic_v128; - using uint256_t = cpp::array<generic_v128, 2>; - using uint512_t = cpp::array<generic_v128, 4>; +using uint128_t = generic_v128; +using uint256_t = cpp::array<generic_v128, 2>; +using uint512_t = cpp::array<generic_v128, 4>; #else - using uint128_t = cpp::array<uint64_t, 2>; - using uint256_t = cpp::array<uint64_t, 4>; - using uint512_t = cpp::array<uint64_t, 8>; +using uint128_t = cpp::array<uint64_t, 2>; +using uint256_t = cpp::array<uint64_t, 4>; +using uint512_t = cpp::array<uint64_t, 8>; #endif - [[maybe_unused]] LIBC_INLINE static void - inline_memset_x86_sw_prefetching(Ptr dst, uint8_t value, size_t count) { - // Prefetch one cacheline - sw_prefetch::PrefetchW(dst + sw_prefetch::kCachelineSize); - if (count <= 128) - return generic::Memset<uint512_t>::head_tail(dst, value, count); - // Prefetch the second cacheline - sw_prefetch::PrefetchW(dst + sw_prefetch::kCachelineSize * 2); - // Aligned loop - generic::Memset<uint256_t>::block(dst, value); - align_to_next_boundary<32>(dst, count); - if (count <= 192) { - return generic::Memset<uint256_t>::loop_and_tail(dst, value, count); - } else { - generic::Memset<uint512_t>::block(dst, value); - generic::Memset<uint256_t>::block(dst + sizeof(uint512_t), value); - return generic::Memset<uint256_t>::loop_and_tail_prefetch<320, 128>( - dst, value, count); +[[maybe_unused]] LIBC_INLINE static void +inline_memset_x86_gt64_sw_prefetching(Ptr dst, uint8_t value, size_t count) { + size_t prefetch_distance = x86::kFiveCachelinesSize; + size_t prefetch_degree = x86::kTwoCachelinesSize; + size_t SIZE = sizeof(uint256_t); + // Prefetch one cache line + prefetch_for_write(dst + x86::kOneCachelineSize); + if (count <= 128) + return generic::Memset<uint512_t>::head_tail(dst, value, count); + // Prefetch the second cache line + prefetch_for_write(dst + x86::kTwoCachelinesSize); + // Aligned loop + generic::Memset<uint256_t>::block(dst, value); + align_to_next_boundary<32>(dst, count); + if (count <= 192) { + return generic::Memset<uint256_t>::loop_and_tail(dst, value, count); + } else { + generic::Memset<uint512_t>::block(dst, value); + generic::Memset<uint256_t>::block_offset(dst, value, SIZE); + size_t offset = 96; + while (offset + prefetch_degree + SIZE <= count) { + for (size_t i = 0; i < prefetch_degree / x86::kOneCachelineSize; ++i) + prefetch_for_write(dst + offset + prefetch_distance + + x86::kOneCachelineSize * i); + for (size_t i = 0; i < prefetch_degree; i += SIZE, offset += SIZE) + generic::Memset<uint256_t>::block_offset(dst, value, offset); } + generic::Memset<uint256_t>::loop_and_tail_offset(dst, value, count, offset); } +} - [[maybe_unused]] LIBC_INLINE static void - inline_memset_x86(Ptr dst, uint8_t value, size_t count) { - if (count == 0) - return; - if (count == 1) - return generic::Memset<uint8_t>::block(dst, value); - if (count == 2) - return generic::Memset<uint16_t>::block(dst, value); - if (count == 3) - return generic::MemsetSequence<uint16_t, uint8_t>::block(dst, value); - if (count <= 8) - return generic::Memset<uint32_t>::head_tail(dst, value, count); - if (count <= 16) - return generic::Memset<uint64_t>::head_tail(dst, value, count); - if (count <= 32) - return generic::Memset<uint128_t>::head_tail(dst, value, count); - if (count <= 64) - return generic::Memset<uint256_t>::head_tail(dst, value, count); - if constexpr (x86::kUseSoftwarePrefetchingMemset) { - return inline_memset_x86_sw_prefetching(dst, value, count); - } - if (count <= 128) - return generic::Memset<uint512_t>::head_tail(dst, value, count); - // Aligned loop - generic::Memset<uint256_t>::block(dst, value); - align_to_next_boundary<32>(dst, count); - return generic::Memset<uint256_t>::loop_and_tail(dst, value, count); - } +[[maybe_unused]] LIBC_INLINE static void +inline_memset_x86(Ptr dst, uint8_t value, size_t count) { + if (count == 0) + return; + if (count == 1) + return generic::Memset<uint8_t>::block(dst, value); + if (count == 2) + return generic::Memset<uint16_t>::block(dst, value); + if (count == 3) + return generic::MemsetSequence<uint16_t, uint8_t>::block(dst, value); + if (count <= 8) + return generic::Memset<uint32_t>::head_tail(dst, value, count); + if (count <= 16) + return generic::Memset<uint64_t>::head_tail(dst, value, count); + if (count <= 32) + return generic::Memset<uint128_t>::head_tail(dst, value, count); + if (count <= 64) + return generic::Memset<uint256_t>::head_tail(dst, value, count); + if constexpr (x86::kUseSoftwarePrefetchingMemset) + return inline_memset_x86_gt64_sw_prefetching(dst, value, count); + if (count <= 128) + return generic::Memset<uint512_t>::head_tail(dst, value, count); + // Aligned loop + generic::Memset<uint256_t>::block(dst, value); + align_to_next_boundary<32>(dst, count); + return generic::Memset<uint256_t>::loop_and_tail(dst, value, count); +} } // namespace LIBC_NAMESPACE #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_X86_64_INLINE_MEMSET_H >From e86bcb7440c6a157907169dcc6fe25f0b322ef89 Mon Sep 17 00:00:00 2001 From: Your Name <milido...@google.com> Date: Tue, 7 Nov 2023 16:27:51 +0000 Subject: [PATCH 16/17] Fix formatting --- libc/src/string/memory_utils/op_generic.h | 10 +++++----- libc/src/string/memory_utils/x86_64/inline_memset.h | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h index 2ee1a650ba71879..b508aca6e846bb4 100644 --- a/libc/src/string/memory_utils/op_generic.h +++ b/libc/src/string/memory_utils/op_generic.h @@ -158,7 +158,8 @@ template <typename T> struct Memset { tail(dst, value, count); } - LIBC_INLINE static void loop_and_tail_offset(Ptr dst, uint8_t value, size_t count, size_t offset) { + LIBC_INLINE static void loop_and_tail_offset(Ptr dst, uint8_t value, + size_t count, size_t offset) { static_assert(SIZE > 1, "a loop of size 1 does not need tail"); do { block_offset(dst, value, offset); @@ -167,10 +168,9 @@ template <typename T> struct Memset { tail(dst, value, count); } - LIBC_INLINE static void loop_and_tail(Ptr dst, uint8_t value, size_t count) { - return loop_and_tail_offset(dst, value, count, 0); - } - + LIBC_INLINE static void loop_and_tail(Ptr dst, uint8_t value, size_t count) { + return loop_and_tail_offset(dst, value, count, 0); + } }; template <typename T, typename... TS> struct MemsetSequence { diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h index 2f132b45789b5c9..b745b8a6b7b1bcb 100644 --- a/libc/src/string/memory_utils/x86_64/inline_memset.h +++ b/libc/src/string/memory_utils/x86_64/inline_memset.h @@ -69,7 +69,7 @@ inline_memset_x86_gt64_sw_prefetching(Ptr dst, uint8_t value, size_t count) { while (offset + prefetch_degree + SIZE <= count) { for (size_t i = 0; i < prefetch_degree / x86::kOneCachelineSize; ++i) prefetch_for_write(dst + offset + prefetch_distance + - x86::kOneCachelineSize * i); + x86::kOneCachelineSize * i); for (size_t i = 0; i < prefetch_degree; i += SIZE, offset += SIZE) generic::Memset<uint256_t>::block_offset(dst, value, offset); } >From dab663125e59267eea8e1f70b74c0f29c60e56cf Mon Sep 17 00:00:00 2001 From: Your Name <milido...@google.com> Date: Tue, 7 Nov 2023 17:43:24 +0000 Subject: [PATCH 17/17] Remove wrong include --- libc/src/string/memory_utils/x86_64/inline_memset.h | 1 - 1 file changed, 1 deletion(-) diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h index b6d3d5a0b65cbb9..9b92cd130bc60b4 100644 --- a/libc/src/string/memory_utils/x86_64/inline_memset.h +++ b/libc/src/string/memory_utils/x86_64/inline_memset.h @@ -12,7 +12,6 @@ #include "src/string/memory_utils/op_generic.h" #include "src/string/memory_utils/op_x86.h" #include "src/string/memory_utils/utils.h" // Ptr, CPtr -#include "third_party/llvm/llvm-project/libc/src/string/memory_utils/inline_memcpy.h" #include <stddef.h> // size_t _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits