https://github.com/doshimili created https://github.com/llvm/llvm-project/pull/70493
Software prefetching helps recover performance when hardware prefetching is disabled. The 'LIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING' compile time option allows users to use this patch. >From 6c313955185c0d59564f6535b6f1580dca168bea Mon Sep 17 00:00:00 2001 From: Your Name <milido...@google.com> Date: Tue, 24 Oct 2023 21:15:23 +0000 Subject: [PATCH 1/3] Add software prefetching to memset --- libc/src/string/memory_utils/op_generic.h | 19 +++++++++++++++++++ .../memory_utils/x86_64/inline_memset.h | 12 +++++++++++- 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h index fd71ca30e24b936..54af7ea10e25e46 100644 --- a/libc/src/string/memory_utils/op_generic.h +++ b/libc/src/string/memory_utils/op_generic.h @@ -163,6 +163,25 @@ template <typename T> struct Memset { } while (offset < count - SIZE); tail(dst, value, count); } + + template <size_t prefetch_distance, size_t prefetch_degree> + LIBC_INLINE static void loop_and_tail_prefetch(Ptr dst, uint8_t value, + size_t count) { + Memset<uint512_t>::block(dst, value); + Memset<uint256_t>::block(dst + 64, value); + size_t offset = 96; + while (offset + prefetch_degree + kSize <= count) { + for (size_t i = 0; i < prefetch_degree / kCachelineSize; ++i) + PrefetchW(dst + offset + prefetch_distance + kCachelineSize * i); + for (size_t i = 0; i < prefetch_degree; i += kSize, offset += kSize) + block(dst + offset, value); + } + while (offset + kSize < count) { + block(dst + offset, value); + offset += kSize; + } + tail(dst, value, count); + } }; template <typename T, typename... TS> struct MemsetSequence { diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h index 6436594856b0eaf..da463bc0029f9aa 100644 --- a/libc/src/string/memory_utils/x86_64/inline_memset.h +++ b/libc/src/string/memory_utils/x86_64/inline_memset.h @@ -17,6 +17,11 @@ namespace LIBC_NAMESPACE { +static constexpr size_t kCachelineSize = 64; + +// prefetch for write +static inline void PrefetchW(CPtr dst) { __builtin_prefetch(dst, 1, 3); } + [[maybe_unused]] LIBC_INLINE static void inline_memset_x86(Ptr dst, uint8_t value, size_t count) { #if defined(__AVX512F__) @@ -53,12 +58,17 @@ inline_memset_x86(Ptr dst, uint8_t value, size_t count) { return generic::Memset<uint128_t>::head_tail(dst, value, count); if (count <= 64) return generic::Memset<uint256_t>::head_tail(dst, value, count); + PrefetchW(dst + kCachelineSize); if (count <= 128) return generic::Memset<uint512_t>::head_tail(dst, value, count); + PrefetchW(dst + kCachelineSize * 2); // Aligned loop generic::Memset<uint256_t>::block(dst, value); align_to_next_boundary<32>(dst, count); - return generic::Memset<uint256_t>::loop_and_tail(dst, value, count); + if (count <= 192) { + return Memset<uint256_t>::loop_and_tail(dst, value, count); + } + return Memset<uint256_t>::loop_and_tail_prefetch<320, 128>(dst, value, count); } } // namespace LIBC_NAMESPACE >From 15cbd0a0c851fa3ac5315e796bb69c1bf791e956 Mon Sep 17 00:00:00 2001 From: Your Name <milido...@google.com> Date: Tue, 24 Oct 2023 21:15:23 +0000 Subject: [PATCH 2/3] Add software prefetching to memset --- libc/src/string/CMakeLists.txt | 1 + .../memory_utils/x86_64/inline_memset.h | 32 ++++++++++++++++--- .../llvm-project-overlay/libc/BUILD.bazel | 1 + 3 files changed, 29 insertions(+), 5 deletions(-) diff --git a/libc/src/string/CMakeLists.txt b/libc/src/string/CMakeLists.txt index 67675b682081c67..aa69bff7a8cfada 100644 --- a/libc/src/string/CMakeLists.txt +++ b/libc/src/string/CMakeLists.txt @@ -656,6 +656,7 @@ if(${LIBC_TARGET_ARCHITECTURE_IS_X86}) add_memset(memset_x86_64_opt_sse4 COMPILE_OPTIONS -march=nehalem REQUIRE SSE4_2) add_memset(memset_x86_64_opt_avx2 COMPILE_OPTIONS -march=haswell REQUIRE AVX2) add_memset(memset_x86_64_opt_avx512 COMPILE_OPTIONS -march=skylake-avx512 REQUIRE AVX512F) + add_memset(memset_x86_64_opt_sw_prefetch COMPILE_OPTIONS -DLIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING) add_memset(memset_opt_host COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE}) add_memset(memset) elseif(${LIBC_TARGET_ARCHITECTURE_IS_AARCH64}) diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h index da463bc0029f9aa..f3ad04930c52c64 100644 --- a/libc/src/string/memory_utils/x86_64/inline_memset.h +++ b/libc/src/string/memory_utils/x86_64/inline_memset.h @@ -16,12 +16,34 @@ #include <stddef.h> // size_t namespace LIBC_NAMESPACE { +namespace x86 { static constexpr size_t kCachelineSize = 64; +LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetching = + LLVM_LIBC_IS_DEFINED(LIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING); + +} // namespace x86 + // prefetch for write static inline void PrefetchW(CPtr dst) { __builtin_prefetch(dst, 1, 3); } +[[maybe_unused]] LIBC_INLINE static void inline_memset_x86_sw_prefetching(Ptr dst, uint8_t value, size_t count) { + PrefetchW(dst + kCachelineSize); + if (count <= 128) + return generic::Memset<uint512_t>::head_tail(dst, value, count); + PrefetchW(dst + kCachelineSize * 2); + // Aligned loop + generic::Memset<uint256_t>::block(dst, value); + align_to_next_boundary<32>(dst, count); + if (count <= 192) { + return Memset<uint256_t>::loop_and_tail(dst, value, count); + } + else { + return Memset<uint256_t>::loop_and_tail_prefetch<320, 128>(dst, value, count); + } +} + [[maybe_unused]] LIBC_INLINE static void inline_memset_x86(Ptr dst, uint8_t value, size_t count) { #if defined(__AVX512F__) @@ -58,17 +80,17 @@ inline_memset_x86(Ptr dst, uint8_t value, size_t count) { return generic::Memset<uint128_t>::head_tail(dst, value, count); if (count <= 64) return generic::Memset<uint256_t>::head_tail(dst, value, count); - PrefetchW(dst + kCachelineSize); + if constexpr (x86::kUseSoftwarePrefetching) { + return inline_memset_x86_sw_prefetching(dst, value, count); + } if (count <= 128) return generic::Memset<uint512_t>::head_tail(dst, value, count); - PrefetchW(dst + kCachelineSize * 2); // Aligned loop generic::Memset<uint256_t>::block(dst, value); align_to_next_boundary<32>(dst, count); - if (count <= 192) { - return Memset<uint256_t>::loop_and_tail(dst, value, count); + else { + return Memset<uint256_t>::loop_and_tail(dst, value, count); } - return Memset<uint256_t>::loop_and_tail_prefetch<320, 128>(dst, value, count); } } // namespace LIBC_NAMESPACE diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel index 3ae68193dccd2b2..dea21fd77182605 100644 --- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel @@ -33,6 +33,7 @@ PRINTF_COPTS = [ MEMORY_COPTS = [ # "LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE=0", # "LIBC_COPT_MEMCPY_X86_USE_SOFTWARE_PREFETCHING", + # "LIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING", ] # A flag to pick which `mpfr` to use for math tests. >From abb9debc49b7e171eae14a98320b9a49779c808c Mon Sep 17 00:00:00 2001 From: Your Name <milido...@google.com> Date: Fri, 27 Oct 2023 17:55:47 +0000 Subject: [PATCH 3/3] Fix formatting --- libc/src/string/memory_utils/x86_64/inline_memset.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h index f3ad04930c52c64..e82b600bf66ab96 100644 --- a/libc/src/string/memory_utils/x86_64/inline_memset.h +++ b/libc/src/string/memory_utils/x86_64/inline_memset.h @@ -28,7 +28,8 @@ LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetching = // prefetch for write static inline void PrefetchW(CPtr dst) { __builtin_prefetch(dst, 1, 3); } -[[maybe_unused]] LIBC_INLINE static void inline_memset_x86_sw_prefetching(Ptr dst, uint8_t value, size_t count) { +[[maybe_unused]] LIBC_INLINE static void +inline_memset_x86_sw_prefetching(Ptr dst, uint8_t value, size_t count) { PrefetchW(dst + kCachelineSize); if (count <= 128) return generic::Memset<uint512_t>::head_tail(dst, value, count); @@ -38,9 +39,9 @@ static inline void PrefetchW(CPtr dst) { __builtin_prefetch(dst, 1, 3); } align_to_next_boundary<32>(dst, count); if (count <= 192) { return Memset<uint256_t>::loop_and_tail(dst, value, count); - } - else { - return Memset<uint256_t>::loop_and_tail_prefetch<320, 128>(dst, value, count); + } else { + return Memset<uint256_t>::loop_and_tail_prefetch<320, 128>(dst, value, + count); } } @@ -89,7 +90,7 @@ inline_memset_x86(Ptr dst, uint8_t value, size_t count) { generic::Memset<uint256_t>::block(dst, value); align_to_next_boundary<32>(dst, count); else { - return Memset<uint256_t>::loop_and_tail(dst, value, count); + return Memset<uint256_t>::loop_and_tail(dst, value, count); } } } // namespace LIBC_NAMESPACE _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits