Small constant-sized flushcache copies currently fall back to __memcpy_flushcache() unless they are exactly 4, 8, or 16 bytes.
Factor the existing inline movnti sequences into small helpers and extend the fixed-size fastpath coverage to 24..96 bytes. This keeps common struct-page-sized copies on the inline path for the upcoming memcpy_streaming() user, while still falling back to __memcpy_flushcache() for uncommon sizes. Signed-off-by: Li Zhe <[email protected]> --- arch/x86/include/asm/string_64.h | 87 +++++++++++++++++++++++++++----- 1 file changed, 73 insertions(+), 14 deletions(-) diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h index 15504b844f1e..94dc92f287f3 100644 --- a/arch/x86/include/asm/string_64.h +++ b/arch/x86/include/asm/string_64.h @@ -82,22 +82,81 @@ int strcmp(const char *cs, const char *ct); #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE #define __HAVE_ARCH_MEMCPY_FLUSHCACHE 1 void __memcpy_flushcache(void *dst, const void *src, size_t cnt); -static __always_inline void memcpy_flushcache(void *dst, const void *src, size_t cnt) + +static __always_inline void memcpy_flushcache_4(void *dst, const void *src) +{ + asm ("movntil %1, %0" : "=m"(*(u32 *)dst) : "r"(*(u32 *)src)); +} + +static __always_inline void memcpy_flushcache_8(void *dst, const void *src) +{ + asm ("movntiq %1, %0" : "=m"(*(u64 *)dst) : "r"(*(u64 *)src)); +} + +static __always_inline void memcpy_flushcache_16(void *dst, const void *src) +{ + memcpy_flushcache_8(dst, src); + memcpy_flushcache_8(dst + 8, src + 8); +} + +/* + * Keep common fixed-size copies on the inline movnti path instead of + * dropping into the generic helper. + */ +static __always_inline int memcpy_flushcache_small(void *dst, const void *src, + size_t cnt) { - if (__builtin_constant_p(cnt)) { - switch (cnt) { - case 4: - asm ("movntil %1, %0" : "=m"(*(u32 *)dst) : "r"(*(u32 *)src)); - return; - case 8: - asm ("movntiq %1, %0" : "=m"(*(u64 *)dst) : "r"(*(u64 *)src)); - return; - case 16: - asm ("movntiq %1, %0" : "=m"(*(u64 *)dst) : "r"(*(u64 *)src)); - asm ("movntiq %1, %0" : "=m"(*(u64 *)(dst + 8)) : "r"(*(u64 *)(src + 8))); - return; - } + switch (cnt) { + case 96: + memcpy_flushcache_16(dst + 80, src + 80); + fallthrough; + case 80: + memcpy_flushcache_16(dst + 64, src + 64); + fallthrough; + case 64: + memcpy_flushcache_16(dst + 48, src + 48); + fallthrough; + case 48: + memcpy_flushcache_16(dst + 32, src + 32); + fallthrough; + case 32: + memcpy_flushcache_16(dst + 16, src + 16); + fallthrough; + case 16: + memcpy_flushcache_16(dst, src); + return 1; + + case 88: + memcpy_flushcache_16(dst + 72, src + 72); + fallthrough; + case 72: + memcpy_flushcache_16(dst + 56, src + 56); + fallthrough; + case 56: + memcpy_flushcache_16(dst + 40, src + 40); + fallthrough; + case 40: + memcpy_flushcache_16(dst + 24, src + 24); + fallthrough; + case 24: + memcpy_flushcache_16(dst + 8, src + 8); + fallthrough; + case 8: + memcpy_flushcache_8(dst, src); + return 1; + + case 4: + memcpy_flushcache_4(dst, src); + return 1; } + + return 0; +} + +static __always_inline void memcpy_flushcache(void *dst, const void *src, size_t cnt) +{ + if (__builtin_constant_p(cnt) && memcpy_flushcache_small(dst, src, cnt)) + return; __memcpy_flushcache(dst, src, cnt); } -- 2.20.1

