Introduce a generic memcpy_streaming() interface for write-once copy
sites that can fall back to memcpy() when no architecture-specific
optimization is available, or when an architecture-specific backend
cannot safely handle a given transfer.

Add memcpy_streaming_drain() alongside it so callers can separate the
copy primitive from any required ordering point. On x86, use
memcpy_flushcache() and sfence only for aligned transfers that can stay
entirely on the non-temporal store path; otherwise fall back to memcpy()
so the generic API does not expose flushcache semantics on cached
head/tail fragments.

Callers are responsible for invoking memcpy_streaming_drain() before
later normal stores that must be ordered after the streaming copy.

Signed-off-by: Li Zhe <[email protected]>
---
 arch/x86/include/asm/string_64.h | 40 ++++++++++++++++++++++++++++++++
 include/linux/string.h           | 20 ++++++++++++++++
 2 files changed, 60 insertions(+)

diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index 4635616863f5..0b57e9e6f3db 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -100,6 +100,46 @@ static __always_inline void memcpy_flushcache(void *dst, 
const void *src, size_t
        }
        __memcpy_flushcache(dst, src, cnt);
 }
+
+/*
+ * Only reuse memcpy_flushcache() for transfers that can stay entirely
+ * on its non-temporal store path. Fall back to memcpy() for zero-length
+ * copies and for unaligned transfers so the generic streaming API does
+ * not expose flushcache semantics on cached head/tail fragments.
+ */
+static __always_inline int memcpy_flushcache_nt_safe(const void *dst,
+                                                    const void *src,
+                                                    size_t cnt)
+{
+       unsigned long d = (unsigned long)dst;
+       unsigned long s = (unsigned long)src;
+
+       if (!cnt)
+               return 0;
+
+       if (cnt >= 8)
+               return !(d & 7) && !(s & 7) && !(cnt & 7);
+
+       return cnt == 4 && !(d & 3) && !(s & 3);
+}
+
+#define __HAVE_ARCH_MEMCPY_STREAMING 1
+static __always_inline void memcpy_streaming(void *dst, const void *src,
+                                            size_t cnt)
+{
+       if (!cnt)
+               return;
+
+       if (memcpy_flushcache_nt_safe(dst, src, cnt))
+               memcpy_flushcache(dst, src, cnt);
+       else
+               memcpy(dst, src, cnt);
+}
+
+static __always_inline void memcpy_streaming_drain(void)
+{
+       asm volatile("sfence" : : : "memory");
+}
 #endif
 
 #endif /* __KERNEL__ */
diff --git a/include/linux/string.h b/include/linux/string.h
index b850bd91b3d8..a4c2d4347f58 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -281,6 +281,26 @@ static inline void memcpy_flushcache(void *dst, const void 
*src, size_t cnt)
 }
 #endif
 
+#ifndef __HAVE_ARCH_MEMCPY_STREAMING
+/*
+ * memcpy_streaming() is for write-once copy sites that may use
+ * non-temporal stores on some architectures. Callers must follow it
+ * with memcpy_streaming_drain() before later normal stores that need to
+ * be ordered after the streaming copy. Implementations may fall back to
+ * memcpy() when a specialized backend cannot safely handle the given
+ * transfer, and backends that use regular cached stores can make the
+ * drain a no-op.
+ */
+static inline void memcpy_streaming(void *dst, const void *src, size_t cnt)
+{
+       memcpy(dst, src, cnt);
+}
+
+static inline void memcpy_streaming_drain(void)
+{
+}
+#endif
+
 void *memchr_inv(const void *s, int c, size_t n);
 char *strreplace(char *str, char old, char new);
 
-- 
2.20.1

Reply via email to