From: Vineet Gupta <vgu...@kernel.org> Signed-off-by: Vineet Gupta <vgu...@kernel.org> --- arch/arc/lib/memset-archs.S | 112 ++++++++++++++---------------------- 1 file changed, 43 insertions(+), 69 deletions(-)
diff --git a/arch/arc/lib/memset-archs.S b/arch/arc/lib/memset-archs.S index 330e22f7cf3c..a9a0ccef761d 100644 --- a/arch/arc/lib/memset-archs.S +++ b/arch/arc/lib/memset-archs.S @@ -5,6 +5,7 @@ #include <linux/linkage.h> #include <asm/cache.h> +#include <asm/assembler.h> /* * The memset implementation below is optimized to use prefetchw and prealloc @@ -55,7 +56,7 @@ ENTRY_CFI(memset) 1: #endif -;;; Destination is aligned + ; promote memset pattern from char to int (double actually for STD) and r1, r1, 0xFF asl r4, r1, 8 or r4, r4, r1 @@ -63,75 +64,48 @@ ENTRY_CFI(memset) or r5, r5, r4 mov r4, r5 - sub3 lp_count, r2, 8 - cmp r2, 64 - bmsk.hi r2, r2, 5 - mov.ls lp_count, 0 - add3.hi r2, r2, 8 - -;;; Convert len to Dwords, unfold x8 - lsr.f lp_count, lp_count, 6 - - lpnz @.Lset64bytes - ;; LOOP START - PREALLOC_INSTR r3, 64 ; alloc next line w/o fetching - -#ifdef CONFIG_ARC_HAS_LL64 - std.ab r4, [r3, 8] - std.ab r4, [r3, 8] - std.ab r4, [r3, 8] - std.ab r4, [r3, 8] - std.ab r4, [r3, 8] - std.ab r4, [r3, 8] - std.ab r4, [r3, 8] - std.ab r4, [r3, 8] -#else - st.ab r4, [r3, 4] - st.ab r4, [r3, 4] - st.ab r4, [r3, 4] - st.ab r4, [r3, 4] - st.ab r4, [r3, 4] - st.ab r4, [r3, 4] - st.ab r4, [r3, 4] - st.ab r4, [r3, 4] - st.ab r4, [r3, 4] - st.ab r4, [r3, 4] - st.ab r4, [r3, 4] - st.ab r4, [r3, 4] - st.ab r4, [r3, 4] - st.ab r4, [r3, 4] - st.ab r4, [r3, 4] - st.ab r4, [r3, 4] -#endif -.Lset64bytes: - - lsr.f lp_count, r2, 5 ;Last remaining max 124 bytes - lpnz .Lset32bytes - ;; LOOP START -#ifdef CONFIG_ARC_HAS_LL64 - std.ab r4, [r3, 8] - std.ab r4, [r3, 8] - std.ab r4, [r3, 8] - std.ab r4, [r3, 8] -#else - st.ab r4, [r3, 4] - st.ab r4, [r3, 4] - st.ab r4, [r3, 4] - st.ab r4, [r3, 4] - st.ab r4, [r3, 4] - st.ab r4, [r3, 4] - st.ab r4, [r3, 4] - st.ab r4, [r3, 4] -#endif -.Lset32bytes: - - and.f lp_count, r2, 0x1F ;Last remaining 31 bytes -.Lsmallchunk: - lpnz .Lcopy3bytes - ;; LOOP START + ; Loop #a: + ; - Updates 1 cache line worth data (64 bytes) per iteration + ; - PREALLOC the next line. + ; + ; = Only entered if at least 2 lines worth of work (i.e. >= 128 bytes), + ; else PREALLOC for next can "bleed" past end of buffer, causing data + ; corruption issue if that line is owned by some other core. + ; = Last 64 bytes (even for min 128 bytes work) are NOT done here to + ; avoid PREALLOC issue + + sub r6, r2, 64 + cmp r2, 64 + bmsk.hi r2, r2, 5 ; trailing 63 bytes + mov.ls r6, 0 + add.hi r2, r2, 64 ; line skipped in loop below + + lsr.f lp_count, r6, 6 + lpnz 2f + PREALLOCR r3, 64 + ST64.ab r4, r3, 8 + ST64.ab r4, r3, 8 + ST64.ab r4, r3, 8 + ST64.ab r4, r3, 8 + ST64.ab r4, r3, 8 + ST64.ab r4, r3, 8 + ST64.ab r4, r3, 8 + ST64.ab r4, r3, 8 +2: + ; Loop #b: Remaining 32 / 64 bytes + lsr.f lp_count, r2, 5 + lpnz .Lbyteloop + ST64.ab r4, r3, 8 + ST64.ab r4, r3, 8 + ST64.ab r4, r3, 8 + ST64.ab r4, r3, 8 + +.Lbyteloop: + ; Loop #c: straggler 31 bytes + and.f lp_count, r2, 0x1F + lpnz 4f stb.ab r1, [r3, 1] -.Lcopy3bytes: - +4: j [blink] END_CFI(memset) -- 2.25.1 _______________________________________________ linux-snps-arc mailing list linux-snps-arc@lists.infradead.org http://lists.infradead.org/mailman/listinfo/linux-snps-arc