On Mon, Aug 22, 2016 at 4:52 PM, Alexey Dobriyan <[email protected]> wrote: > Apply alternatives at the call site instead of function body. > Save branch per clean page. > > Bonus: tell gcc to not flush whole shebang of registers, > just RDI, RAX, RCX. > > Signed-off-by: Alexey Dobriyan <[email protected]> > --- > > arch/x86/include/asm/page_64.h | 16 +++++++++++++++- > arch/x86/lib/clear_page_64.S | 18 ++++++------------ > 2 files changed, 21 insertions(+), 13 deletions(-) > > --- a/arch/x86/include/asm/page_64.h > +++ b/arch/x86/include/asm/page_64.h > @@ -4,6 +4,7 @@ > #include <asm/page_64_types.h> > > #ifndef __ASSEMBLY__ > +#include <asm/alternative.h> > > /* duplicated to the one in bootmem.h */ > extern unsigned long max_pfn; > @@ -34,7 +35,20 @@ extern unsigned long __phys_addr_symbol(unsigned long); > #define pfn_valid(pfn) ((pfn) < max_pfn) > #endif > > -void clear_page(void *page); > +void clear_page_mov(void *page); > +void clear_page_rep_stosq(void *page); > +void clear_page_rep_stosb(void *page); > +static __always_inline void clear_page(void *page) > +{ > + alternative_call_2( > + clear_page_mov, > + clear_page_rep_stosq, X86_FEATURE_REP_GOOD, > + clear_page_rep_stosb, X86_FEATURE_ERMS, > + "=D" (page), > + "0" (page) > + : "rax", "rcx", "memory" > + ); > +} > void copy_page(void *to, void *from); > > #endif /* !__ASSEMBLY__ */ > --- a/arch/x86/lib/clear_page_64.S > +++ b/arch/x86/lib/clear_page_64.S > @@ -1,6 +1,4 @@ > #include <linux/linkage.h> > -#include <asm/cpufeatures.h> > -#include <asm/alternative-asm.h> > > /* > * Most CPUs support enhanced REP MOVSB/STOSB instructions. It is > @@ -13,18 +11,14 @@ > * Zero a page. > * %rdi - page > */ > -ENTRY(clear_page) > - > - ALTERNATIVE_2 "jmp clear_page_orig", "", X86_FEATURE_REP_GOOD, \ > - "jmp clear_page_c_e", X86_FEATURE_ERMS > - > +ENTRY(clear_page_rep_stosq) > movl $4096/8,%ecx > xorl %eax,%eax > rep stosq > ret > -ENDPROC(clear_page) > +ENDPROC(clear_page_rep_stosq) > > -ENTRY(clear_page_orig) > +ENTRY(clear_page_mov) > > xorl %eax,%eax > movl $4096/64,%ecx > @@ -44,11 +38,11 @@ ENTRY(clear_page_orig) > jnz .Lloop > nop > ret > -ENDPROC(clear_page_orig) > +ENDPROC(clear_page_mov) > > -ENTRY(clear_page_c_e) > +ENTRY(clear_page_rep_stosb) > movl $4096,%ecx > xorl %eax,%eax > rep stosb > ret > -ENDPROC(clear_page_c_e) > +ENDPROC(clear_page_rep_stosb)
I like this idea, but does it make sense to take it a step further and inline the string instruction alternatives to avoid a call altogether? Also, 32-bit should be converted to do the same thing as 64-bit. -- Brian Gerst

