This diff moves the buffer cache to only be allocated out of dma'able memory, along with a few pieces (flags) that I will need for the next step of allowing it to touch high memory.
Appears to behave well for me under load and builds survive it with the buffer cache cranked up. I would like to get this in and a first step as it allows me to progress, and will allow others to fix some of the other problems with drivers without the buffer cache getting in the way with bigmem turned on. -Bob ------8<---- Index: sys/kern/vfs_bio.c =================================================================== RCS file: /cvs/src/sys/kern/vfs_bio.c,v retrieving revision 1.127 diff -u -r1.127 vfs_bio.c --- sys/kern/vfs_bio.c 13 Nov 2010 17:45:44 -0000 1.127 +++ sys/kern/vfs_bio.c 1 Apr 2011 08:56:34 -0000 @@ -191,19 +191,43 @@ void bufinit(void) { + long low, high, dmapages, highpages; struct bqueues *dp; /* XXX - for now */ bufhighpages = buflowpages = bufpages = bufcachepercent = bufkvm = 0; /* + * First off, figure out how much of memory we can use. + * + * XXX for now we only use dma-able memory + * + * XXX - this isn't completely accurate, because their may + * be holes in the physical memory. This needs to be replaced + * with a uvm_pmemrange function to tell us how many pages + * are within a constraint range - but this is accurate enough + * for now. + */ + + low = atop(dma_constraint.ucr_low); + high = atop(dma_constraint.ucr_high); + if (high >= physmem) { + high = physmem; + highpages = 0; + } + else + highpages = physmem - high; + /* XXX highpages not used yet but will be very soon. */ + dmapages = high - low; + + /* * If MD code doesn't say otherwise, use 10% of kvm for mappings and - * 10% physmem for pages. + * 10% of dmaable pages for cache pages. */ if (bufcachepercent == 0) bufcachepercent = 10; if (bufpages == 0) - bufpages = physmem * bufcachepercent / 100; + bufpages = dmapages * bufcachepercent / 100; bufhighpages = bufpages; @@ -212,7 +236,7 @@ * we will not allow uvm to steal back more than this number of * pages */ - buflowpages = physmem * 10 / 100; + buflowpages = dmapages * 10 / 100; /* * set bufbackpages to 100 pages, or 10 percent of the low water mark Index: sys/kern/vfs_biomem.c =================================================================== RCS file: /cvs/src/sys/kern/vfs_biomem.c,v retrieving revision 1.14 diff -u -r1.14 vfs_biomem.c --- sys/kern/vfs_biomem.c 30 Apr 2010 21:56:39 -0000 1.14 +++ sys/kern/vfs_biomem.c 1 Apr 2011 05:32:21 -0000 @@ -29,10 +29,19 @@ vaddr_t buf_kva_start, buf_kva_end; int buf_needva; TAILQ_HEAD(,buf) buf_valist; +void buf_realloc_pages(struct buf *, struct uvm_constraint_range *); int buf_nkvmsleep; +#if 0 +extern int needda; +#endif +extern void uvm_pagealloc_multi(struct uvm_object *, voff_t, + vsize_t, int); +extern void uvm_pagerealloc_multi(struct uvm_object *, voff_t, + vsize_t, int, struct uvm_constraint_range *); extern struct bcachestats bcstats; +extern int needbuffer; /* * Pages are allocated from a uvm object (we only use it for page storage, @@ -99,6 +108,10 @@ s = splbio(); SET(bp->b_flags, B_BUSY|B_NOTMAPPED); + if (bp->b_data != NULL) { + TAILQ_REMOVE(&buf_valist, bp, b_valist); + bcstats.busymapped++; + } splx(s); } @@ -170,6 +183,26 @@ } } CLR(bp->b_flags, B_BUSY|B_NOTMAPPED); +#if 0 + if (ISSET(bp->b_flags, B_DAQ) && needda) { + wakeup(&needda); + } +#endif + /* Wake up any processes waiting for any buffer to become free. */ + if (needbuffer) { + needbuffer--; + wakeup(&needbuffer); + } + + /* + * Wake up any processes waiting for _this_ buffer to become + * free. + */ + + if (ISSET(bp->b_flags, B_WANTED)) { + CLR(bp->b_flags, B_WANTED); + wakeup(bp); + } splx(s); } @@ -259,11 +292,11 @@ return (va); } +/* Always allocates in Device Accessible Memory */ void buf_alloc_pages(struct buf *bp, vsize_t size) { - struct vm_page *pg; - voff_t offs, i; + voff_t offs; int s; KASSERT(size == round_page(size)); @@ -277,25 +310,51 @@ KASSERT(buf_page_offset > 0); - for (i = 0; i < atop(size); i++) { -#if defined(DEBUG) || 1 - if ((pg = uvm_pagelookup(buf_object, offs + ptoa(i)))) - panic("buf_alloc_pages: overlap buf: %p page: %p", - bp, pg); -#endif - - while ((pg = uvm_pagealloc(buf_object, offs + ptoa(i), - NULL, 0)) == NULL) { - uvm_wait("buf_alloc_pages"); - } - pg->wire_count = 1; - atomic_clearbits_int(&pg->pg_flags, PG_BUSY); - bcstats.numbufpages++; - } - + uvm_pagealloc_multi(buf_object, offs, size, UVM_PLA_WAITOK); + bcstats.numbufpages+= atop(size);; + SET(bp->b_flags, B_DMA); bp->b_pobj = buf_object; bp->b_poffs = offs; bp->b_bufsize = size; + splx(s); +} + +/* reallocate into a particular location specified by "where" */ +void +buf_realloc_pages(struct buf *bp, struct uvm_constraint_range *where) +{ + vaddr_t va; + int dma = 1; + int s, i; + + s = splbio(); + KASSERT(ISSET(bp->b_flags, B_BUSY)); + + /* if the original buf is mapped, unmap it */ + if ( bp->b_data != NULL) { + va = (vaddr_t)bp->b_data; + pmap_kremove(va, bp->b_bufsize); + pmap_update(pmap_kernel()); + } + uvm_pagerealloc_multi(bp->b_pobj, bp->b_poffs, bp->b_bufsize, + UVM_PLA_WAITOK, where); + /* if the original buf was mapped, re-map it */ + for (i = 0; i < atop(bp->b_bufsize); i++) { + struct vm_page *pg = uvm_pagelookup(bp->b_pobj, + bp->b_poffs + ptoa(i)); + KASSERT(pg != NULL); + if (!PADDR_IS_DMA_REACHABLE(VM_PAGE_TO_PHYS(pg))) + dma = 0; + if (bp->b_data != NULL) { + pmap_kenter_pa(va + ptoa(i), VM_PAGE_TO_PHYS(pg), + VM_PROT_READ|VM_PROT_WRITE); + pmap_update(pmap_kernel()); + } + } + if (dma) + SET(bp->b_flags, B_DMA); + else + CLR(bp->b_flags, B_DMA); splx(s); } Index: sys/sys/buf.h =================================================================== RCS file: /cvs/src/sys/sys/buf.h,v retrieving revision 1.74 diff -u -r1.74 buf.h --- sys/sys/buf.h 22 Sep 2010 01:18:57 -0000 1.74 +++ sys/sys/buf.h 1 Apr 2011 05:32:21 -0000 @@ -144,6 +144,7 @@ LIST_ENTRY(buf) b_list; /* All allocated buffers. */ LIST_ENTRY(buf) b_vnbufs; /* Buffer's associated vnode. */ TAILQ_ENTRY(buf) b_freelist; /* Free list position if not active. */ + TAILQ_ENTRY(buf) b_qda; /* Device Accisible queue position */ time_t b_synctime; /* Time this buffer should be flushed */ struct proc *b_proc; /* Associated proc; NULL if kernel. */ volatile long b_flags; /* B_* flags. */ @@ -189,6 +190,7 @@ /* * These flags are kept in b_flags. */ +#define B_WRITE 0x00000000 /* Write buffer (pseudo flag). */ #define B_AGE 0x00000001 /* Move to age queue when I/O done. */ #define B_NEEDCOMMIT 0x00000002 /* Needs committing to stable storage */ #define B_ASYNC 0x00000004 /* Start I/O, do not wait. */ @@ -197,29 +199,31 @@ #define B_CACHE 0x00000020 /* Bread found us in the cache. */ #define B_CALL 0x00000040 /* Call b_iodone from biodone. */ #define B_DELWRI 0x00000080 /* Delay I/O until buffer reused. */ +#define B_PRIV 0x00000100 /* Privately allocated buffer */ #define B_DONE 0x00000200 /* I/O completed. */ #define B_EINTR 0x00000400 /* I/O was interrupted */ #define B_ERROR 0x00000800 /* I/O error occurred. */ -#define B_INVAL 0x00002000 /* Does not contain valid info. */ -#define B_NOCACHE 0x00008000 /* Do not cache block after use. */ -#define B_PHYS 0x00040000 /* I/O to user memory. */ -#define B_RAW 0x00080000 /* Set by physio for raw transfers. */ -#define B_READ 0x00100000 /* Read buffer. */ -#define B_WANTED 0x00800000 /* Process wants this buffer. */ -#define B_WRITE 0x00000000 /* Write buffer (pseudo flag). */ -#define B_WRITEINPROG 0x01000000 /* Write in progress. */ -#define B_XXX 0x02000000 /* Debugging flag. */ -#define B_DEFERRED 0x04000000 /* Skipped over for cleaning */ -#define B_SCANNED 0x08000000 /* Block already pushed during sync */ -#define B_PDAEMON 0x10000000 /* I/O started by pagedaemon */ -#define B_RELEASED 0x20000000 /* free this buffer after its kvm */ -#define B_NOTMAPPED 0x40000000 /* BUSY, but not necessarily mapped */ - -#define B_BITS "\010\001AGE\002NEEDCOMMIT\003ASYNC\004BAD\005BUSY\006CACHE" \ - "\007CALL\010DELWRI\012DONE\013EINTR\014ERROR" \ - "\016INVAL\020NOCACHE\023PHYS\024RAW\025READ" \ - "\030WANTED\031WRITEINPROG\032XXX\033DEFERRED" \ - "\034SCANNED\035PDAEMON" +#define B_INVAL 0x00001000 /* Does not contain valid info. */ +#define B_NOCACHE 0x00002000 /* Do not cache block after use. */ +#define B_PHYS 0x00004000 /* I/O to user memory. */ +#define B_RAW 0x00008000 /* Set by physio for raw transfers. */ +#define B_READ 0x00010000 /* Read buffer. */ +#define B_WANTED 0x00020000 /* Process wants this buffer. */ +#define B_WRITEINPROG 0x00040000 /* Write in progress. */ +#define B_XXX 0x00080000 /* Debugging flag. */ +#define B_DEFERRED 0x00100000 /* Skipped over for cleaning */ +#define B_SCANNED 0x00200000 /* Block already pushed during sync */ +#define B_PDAEMON 0x00400000 /* I/O started by pagedaemon */ +#define B_RELEASED 0x00800000 /* free this buffer after its kvm */ +#define B_DMA 0x01000000 /* buffer is on the DA queue */ +#define B_DAQ 0x02000000 /* buffer is DMA reachable */ +#define B_NOTMAPPED 0x04000000 /* BUSY, but not necessarily mapped */ + +#define B_BITS "\20\001AGE\002NEEDCOMMIT\003ASYNC\004BAD\005BUSY" \ + "\006CACHE\007CALL\010DELWRI\011PRIV\012DONE\013EINTR\014ERROR" \ + "\015INVAL\016NOCACHE\017PHYS\020RAW\021READ" \ + "\022WANTED\023WRITEINPROG\024XXX(FORMAT)\025DEFERRED" \ + "\026SCANNED\027DAEMON\030RELEASED\031DAQ\032DMA\033NOTMAPPED" /* * This structure describes a clustered I/O. It is stored in the b_saveaddr Index: sys/uvm/uvm_page.c =================================================================== RCS file: /cvs/src/sys/uvm/uvm_page.c,v retrieving revision 1.102 diff -u -r1.102 uvm_page.c --- sys/uvm/uvm_page.c 7 Aug 2010 03:50:02 -0000 1.102 +++ sys/uvm/uvm_page.c 1 Apr 2011 05:32:21 -0000 @@ -81,6 +81,11 @@ #include <uvm/uvm.h> +void uvm_pagealloc_multi(struct uvm_object *, voff_t, + vsize_t, int); +void uvm_pagerealloc_multi(struct uvm_object *, voff_t, + vsize_t, int, struct uvm_constraint_range *); + /* * for object trees */ @@ -801,6 +806,69 @@ pg->owner_tag = NULL; #endif UVM_PAGE_OWN(pg, "new alloc"); +} + +/* + * interface used by the buffer cache to allocate a buffer at a time. + * The pages are allocated wired in DMA accessible memory + */ +void +uvm_pagealloc_multi(struct uvm_object *obj, voff_t off, vsize_t size, int flags) +{ + struct pglist plist; + struct vm_page *pg; + int i, error; + + + TAILQ_INIT(&plist); + error = uvm_pglistalloc(size, dma_constraint.ucr_low, + dma_constraint.ucr_high, 0, 0, &plist, atop(round_page(size)), + UVM_PLA_WAITOK); + if (error) + panic("wtf - uvm_pglistalloc returned %x", error); + i = 0; + while((pg = TAILQ_FIRST(&plist)) != NULL) { + pg->wire_count = 1; + atomic_setbits_int(&pg->pg_flags, PG_CLEAN | PG_FAKE); + KASSERT((pg->pg_flags & PG_DEV) == 0); + TAILQ_REMOVE(&plist, pg, pageq); + uvm_pagealloc_pg(pg, obj, off + ptoa(i++), NULL); + } +} + +/* + * interface used by the buffer cache to reallocate a buffer at a time. + * The pages are reallocated wired outside the DMA accessible region. + * + */ +void +uvm_pagerealloc_multi(struct uvm_object *obj, voff_t off, vsize_t size, int flags, struct uvm_constraint_range *where) +{ + struct pglist plist; + struct vm_page *pg, *tpg; + int i, error; + voff_t offset; + + + TAILQ_INIT(&plist); + if (size == 0) + panic("size 0 uvm_pagerealloc"); + error = uvm_pglistalloc(size, where->ucr_low, where->ucr_high, 0, + 0, &plist, atop(round_page(size)), UVM_PLA_WAITOK); + if (error) + panic("wtf - uvm_pglistalloc returned %x", error); + i = 0; + while((pg = TAILQ_FIRST(&plist)) != NULL) { + offset = off + ptoa(i++); + tpg = uvm_pagelookup(obj, offset); + pg->wire_count = 1; + atomic_setbits_int(&pg->pg_flags, PG_CLEAN | PG_FAKE); + KASSERT((pg->pg_flags & PG_DEV) == 0); + TAILQ_REMOVE(&plist, pg, pageq); + uvm_pagecopy(tpg, pg); + uvm_pagefree(tpg); + uvm_pagealloc_pg(pg, obj, offset, NULL); + } } /* Index: sys/uvm/uvm_pdaemon.c =================================================================== RCS file: /cvs/src/sys/uvm/uvm_pdaemon.c,v retrieving revision 1.56 diff -u -r1.56 uvm_pdaemon.c --- sys/uvm/uvm_pdaemon.c 26 Sep 2010 12:53:27 -0000 1.56 +++ sys/uvm/uvm_pdaemon.c 1 Apr 2011 08:12:34 -0000 @@ -241,9 +241,9 @@ /* * get pages from the buffer cache, or scan if needed */ - if (((uvmexp.free - BUFPAGES_DEFICIT) < uvmexp.freetarg) || - ((uvmexp.inactive + BUFPAGES_INACT) < uvmexp.inactarg)) { - if (bufbackoff() == -1) + if ((bufbackoff() == -1) + && (((uvmexp.free - BUFPAGES_DEFICIT) < uvmexp.freetarg) || + ((uvmexp.inactive + BUFPAGES_INACT) < uvmexp.inactarg))) { uvmpd_scan(); } Index: sys/uvm/uvm_swap.c =================================================================== RCS file: /cvs/src/sys/uvm/uvm_swap.c,v retrieving revision 1.100 diff -u -r1.100 uvm_swap.c --- sys/uvm/uvm_swap.c 21 Dec 2010 20:14:44 -0000 1.100 +++ sys/uvm/uvm_swap.c 1 Apr 2011 05:32:21 -0000 @@ -1952,7 +1952,8 @@ * fill in the bp. we currently route our i/o through * /dev/drum's vnode [swapdev_vp]. */ - bp->b_flags = B_BUSY | B_NOCACHE | B_RAW | (flags & (B_READ|B_ASYNC)); + bp->b_flags = B_PRIV | B_BUSY | B_NOCACHE | B_RAW | + (flags & (B_READ|B_ASYNC)); bp->b_proc = &proc0; /* XXX */ bp->b_vnbufs.le_next = NOLIST; if (bounce)