(apologies for the earlier oops.) Want more buffer cache? please have a try with this.
This diff breaks the buffer cache into the dma'able region, and the above dma-able region of memory. buffers are always allocated in the dma'able region, and as they age they are moved above the dma'able region if such memory exists. I/O operations on buffers in high memory flip the buffer back into dma-able memory first. With this diff you can have huge tracts of buffer cache on amd64 but this also needs testing on all arch's. Index: kern/kern_sysctl.c =================================================================== RCS file: /cvs/src/sys/kern/kern_sysctl.c,v retrieving revision 1.206 diff -u -p -r1.206 kern_sysctl.c --- kern/kern_sysctl.c 5 Jul 2011 04:48:02 -0000 1.206 +++ kern/kern_sysctl.c 7 Jul 2011 21:09:33 -0000 @@ -112,6 +112,7 @@ extern struct disklist_head disklist; extern fixpt_t ccpu; extern long numvnodes; extern u_int mcllivelocks; +extern psize_t b_dmapages_total, b_highpages_total, b_dmamaxpages; extern void nmbclust_update(void); @@ -566,8 +567,8 @@ kern_sysctl(int *name, u_int namelen, vo return (sysctl_int(oldp, oldlenp, newp, newlen, &rthreads_enabled)); case KERN_CACHEPCT: { - u_int64_t dmapages; - int opct, pgs; + psize_t pgs; + int opct; opct = bufcachepercent; error = sysctl_int(oldp, oldlenp, newp, newlen, &bufcachepercent); @@ -577,11 +578,13 @@ kern_sysctl(int *name, u_int namelen, vo bufcachepercent = opct; return (EINVAL); } - dmapages = uvm_pagecount(&dma_constraint); if (bufcachepercent != opct) { - pgs = bufcachepercent * dmapages / 100; + pgs = (b_highpages_total + b_dmapages_total) + * bufcachepercent / 100; + b_dmamaxpages = b_dmapages_total * bufcachepercent + / 100; bufadjust(pgs); /* adjust bufpages */ - bufhighpages = bufpages; /* set high water mark */ + bufhighpages = bufpages; } return(0); } Index: kern/spec_vnops.c =================================================================== RCS file: /cvs/src/sys/kern/spec_vnops.c,v retrieving revision 1.67 diff -u -p -r1.67 spec_vnops.c --- kern/spec_vnops.c 5 Jul 2011 05:37:07 -0000 1.67 +++ kern/spec_vnops.c 6 Jul 2011 22:44:00 -0000 @@ -457,7 +457,9 @@ spec_strategy(void *v) struct vop_strategy_args *ap = v; struct buf *bp = ap->a_bp; int maj = major(bp->b_dev); - + + if (!ISSET(bp->b_flags, B_DAQ) && ISSET(bp->b_flags, B_BC)) + panic("bogus buf passed to spec_strategy"); if (LIST_FIRST(&bp->b_dep) != NULL) buf_start(bp); Index: kern/vfs_bio.c =================================================================== RCS file: /cvs/src/sys/kern/vfs_bio.c,v retrieving revision 1.133 diff -u -p -r1.133 vfs_bio.c --- kern/vfs_bio.c 6 Jul 2011 20:50:05 -0000 1.133 +++ kern/vfs_bio.c 7 Jul 2011 21:34:52 -0000 @@ -68,9 +68,13 @@ #define BQ_DIRTY 0 /* LRU queue with dirty buffers */ #define BQ_CLEAN 1 /* LRU queue with clean buffers */ -TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES]; -int needbuffer; +struct uvm_constraint_range high_constraint; struct bio_ops bioops; +TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES]; +TAILQ_HEAD(bqda, buf) bufqueue_da; +psize_t b_dmapages_total, b_highpages_total, b_dmamaxpages; +int needbuffer,needda; +int needda; /* * Buffer pool for I/O buffers. @@ -87,12 +91,13 @@ void buf_put(struct buf *); struct buf *bio_doread(struct vnode *, daddr64_t, int, int); struct buf *buf_get(struct vnode *, daddr64_t, size_t); +struct buf *buf_remove_from_freelist(struct buf *); void bread_cluster_callback(struct buf *); /* * We keep a few counters to monitor the utilization of the buffer cache * - * numbufpages - number of pages totally allocated. + * numbufpages - number of pages totally allocated. * numdirtypages - number of pages on BQ_DIRTY queue. * lodirtypages - low water mark for buffer cleaning daemon. * hidirtypages - high water mark for buffer cleaning daemon. @@ -110,14 +115,112 @@ long hicleanpages; long maxcleanpages; long backoffpages; /* backoff counter for page allocations */ long buflowpages; /* bufpages low water mark */ -long bufhighpages; /* bufpages high water mark */ -long bufbackpages; /* number of pages we back off when asked to shrink */ +long bufhighpages; /* bufpages high water mark */ +long bufbackpages; /* number of pages we back off when asked to shrink */ + +/* XXX - should be defined here but we have md issues */ +extern int bufcachepercent; vsize_t bufkvm; struct proc *cleanerproc; int bd_req; /* Sleep point for cleaner daemon. */ +/* nuke a buf off it's freelist - returns next buf. skips busy buffers */ +struct buf * +buf_remove_from_freelist(struct buf * bp) +{ + struct buf * nbp; + nbp = TAILQ_NEXT(bp, b_freelist); + /* skip busy buffers */ + if (!ISSET(bp->b_flags, B_BUSY)) { + bremfree(bp); + if (bp->b_vp) { + RB_REMOVE(buf_rb_bufs, + &bp->b_vp->v_bufs_tree, bp); + brelvp(bp); + } + buf_put(bp); + } + return(nbp); +} +/* + * Add buf to the head of the dma reachable queue + * and ensure that it is dma reachable. + */ +void +buf_daq_add(struct buf *buf) +{ + struct buf *b; + int s; + +start: + KASSERT(ISSET(buf->b_flags, B_BC)); + KASSERT(ISSET(buf->b_flags, B_BUSY)); + KASSERT(buf->b_pobj != NULL); + s = splbio(); + /* + * if we are adding to the queue, ensure we free down below the + * max + */ + while (b_highpages_total && + (!ISSET(buf->b_flags, B_DAQ)) && (!ISSET(buf->b_flags, B_DMA)) && + (bcstats.dmapages > (b_dmamaxpages - atop(buf->b_bufsize)))) { + b = TAILQ_FIRST(&bufqueue_da); + /* find first non-busy buffer */ + while (b && ISSET(b->b_flags, B_BUSY)) + b = TAILQ_NEXT(b, b_qda); + if (b == NULL) { + /* no non-busy buffers. */ + needda++; + tsleep(&needda, PRIBIO, "needda", 0); + needda--; + splx(s); + goto start; + } else { + if (b_highpages_total) { + buf_acquire_unmapped(b); + /* move buffer to above dma reachable memory */ + TAILQ_REMOVE(&bufqueue_da, b, b_qda); + buf_realloc_pages(b, &high_constraint); + if (ISSET(b->b_flags, B_DMA)) + panic("B_DMA after high flip %p", b); + CLR(b->b_flags, B_DAQ); + buf_release(b); + splx(s); + goto start; + } else { + /* no high pages to flip to. */ + needda++; + tsleep(&needda, PRIBIO, "needda", 0); + needda--; + splx(s); + goto start; + } + } + } + /* don't copy it if it's already in dma reachable memory */ + if (ISSET(buf->b_flags, B_DMA)) { + /* buf already there, just move it to the end */ + if (ISSET(buf->b_flags, B_DAQ)) + TAILQ_REMOVE(&bufqueue_da, buf, b_qda); + TAILQ_INSERT_TAIL(&bufqueue_da, buf, b_qda); + SET(buf->b_flags, B_DAQ); + } else { + if (ISSET(buf->b_flags, B_DAQ)) + panic("non-dma buffer on dma queue %p\n", buf); + /* move buf to dma reachable memory */ + buf_realloc_pages(buf, &dma_constraint); + if (!ISSET(buf->b_flags, B_DMA)) + panic("non-dma buffer after dma move %p\n", buf); + TAILQ_INSERT_TAIL(&bufqueue_da, buf, b_qda); + SET(buf->b_flags, B_DAQ); + } + splx(s); + return; + +} + void bremfree(struct buf *bp) { @@ -139,11 +242,10 @@ bremfree(struct buf *bp) if (dp == &bufqueues[BQUEUES]) panic("bremfree: lost tail"); } - if (!ISSET(bp->b_flags, B_DELWRI)) { + if (!ISSET(bp->b_flags, B_DELWRI)) bcstats.numcleanpages -= atop(bp->b_bufsize); - } else { + else bcstats.numdirtypages -= atop(bp->b_bufsize); - } TAILQ_REMOVE(dp, bp, b_freelist); bcstats.freebufs--; } @@ -175,7 +277,10 @@ buf_put(struct buf *bp) if (backoffpages < 0) backoffpages = 0; } - + if (ISSET(bp->b_flags, B_DAQ)) { + TAILQ_REMOVE(&bufqueue_da, bp, b_qda); + CLR(bp->b_flags, B_DAQ); + } if (buf_dealloc_mem(bp) != 0) return; pool_put(&bufpool, bp); @@ -187,10 +292,22 @@ buf_put(struct buf *bp) void bufinit(void) { - u_int64_t dmapages; struct bqueues *dp; - dmapages = uvm_pagecount(&dma_constraint); + bufhighpages = buflowpages = bufpages = bufcachepercent = bufkvm = 0; + /* + * XXX note this really is "high" - i.e. *above* dma_constraint + */ + high_constraint.ucr_low = dma_constraint.ucr_high; + high_constraint.ucr_high = no_constraint.ucr_high; + + /* do we have memory above dma_constraint, or not? */ + if (high_constraint.ucr_low != high_constraint.ucr_high) { + high_constraint.ucr_low++; + b_highpages_total = uvm_pagecount(&high_constraint); + } else + b_highpages_total = 0; + b_dmapages_total = uvm_pagecount(&dma_constraint); /* * If MD code doesn't say otherwise, use 10% of kvm for mappings and @@ -199,25 +316,31 @@ bufinit(void) if (bufcachepercent == 0) bufcachepercent = 10; if (bufpages == 0) - bufpages = dmapages * bufcachepercent / 100; + bufpages = (b_highpages_total + b_dmapages_total) + * bufcachepercent / 100; bufhighpages = bufpages; + b_dmamaxpages = b_dmapages_total * bufcachepercent / 100; + + printf("buffer cache from %d dma pages and %d high pages\n", + b_dmapages_total, b_highpages_total); /* * set the base backoff level for the buffer cache to bufpages. * we will not allow uvm to steal back more than this number of * pages */ - buflowpages = dmapages * 10 / 100; + buflowpages = b_dmapages_total * 10 / 100; /* - * set bufbackpages to 100 pages, or 10 percent of the low water mark - * if we don't have that many pages. + * set bufbackpages to 1 MB worth or pages, or 10 percent of + * the low water mark if we don't have that many pages. */ bufbackpages = buflowpages * 10 / 100; - if (bufbackpages > 100) - bufbackpages = 100; + + if (bufbackpages > (1048576 / PAGE_SIZE)) + bufbackpages = (1048576 / PAGE_SIZE); if (bufkvm == 0) bufkvm = (VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS) / 10; @@ -238,15 +361,16 @@ bufinit(void) pool_setipl(&bufpool, IPL_BIO); for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++) TAILQ_INIT(dp); + TAILQ_INIT(&bufqueue_da); /* * hmm - bufkvm is an argument because it's static, while * bufpages is global because it can change while running. - */ + */ buf_mem_init(bufkvm); - hidirtypages = (bufpages / 4) * 3; - lodirtypages = bufpages / 2; + hidirtypages = (b_dmamaxpages / 4) * 3; + lodirtypages = b_dmamaxpages / 2; /* * When we hit 95% of pages being clean, we bring them down to @@ -259,6 +383,39 @@ bufinit(void) } /* + * Flip some dma reachable cache pages high + */ +void +bufhigh(int delta) +{ + psize_t newdmapages; + struct buf *b; + int s; + + if (!b_highpages_total) + return; + s = splbio(); + newdmapages = bcstats.dmapages - delta; + while ((bcstats.dmapages > newdmapages) && + (b = TAILQ_FIRST(&bufqueue_da))) { + while (ISSET(b->b_flags, B_BUSY)) + b = TAILQ_NEXT(b, b_qda); + if (b != NULL) { + buf_acquire_unmapped(b); + /* move buffer to above dma reachable memory */ + buf_realloc_pages(b, &high_constraint); + if (ISSET(b->b_flags, B_DMA)) + panic("DMA flagged buffer after high flip %p", b); + TAILQ_REMOVE(&bufqueue_da, b, b_qda); + CLR(b->b_flags, B_DAQ); + buf_release(b); + } + } + wakeup(&needda); + splx(s); +} + +/* * Change cachepct */ void @@ -272,10 +429,19 @@ bufadjust(int newbufpages) int s; s = splbio(); + /* XXX for hibernate - throw away everything we can.*/ + if (newbufpages == 0) { + bp = TAILQ_FIRST(&bufqueues[BQ_CLEAN]); + while (bp) + bp = buf_remove_from_freelist(bp); + splx(s); + return; + } + bufpages = newbufpages; - hidirtypages = (bufpages / 4) * 3; - lodirtypages = bufpages / 2; + hidirtypages = (b_dmamaxpages / 4) * 3; + lodirtypages = b_dmamaxpages / 2; /* * When we hit 95% of pages being clean, we bring them down to @@ -291,16 +457,9 @@ bufadjust(int newbufpages) * free them up to get back down. this may possibly consume * all our clean pages... */ - while ((bp = TAILQ_FIRST(&bufqueues[BQ_CLEAN])) && - (bcstats.numbufpages > bufpages)) { - bremfree(bp); - if (bp->b_vp) { - RB_REMOVE(buf_rb_bufs, - &bp->b_vp->v_bufs_tree, bp); - brelvp(bp); - } - buf_put(bp); - } + bp = TAILQ_FIRST(&bufqueues[BQ_CLEAN]); + while (bp && (bcstats.numbufpages > bufpages)) + bp = buf_remove_from_freelist(bp); /* * Wake up cleaner if we're getting low on pages. We might @@ -336,23 +495,39 @@ bufbackoff(struct uvm_constraint_range * * On success, it frees N pages from the buffer cache, and sets * a flag so that the next N allocations from buf_get will recycle * a buffer rather than allocate a new one. It then returns 0 to the - * caller. + * caller. * * on failure, it could free no pages from the buffer cache, does - * nothing and returns -1 to the caller. + * nothing and returns -1 to the caller. + */ + + psize_t d, s; + + /* + * back of by at least bufbackpages, or bufbackpages + what + * the pagedaemon needs if it happens to know when it calls us */ - long d; + s = (size > 0) ? bufbackpages + size : bufbackpages; - if (bufpages <= buflowpages) + if (bufpages <= buflowpages) return(-1); - if (bufpages - bufbackpages >= buflowpages) - d = bufbackpages; + if (bufpages - s >= buflowpages) + d = s; else d = bufpages - buflowpages; - backoffpages = bufbackpages; - bufadjust(bufpages - d); - backoffpages = bufbackpages; + + if (b_highpages_total + && (range->ucr_high <= dma_constraint.ucr_high)) { + if (bcstats.dmapages - s > b_dmamaxpages) + s += (bcstats.dmapages - b_dmamaxpages); + bufhigh(s); + } + else { + backoffpages = bufbackpages; + bufadjust(bufpages - d); + backoffpages = bufbackpages; + } return(0); } @@ -534,12 +709,18 @@ bread_cluster(struct vnode *vp, daddr64_ for (i = 1; i < howmany; i++) { bcstats.pendingreads++; bcstats.numreads++; - SET(xbpp[i]->b_flags, B_READ | B_ASYNC); + /* + * We set B_DMA here because bp above should be + * and we are playing buffer slice-n-dice games + * from the memory allocated in bp. + */ + SET(xbpp[i]->b_flags, B_DMA | B_READ | B_ASYNC); xbpp[i]->b_blkno = sblkno + (i * inc); xbpp[i]->b_bufsize = xbpp[i]->b_bcount = size; xbpp[i]->b_data = NULL; xbpp[i]->b_pobj = bp->b_pobj; xbpp[i]->b_poffs = bp->b_poffs + (i * size); + buf_daq_add(xbpp[i]); } KASSERT(bp->b_lblkno == blkno + 1); @@ -618,7 +799,7 @@ bwrite(struct buf *bp) reassignbuf(bp); } else curproc->p_stats->p_ru.ru_oublock++; - + /* Initiate disk write. Make sure the appropriate party is charged. */ bp->b_vp->v_numoutput++; @@ -793,6 +974,8 @@ brelse(struct buf *bp) CLR(bp->b_flags, B_WANTED); wakeup(bp); } + if (ISSET(bp->b_flags, B_DMA) && needda) + wakeup(&needda); if (bp->b_vp != NULL) RB_REMOVE(buf_rb_bufs, &bp->b_vp->v_bufs_tree, bp); @@ -833,19 +1016,6 @@ brelse(struct buf *bp) bcstats.freebufs++; CLR(bp->b_flags, (B_AGE | B_ASYNC | B_NOCACHE | B_DEFERRED)); buf_release(bp); - - /* Wake up any processes waiting for any buffer to become free. */ - if (needbuffer) { - needbuffer--; - wakeup(&needbuffer); - } - - /* Wake up any processes waiting for _this_ buffer to become free. */ - if (ISSET(bp->b_flags, B_WANTED)) { - CLR(bp->b_flags, B_WANTED); - wakeup(bp); - } - splx(s); } @@ -981,16 +1151,9 @@ buf_get(struct vnode *vp, daddr64_t blkn * free down to the low water mark. */ if (bcstats.numcleanpages > hicleanpages) { - while (bcstats.numcleanpages > locleanpages) { - bp = TAILQ_FIRST(&bufqueues[BQ_CLEAN]); - bremfree(bp); - if (bp->b_vp) { - RB_REMOVE(buf_rb_bufs, - &bp->b_vp->v_bufs_tree, bp); - brelvp(bp); - } - buf_put(bp); - } + bp = TAILQ_FIRST(&bufqueues[BQ_CLEAN]); + while (bp && (bcstats.numcleanpages > locleanpages)) + bp = buf_remove_from_freelist(bp); } npages = atop(round_page(size)); @@ -1002,15 +1165,9 @@ buf_get(struct vnode *vp, daddr64_t blkn || backoffpages) { int freemax = 5; int i = freemax; - while ((bp = TAILQ_FIRST(&bufqueues[BQ_CLEAN])) && i--) { - bremfree(bp); - if (bp->b_vp) { - RB_REMOVE(buf_rb_bufs, - &bp->b_vp->v_bufs_tree, bp); - brelvp(bp); - } - buf_put(bp); - } + bp = TAILQ_FIRST(&bufqueues[BQ_CLEAN]); + while (bp && i--) + bp = buf_remove_from_freelist(bp); if (freemax == i && (bcstats.numbufpages + npages > bufpages)) { needbuffer++; @@ -1027,6 +1184,8 @@ buf_get(struct vnode *vp, daddr64_t blkn splx(s); return (NULL); } + /* Mark buffer as the cache's */ + SET(bp->b_flags, B_BC); bp->b_freelist.tqe_next = NOLIST; bp->b_synctime = time_uptime + 300; @@ -1041,7 +1200,7 @@ buf_get(struct vnode *vp, daddr64_t blkn * We insert the buffer into the hash with B_BUSY set * while we allocate pages for it. This way any getblk * that happens while we allocate pages will wait for - * this buffer instead of starting its own guf_get. + * this buffer instead of starting its own buf_get. * * But first, we check if someone beat us to it. */ @@ -1067,10 +1226,9 @@ buf_get(struct vnode *vp, daddr64_t blkn if (size) { buf_alloc_pages(bp, round_page(size)); buf_map(bp); + buf_daq_add(bp); } - splx(s); - return (bp); } @@ -1082,23 +1240,46 @@ buf_daemon(struct proc *p) { struct timeval starttime, timediff; struct buf *bp; - int s; + int s, nb, error; cleanerproc = curproc; s = splbio(); for (;;) { + struct buf *nbp; if (bcstats.numdirtypages < hidirtypages) tsleep(&bd_req, PRIBIO - 7, "cleaner", 0); getmicrouptime(&starttime); - +start: + nb = 0; while ((bp = TAILQ_FIRST(&bufqueues[BQ_DIRTY]))) { struct timeval tv; + nbp = TAILQ_NEXT(bp, b_freelist); if (bcstats.numdirtypages < lodirtypages) break; + /* + * If we haven't found any other buffers to + * process and this last one is busy, wait for + * it and restart. otherwise, continue and + * process the rest of them.. + * + */ + if ((nb == 0) && (nbp == NULL) && + ISSET(bp->b_flags, B_BUSY)) { + SET(bp->b_flags, B_WANTED); + error = tsleep(bp, PRIBIO + 1, "getblk", 0); + splx(s); + if (error) + return; + s = splbio(); + goto start; + } else { + continue; + } + nb++; bremfree(bp); buf_acquire(bp); splx(s); @@ -1132,7 +1313,6 @@ buf_daemon(struct proc *p) s = splbio(); if (timediff.tv_sec) break; - } } } Index: kern/vfs_biomem.c =================================================================== RCS file: /cvs/src/sys/kern/vfs_biomem.c,v retrieving revision 1.17 diff -u -p -r1.17 vfs_biomem.c --- kern/vfs_biomem.c 7 Apr 2011 19:07:42 -0000 1.17 +++ kern/vfs_biomem.c 7 Jul 2011 21:17:09 -0000 @@ -33,6 +33,8 @@ TAILQ_HEAD(,buf) buf_valist; int buf_nkvmsleep; extern struct bcachestats bcstats; +extern int needbuffer; +extern int needda; /* * Pages are allocated from a uvm object (we only use it for page storage, @@ -99,6 +101,11 @@ buf_acquire_unmapped(struct buf *bp) s = splbio(); SET(bp->b_flags, B_BUSY|B_NOTMAPPED); + /* XXX */ + if (bp->b_data != NULL) { + TAILQ_REMOVE(&buf_valist, bp, b_valist); + bcstats.busymapped++; + } splx(s); } @@ -170,6 +177,24 @@ buf_release(struct buf *bp) } } CLR(bp->b_flags, B_BUSY|B_NOTMAPPED); + if (ISSET(bp->b_flags, B_DMA) && needda) { + wakeup(&needda); + } + /* Wake up any processes waiting for any buffer to become free. */ + if (needbuffer) { + needbuffer--; + wakeup(&needbuffer); + } + + /* + * Wake up any processes waiting for _this_ buffer to become + * free. + */ + + if (ISSET(bp->b_flags, B_WANTED)) { + CLR(bp->b_flags, B_WANTED); + wakeup(bp); + } splx(s); } @@ -286,6 +311,8 @@ buf_alloc_pages(struct buf *bp, vsize_t uvm_pagealloc_multi(buf_object, offs, size, UVM_PLA_WAITOK); bcstats.numbufpages += atop(size); + bcstats.dmapages += atop(size); + SET(bp->b_flags, B_DMA); bp->b_pobj = buf_object; bp->b_poffs = offs; bp->b_bufsize = size; @@ -302,6 +329,7 @@ buf_free_pages(struct buf *bp) KASSERT(bp->b_data == NULL); KASSERT(uobj != NULL); + KASSERT(!ISSET(bp->b_flags, B_DAQ)); s = splbio(); @@ -316,11 +344,57 @@ buf_free_pages(struct buf *bp) pg->wire_count = 0; uvm_pagefree(pg); bcstats.numbufpages--; + if (ISSET(bp->b_flags, B_DMA)) + bcstats.dmapages--; } + CLR(bp->b_flags, B_DMA); splx(s); } -/* - * XXX - it might make sense to make a buf_realloc_pages to avoid - * bouncing through the free list all the time. - */ +/* Reallocate a buf into a particular location specified by "where" */ +void +buf_realloc_pages(struct buf *bp, struct uvm_constraint_range *where) +{ + vaddr_t va; + int dma; + int s, i; + + s = splbio(); + KASSERT(ISSET(bp->b_flags, B_BUSY)); + dma = ISSET(bp->b_flags, B_DMA); + + /* if the original buf is mapped, unmap it */ + if (bp->b_data != NULL) { + va = (vaddr_t)bp->b_data; + pmap_kremove(va, bp->b_bufsize); + pmap_update(pmap_kernel()); + } + uvm_pagerealloc_multi(bp->b_pobj, bp->b_poffs, bp->b_bufsize, + UVM_PLA_WAITOK, where); + /* + * do this now, and put it back later when we know where we are + */ + if (dma) + bcstats.dmapages -= atop(bp->b_bufsize); + + dma = 1; + /* if the original buf was mapped, re-map it */ + for (i = 0; i < atop(bp->b_bufsize); i++) { + struct vm_page *pg = uvm_pagelookup(bp->b_pobj, + bp->b_poffs + ptoa(i)); + KASSERT(pg != NULL); + if (!PADDR_IS_DMA_REACHABLE(VM_PAGE_TO_PHYS(pg))) + dma = 0; + if (bp->b_data != NULL) { + pmap_kenter_pa(va + ptoa(i), VM_PAGE_TO_PHYS(pg), + VM_PROT_READ|VM_PROT_WRITE); + pmap_update(pmap_kernel()); + } + } + if (dma) { + SET(bp->b_flags, B_DMA); + bcstats.dmapages += atop(bp->b_bufsize); + } else + CLR(bp->b_flags, B_DMA); + splx(s); +} Index: kern/vfs_vops.c =================================================================== RCS file: /cvs/src/sys/kern/vfs_vops.c,v retrieving revision 1.4 diff -u -p -r1.4 vfs_vops.c --- kern/vfs_vops.c 2 Jul 2011 15:52:25 -0000 1.4 +++ kern/vfs_vops.c 6 Jul 2011 22:39:28 -0000 @@ -614,6 +614,17 @@ VOP_STRATEGY(struct buf *bp) if (bp->b_vp->v_op->vop_strategy == NULL) return (EOPNOTSUPP); + /* + * Flip buffer to dma reachable memory if + * necessary. + * + * XXX if you're making your own buffers and not + * having the buffer cache manage them then it's your + * problem to ensure they can be dma'ed to and from. + */ + if (ISSET(bp->b_flags, B_BC)) + buf_daq_add(bp); + return ((bp->b_vp->v_op->vop_strategy)(&a)); } Index: sys/buf.h =================================================================== RCS file: /cvs/src/sys/sys/buf.h,v retrieving revision 1.78 diff -u -p -r1.78 buf.h --- sys/buf.h 4 Jul 2011 04:30:41 -0000 1.78 +++ sys/buf.h 6 Jul 2011 22:41:29 -0000 @@ -144,6 +144,7 @@ struct buf { LIST_ENTRY(buf) b_list; /* All allocated buffers. */ LIST_ENTRY(buf) b_vnbufs; /* Buffer's associated vnode. */ TAILQ_ENTRY(buf) b_freelist; /* Free list position if not active. */ + TAILQ_ENTRY(buf) b_qda; /* dma reachable queue position */ time_t b_synctime; /* Time this buffer should be flushed */ struct proc *b_proc; /* Associated proc; NULL if kernel. */ volatile long b_flags; /* B_* flags. */ @@ -214,12 +215,15 @@ struct buf { #define B_PDAEMON 0x00200000 /* I/O started by pagedaemon */ #define B_RELEASED 0x00400000 /* free this buffer after its kvm */ #define B_NOTMAPPED 0x00800000 /* BUSY, but not necessarily mapped */ +#define B_DMA 0x01000000 /* DMA reachable */ +#define B_DAQ 0x02000000 /* On the DMA reachable queue */ +#define B_BC 0x04000000 /* Belongs to the Buffer Cache */ #define B_BITS "\20\001AGE\002NEEDCOMMIT\003ASYNC\004BAD\005BUSY" \ "\006CACHE\007CALL\010DELWRI\011DONE\012EINTR\013ERROR" \ "\014INVAL\015NOCACHE\016PHYS\017RAW\020READ" \ "\021WANTED\022WRITEINPROG\023XXX(FORMAT)\024DEFERRED" \ - "\025SCANNED\026DAEMON\027RELEASED\030NOTMAPPED" + "\025SCANNED\026DAEMON\027RELEASED\030NOTMAPPED\031DMA\032DAQ\033BC" /* * This structure describes a clustered I/O. It is stored in the b_saveaddr @@ -276,6 +280,7 @@ void bremfree(struct buf *); void bufinit(void); void buf_dirty(struct buf *); void buf_undirty(struct buf *); +void buf_daq_add (struct buf *); int bwrite(struct buf *); struct buf *getblk(struct vnode *, daddr64_t, int, int, int); struct buf *geteblk(int); @@ -298,7 +303,8 @@ int buf_dealloc_mem(struct buf *); void buf_fix_mapping(struct buf *, vsize_t); void buf_alloc_pages(struct buf *, vsize_t); void buf_free_pages(struct buf *); - +struct uvm_constraint_range; +void buf_realloc_pages(struct buf *, struct uvm_constraint_range *); void minphys(struct buf *bp); int physio(void (*strategy)(struct buf *), dev_t dev, int flags, Index: sys/mount.h =================================================================== RCS file: /cvs/src/sys/sys/mount.h,v retrieving revision 1.105 diff -u -p -r1.105 mount.h --- sys/mount.h 6 Jul 2011 20:50:05 -0000 1.105 +++ sys/mount.h 6 Jul 2011 20:58:36 -0000 @@ -505,6 +505,7 @@ extern long buflowpages, bufhighpages, b #define BUFPAGES_INACT (((bcstats.numcleanpages - buflowpages) < 0) ? 0 \ : bcstats.numcleanpages - buflowpages) extern int bufcachepercent; +extern void bufhigh(int); extern void bufadjust(int); struct uvm_constraint_range; extern int bufbackoff(struct uvm_constraint_range*, long);