(apologies for the earlier oops.)

  Want more buffer cache? please have a try with this.

This diff breaks the buffer cache into the dma'able region, and the
above dma-able region of memory.  buffers are always allocated in
the dma'able region, and as they age they are moved above the dma'able
region if such memory exists. I/O operations on buffers in high
memory flip the buffer back into dma-able memory first.

With this diff you can have huge tracts of buffer cache on amd64 but this
also needs testing on all arch's.

Index: kern/kern_sysctl.c
===================================================================
RCS file: /cvs/src/sys/kern/kern_sysctl.c,v
retrieving revision 1.206
diff -u -p -r1.206 kern_sysctl.c
--- kern/kern_sysctl.c  5 Jul 2011 04:48:02 -0000       1.206
+++ kern/kern_sysctl.c  7 Jul 2011 21:09:33 -0000
@@ -112,6 +112,7 @@ extern struct disklist_head disklist;
 extern fixpt_t ccpu;
 extern  long numvnodes;
 extern u_int mcllivelocks;
+extern psize_t b_dmapages_total, b_highpages_total, b_dmamaxpages;
 
 extern void nmbclust_update(void);
 
@@ -566,8 +567,8 @@ kern_sysctl(int *name, u_int namelen, vo
                return (sysctl_int(oldp, oldlenp, newp, newlen,
                    &rthreads_enabled));
        case KERN_CACHEPCT: {
-               u_int64_t dmapages;
-               int opct, pgs;
+               psize_t pgs;
+               int opct;
                opct = bufcachepercent;
                error = sysctl_int(oldp, oldlenp, newp, newlen,
                    &bufcachepercent);
@@ -577,11 +578,13 @@ kern_sysctl(int *name, u_int namelen, vo
                        bufcachepercent = opct;
                        return (EINVAL);
                }
-               dmapages = uvm_pagecount(&dma_constraint);
                if (bufcachepercent != opct) {
-                       pgs = bufcachepercent * dmapages / 100;
+                       pgs = (b_highpages_total + b_dmapages_total)
+                           * bufcachepercent / 100;
+                       b_dmamaxpages = b_dmapages_total * bufcachepercent
+                           / 100;
                        bufadjust(pgs); /* adjust bufpages */
-                       bufhighpages = bufpages; /* set high water mark */
+                       bufhighpages = bufpages;
                }
                return(0);
        }
Index: kern/spec_vnops.c
===================================================================
RCS file: /cvs/src/sys/kern/spec_vnops.c,v
retrieving revision 1.67
diff -u -p -r1.67 spec_vnops.c
--- kern/spec_vnops.c   5 Jul 2011 05:37:07 -0000       1.67
+++ kern/spec_vnops.c   6 Jul 2011 22:44:00 -0000
@@ -457,7 +457,9 @@ spec_strategy(void *v)
        struct vop_strategy_args *ap = v;
        struct buf *bp = ap->a_bp;
        int maj = major(bp->b_dev);
-       
+
+       if (!ISSET(bp->b_flags, B_DAQ) && ISSET(bp->b_flags, B_BC))
+               panic("bogus buf passed to spec_strategy");
        if (LIST_FIRST(&bp->b_dep) != NULL)
                buf_start(bp);
 
Index: kern/vfs_bio.c
===================================================================
RCS file: /cvs/src/sys/kern/vfs_bio.c,v
retrieving revision 1.133
diff -u -p -r1.133 vfs_bio.c
--- kern/vfs_bio.c      6 Jul 2011 20:50:05 -0000       1.133
+++ kern/vfs_bio.c      7 Jul 2011 21:34:52 -0000
@@ -68,9 +68,13 @@
 #define        BQ_DIRTY        0               /* LRU queue with dirty buffers 
*/
 #define        BQ_CLEAN        1               /* LRU queue with clean buffers 
*/
 
-TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES];
-int needbuffer;
+struct uvm_constraint_range high_constraint;
 struct bio_ops bioops;
+TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES];
+TAILQ_HEAD(bqda, buf) bufqueue_da;
+psize_t b_dmapages_total, b_highpages_total, b_dmamaxpages;
+int needbuffer,needda;
+int needda;
 
 /*
  * Buffer pool for I/O buffers.
@@ -87,12 +91,13 @@ void buf_put(struct buf *);
 
 struct buf *bio_doread(struct vnode *, daddr64_t, int, int);
 struct buf *buf_get(struct vnode *, daddr64_t, size_t);
+struct buf *buf_remove_from_freelist(struct buf *);
 void bread_cluster_callback(struct buf *);
 
 /*
  * We keep a few counters to monitor the utilization of the buffer cache
  *
- *  numbufpages   - number of pages totally allocated.
+ *  numbufpages          - number of pages totally allocated.
  *  numdirtypages - number of pages on BQ_DIRTY queue.
  *  lodirtypages  - low water mark for buffer cleaning daemon.
  *  hidirtypages  - high water mark for buffer cleaning daemon.
@@ -110,14 +115,112 @@ long hicleanpages;
 long maxcleanpages;
 long backoffpages;     /* backoff counter for page allocations */
 long buflowpages;      /* bufpages low water mark */
-long bufhighpages;     /* bufpages high water mark */
-long bufbackpages;     /* number of pages we back off when asked to shrink */
+long bufhighpages;     /* bufpages high water mark */
+long bufbackpages;     /* number of pages we back off when asked to shrink */
+
+/* XXX - should be defined here but we have md issues */
+extern int bufcachepercent;
 
 vsize_t bufkvm;
 
 struct proc *cleanerproc;
 int bd_req;                    /* Sleep point for cleaner daemon. */
 
+/* nuke a buf off it's freelist - returns next buf. skips busy buffers */
+struct buf *
+buf_remove_from_freelist(struct buf * bp)
+{
+       struct buf * nbp;
+       nbp = TAILQ_NEXT(bp, b_freelist);
+       /* skip busy buffers */
+       if (!ISSET(bp->b_flags, B_BUSY)) {
+               bremfree(bp);
+               if (bp->b_vp) {
+                       RB_REMOVE(buf_rb_bufs,
+                           &bp->b_vp->v_bufs_tree, bp);
+                       brelvp(bp);
+               }
+               buf_put(bp);
+       }
+       return(nbp);
+}
+/*
+ * Add buf to the head of the dma reachable queue
+ * and ensure that it is dma reachable.
+ */
+void
+buf_daq_add(struct buf *buf)
+{
+       struct buf *b;
+       int s;
+
+start:
+       KASSERT(ISSET(buf->b_flags, B_BC));
+       KASSERT(ISSET(buf->b_flags, B_BUSY));
+       KASSERT(buf->b_pobj != NULL);
+       s = splbio();
+       /*
+        * if we are adding to the queue, ensure we free down below the
+        * max
+        */
+       while (b_highpages_total && 
+           (!ISSET(buf->b_flags, B_DAQ)) && (!ISSET(buf->b_flags, B_DMA)) &&
+           (bcstats.dmapages > (b_dmamaxpages - atop(buf->b_bufsize)))) {
+               b = TAILQ_FIRST(&bufqueue_da);
+               /* find first non-busy buffer */
+               while (b && ISSET(b->b_flags, B_BUSY))
+                       b = TAILQ_NEXT(b, b_qda);
+               if (b == NULL) {
+                       /* no non-busy buffers. */
+                       needda++;
+                       tsleep(&needda, PRIBIO, "needda", 0);
+                       needda--;
+                       splx(s);
+                       goto start;
+               } else {
+                       if (b_highpages_total) {
+                               buf_acquire_unmapped(b);
+                               /* move buffer to above dma reachable memory */
+                               TAILQ_REMOVE(&bufqueue_da, b, b_qda);
+                               buf_realloc_pages(b, &high_constraint);
+                               if (ISSET(b->b_flags, B_DMA))
+                                       panic("B_DMA after high flip %p", b);
+                               CLR(b->b_flags, B_DAQ);
+                               buf_release(b);
+                               splx(s);
+                               goto start;
+                       } else {
+                              /* no high pages to flip to. */
+                              needda++;
+                              tsleep(&needda, PRIBIO, "needda", 0);
+                              needda--;
+                              splx(s);
+                              goto start;
+                       }
+               }
+       }
+       /* don't copy it if it's already in dma reachable memory */
+       if (ISSET(buf->b_flags, B_DMA)) {
+               /* buf already there, just move it to the end */
+               if (ISSET(buf->b_flags, B_DAQ))
+                       TAILQ_REMOVE(&bufqueue_da, buf, b_qda);
+               TAILQ_INSERT_TAIL(&bufqueue_da, buf, b_qda);
+               SET(buf->b_flags, B_DAQ);
+       } else {
+               if (ISSET(buf->b_flags, B_DAQ))
+                       panic("non-dma buffer on dma queue %p\n", buf);
+               /* move buf to dma reachable memory */
+               buf_realloc_pages(buf, &dma_constraint);
+               if (!ISSET(buf->b_flags, B_DMA))
+                       panic("non-dma buffer after dma move %p\n", buf);
+               TAILQ_INSERT_TAIL(&bufqueue_da, buf, b_qda);
+               SET(buf->b_flags, B_DAQ);
+       }
+       splx(s);
+       return;
+
+}
+
 void
 bremfree(struct buf *bp)
 {
@@ -139,11 +242,10 @@ bremfree(struct buf *bp)
                if (dp == &bufqueues[BQUEUES])
                        panic("bremfree: lost tail");
        }
-       if (!ISSET(bp->b_flags, B_DELWRI)) {
+       if (!ISSET(bp->b_flags, B_DELWRI))
                bcstats.numcleanpages -= atop(bp->b_bufsize);
-       } else {
+       else
                bcstats.numdirtypages -= atop(bp->b_bufsize);
-       }
        TAILQ_REMOVE(dp, bp, b_freelist);
        bcstats.freebufs--;
 }
@@ -175,7 +277,10 @@ buf_put(struct buf *bp)
                if (backoffpages < 0)
                        backoffpages = 0;
        }
-
+       if (ISSET(bp->b_flags, B_DAQ)) {
+               TAILQ_REMOVE(&bufqueue_da, bp, b_qda);
+               CLR(bp->b_flags, B_DAQ);
+       }
        if (buf_dealloc_mem(bp) != 0)
                return;
        pool_put(&bufpool, bp);
@@ -187,10 +292,22 @@ buf_put(struct buf *bp)
 void
 bufinit(void)
 {
-       u_int64_t dmapages;
        struct bqueues *dp;
 
-       dmapages = uvm_pagecount(&dma_constraint);
+       bufhighpages = buflowpages = bufpages = bufcachepercent = bufkvm = 0;
+       /*
+        * XXX note this really is "high" - i.e. *above* dma_constraint
+        */
+       high_constraint.ucr_low = dma_constraint.ucr_high;
+       high_constraint.ucr_high = no_constraint.ucr_high;
+
+       /* do we have memory above dma_constraint, or not? */
+       if (high_constraint.ucr_low != high_constraint.ucr_high) {
+               high_constraint.ucr_low++;
+               b_highpages_total = uvm_pagecount(&high_constraint);
+       } else 
+               b_highpages_total = 0;
+       b_dmapages_total = uvm_pagecount(&dma_constraint);
 
        /*
         * If MD code doesn't say otherwise, use 10% of kvm for mappings and
@@ -199,25 +316,31 @@ bufinit(void)
        if (bufcachepercent == 0)
                bufcachepercent = 10;
        if (bufpages == 0)
-               bufpages = dmapages * bufcachepercent / 100;
+               bufpages = (b_highpages_total + b_dmapages_total)
+                   * bufcachepercent / 100;
 
        bufhighpages = bufpages;
+       b_dmamaxpages = b_dmapages_total * bufcachepercent / 100;
+
+       printf("buffer cache from %d dma pages and %d high pages\n",
+           b_dmapages_total, b_highpages_total);
 
        /*
         * set the base backoff level for the buffer cache to bufpages.
         * we will not allow uvm to steal back more than this number of
         * pages
         */
-       buflowpages = dmapages * 10 / 100;
+       buflowpages = b_dmapages_total * 10 / 100;
 
        /*
-        * set bufbackpages to 100 pages, or 10 percent of the low water mark
-        * if we don't have that many pages.
+        * set bufbackpages to 1 MB worth or pages, or 10 percent of
+        * the low water mark if we don't have that many pages.
         */
 
        bufbackpages = buflowpages * 10 / 100;
-       if (bufbackpages > 100)
-               bufbackpages = 100;
+
+       if (bufbackpages > (1048576 / PAGE_SIZE))
+               bufbackpages = (1048576 / PAGE_SIZE);
 
        if (bufkvm == 0)
                bufkvm = (VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS) / 10;
@@ -238,15 +361,16 @@ bufinit(void)
        pool_setipl(&bufpool, IPL_BIO);
        for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
                TAILQ_INIT(dp);
+       TAILQ_INIT(&bufqueue_da);
 
        /*
         * hmm - bufkvm is an argument because it's static, while
         * bufpages is global because it can change while running.
-        */
+        */
        buf_mem_init(bufkvm);
 
-       hidirtypages = (bufpages / 4) * 3;
-       lodirtypages = bufpages / 2;
+       hidirtypages = (b_dmamaxpages / 4) * 3;
+       lodirtypages = b_dmamaxpages / 2;
 
        /*
         * When we hit 95% of pages being clean, we bring them down to
@@ -259,6 +383,39 @@ bufinit(void)
 }
 
 /*
+ * Flip some dma reachable cache pages high
+ */
+void
+bufhigh(int delta)
+{
+       psize_t newdmapages;
+       struct buf *b;
+       int s;
+
+       if (!b_highpages_total)
+               return;
+       s = splbio();
+       newdmapages = bcstats.dmapages - delta;
+       while ((bcstats.dmapages > newdmapages) &&
+           (b = TAILQ_FIRST(&bufqueue_da))) {
+               while (ISSET(b->b_flags, B_BUSY))
+                       b = TAILQ_NEXT(b, b_qda);
+               if (b != NULL) {
+                       buf_acquire_unmapped(b);
+                       /* move buffer to above dma reachable memory */
+                       buf_realloc_pages(b, &high_constraint);
+                       if (ISSET(b->b_flags, B_DMA))
+                               panic("DMA flagged buffer after high flip %p", 
b);
+                       TAILQ_REMOVE(&bufqueue_da, b, b_qda);
+                       CLR(b->b_flags, B_DAQ);
+                       buf_release(b);
+               }
+       }
+       wakeup(&needda);
+       splx(s);
+}
+
+/*
  * Change cachepct
  */
 void
@@ -272,10 +429,19 @@ bufadjust(int newbufpages)
        int s;
 
        s = splbio();
+       /* XXX for hibernate  - throw away everything we can.*/
+       if (newbufpages == 0) {
+               bp = TAILQ_FIRST(&bufqueues[BQ_CLEAN]);
+               while (bp)
+                       bp = buf_remove_from_freelist(bp);
+               splx(s);
+               return;
+       }
+
        bufpages = newbufpages;
 
-       hidirtypages = (bufpages / 4) * 3;
-       lodirtypages = bufpages / 2;
+       hidirtypages = (b_dmamaxpages / 4) * 3;
+       lodirtypages = b_dmamaxpages / 2;
 
        /*
         * When we hit 95% of pages being clean, we bring them down to
@@ -291,16 +457,9 @@ bufadjust(int newbufpages)
         * free them up to get back down. this may possibly consume
         * all our clean pages...
         */
-       while ((bp = TAILQ_FIRST(&bufqueues[BQ_CLEAN])) &&
-           (bcstats.numbufpages > bufpages)) {
-               bremfree(bp);
-               if (bp->b_vp) {
-                       RB_REMOVE(buf_rb_bufs,
-                           &bp->b_vp->v_bufs_tree, bp);
-                       brelvp(bp);
-               }
-               buf_put(bp);
-       }
+       bp = TAILQ_FIRST(&bufqueues[BQ_CLEAN]);
+       while (bp && (bcstats.numbufpages > bufpages))
+               bp = buf_remove_from_freelist(bp);
 
        /*
         * Wake up cleaner if we're getting low on pages. We might
@@ -336,23 +495,39 @@ bufbackoff(struct uvm_constraint_range *
         * On success, it frees N pages from the buffer cache, and sets
         * a flag so that the next N allocations from buf_get will recycle
         * a buffer rather than allocate a new one. It then returns 0 to the
-        * caller. 
+        * caller.
         *
         * on failure, it could free no pages from the buffer cache, does
-        * nothing and returns -1 to the caller. 
+        * nothing and returns -1 to the caller.
+        */
+
+       psize_t d, s;
+
+       /*
+        * back of by at least bufbackpages, or bufbackpages + what
+        * the pagedaemon needs if it happens to know when it calls us
         */
-       long d;
+       s = (size > 0) ? bufbackpages + size : bufbackpages;
 
-       if (bufpages <= buflowpages) 
+       if (bufpages <= buflowpages)
                return(-1);
 
-       if (bufpages - bufbackpages >= buflowpages)
-               d = bufbackpages;
+       if (bufpages - s >= buflowpages)
+               d = s;
        else
                d = bufpages - buflowpages;
-       backoffpages = bufbackpages;
-       bufadjust(bufpages - d);
-       backoffpages = bufbackpages;
+
+       if (b_highpages_total
+           && (range->ucr_high <= dma_constraint.ucr_high)) {
+               if (bcstats.dmapages - s > b_dmamaxpages)
+                       s += (bcstats.dmapages - b_dmamaxpages);
+               bufhigh(s);
+       }
+       else {
+               backoffpages = bufbackpages;
+               bufadjust(bufpages - d);
+               backoffpages = bufbackpages;
+       }
        return(0);
 }
 
@@ -534,12 +709,18 @@ bread_cluster(struct vnode *vp, daddr64_
        for (i = 1; i < howmany; i++) {
                bcstats.pendingreads++;
                bcstats.numreads++;
-               SET(xbpp[i]->b_flags, B_READ | B_ASYNC);
+               /*
+                * We set B_DMA here  because bp above should be
+                * and we are playing buffer slice-n-dice games
+                * from the memory allocated in bp.
+                */
+               SET(xbpp[i]->b_flags, B_DMA | B_READ | B_ASYNC);
                xbpp[i]->b_blkno = sblkno + (i * inc);
                xbpp[i]->b_bufsize = xbpp[i]->b_bcount = size;
                xbpp[i]->b_data = NULL;
                xbpp[i]->b_pobj = bp->b_pobj;
                xbpp[i]->b_poffs = bp->b_poffs + (i * size);
+               buf_daq_add(xbpp[i]);
        }
 
        KASSERT(bp->b_lblkno == blkno + 1);
@@ -618,7 +799,7 @@ bwrite(struct buf *bp)
                reassignbuf(bp);
        } else
                curproc->p_stats->p_ru.ru_oublock++;
-       
+
 
        /* Initiate disk write.  Make sure the appropriate party is charged. */
        bp->b_vp->v_numoutput++;
@@ -793,6 +974,8 @@ brelse(struct buf *bp)
                                CLR(bp->b_flags, B_WANTED);
                                wakeup(bp);
                        }
+                       if (ISSET(bp->b_flags, B_DMA) && needda)
+                               wakeup(&needda);
                        if (bp->b_vp != NULL)
                                RB_REMOVE(buf_rb_bufs,
                                    &bp->b_vp->v_bufs_tree, bp);
@@ -833,19 +1016,6 @@ brelse(struct buf *bp)
        bcstats.freebufs++;
        CLR(bp->b_flags, (B_AGE | B_ASYNC | B_NOCACHE | B_DEFERRED));
        buf_release(bp);
-
-       /* Wake up any processes waiting for any buffer to become free. */
-       if (needbuffer) {
-               needbuffer--;
-               wakeup(&needbuffer);
-       }
-
-       /* Wake up any processes waiting for _this_ buffer to become free. */
-       if (ISSET(bp->b_flags, B_WANTED)) {
-               CLR(bp->b_flags, B_WANTED);
-               wakeup(bp);
-       }
-
        splx(s);
 }
 
@@ -981,16 +1151,9 @@ buf_get(struct vnode *vp, daddr64_t blkn
                 * free down to the low water mark.
                 */
                if (bcstats.numcleanpages > hicleanpages) {
-                       while (bcstats.numcleanpages > locleanpages) {
-                               bp = TAILQ_FIRST(&bufqueues[BQ_CLEAN]);
-                               bremfree(bp);
-                               if (bp->b_vp) {
-                                       RB_REMOVE(buf_rb_bufs,
-                                           &bp->b_vp->v_bufs_tree, bp);
-                                       brelvp(bp);
-                               }
-                               buf_put(bp);
-                       }
+                       bp = TAILQ_FIRST(&bufqueues[BQ_CLEAN]);
+                       while (bp && (bcstats.numcleanpages > locleanpages))
+                               bp = buf_remove_from_freelist(bp);
                }
 
                npages = atop(round_page(size));
@@ -1002,15 +1165,9 @@ buf_get(struct vnode *vp, daddr64_t blkn
                    || backoffpages) {
                        int freemax = 5;
                        int i = freemax;
-                       while ((bp = TAILQ_FIRST(&bufqueues[BQ_CLEAN])) && i--) 
{
-                               bremfree(bp);
-                               if (bp->b_vp) {
-                                       RB_REMOVE(buf_rb_bufs,
-                                           &bp->b_vp->v_bufs_tree, bp);
-                                       brelvp(bp);
-                               }
-                               buf_put(bp);
-                       }
+                       bp = TAILQ_FIRST(&bufqueues[BQ_CLEAN]);
+                       while (bp && i--)
+                               bp = buf_remove_from_freelist(bp);
                        if (freemax == i &&
                            (bcstats.numbufpages + npages > bufpages)) {
                                needbuffer++;
@@ -1027,6 +1184,8 @@ buf_get(struct vnode *vp, daddr64_t blkn
                splx(s);
                return (NULL);
        }
+       /* Mark buffer as the cache's */
+       SET(bp->b_flags, B_BC);
 
        bp->b_freelist.tqe_next = NOLIST;
        bp->b_synctime = time_uptime + 300;
@@ -1041,7 +1200,7 @@ buf_get(struct vnode *vp, daddr64_t blkn
                 * We insert the buffer into the hash with B_BUSY set
                 * while we allocate pages for it. This way any getblk
                 * that happens while we allocate pages will wait for
-                * this buffer instead of starting its own guf_get.
+                * this buffer instead of starting its own buf_get.
                 *
                 * But first, we check if someone beat us to it.
                 */
@@ -1067,10 +1226,9 @@ buf_get(struct vnode *vp, daddr64_t blkn
        if (size) {
                buf_alloc_pages(bp, round_page(size));
                buf_map(bp);
+               buf_daq_add(bp);
        }
-
        splx(s);
-
        return (bp);
 }
 
@@ -1082,23 +1240,46 @@ buf_daemon(struct proc *p)
 {
        struct timeval starttime, timediff;
        struct buf *bp;
-       int s;
+       int s, nb, error;
 
        cleanerproc = curproc;
 
        s = splbio();
        for (;;) {
+               struct buf *nbp;
                if (bcstats.numdirtypages < hidirtypages)
                        tsleep(&bd_req, PRIBIO - 7, "cleaner", 0);
 
                getmicrouptime(&starttime);
-
+start:
+               nb = 0;
                while ((bp = TAILQ_FIRST(&bufqueues[BQ_DIRTY]))) {
                        struct timeval tv;
+                       nbp = TAILQ_NEXT(bp, b_freelist);
 
                        if (bcstats.numdirtypages < lodirtypages)
                                break;
 
+                       /*
+                        * If we haven't found any other buffers to
+                        * process and this last one is busy, wait for
+                        * it and restart. otherwise, continue and
+                        * process the rest of them..
+                        *
+                        */
+                       if ((nb == 0) && (nbp == NULL) &&
+                           ISSET(bp->b_flags, B_BUSY)) {
+                               SET(bp->b_flags, B_WANTED);
+                               error = tsleep(bp, PRIBIO + 1, "getblk", 0);
+                               splx(s);
+                               if (error)
+                                       return;
+                               s = splbio();
+                               goto start;
+                       } else {
+                               continue;
+                       }
+                       nb++;
                        bremfree(bp);
                        buf_acquire(bp);
                        splx(s);
@@ -1132,7 +1313,6 @@ buf_daemon(struct proc *p)
                        s = splbio();
                        if (timediff.tv_sec)
                                break;
-
                }
        }
 }
Index: kern/vfs_biomem.c
===================================================================
RCS file: /cvs/src/sys/kern/vfs_biomem.c,v
retrieving revision 1.17
diff -u -p -r1.17 vfs_biomem.c
--- kern/vfs_biomem.c   7 Apr 2011 19:07:42 -0000       1.17
+++ kern/vfs_biomem.c   7 Jul 2011 21:17:09 -0000
@@ -33,6 +33,8 @@ TAILQ_HEAD(,buf) buf_valist;
 int buf_nkvmsleep;
 
 extern struct bcachestats bcstats;
+extern int needbuffer;
+extern int needda;
 
 /*
  * Pages are allocated from a uvm object (we only use it for page storage,
@@ -99,6 +101,11 @@ buf_acquire_unmapped(struct buf *bp)
 
        s = splbio();
        SET(bp->b_flags, B_BUSY|B_NOTMAPPED);
+       /* XXX */
+       if (bp->b_data != NULL) {
+               TAILQ_REMOVE(&buf_valist, bp, b_valist);
+               bcstats.busymapped++;
+       }
        splx(s);
 }
 
@@ -170,6 +177,24 @@ buf_release(struct buf *bp)
                }
        }
        CLR(bp->b_flags, B_BUSY|B_NOTMAPPED);
+       if (ISSET(bp->b_flags, B_DMA) && needda) {
+               wakeup(&needda);
+       }
+       /* Wake up any processes waiting for any buffer to become free. */
+       if (needbuffer) {
+               needbuffer--;
+               wakeup(&needbuffer);
+       }
+
+       /*
+        * Wake up any processes waiting for _this_ buffer to become
+        * free.
+        */
+
+       if (ISSET(bp->b_flags, B_WANTED)) {
+               CLR(bp->b_flags, B_WANTED);
+               wakeup(bp);
+       }
        splx(s);
 }
 
@@ -286,6 +311,8 @@ buf_alloc_pages(struct buf *bp, vsize_t 
 
        uvm_pagealloc_multi(buf_object, offs, size, UVM_PLA_WAITOK);
        bcstats.numbufpages += atop(size);
+       bcstats.dmapages += atop(size);
+       SET(bp->b_flags, B_DMA);
        bp->b_pobj = buf_object;
        bp->b_poffs = offs;
        bp->b_bufsize = size;
@@ -302,6 +329,7 @@ buf_free_pages(struct buf *bp)
 
        KASSERT(bp->b_data == NULL);
        KASSERT(uobj != NULL);
+       KASSERT(!ISSET(bp->b_flags, B_DAQ));
 
        s = splbio();
 
@@ -316,11 +344,57 @@ buf_free_pages(struct buf *bp)
                pg->wire_count = 0;
                uvm_pagefree(pg);
                bcstats.numbufpages--;
+               if (ISSET(bp->b_flags, B_DMA))
+                       bcstats.dmapages--;
        }
+       CLR(bp->b_flags, B_DMA);
        splx(s);
 }
 
-/*
- * XXX - it might make sense to make a buf_realloc_pages to avoid
- *       bouncing through the free list all the time.
- */
+/* Reallocate a buf into a particular location specified by "where" */
+void
+buf_realloc_pages(struct buf *bp, struct uvm_constraint_range *where)
+{
+       vaddr_t va;
+       int dma;
+       int s, i;
+
+       s = splbio();
+       KASSERT(ISSET(bp->b_flags, B_BUSY));
+       dma = ISSET(bp->b_flags, B_DMA);
+
+       /* if the original buf is mapped, unmap it */
+       if (bp->b_data != NULL) {
+               va = (vaddr_t)bp->b_data;
+               pmap_kremove(va, bp->b_bufsize);
+               pmap_update(pmap_kernel());
+       }
+       uvm_pagerealloc_multi(bp->b_pobj, bp->b_poffs, bp->b_bufsize,
+           UVM_PLA_WAITOK, where);
+       /*
+        * do this now, and put it back later when we know where we are
+        */
+       if (dma)
+               bcstats.dmapages -= atop(bp->b_bufsize);
+
+       dma = 1;
+       /* if the original buf was mapped, re-map it */
+       for (i = 0; i < atop(bp->b_bufsize); i++) {
+               struct vm_page *pg = uvm_pagelookup(bp->b_pobj,
+                   bp->b_poffs + ptoa(i));
+               KASSERT(pg != NULL);
+               if  (!PADDR_IS_DMA_REACHABLE(VM_PAGE_TO_PHYS(pg)))
+                       dma = 0;
+               if (bp->b_data != NULL) {
+                       pmap_kenter_pa(va + ptoa(i), VM_PAGE_TO_PHYS(pg),
+                           VM_PROT_READ|VM_PROT_WRITE);
+                       pmap_update(pmap_kernel());
+               }
+       }
+       if (dma) {
+               SET(bp->b_flags, B_DMA);
+               bcstats.dmapages += atop(bp->b_bufsize);
+       } else
+               CLR(bp->b_flags, B_DMA);
+       splx(s);
+}
Index: kern/vfs_vops.c
===================================================================
RCS file: /cvs/src/sys/kern/vfs_vops.c,v
retrieving revision 1.4
diff -u -p -r1.4 vfs_vops.c
--- kern/vfs_vops.c     2 Jul 2011 15:52:25 -0000       1.4
+++ kern/vfs_vops.c     6 Jul 2011 22:39:28 -0000
@@ -614,6 +614,17 @@ VOP_STRATEGY(struct buf *bp)
        if (bp->b_vp->v_op->vop_strategy == NULL)
                return (EOPNOTSUPP);
 
+       /*
+        * Flip buffer to dma reachable memory if
+        * necessary.
+        *
+        * XXX if you're making your own buffers and not
+        * having the buffer cache manage them then it's your
+        * problem to ensure they can be dma'ed to and from.
+        */
+       if (ISSET(bp->b_flags, B_BC))
+               buf_daq_add(bp);
+
        return ((bp->b_vp->v_op->vop_strategy)(&a));
 }
 
Index: sys/buf.h
===================================================================
RCS file: /cvs/src/sys/sys/buf.h,v
retrieving revision 1.78
diff -u -p -r1.78 buf.h
--- sys/buf.h   4 Jul 2011 04:30:41 -0000       1.78
+++ sys/buf.h   6 Jul 2011 22:41:29 -0000
@@ -144,6 +144,7 @@ struct buf {
        LIST_ENTRY(buf) b_list;         /* All allocated buffers. */
        LIST_ENTRY(buf) b_vnbufs;       /* Buffer's associated vnode. */
        TAILQ_ENTRY(buf) b_freelist;    /* Free list position if not active. */
+       TAILQ_ENTRY(buf) b_qda;         /* dma reachable queue position */
        time_t  b_synctime;             /* Time this buffer should be flushed */
        struct  proc *b_proc;           /* Associated proc; NULL if kernel. */
        volatile long   b_flags;        /* B_* flags. */
@@ -214,12 +215,15 @@ struct buf {
 #define        B_PDAEMON       0x00200000      /* I/O started by pagedaemon */
 #define        B_RELEASED      0x00400000      /* free this buffer after its 
kvm */
 #define        B_NOTMAPPED     0x00800000      /* BUSY, but not necessarily 
mapped */
+#define        B_DMA           0x01000000      /* DMA reachable */
+#define        B_DAQ           0x02000000      /* On the DMA reachable queue */
+#define        B_BC            0x04000000      /* Belongs to the Buffer Cache 
*/
 
 #define        B_BITS  "\20\001AGE\002NEEDCOMMIT\003ASYNC\004BAD\005BUSY" \
     "\006CACHE\007CALL\010DELWRI\011DONE\012EINTR\013ERROR" \
     "\014INVAL\015NOCACHE\016PHYS\017RAW\020READ" \
     "\021WANTED\022WRITEINPROG\023XXX(FORMAT)\024DEFERRED" \
-    "\025SCANNED\026DAEMON\027RELEASED\030NOTMAPPED"
+    "\025SCANNED\026DAEMON\027RELEASED\030NOTMAPPED\031DMA\032DAQ\033BC"
 
 /*
  * This structure describes a clustered I/O.  It is stored in the b_saveaddr
@@ -276,6 +280,7 @@ void        bremfree(struct buf *);
 void   bufinit(void);
 void   buf_dirty(struct buf *);
 void    buf_undirty(struct buf *);
+void   buf_daq_add (struct buf *);
 int    bwrite(struct buf *);
 struct buf *getblk(struct vnode *, daddr64_t, int, int, int);
 struct buf *geteblk(int);
@@ -298,7 +303,8 @@ int buf_dealloc_mem(struct buf *);
 void   buf_fix_mapping(struct buf *, vsize_t);
 void   buf_alloc_pages(struct buf *, vsize_t);
 void   buf_free_pages(struct buf *);
-
+struct uvm_constraint_range;
+void   buf_realloc_pages(struct buf *, struct uvm_constraint_range *);
 
 void   minphys(struct buf *bp);
 int    physio(void (*strategy)(struct buf *), dev_t dev, int flags,
Index: sys/mount.h
===================================================================
RCS file: /cvs/src/sys/sys/mount.h,v
retrieving revision 1.105
diff -u -p -r1.105 mount.h
--- sys/mount.h 6 Jul 2011 20:50:05 -0000       1.105
+++ sys/mount.h 6 Jul 2011 20:58:36 -0000
@@ -505,6 +505,7 @@ extern long buflowpages, bufhighpages, b
 #define BUFPAGES_INACT (((bcstats.numcleanpages - buflowpages) < 0) ? 0 \
     : bcstats.numcleanpages - buflowpages)
 extern int bufcachepercent;
+extern void bufhigh(int);
 extern void bufadjust(int);
 struct uvm_constraint_range;
 extern int bufbackoff(struct uvm_constraint_range*, long);

Reply via email to