Re: Defang buffer cache for bigmem.

Owain Ainsworth Fri, 01 Apr 2011 13:03:35 -0700

On Fri, Apr 01, 2011 at 07:25:40AM -0600, Bob Beck wrote:
> 
> This diff moves the buffer cache to only be allocated out of dma'able
>  memory, along with a few pieces (flags) that I will need for the next step
>  of allowing it to touch high memory.
> 
>  Appears to behave well for me under load and builds survive it with
>  the buffer cache cranked up.
> 
>  I would like to get this in and a first step as it allows me to progress,
>  and will allow others to fix some of the other problems with drivers without 
> the buffer cache getting in the way with bigmem turned on. 
> 
>  -Bob
> 
> ------8<----
> Index: sys/kern/vfs_bio.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/vfs_bio.c,v
> retrieving revision 1.127
> diff -u -r1.127 vfs_bio.c
> --- sys/kern/vfs_bio.c        13 Nov 2010 17:45:44 -0000      1.127
> +++ sys/kern/vfs_bio.c        1 Apr 2011 08:56:34 -0000
> @@ -191,19 +191,43 @@
>  void
>  bufinit(void)
>  {
> +     long low, high, dmapages, highpages;
>       struct bqueues *dp;
>  
>       /* XXX - for now */
>       bufhighpages = buflowpages = bufpages = bufcachepercent = bufkvm = 0;
>  
>       /*
> +      * First off, figure out how much of memory we can use. 
> +      *
> +      * XXX for now we only use dma-able memory
> +      *
> +      * XXX - this isn't completely accurate, because their may
> +      * be holes in the physical memory. This needs to be replaced
> +      * with a uvm_pmemrange function to tell us how many pages
> +      * are within a constraint range - but this is accurate enough
> +      * for now. 
> +      */
> +     
> +     low = atop(dma_constraint.ucr_low);
> +     high = atop(dma_constraint.ucr_high);
> +     if (high >= physmem) {
> +             high = physmem;
> +             highpages = 0;
> +     }
> +     else 
> +             highpages = physmem - high;
> +     /* XXX highpages not used yet but will be very soon. */


``not completely accurate'' is an understatement. due to how memory
segment can work this could be off by a shittonne. specifically the
guess would likely be high which mean we'd try to overallocate buffer
cache pages.

I'm about to get on a train i'll see if I can write that pmemrange
function for you.

> +     dmapages = high - low;
> +
> +     /*
>        * If MD code doesn't say otherwise, use 10% of kvm for mappings and
> -      * 10% physmem for pages.
> +      * 10% of dmaable pages for cache pages.
>        */
>       if (bufcachepercent == 0)
>               bufcachepercent = 10;
>       if (bufpages == 0)
> -             bufpages = physmem * bufcachepercent / 100;
> +             bufpages = dmapages * bufcachepercent / 100;
>  
>       bufhighpages = bufpages;
>  
> @@ -212,7 +236,7 @@
>        * we will not allow uvm to steal back more than this number of
>        * pages
>        */
> -     buflowpages = physmem * 10 / 100;
> +     buflowpages = dmapages * 10 / 100;
>  
>       /*
>        * set bufbackpages to 100 pages, or 10 percent of the low water mark
> Index: sys/kern/vfs_biomem.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/vfs_biomem.c,v
> retrieving revision 1.14
> diff -u -r1.14 vfs_biomem.c
> --- sys/kern/vfs_biomem.c     30 Apr 2010 21:56:39 -0000      1.14
> +++ sys/kern/vfs_biomem.c     1 Apr 2011 05:32:21 -0000
> @@ -29,10 +29,19 @@
>  vaddr_t buf_kva_start, buf_kva_end;
>  int buf_needva;
>  TAILQ_HEAD(,buf) buf_valist;
> +void buf_realloc_pages(struct buf *, struct uvm_constraint_range *);
>  
>  int buf_nkvmsleep;
> +#if 0
> +extern int needda;
> +#endif

more da droppings, do they really have to be in here?
> +extern void                  uvm_pagealloc_multi(struct uvm_object *, voff_t,
> +                         vsize_t, int);
> +extern void                  uvm_pagerealloc_multi(struct uvm_object *, 
> voff_t,
> +                         vsize_t, int, struct uvm_constraint_range *);
these should go in one of the uvm headers.
>  
>  extern struct bcachestats bcstats;
> +extern int needbuffer;
>  
>  /*
>   * Pages are allocated from a uvm object (we only use it for page storage,
> @@ -99,6 +108,10 @@
>  
>       s = splbio();
>       SET(bp->b_flags, B_BUSY|B_NOTMAPPED);
> +     if (bp->b_data != NULL) {
> +             TAILQ_REMOVE(&buf_valist, bp, b_valist);
> +             bcstats.busymapped++;
> +     }
>       splx(s);
>  }
>  
> @@ -170,6 +183,26 @@
>               }
>       }
>       CLR(bp->b_flags, B_BUSY|B_NOTMAPPED);
> +#if 0
> +     if (ISSET(bp->b_flags, B_DAQ) && needda) {
> +             wakeup(&needda);
> +     }
> +#endif
> +     /* Wake up any processes waiting for any buffer to become free. */
> +     if (needbuffer) {
> +             needbuffer--;
> +             wakeup(&needbuffer);
> +     }
> +
> +     /*
> +      * Wake up any processes waiting for _this_ buffer to become
> +      * free.
> +      */
> +
> +     if (ISSET(bp->b_flags, B_WANTED)) {
> +             CLR(bp->b_flags, B_WANTED);
> +             wakeup(bp);
> +     }

Is this a bug in the current code? Can I please have a full explanation
of why this change is being made (this and the previous chunk) Looks
like it should be a separate commit.


>       splx(s);
>  }
>  
> @@ -259,11 +292,11 @@
>       return (va);
>  }
>  
> +/* Always allocates in Device Accessible Memory */
>  void
>  buf_alloc_pages(struct buf *bp, vsize_t size)
>  {
> -     struct vm_page *pg;
> -     voff_t offs, i;
> +     voff_t offs;
>       int s;
>  
>       KASSERT(size == round_page(size));
> @@ -277,25 +310,51 @@
>  
>       KASSERT(buf_page_offset > 0);
>  
> -     for (i = 0; i < atop(size); i++) {
> -#if defined(DEBUG) || 1
> -             if ((pg = uvm_pagelookup(buf_object, offs + ptoa(i))))
> -                     panic("buf_alloc_pages: overlap buf: %p page: %p",
> -                         bp, pg);
> -#endif
> -
> -             while ((pg = uvm_pagealloc(buf_object, offs + ptoa(i),
> -                         NULL, 0)) == NULL) {
> -                     uvm_wait("buf_alloc_pages");
> -             }
> -             pg->wire_count = 1;
> -             atomic_clearbits_int(&pg->pg_flags, PG_BUSY);
> -             bcstats.numbufpages++;
> -     }
> -
> +     uvm_pagealloc_multi(buf_object, offs, size, UVM_PLA_WAITOK);
> +     bcstats.numbufpages+= atop(size);;
> +     SET(bp->b_flags, B_DMA);

I'm still curious if just this step makes anything faster, it prevents
$bufsize calls to the alloc functions and replaces it with one call to
pmemrange, i discussed that with ariane ages ago. maybe worth testing if
anyone has a moment.

>       bp->b_pobj = buf_object;
>       bp->b_poffs = offs;
>       bp->b_bufsize = size;
> +     splx(s);
> +}
> +
> +/* reallocate into a particular location specified by "where" */
> +void
> +buf_realloc_pages(struct buf *bp, struct uvm_constraint_range *where)
> +{
> +     vaddr_t va;
> +     int dma = 1;
> +     int s, i;
> +
> +     s = splbio();
> +     KASSERT(ISSET(bp->b_flags, B_BUSY));
> +
> +     /* if the original buf is mapped, unmap it */
> +     if ( bp->b_data != NULL) {
> +             va = (vaddr_t)bp->b_data;
> +             pmap_kremove(va, bp->b_bufsize);
> +             pmap_update(pmap_kernel());
> +     }
> +     uvm_pagerealloc_multi(bp->b_pobj, bp->b_poffs, bp->b_bufsize,
> +         UVM_PLA_WAITOK, where);
> +     /* if the original buf was mapped, re-map it */
> +     for (i = 0; i < atop(bp->b_bufsize); i++) {
> +             struct vm_page *pg = uvm_pagelookup(bp->b_pobj,
> +                 bp->b_poffs + ptoa(i));
> +             KASSERT(pg != NULL);
> +             if  (!PADDR_IS_DMA_REACHABLE(VM_PAGE_TO_PHYS(pg)))
> +                     dma = 0;
> +             if (bp->b_data != NULL) {
> +                     pmap_kenter_pa(va + ptoa(i), VM_PAGE_TO_PHYS(pg),
> +                         VM_PROT_READ|VM_PROT_WRITE);
> +                     pmap_update(pmap_kernel());
> +             }
> +     }
> +     if (dma)
> +             SET(bp->b_flags, B_DMA);
> +     else
> +             CLR(bp->b_flags, B_DMA);

I worry that this function will do unnecessary work in one case (with
the da queue stuff). If we have all memory as B_DMAable then every time
we realloc to !dmaable memory we cycle the pages for no real use
(reallocate, mark it dmaable, but as soon as the buffer drops we do it
again for no use!). Sparc64
for example will hurt with that.

for this diff's functionality otoh that's not much of a worry, but a
concern for later.
>       splx(s);
>  }
>  
> Index: sys/sys/buf.h
> ===================================================================
> RCS file: /cvs/src/sys/sys/buf.h,v
> retrieving revision 1.74
> diff -u -r1.74 buf.h
> --- sys/sys/buf.h     22 Sep 2010 01:18:57 -0000      1.74
> +++ sys/sys/buf.h     1 Apr 2011 05:32:21 -0000
> @@ -144,6 +144,7 @@
>       LIST_ENTRY(buf) b_list;         /* All allocated buffers. */
>       LIST_ENTRY(buf) b_vnbufs;       /* Buffer's associated vnode. */
>       TAILQ_ENTRY(buf) b_freelist;    /* Free list position if not active. */
> +     TAILQ_ENTRY(buf) b_qda;         /* Device Accisible queue position */
>       time_t  b_synctime;             /* Time this buffer should be flushed */
>       struct  proc *b_proc;           /* Associated proc; NULL if kernel. */
>       volatile long   b_flags;        /* B_* flags. */

Does the da queue stuff really have to be in here, it doesn't fit with
the rest?

> @@ -189,6 +190,7 @@
>  /*
>   * These flags are kept in b_flags.
>   */
> +#define      B_WRITE         0x00000000      /* Write buffer (pseudo flag). 
> */
>  #define      B_AGE           0x00000001      /* Move to age queue when I/O 
> done. */
>  #define      B_NEEDCOMMIT    0x00000002      /* Needs committing to stable 
> storage */
>  #define      B_ASYNC         0x00000004      /* Start I/O, do not wait. */
> @@ -197,29 +199,31 @@
>  #define      B_CACHE         0x00000020      /* Bread found us in the cache. 
> */
>  #define      B_CALL          0x00000040      /* Call b_iodone from biodone. 
> */
>  #define      B_DELWRI        0x00000080      /* Delay I/O until buffer 
> reused. */
> +#define      B_PRIV          0x00000100      /* Privately allocated buffer */
>  #define      B_DONE          0x00000200      /* I/O completed. */
>  #define      B_EINTR         0x00000400      /* I/O was interrupted */
>  #define      B_ERROR         0x00000800      /* I/O error occurred. */
> -#define      B_INVAL         0x00002000      /* Does not contain valid info. 
> */
> -#define      B_NOCACHE       0x00008000      /* Do not cache block after 
> use. */
> -#define      B_PHYS          0x00040000      /* I/O to user memory. */
> -#define      B_RAW           0x00080000      /* Set by physio for raw 
> transfers. */
> -#define      B_READ          0x00100000      /* Read buffer. */
> -#define      B_WANTED        0x00800000      /* Process wants this buffer. */
> -#define      B_WRITE         0x00000000      /* Write buffer (pseudo flag). 
> */
> -#define      B_WRITEINPROG   0x01000000      /* Write in progress. */
> -#define      B_XXX           0x02000000      /* Debugging flag. */
> -#define      B_DEFERRED      0x04000000      /* Skipped over for cleaning */
> -#define      B_SCANNED       0x08000000      /* Block already pushed during 
> sync */
> -#define      B_PDAEMON       0x10000000      /* I/O started by pagedaemon */
> -#define B_RELEASED   0x20000000      /* free this buffer after its kvm */
> -#define B_NOTMAPPED  0x40000000      /* BUSY, but not necessarily mapped */
> -
> -#define      B_BITS  
> "\010\001AGE\002NEEDCOMMIT\003ASYNC\004BAD\005BUSY\006CACHE" \
> -    "\007CALL\010DELWRI\012DONE\013EINTR\014ERROR" \
> -    "\016INVAL\020NOCACHE\023PHYS\024RAW\025READ" \
> -    "\030WANTED\031WRITEINPROG\032XXX\033DEFERRED" \
> -    "\034SCANNED\035PDAEMON"
> +#define      B_INVAL         0x00001000      /* Does not contain valid info. 
> */
> +#define      B_NOCACHE       0x00002000      /* Do not cache block after 
> use. */
> +#define      B_PHYS          0x00004000      /* I/O to user memory. */
> +#define      B_RAW           0x00008000      /* Set by physio for raw 
> transfers. */
> +#define      B_READ          0x00010000      /* Read buffer. */
> +#define      B_WANTED        0x00020000      /* Process wants this buffer. */
> +#define      B_WRITEINPROG   0x00040000      /* Write in progress. */
> +#define      B_XXX           0x00080000      /* Debugging flag. */
> +#define      B_DEFERRED      0x00100000      /* Skipped over for cleaning */
> +#define      B_SCANNED       0x00200000      /* Block already pushed during 
> sync */
> +#define      B_PDAEMON       0x00400000      /* I/O started by pagedaemon */
> +#define B_RELEASED   0x00800000      /* free this buffer after its kvm */
> +#define B_DMA                0x01000000      /* buffer is on the DA queue */
> +#define B_DAQ                0x02000000      /* buffer is DMA reachable */
> +#define B_NOTMAPPED  0x04000000      /* BUSY, but not necessarily mapped */
> +
> +#define      B_BITS  "\20\001AGE\002NEEDCOMMIT\003ASYNC\004BAD\005BUSY" \
> +    "\006CACHE\007CALL\010DELWRI\011PRIV\012DONE\013EINTR\014ERROR" \
> +    "\015INVAL\016NOCACHE\017PHYS\020RAW\021READ" \
> +    "\022WANTED\023WRITEINPROG\024XXX(FORMAT)\025DEFERRED" \
> +    "\026SCANNED\027DAEMON\030RELEASED\031DAQ\032DMA\033NOTMAPPED"

I'd really like to see this flags fixing be in a separate commit.

IIRC this just add B_PRIV and kills some dead ones. as well as
rearranging everything (that is ok by me, despite B_XXX)

>  
>  /*
>   * This structure describes a clustered I/O.  It is stored in the b_saveaddr
> Index: sys/uvm/uvm_page.c
> ===================================================================
> RCS file: /cvs/src/sys/uvm/uvm_page.c,v
> retrieving revision 1.102
> diff -u -r1.102 uvm_page.c
> --- sys/uvm/uvm_page.c        7 Aug 2010 03:50:02 -0000       1.102
> +++ sys/uvm/uvm_page.c        1 Apr 2011 05:32:21 -0000
> @@ -81,6 +81,11 @@
>  
>  #include <uvm/uvm.h>
>  
> +void                 uvm_pagealloc_multi(struct uvm_object *, voff_t,
> +                         vsize_t, int);
> +void                 uvm_pagerealloc_multi(struct uvm_object *, voff_t,
> +                         vsize_t, int, struct uvm_constraint_range *);
> +
>  /*
>   * for object trees
>   */
> @@ -801,6 +806,69 @@
>       pg->owner_tag = NULL;
>  #endif
>       UVM_PAGE_OWN(pg, "new alloc");
> +}
> +
> +/*
> + * interface used by the buffer cache to allocate a buffer at a time.
> + * The pages are allocated wired in DMA accessible memory
> + */
> +void
> +uvm_pagealloc_multi(struct uvm_object *obj, voff_t off, vsize_t size, int 
> flags)
> +{
> +     struct pglist    plist;
> +     struct vm_page  *pg;
> +     int              i, error;
> +
> +
> +     TAILQ_INIT(&plist);
> +     error = uvm_pglistalloc(size, dma_constraint.ucr_low,
> +         dma_constraint.ucr_high, 0, 0, &plist, atop(round_page(size)),
> +         UVM_PLA_WAITOK);
> +     if (error)
> +             panic("wtf - uvm_pglistalloc returned %x", error);
> +     i = 0;
> +     while((pg = TAILQ_FIRST(&plist)) != NULL) {
> +       pg->wire_count = 1;
> +       atomic_setbits_int(&pg->pg_flags, PG_CLEAN | PG_FAKE);
> +       KASSERT((pg->pg_flags & PG_DEV) == 0);
> +       TAILQ_REMOVE(&plist, pg, pageq);
> +       uvm_pagealloc_pg(pg, obj, off + ptoa(i++), NULL);
> +     }
> +}
> +
> +/*
> + * interface used by the buffer cache to reallocate a buffer at a time.
> + * The pages are reallocated wired outside the DMA accessible region.
> + *
> + */
> +void
> +uvm_pagerealloc_multi(struct uvm_object *obj, voff_t off, vsize_t size, int 
> flags, struct uvm_constraint_range *where)
> +{
> +     struct pglist    plist;
> +     struct vm_page  *pg, *tpg;
> +     int              i, error;
> +     voff_t          offset;
> +
> +
> +     TAILQ_INIT(&plist);
> +     if (size == 0)
> +             panic("size 0 uvm_pagerealloc");
> +     error = uvm_pglistalloc(size, where->ucr_low, where->ucr_high, 0,
> +         0, &plist, atop(round_page(size)), UVM_PLA_WAITOK);
> +     if (error)
> +             panic("wtf - uvm_pglistalloc returned %x", error);
> +     i = 0;
> +     while((pg = TAILQ_FIRST(&plist)) != NULL) {
> +       offset = off + ptoa(i++);
> +       tpg = uvm_pagelookup(obj, offset);
> +       pg->wire_count = 1;
> +       atomic_setbits_int(&pg->pg_flags, PG_CLEAN | PG_FAKE);
> +       KASSERT((pg->pg_flags & PG_DEV) == 0);
> +       TAILQ_REMOVE(&plist, pg, pageq);
> +       uvm_pagecopy(tpg, pg);
> +       uvm_pagefree(tpg);
> +       uvm_pagealloc_pg(pg, obj, offset, NULL);
> +     }

Despite it being me who named these functions I still really hate the
names. Then again, that's a minor issue.

>  }
>  
>  /*
> Index: sys/uvm/uvm_pdaemon.c
> ===================================================================
> RCS file: /cvs/src/sys/uvm/uvm_pdaemon.c,v
> retrieving revision 1.56
> diff -u -r1.56 uvm_pdaemon.c
> --- sys/uvm/uvm_pdaemon.c     26 Sep 2010 12:53:27 -0000      1.56
> +++ sys/uvm/uvm_pdaemon.c     1 Apr 2011 08:12:34 -0000
> @@ -241,9 +241,9 @@
>               /*
>                * get pages from the buffer cache, or scan if needed
>                */
> -             if (((uvmexp.free - BUFPAGES_DEFICIT) < uvmexp.freetarg) ||
> -                 ((uvmexp.inactive + BUFPAGES_INACT) < uvmexp.inactarg)) {
> -                     if (bufbackoff() == -1)
> +             if ((bufbackoff() == -1)
> +                 && (((uvmexp.free - BUFPAGES_DEFICIT) < uvmexp.freetarg) ||
> +                 ((uvmexp.inactive + BUFPAGES_INACT) < uvmexp.inactarg))) {
>                               uvmpd_scan();
>               }

I can't recall why this was changed over. this means that if the
pagedaemon wakes then we always will backoff the buffer cache even if we
have plenty of page. Why is this?
>  
> Index: sys/uvm/uvm_swap.c
> ===================================================================
> RCS file: /cvs/src/sys/uvm/uvm_swap.c,v
> retrieving revision 1.100
> diff -u -r1.100 uvm_swap.c
> --- sys/uvm/uvm_swap.c        21 Dec 2010 20:14:44 -0000      1.100
> +++ sys/uvm/uvm_swap.c        1 Apr 2011 05:32:21 -0000
> @@ -1952,7 +1952,8 @@
>        * fill in the bp.   we currently route our i/o through
>        * /dev/drum's vnode [swapdev_vp].
>        */
> -     bp->b_flags = B_BUSY | B_NOCACHE | B_RAW | (flags & (B_READ|B_ASYNC));
> +     bp->b_flags = B_PRIV | B_BUSY | B_NOCACHE | B_RAW | 
> +       (flags & (B_READ|B_ASYNC));

ok for this as a separate commit just to mark it as none of the
bufcache's concern.

>       bp->b_proc = &proc0;    /* XXX */
>       bp->b_vnbufs.le_next = NOLIST;
>       if (bounce)

-- 
There is hopeful symbolism in the fact that flags do not wave in a
vacuum.
                -- Arthur C. Clarke

Re: Defang buffer cache for bigmem.

Reply via email to