Ok, this *used* to work, but I have some test reports now that it will likely panic. so please hold off.
Sorry about that -Bob * Bob Beck <b...@openbsd.org> [2009-08-09 03:41]: > Ok let's try this again: > > > > > I could use some assistance in testing this, particularly on some of > > the more odd archetectures. > > > > This diff makes a bunch of changes to the vfs name cache: > > > > 1) it gets rid of the global hash table and reverse hash table for namecahe > > entries. namecache entries > > are now allocated and freed in two global LRU's - one for regular, and > > one for negative entries. > > > > 2) Entries are no longer searched in the global lists, instead they are > > kept track of in the relevant > > vnodes. Since each vnode can be either the parent (directory) vnode of a > > namecache entry, or the target of > > the entry, we keep track of it in both ways in the vnode. We now use a rb > > tree to search the namecache from > > a directory vnode, and keep a list of which entries that we are the target > > vnode. > > > > 3) (most importantly) namecache entries can now be allocated and freed. > > > > 4) cache_purge now actually does something rather than depending on vnode > > horror to work. when recycling a vnode > > cache_purge will now correctly clear the name cache entries associated with > > the vnode. (before it basically > > didn't do anything, and depended on us noticing the vnodes were being > > recycled underneath us) > > > > This has been beat on a bunch, and appears not to slow anything down. I > > do require some testing > > and reports on other arch's particularly the likes of sparc, vax, and > > strange things if possible > > > > Thanks, > > -Bob > > I had this ready to go after c2k9, but with the instability in the > tree I never committed it. I'd like some people to beat on it again please. > > This is a first step toward making vnodes something more generic and > sane where hopefully we can allocate and deallocate them without major > pain. I really need testing of this. - now updated to apply again. > > -Bob > > Index: kern/vfs_cache.c > =================================================================== > RCS file: /cvs/src/sys/kern/vfs_cache.c,v > retrieving revision 1.30 > diff -u -r1.30 vfs_cache.c > --- kern/vfs_cache.c 9 Jul 2009 22:29:56 -0000 1.30 > +++ kern/vfs_cache.c 9 Aug 2009 18:30:12 -0000 > @@ -68,26 +68,59 @@ > /* > * Structures associated with name caching. > */ > -LIST_HEAD(nchashhead, namecache) *nchashtbl; > u_long nchash; /* size of hash table - 1 */ > -long numcache; /* number of cache entries allocated */ > -TAILQ_HEAD(, namecache) nclruhead; /* LRU chain */ > -struct nchstats nchstats; /* cache effectiveness > statistics */ > - > -LIST_HEAD(ncvhashhead, namecache) *ncvhashtbl; > -u_long ncvhash; /* size of hash table - 1 */ > - > -#define NCHASH(dvp, cnp) \ > - hash32_buf(&(dvp)->v_id, sizeof((dvp)->v_id), (cnp)->cn_hash) & nchash > +long numcache; /* total number of cache entries > allocated */ > +long numneg; /* number of negative cache entries */ > > -#define NCVHASH(vp) (vp)->v_id & ncvhash > +TAILQ_HEAD(, namecache) nclruhead; /* Regular Entry LRU chain */ > +TAILQ_HEAD(, namecache) nclruneghead; /* Negative Entry LRU > chain */ > +struct nchstats nchstats; /* cache effectiveness > statistics */ > > int doingcache = 1; /* 1 => enable the cache */ > > struct pool nch_pool; > > +void cache_zap(struct namecache *); > u_long nextvnodeid; > > +static int > +namecache_compare(struct namecache *n1, struct namecache *n2) > +{ > + if (n1->nc_nlen == n2->nc_nlen) > + return (memcmp(n1->nc_name, n2->nc_name, n1->nc_nlen)); > + else > + return (n1->nc_nlen - n2->nc_nlen); > +} > + > +RB_GENERATE(namecache_rb_cache, namecache, n_rbcache, namecache_compare); > + > +/* > + * blow away a namecache entry > + */ > +void > +cache_zap(struct namecache *ncp) > +{ > + struct vnode *dvp = NULL; > + > + if (ncp->nc_vp != NULL) { > + TAILQ_REMOVE(&nclruhead, ncp, nc_lru); > + numcache--; > + } else { > + TAILQ_REMOVE(&nclruneghead, ncp, nc_neg); > + numneg--; > + } > + if (ncp->nc_dvp) { > + RB_REMOVE(namecache_rb_cache, &ncp->nc_dvp->v_nc_tree, ncp); > + if (RB_EMPTY(&ncp->nc_dvp->v_nc_tree)) > + dvp = ncp->nc_dvp; > + } > + if (ncp->nc_vp) > + TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_me); > + pool_put(&nch_pool, ncp); > + if (dvp) > + vdrop(dvp); > +} > + > /* > * Look for a name in the cache. We don't do this if the segment name is > * long, simply so the cache can avoid holding long names (which would > @@ -107,7 +140,7 @@ > cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname > *cnp) > { > struct namecache *ncp; > - struct nchashhead *ncpp; > + struct namecache n; > struct vnode *vp; > struct proc *p = curproc; > u_long vpid; > @@ -125,14 +158,11 @@ > return (-1); > } > > - ncpp = &nchashtbl[NCHASH(dvp, cnp)]; > - LIST_FOREACH(ncp, ncpp, nc_hash) { > - if (ncp->nc_dvp == dvp && > - ncp->nc_dvpid == dvp->v_id && > - ncp->nc_nlen == cnp->cn_namelen && > - !memcmp(ncp->nc_name, cnp->cn_nameptr, (u_int)ncp->nc_nlen)) > - break; > - } > + /* lookup in directory vnode's redblack tree */ > + n.nc_nlen = cnp->cn_namelen; > + memcpy(n.nc_name, cnp->cn_nameptr, n.nc_nlen); > + ncp = RB_FIND(namecache_rb_cache, &dvp->v_nc_tree, &n); > + > if (ncp == NULL) { > nchstats.ncs_miss++; > return (-1); > @@ -145,13 +175,10 @@ > (cnp->cn_flags & ISLASTCN) == 0) { > nchstats.ncs_neghits++; > /* > - * Move this slot to end of LRU chain, > - * if not already there. > + * Move this slot to end of the negative LRU chain, > */ > - if (TAILQ_NEXT(ncp, nc_lru) != NULL) { > - TAILQ_REMOVE(&nclruhead, ncp, nc_lru); > - TAILQ_INSERT_TAIL(&nclruhead, ncp, nc_lru); > - } > + TAILQ_REMOVE(&nclruneghead, ncp, nc_neg); > + TAILQ_INSERT_TAIL(&nclruneghead, ncp, nc_neg); > return (ENOENT); > } else { > nchstats.ncs_badhits++; > @@ -220,12 +247,10 @@ > > nchstats.ncs_goodhits++; > /* > - * Move this slot to end of LRU chain, if not already there. > + * Move this slot to end of the regular LRU chain. > */ > - if (TAILQ_NEXT(ncp, nc_lru) != NULL) { > - TAILQ_REMOVE(&nclruhead, ncp, nc_lru); > - TAILQ_INSERT_TAIL(&nclruhead, ncp, nc_lru); > - } > + TAILQ_REMOVE(&nclruhead, ncp, nc_lru); > + TAILQ_INSERT_TAIL(&nclruhead, ncp, nc_lru); > *vpp = vp; > return (0); > > @@ -235,16 +260,7 @@ > * the cache entry is invalid, or otherwise don't > * want cache entry to exist. > */ > - TAILQ_REMOVE(&nclruhead, ncp, nc_lru); > - LIST_REMOVE(ncp, nc_hash); > - ncp->nc_hash.le_prev = NULL; > - > - if (ncp->nc_vhash.le_prev != NULL) { > - LIST_REMOVE(ncp, nc_vhash); > - ncp->nc_vhash.le_prev = NULL; > - } > - > - TAILQ_INSERT_HEAD(&nclruhead, ncp, nc_lru); > + cache_zap(ncp); > return (-1); > } > > @@ -267,62 +283,51 @@ > cache_revlookup(struct vnode *vp, struct vnode **dvpp, char **bpp, char > *bufp) > { > struct namecache *ncp; > - struct vnode *dvp; > - struct ncvhashhead *nvcpp; > + struct vnode *dvp = NULL; > char *bp; > > if (!doingcache) > goto out; > - > - nvcpp = &ncvhashtbl[NCVHASH(vp)]; > - > - LIST_FOREACH(ncp, nvcpp, nc_vhash) { > - if (ncp->nc_vp == vp && > - ncp->nc_vpid == vp->v_id && > - (dvp = ncp->nc_dvp) != NULL && > - /* avoid pesky '.' entries.. */ > - dvp != vp && ncp->nc_dvpid == dvp->v_id) { > - > + TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_me) { > + dvp = ncp->nc_dvp; > + if (dvp && dvp != vp) > + goto found; > + } > + goto miss; > +found: > #ifdef DIAGNOSTIC > - if (ncp->nc_nlen == 1 && > - ncp->nc_name[0] == '.') > - panic("cache_revlookup: found entry for ."); > - > - if (ncp->nc_nlen == 2 && > - ncp->nc_name[0] == '.' && > - ncp->nc_name[1] == '.') > - panic("cache_revlookup: found entry for .."); > + if (ncp->nc_nlen == 1 && > + ncp->nc_name[0] == '.') > + panic("cache_revlookup: found entry for ."); > + if (ncp->nc_nlen == 2 && > + ncp->nc_name[0] == '.' && > + ncp->nc_name[1] == '.') > + panic("cache_revlookup: found entry for .."); > #endif > - nchstats.ncs_revhits++; > - > - if (bufp != NULL) { > - bp = *bpp; > - bp -= ncp->nc_nlen; > - if (bp <= bufp) { > - *dvpp = NULL; > - return (ERANGE); > - } > - memcpy(bp, ncp->nc_name, ncp->nc_nlen); > - *bpp = bp; > - } > + nchstats.ncs_revhits++; > > - *dvpp = dvp; > - > - /* > - * XXX: Should we vget() here to have more > - * consistent semantics with cache_lookup()? > - * > - * For MP safety it might be necessary to do > - * this here, while also protecting hash > - * tables themselves to provide some sort of > - * sane inter locking. > - */ > - return (0); > + if (bufp != NULL) { > + bp = *bpp; > + bp -= ncp->nc_nlen; > + if (bp <= bufp) { > + *dvpp = NULL; > + return (ERANGE); > } > + memcpy(bp, ncp->nc_name, ncp->nc_nlen); > + *bpp = bp; > } > - nchstats.ncs_revmiss++; > > - out: > + *dvpp = dvp; > + > + /* > + * XXX: Should we vget() here to have more > + * consistent semantics with cache_lookup()? > + */ > + return (0); > + > +miss: > + nchstats.ncs_revmiss++; > +out: > *dvpp = NULL; > return (-1); > } > @@ -334,57 +339,49 @@ > cache_enter(struct vnode *dvp, struct vnode *vp, struct componentname *cnp) > { > struct namecache *ncp; > - struct nchashhead *ncpp; > - struct ncvhashhead *nvcpp; > > if (!doingcache || cnp->cn_namelen > NCHNAMLEN) > return; > > /* > - * Free the cache slot at head of lru chain. > + * allocate, or recycle (free and allocate) an ncp. > */ > - if (numcache < desiredvnodes) { > - ncp = pool_get(&nch_pool, PR_WAITOK|PR_ZERO); > - numcache++; > - } else if ((ncp = TAILQ_FIRST(&nclruhead)) != NULL) { > - TAILQ_REMOVE(&nclruhead, ncp, nc_lru); > - if (ncp->nc_hash.le_prev != NULL) { > - LIST_REMOVE(ncp, nc_hash); > - ncp->nc_hash.le_prev = NULL; > - } > - if (ncp->nc_vhash.le_prev != NULL) { > - LIST_REMOVE(ncp, nc_vhash); > - ncp->nc_vhash.le_prev = NULL; > - } > - } else > - return; > + if (numcache >= desiredvnodes) { > + if ((ncp = TAILQ_FIRST(&nclruhead)) != NULL) > + cache_zap(ncp); > + else if ((ncp = TAILQ_FIRST(&nclruneghead)) != NULL) > + cache_zap(ncp); > + else > + panic("wtf? leak?"); > + } > + ncp = pool_get(&nch_pool, PR_WAITOK|PR_ZERO); > + > /* grab the vnode we just found */ > ncp->nc_vp = vp; > if (vp) > ncp->nc_vpid = vp->v_id; > + > /* fill in cache info */ > ncp->nc_dvp = dvp; > ncp->nc_dvpid = dvp->v_id; > ncp->nc_nlen = cnp->cn_namelen; > bcopy(cnp->cn_nameptr, ncp->nc_name, (unsigned)ncp->nc_nlen); > - TAILQ_INSERT_TAIL(&nclruhead, ncp, nc_lru); > - ncpp = &nchashtbl[NCHASH(dvp, cnp)]; > - LIST_INSERT_HEAD(ncpp, ncp, nc_hash); > - > - /* > - * Create reverse-cache entries (used in getcwd) for > - * directories. > - */ > - > - ncp->nc_vhash.le_prev = NULL; > - ncp->nc_vhash.le_next = NULL; > - > - if (vp && vp != dvp && vp->v_type == VDIR && > - (ncp->nc_nlen > 2 || > - (ncp->nc_nlen > 1 && ncp->nc_name[1] != '.') || > - (ncp->nc_nlen > 0 && ncp->nc_name[0] != '.'))) { > - nvcpp = &ncvhashtbl[NCVHASH(vp)]; > - LIST_INSERT_HEAD(nvcpp, ncp, nc_vhash); > + if (RB_EMPTY(&dvp->v_nc_tree)) { > + vhold(dvp); > + } > + if (RB_INSERT(namecache_rb_cache, &dvp->v_nc_tree, ncp) != NULL) > + panic("Ich habe eine Rotweinflasche in meinem Arsch\n"); > + if (vp) { > + TAILQ_INSERT_TAIL(&nclruhead, ncp, nc_lru); > + numcache++; > + TAILQ_INSERT_TAIL(&vp->v_cache_dst, ncp, nc_me); > + } else { > + TAILQ_INSERT_TAIL(&nclruneghead, ncp, nc_neg); > + numneg++; > + } > + if (numneg > desiredvnodes) { > + if ((ncp = TAILQ_FIRST(&nclruneghead)) != NULL) > + cache_zap(ncp); > } > } > > @@ -394,10 +391,8 @@ > void > nchinit() > { > - > TAILQ_INIT(&nclruhead); > - nchashtbl = hashinit(desiredvnodes, M_CACHE, M_WAITOK, &nchash); > - ncvhashtbl = hashinit(desiredvnodes/8, M_CACHE, M_WAITOK, &ncvhash); > + TAILQ_INIT(&nclruneghead); > pool_init(&nch_pool, sizeof(struct namecache), 0, 0, 0, "nchpl", > &pool_allocator_nointr); > } > @@ -410,18 +405,16 @@ > cache_purge(struct vnode *vp) > { > struct namecache *ncp; > - struct nchashhead *ncpp; > > + while ((ncp = TAILQ_FIRST(&vp->v_cache_dst))) > + cache_zap(ncp); > + while ((ncp = RB_ROOT(&vp->v_nc_tree))) > + cache_zap(ncp); > + > + /* XXX this blows goats */ > vp->v_id = ++nextvnodeid; > - if (nextvnodeid != 0) > - return; > - for (ncpp = &nchashtbl[nchash]; ncpp >= nchashtbl; ncpp--) { > - LIST_FOREACH(ncp, ncpp, nc_hash) { > - ncp->nc_vpid = 0; > - ncp->nc_dvpid = 0; > - } > - } > - vp->v_id = ++nextvnodeid; > + if (vp->v_id == 0) > + vp->v_id = ++nextvnodeid; > } > > /* > @@ -437,6 +430,7 @@ > { > struct namecache *ncp, *nxtcp; > > + /* whack the regular entries */ > for (ncp = TAILQ_FIRST(&nclruhead); ncp != TAILQ_END(&nclruhead); > ncp = nxtcp) { > if (ncp->nc_dvp == NULL || ncp->nc_dvp->v_mount != mp) { > @@ -444,19 +438,20 @@ > continue; > } > /* free the resources we had */ > - ncp->nc_vp = NULL; > - ncp->nc_dvp = NULL; > - TAILQ_REMOVE(&nclruhead, ncp, nc_lru); > - if (ncp->nc_hash.le_prev != NULL) { > - LIST_REMOVE(ncp, nc_hash); > - ncp->nc_hash.le_prev = NULL; > - } > - if (ncp->nc_vhash.le_prev != NULL) { > - LIST_REMOVE(ncp, nc_vhash); > - ncp->nc_vhash.le_prev = NULL; > - } > + cache_zap(ncp); > /* cause rescan of list, it may have altered */ > nxtcp = TAILQ_FIRST(&nclruhead); > - TAILQ_INSERT_HEAD(&nclruhead, ncp, nc_lru); > + } > + /* whack the negative entries */ > + for (ncp = TAILQ_FIRST(&nclruneghead); ncp != TAILQ_END(&nclruneghead); > + ncp = nxtcp) { > + if (ncp->nc_dvp == NULL || ncp->nc_dvp->v_mount != mp) { > + nxtcp = TAILQ_NEXT(ncp, nc_neg); > + continue; > + } > + /* free the resources we had */ > + cache_zap(ncp); > + /* cause rescan of list, it may have altered */ > + nxtcp = TAILQ_FIRST(&nclruneghead); > } > } > Index: kern/vfs_subr.c > =================================================================== > RCS file: /cvs/src/sys/kern/vfs_subr.c,v > retrieving revision 1.180 > diff -u -r1.180 vfs_subr.c > --- kern/vfs_subr.c 2 Aug 2009 16:28:40 -0000 1.180 > +++ kern/vfs_subr.c 9 Aug 2009 18:30:12 -0000 > @@ -360,6 +360,8 @@ > splx(s); > vp = pool_get(&vnode_pool, PR_WAITOK | PR_ZERO); > RB_INIT(&vp->v_bufs_tree); > + RB_INIT(&vp->v_nc_tree); > + TAILQ_INIT(&vp->v_cache_dst); > numvnodes++; > } else { > for (vp = TAILQ_FIRST(listhd); vp != NULLVP; > Index: sys/namei.h > =================================================================== > RCS file: /cvs/src/sys/sys/namei.h,v > retrieving revision 1.22 > diff -u -r1.22 namei.h > --- sys/namei.h 29 Aug 2008 08:57:28 -0000 1.22 > +++ sys/namei.h 9 Aug 2009 18:30:12 -0000 > @@ -36,6 +36,11 @@ > #define _SYS_NAMEI_H_ > > #include <sys/queue.h> > +#include <sys/tree.h> > + > +struct namecache; > +struct namecache_rb_cache; > +RB_PROTOTYPE(namecache_rb_cache, namecache, n_rbcache, namecache_compare); > > /* > * Encapsulation of namei parameters. > @@ -156,9 +161,10 @@ > #define NCHNAMLEN 31 /* maximum name segment length we > bother with */ > > struct namecache { > - LIST_ENTRY(namecache) nc_hash; /* hash chain */ > - LIST_ENTRY(namecache) nc_vhash; /* (reverse) directory hash chain */ > - TAILQ_ENTRY(namecache) nc_lru; /* LRU chain */ > + TAILQ_ENTRY(namecache) nc_lru; /* Regular Entry LRU chain */ > + TAILQ_ENTRY(namecache) nc_neg; /* Negative Entry LRU chain */ > + RB_ENTRY(namecache) n_rbcache; /* Namecache rb tree from vnode */ > + TAILQ_ENTRY(namecache) nc_me; /* ncp's referring to me */ > struct vnode *nc_dvp; /* vnode of parent of name */ > u_long nc_dvpid; /* capability number of nc_dvp */ > struct vnode *nc_vp; /* vnode the name refers to */ > Index: sys/vnode.h > =================================================================== > RCS file: /cvs/src/sys/sys/vnode.h,v > retrieving revision 1.102 > diff -u -r1.102 vnode.h > --- sys/vnode.h 2 Aug 2009 16:28:40 -0000 1.102 > +++ sys/vnode.h 9 Aug 2009 18:30:12 -0000 > @@ -36,6 +36,7 @@ > #include <sys/types.h> > #include <sys/queue.h> > #include <sys/lock.h> > +#include <sys/namei.h> > #include <sys/selinfo.h> > #include <sys/tree.h> > > @@ -82,6 +83,7 @@ > LIST_HEAD(buflists, buf); > > RB_HEAD(buf_rb_bufs, buf); > +RB_HEAD(namecache_rb_cache, namecache); > > struct vnode { > struct uvm_vnode v_uvm; /* uvm data */ > @@ -110,6 +112,10 @@ > struct fifoinfo *vu_fifoinfo; /* fifo (VFIFO) */ > } v_un; > > + /* VFS namecache */ > + struct namecache_rb_cache v_nc_tree; > + TAILQ_HEAD(, namecache) v_cache_dst; /* cache entries to us */ > + > enum vtagtype v_tag; /* type of underlying data */ > void *v_data; /* private data for fs */ > struct selinfo v_selectinfo; /* identity of poller(s) */ > @@ -247,8 +253,9 @@ > extern int maxvnodes; /* XXX number of vnodes to > allocate */ > extern time_t syncdelay; /* time to delay syncing vnodes > */ > extern int rushjob; /* # of slots syncer should run > ASAP */ > +extern void vhold(struct vnode *); > +extern void vdrop(struct vnode *); > #endif /* _KERNEL */ > - > > /* > * Mods for exensibility. > -- #!/usr/bin/perl if ((not 0 && not 1) != (! 0 && ! 1)) { print "Larry and Tom must smoke some really primo stuff...\n"; }