Ok let's try this again: > > I could use some assistance in testing this, particularly on some of > the more odd archetectures. > > This diff makes a bunch of changes to the vfs name cache: > > 1) it gets rid of the global hash table and reverse hash table for namecahe > entries. namecache entries > are now allocated and freed in two global LRU's - one for regular, and one > for negative entries. > > 2) Entries are no longer searched in the global lists, instead they are kept > track of in the relevant > vnodes. Since each vnode can be either the parent (directory) vnode of a > namecache entry, or the target of > the entry, we keep track of it in both ways in the vnode. We now use a rb > tree to search the namecache from > a directory vnode, and keep a list of which entries that we are the target > vnode. > > 3) (most importantly) namecache entries can now be allocated and freed. > > 4) cache_purge now actually does something rather than depending on vnode > horror to work. when recycling a vnode > cache_purge will now correctly clear the name cache entries associated with > the vnode. (before it basically > didn't do anything, and depended on us noticing the vnodes were being > recycled underneath us) > > This has been beat on a bunch, and appears not to slow anything down. I > do require some testing > and reports on other arch's particularly the likes of sparc, vax, and strange > things if possible > > Thanks, > -Bob
I had this ready to go after c2k9, but with the instability in the tree I never committed it. I'd like some people to beat on it again please. This is a first step toward making vnodes something more generic and sane where hopefully we can allocate and deallocate them without major pain. I really need testing of this. - now updated to apply again. -Bob Index: kern/vfs_cache.c =================================================================== RCS file: /cvs/src/sys/kern/vfs_cache.c,v retrieving revision 1.30 diff -u -r1.30 vfs_cache.c --- kern/vfs_cache.c 9 Jul 2009 22:29:56 -0000 1.30 +++ kern/vfs_cache.c 9 Aug 2009 18:30:12 -0000 @@ -68,26 +68,59 @@ /* * Structures associated with name caching. */ -LIST_HEAD(nchashhead, namecache) *nchashtbl; u_long nchash; /* size of hash table - 1 */ -long numcache; /* number of cache entries allocated */ -TAILQ_HEAD(, namecache) nclruhead; /* LRU chain */ -struct nchstats nchstats; /* cache effectiveness statistics */ - -LIST_HEAD(ncvhashhead, namecache) *ncvhashtbl; -u_long ncvhash; /* size of hash table - 1 */ - -#define NCHASH(dvp, cnp) \ - hash32_buf(&(dvp)->v_id, sizeof((dvp)->v_id), (cnp)->cn_hash) & nchash +long numcache; /* total number of cache entries allocated */ +long numneg; /* number of negative cache entries */ -#define NCVHASH(vp) (vp)->v_id & ncvhash +TAILQ_HEAD(, namecache) nclruhead; /* Regular Entry LRU chain */ +TAILQ_HEAD(, namecache) nclruneghead; /* Negative Entry LRU chain */ +struct nchstats nchstats; /* cache effectiveness statistics */ int doingcache = 1; /* 1 => enable the cache */ struct pool nch_pool; +void cache_zap(struct namecache *); u_long nextvnodeid; +static int +namecache_compare(struct namecache *n1, struct namecache *n2) +{ + if (n1->nc_nlen == n2->nc_nlen) + return (memcmp(n1->nc_name, n2->nc_name, n1->nc_nlen)); + else + return (n1->nc_nlen - n2->nc_nlen); +} + +RB_GENERATE(namecache_rb_cache, namecache, n_rbcache, namecache_compare); + +/* + * blow away a namecache entry + */ +void +cache_zap(struct namecache *ncp) +{ + struct vnode *dvp = NULL; + + if (ncp->nc_vp != NULL) { + TAILQ_REMOVE(&nclruhead, ncp, nc_lru); + numcache--; + } else { + TAILQ_REMOVE(&nclruneghead, ncp, nc_neg); + numneg--; + } + if (ncp->nc_dvp) { + RB_REMOVE(namecache_rb_cache, &ncp->nc_dvp->v_nc_tree, ncp); + if (RB_EMPTY(&ncp->nc_dvp->v_nc_tree)) + dvp = ncp->nc_dvp; + } + if (ncp->nc_vp) + TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_me); + pool_put(&nch_pool, ncp); + if (dvp) + vdrop(dvp); +} + /* * Look for a name in the cache. We don't do this if the segment name is * long, simply so the cache can avoid holding long names (which would @@ -107,7 +140,7 @@ cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp) { struct namecache *ncp; - struct nchashhead *ncpp; + struct namecache n; struct vnode *vp; struct proc *p = curproc; u_long vpid; @@ -125,14 +158,11 @@ return (-1); } - ncpp = &nchashtbl[NCHASH(dvp, cnp)]; - LIST_FOREACH(ncp, ncpp, nc_hash) { - if (ncp->nc_dvp == dvp && - ncp->nc_dvpid == dvp->v_id && - ncp->nc_nlen == cnp->cn_namelen && - !memcmp(ncp->nc_name, cnp->cn_nameptr, (u_int)ncp->nc_nlen)) - break; - } + /* lookup in directory vnode's redblack tree */ + n.nc_nlen = cnp->cn_namelen; + memcpy(n.nc_name, cnp->cn_nameptr, n.nc_nlen); + ncp = RB_FIND(namecache_rb_cache, &dvp->v_nc_tree, &n); + if (ncp == NULL) { nchstats.ncs_miss++; return (-1); @@ -145,13 +175,10 @@ (cnp->cn_flags & ISLASTCN) == 0) { nchstats.ncs_neghits++; /* - * Move this slot to end of LRU chain, - * if not already there. + * Move this slot to end of the negative LRU chain, */ - if (TAILQ_NEXT(ncp, nc_lru) != NULL) { - TAILQ_REMOVE(&nclruhead, ncp, nc_lru); - TAILQ_INSERT_TAIL(&nclruhead, ncp, nc_lru); - } + TAILQ_REMOVE(&nclruneghead, ncp, nc_neg); + TAILQ_INSERT_TAIL(&nclruneghead, ncp, nc_neg); return (ENOENT); } else { nchstats.ncs_badhits++; @@ -220,12 +247,10 @@ nchstats.ncs_goodhits++; /* - * Move this slot to end of LRU chain, if not already there. + * Move this slot to end of the regular LRU chain. */ - if (TAILQ_NEXT(ncp, nc_lru) != NULL) { - TAILQ_REMOVE(&nclruhead, ncp, nc_lru); - TAILQ_INSERT_TAIL(&nclruhead, ncp, nc_lru); - } + TAILQ_REMOVE(&nclruhead, ncp, nc_lru); + TAILQ_INSERT_TAIL(&nclruhead, ncp, nc_lru); *vpp = vp; return (0); @@ -235,16 +260,7 @@ * the cache entry is invalid, or otherwise don't * want cache entry to exist. */ - TAILQ_REMOVE(&nclruhead, ncp, nc_lru); - LIST_REMOVE(ncp, nc_hash); - ncp->nc_hash.le_prev = NULL; - - if (ncp->nc_vhash.le_prev != NULL) { - LIST_REMOVE(ncp, nc_vhash); - ncp->nc_vhash.le_prev = NULL; - } - - TAILQ_INSERT_HEAD(&nclruhead, ncp, nc_lru); + cache_zap(ncp); return (-1); } @@ -267,62 +283,51 @@ cache_revlookup(struct vnode *vp, struct vnode **dvpp, char **bpp, char *bufp) { struct namecache *ncp; - struct vnode *dvp; - struct ncvhashhead *nvcpp; + struct vnode *dvp = NULL; char *bp; if (!doingcache) goto out; - - nvcpp = &ncvhashtbl[NCVHASH(vp)]; - - LIST_FOREACH(ncp, nvcpp, nc_vhash) { - if (ncp->nc_vp == vp && - ncp->nc_vpid == vp->v_id && - (dvp = ncp->nc_dvp) != NULL && - /* avoid pesky '.' entries.. */ - dvp != vp && ncp->nc_dvpid == dvp->v_id) { - + TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_me) { + dvp = ncp->nc_dvp; + if (dvp && dvp != vp) + goto found; + } + goto miss; +found: #ifdef DIAGNOSTIC - if (ncp->nc_nlen == 1 && - ncp->nc_name[0] == '.') - panic("cache_revlookup: found entry for ."); - - if (ncp->nc_nlen == 2 && - ncp->nc_name[0] == '.' && - ncp->nc_name[1] == '.') - panic("cache_revlookup: found entry for .."); + if (ncp->nc_nlen == 1 && + ncp->nc_name[0] == '.') + panic("cache_revlookup: found entry for ."); + if (ncp->nc_nlen == 2 && + ncp->nc_name[0] == '.' && + ncp->nc_name[1] == '.') + panic("cache_revlookup: found entry for .."); #endif - nchstats.ncs_revhits++; - - if (bufp != NULL) { - bp = *bpp; - bp -= ncp->nc_nlen; - if (bp <= bufp) { - *dvpp = NULL; - return (ERANGE); - } - memcpy(bp, ncp->nc_name, ncp->nc_nlen); - *bpp = bp; - } + nchstats.ncs_revhits++; - *dvpp = dvp; - - /* - * XXX: Should we vget() here to have more - * consistent semantics with cache_lookup()? - * - * For MP safety it might be necessary to do - * this here, while also protecting hash - * tables themselves to provide some sort of - * sane inter locking. - */ - return (0); + if (bufp != NULL) { + bp = *bpp; + bp -= ncp->nc_nlen; + if (bp <= bufp) { + *dvpp = NULL; + return (ERANGE); } + memcpy(bp, ncp->nc_name, ncp->nc_nlen); + *bpp = bp; } - nchstats.ncs_revmiss++; - out: + *dvpp = dvp; + + /* + * XXX: Should we vget() here to have more + * consistent semantics with cache_lookup()? + */ + return (0); + +miss: + nchstats.ncs_revmiss++; +out: *dvpp = NULL; return (-1); } @@ -334,57 +339,49 @@ cache_enter(struct vnode *dvp, struct vnode *vp, struct componentname *cnp) { struct namecache *ncp; - struct nchashhead *ncpp; - struct ncvhashhead *nvcpp; if (!doingcache || cnp->cn_namelen > NCHNAMLEN) return; /* - * Free the cache slot at head of lru chain. + * allocate, or recycle (free and allocate) an ncp. */ - if (numcache < desiredvnodes) { - ncp = pool_get(&nch_pool, PR_WAITOK|PR_ZERO); - numcache++; - } else if ((ncp = TAILQ_FIRST(&nclruhead)) != NULL) { - TAILQ_REMOVE(&nclruhead, ncp, nc_lru); - if (ncp->nc_hash.le_prev != NULL) { - LIST_REMOVE(ncp, nc_hash); - ncp->nc_hash.le_prev = NULL; - } - if (ncp->nc_vhash.le_prev != NULL) { - LIST_REMOVE(ncp, nc_vhash); - ncp->nc_vhash.le_prev = NULL; - } - } else - return; + if (numcache >= desiredvnodes) { + if ((ncp = TAILQ_FIRST(&nclruhead)) != NULL) + cache_zap(ncp); + else if ((ncp = TAILQ_FIRST(&nclruneghead)) != NULL) + cache_zap(ncp); + else + panic("wtf? leak?"); + } + ncp = pool_get(&nch_pool, PR_WAITOK|PR_ZERO); + /* grab the vnode we just found */ ncp->nc_vp = vp; if (vp) ncp->nc_vpid = vp->v_id; + /* fill in cache info */ ncp->nc_dvp = dvp; ncp->nc_dvpid = dvp->v_id; ncp->nc_nlen = cnp->cn_namelen; bcopy(cnp->cn_nameptr, ncp->nc_name, (unsigned)ncp->nc_nlen); - TAILQ_INSERT_TAIL(&nclruhead, ncp, nc_lru); - ncpp = &nchashtbl[NCHASH(dvp, cnp)]; - LIST_INSERT_HEAD(ncpp, ncp, nc_hash); - - /* - * Create reverse-cache entries (used in getcwd) for - * directories. - */ - - ncp->nc_vhash.le_prev = NULL; - ncp->nc_vhash.le_next = NULL; - - if (vp && vp != dvp && vp->v_type == VDIR && - (ncp->nc_nlen > 2 || - (ncp->nc_nlen > 1 && ncp->nc_name[1] != '.') || - (ncp->nc_nlen > 0 && ncp->nc_name[0] != '.'))) { - nvcpp = &ncvhashtbl[NCVHASH(vp)]; - LIST_INSERT_HEAD(nvcpp, ncp, nc_vhash); + if (RB_EMPTY(&dvp->v_nc_tree)) { + vhold(dvp); + } + if (RB_INSERT(namecache_rb_cache, &dvp->v_nc_tree, ncp) != NULL) + panic("Ich habe eine Rotweinflasche in meinem Arsch\n"); + if (vp) { + TAILQ_INSERT_TAIL(&nclruhead, ncp, nc_lru); + numcache++; + TAILQ_INSERT_TAIL(&vp->v_cache_dst, ncp, nc_me); + } else { + TAILQ_INSERT_TAIL(&nclruneghead, ncp, nc_neg); + numneg++; + } + if (numneg > desiredvnodes) { + if ((ncp = TAILQ_FIRST(&nclruneghead)) != NULL) + cache_zap(ncp); } } @@ -394,10 +391,8 @@ void nchinit() { - TAILQ_INIT(&nclruhead); - nchashtbl = hashinit(desiredvnodes, M_CACHE, M_WAITOK, &nchash); - ncvhashtbl = hashinit(desiredvnodes/8, M_CACHE, M_WAITOK, &ncvhash); + TAILQ_INIT(&nclruneghead); pool_init(&nch_pool, sizeof(struct namecache), 0, 0, 0, "nchpl", &pool_allocator_nointr); } @@ -410,18 +405,16 @@ cache_purge(struct vnode *vp) { struct namecache *ncp; - struct nchashhead *ncpp; + while ((ncp = TAILQ_FIRST(&vp->v_cache_dst))) + cache_zap(ncp); + while ((ncp = RB_ROOT(&vp->v_nc_tree))) + cache_zap(ncp); + + /* XXX this blows goats */ vp->v_id = ++nextvnodeid; - if (nextvnodeid != 0) - return; - for (ncpp = &nchashtbl[nchash]; ncpp >= nchashtbl; ncpp--) { - LIST_FOREACH(ncp, ncpp, nc_hash) { - ncp->nc_vpid = 0; - ncp->nc_dvpid = 0; - } - } - vp->v_id = ++nextvnodeid; + if (vp->v_id == 0) + vp->v_id = ++nextvnodeid; } /* @@ -437,6 +430,7 @@ { struct namecache *ncp, *nxtcp; + /* whack the regular entries */ for (ncp = TAILQ_FIRST(&nclruhead); ncp != TAILQ_END(&nclruhead); ncp = nxtcp) { if (ncp->nc_dvp == NULL || ncp->nc_dvp->v_mount != mp) { @@ -444,19 +438,20 @@ continue; } /* free the resources we had */ - ncp->nc_vp = NULL; - ncp->nc_dvp = NULL; - TAILQ_REMOVE(&nclruhead, ncp, nc_lru); - if (ncp->nc_hash.le_prev != NULL) { - LIST_REMOVE(ncp, nc_hash); - ncp->nc_hash.le_prev = NULL; - } - if (ncp->nc_vhash.le_prev != NULL) { - LIST_REMOVE(ncp, nc_vhash); - ncp->nc_vhash.le_prev = NULL; - } + cache_zap(ncp); /* cause rescan of list, it may have altered */ nxtcp = TAILQ_FIRST(&nclruhead); - TAILQ_INSERT_HEAD(&nclruhead, ncp, nc_lru); + } + /* whack the negative entries */ + for (ncp = TAILQ_FIRST(&nclruneghead); ncp != TAILQ_END(&nclruneghead); + ncp = nxtcp) { + if (ncp->nc_dvp == NULL || ncp->nc_dvp->v_mount != mp) { + nxtcp = TAILQ_NEXT(ncp, nc_neg); + continue; + } + /* free the resources we had */ + cache_zap(ncp); + /* cause rescan of list, it may have altered */ + nxtcp = TAILQ_FIRST(&nclruneghead); } } Index: kern/vfs_subr.c =================================================================== RCS file: /cvs/src/sys/kern/vfs_subr.c,v retrieving revision 1.180 diff -u -r1.180 vfs_subr.c --- kern/vfs_subr.c 2 Aug 2009 16:28:40 -0000 1.180 +++ kern/vfs_subr.c 9 Aug 2009 18:30:12 -0000 @@ -360,6 +360,8 @@ splx(s); vp = pool_get(&vnode_pool, PR_WAITOK | PR_ZERO); RB_INIT(&vp->v_bufs_tree); + RB_INIT(&vp->v_nc_tree); + TAILQ_INIT(&vp->v_cache_dst); numvnodes++; } else { for (vp = TAILQ_FIRST(listhd); vp != NULLVP; Index: sys/namei.h =================================================================== RCS file: /cvs/src/sys/sys/namei.h,v retrieving revision 1.22 diff -u -r1.22 namei.h --- sys/namei.h 29 Aug 2008 08:57:28 -0000 1.22 +++ sys/namei.h 9 Aug 2009 18:30:12 -0000 @@ -36,6 +36,11 @@ #define _SYS_NAMEI_H_ #include <sys/queue.h> +#include <sys/tree.h> + +struct namecache; +struct namecache_rb_cache; +RB_PROTOTYPE(namecache_rb_cache, namecache, n_rbcache, namecache_compare); /* * Encapsulation of namei parameters. @@ -156,9 +161,10 @@ #define NCHNAMLEN 31 /* maximum name segment length we bother with */ struct namecache { - LIST_ENTRY(namecache) nc_hash; /* hash chain */ - LIST_ENTRY(namecache) nc_vhash; /* (reverse) directory hash chain */ - TAILQ_ENTRY(namecache) nc_lru; /* LRU chain */ + TAILQ_ENTRY(namecache) nc_lru; /* Regular Entry LRU chain */ + TAILQ_ENTRY(namecache) nc_neg; /* Negative Entry LRU chain */ + RB_ENTRY(namecache) n_rbcache; /* Namecache rb tree from vnode */ + TAILQ_ENTRY(namecache) nc_me; /* ncp's referring to me */ struct vnode *nc_dvp; /* vnode of parent of name */ u_long nc_dvpid; /* capability number of nc_dvp */ struct vnode *nc_vp; /* vnode the name refers to */ Index: sys/vnode.h =================================================================== RCS file: /cvs/src/sys/sys/vnode.h,v retrieving revision 1.102 diff -u -r1.102 vnode.h --- sys/vnode.h 2 Aug 2009 16:28:40 -0000 1.102 +++ sys/vnode.h 9 Aug 2009 18:30:12 -0000 @@ -36,6 +36,7 @@ #include <sys/types.h> #include <sys/queue.h> #include <sys/lock.h> +#include <sys/namei.h> #include <sys/selinfo.h> #include <sys/tree.h> @@ -82,6 +83,7 @@ LIST_HEAD(buflists, buf); RB_HEAD(buf_rb_bufs, buf); +RB_HEAD(namecache_rb_cache, namecache); struct vnode { struct uvm_vnode v_uvm; /* uvm data */ @@ -110,6 +112,10 @@ struct fifoinfo *vu_fifoinfo; /* fifo (VFIFO) */ } v_un; + /* VFS namecache */ + struct namecache_rb_cache v_nc_tree; + TAILQ_HEAD(, namecache) v_cache_dst; /* cache entries to us */ + enum vtagtype v_tag; /* type of underlying data */ void *v_data; /* private data for fs */ struct selinfo v_selectinfo; /* identity of poller(s) */ @@ -247,8 +253,9 @@ extern int maxvnodes; /* XXX number of vnodes to allocate */ extern time_t syncdelay; /* time to delay syncing vnodes */ extern int rushjob; /* # of slots syncer should run ASAP */ +extern void vhold(struct vnode *); +extern void vdrop(struct vnode *); #endif /* _KERNEL */ - /* * Mods for exensibility.