I could use some assistance in testing this, particularly on some of the more odd archetectures.
This diff makes a bunch of changes to the vfs name cache: 1) it gets rid of the global hash table and reverse hash table for namecahe entries. namecache entries are now allocated and freed in two global LRU's - one for regular, and one for negative entries. 2) Entries are no longer searched in the global lists, instead they are kept track of in the relevant vnodes. Since each vnode can be either the parent (directory) vnode of a namecache entry, or the target of the entry, we keep track of it in both ways in the vnode. We now use a rb tree to search the namecache from a directory vnode, and keep a list of which entries that we are the target vnode. 3) (most importantly) namecache entries can now be allocated and freed. 4) cache_purge now actually does something rather than depending on vnode horror to work. when recycling a vnode cache_purge will now correctly clear the name cache entries associated with the vnode. (before it basically didn't do anything, and depended on us noticing the vnodes were being recycled underneath us) This has been beat on a bunch, and appears not to slow anything down. I do require some testing and reports on other arch's particularly the likes of sparc, vax, and strange things if possible Thanks, -Bob Index: kern/vfs_cache.c =================================================================== RCS file: /cvs/src/sys/kern/vfs_cache.c,v retrieving revision 1.29 diff -u -p kern/vfs_cache.c --- kern/vfs_cache.c 24 Oct 2008 00:22:57 -0000 1.29 +++ kern/vfs_cache.c 11 Jun 2009 21:32:21 -0000 @@ -68,27 +68,60 @@ /* * Structures associated with name caching. */ -LIST_HEAD(nchashhead, namecache) *nchashtbl; u_long nchash; /* size of hash table - 1 */ -long numcache; /* number of cache entries allocated */ -TAILQ_HEAD(, namecache) nclruhead; /* LRU chain */ +long numcache; /* total number of cache entries allocated */ +long numneg; /* number of negative cache entries */ + +TAILQ_HEAD(, namecache) nclruhead; /* Regular Entry LRU chain */ +TAILQ_HEAD(, namecache) nclruneghead; /* Negative Entry LRU chain */ struct nchstats nchstats; /* cache effectiveness statistics */ -LIST_HEAD(ncvhashhead, namecache) *ncvhashtbl; -u_long ncvhash; /* size of hash table - 1 */ - -#define NCHASH(dvp, cnp) \ - hash32_buf(&(dvp)->v_id, sizeof((dvp)->v_id), (cnp)->cn_hash) & nchash - -#define NCVHASH(vp) (vp)->v_id & ncvhash - int doingcache = 1; /* 1 => enable the cache */ struct pool nch_pool; +void cache_zap(struct namecache *); u_long nextvnodeid; +static int +namecache_compare(struct namecache *n1, struct namecache *n2) +{ + if (n1->nc_nlen == n2->nc_nlen) + return (memcmp(n1->nc_name, n2->nc_name, n1->nc_nlen)); + else + return (n1->nc_nlen - n2->nc_nlen); +} + +RB_GENERATE(namecache_rb_cache, namecache, n_rbcache, namecache_compare); + /* + * blow away a namecache entry + */ +void +cache_zap(struct namecache *ncp) +{ + struct vnode *dvp = NULL; + + if (ncp->nc_vp != NULL) { + TAILQ_REMOVE(&nclruhead, ncp, nc_lru); + numcache--; + } else { + TAILQ_REMOVE(&nclruneghead, ncp, nc_neg); + numneg--; + } + if (ncp->nc_dvp) { + RB_REMOVE(namecache_rb_cache, &ncp->nc_dvp->v_nc_tree, ncp); + if (RB_EMPTY(&ncp->nc_dvp->v_nc_tree)) + dvp = ncp->nc_dvp; + } + if (ncp->nc_vp) + TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_me); + pool_put(&nch_pool, ncp); + if (dvp) + vdrop(dvp); +} + +/* * Look for a name in the cache. We don't do this if the segment name is * long, simply so the cache can avoid holding long names (which would * either waste space, or add greatly to the complexity). @@ -107,7 +140,7 @@ int cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp) { struct namecache *ncp; - struct nchashhead *ncpp; + struct namecache n; struct vnode *vp; struct proc *p = curproc; u_long vpid; @@ -125,14 +158,11 @@ cache_lookup(struct vnode *dvp, struct vnode **vpp, st return (-1); } - ncpp = &nchashtbl[NCHASH(dvp, cnp)]; - LIST_FOREACH(ncp, ncpp, nc_hash) { - if (ncp->nc_dvp == dvp && - ncp->nc_dvpid == dvp->v_id && - ncp->nc_nlen == cnp->cn_namelen && - !memcmp(ncp->nc_name, cnp->cn_nameptr, (u_int)ncp->nc_nlen)) - break; - } + /* lookup in directory vnode's redblack tree */ + n.nc_nlen = cnp->cn_namelen; + memcpy(n.nc_name, cnp->cn_nameptr, n.nc_nlen); + ncp = RB_FIND(namecache_rb_cache, &dvp->v_nc_tree, &n); + if (ncp == NULL) { nchstats.ncs_miss++; return (-1); @@ -145,13 +175,10 @@ cache_lookup(struct vnode *dvp, struct vnode **vpp, st (cnp->cn_flags & ISLASTCN) == 0) { nchstats.ncs_neghits++; /* - * Move this slot to end of LRU chain, - * if not already there. + * Move this slot to end of the negative LRU chain, */ - if (TAILQ_NEXT(ncp, nc_lru) != NULL) { - TAILQ_REMOVE(&nclruhead, ncp, nc_lru); - TAILQ_INSERT_TAIL(&nclruhead, ncp, nc_lru); - } + TAILQ_REMOVE(&nclruneghead, ncp, nc_neg); + TAILQ_INSERT_TAIL(&nclruneghead, ncp, nc_neg); return (ENOENT); } else { nchstats.ncs_badhits++; @@ -220,12 +247,10 @@ cache_lookup(struct vnode *dvp, struct vnode **vpp, st nchstats.ncs_goodhits++; /* - * Move this slot to end of LRU chain, if not already there. + * Move this slot to end of the regular LRU chain. */ - if (TAILQ_NEXT(ncp, nc_lru) != NULL) { - TAILQ_REMOVE(&nclruhead, ncp, nc_lru); - TAILQ_INSERT_TAIL(&nclruhead, ncp, nc_lru); - } + TAILQ_REMOVE(&nclruhead, ncp, nc_lru); + TAILQ_INSERT_TAIL(&nclruhead, ncp, nc_lru); *vpp = vp; return (0); @@ -235,16 +260,7 @@ remove: * the cache entry is invalid, or otherwise don't * want cache entry to exist. */ - TAILQ_REMOVE(&nclruhead, ncp, nc_lru); - LIST_REMOVE(ncp, nc_hash); - ncp->nc_hash.le_prev = NULL; - - if (ncp->nc_vhash.le_prev != NULL) { - LIST_REMOVE(ncp, nc_vhash); - ncp->nc_vhash.le_prev = NULL; - } - - TAILQ_INSERT_HEAD(&nclruhead, ncp, nc_lru); + cache_zap(ncp); return (-1); } @@ -267,62 +283,51 @@ int cache_revlookup(struct vnode *vp, struct vnode **dvpp, char **bpp, char *bufp) { struct namecache *ncp; - struct vnode *dvp; - struct ncvhashhead *nvcpp; + struct vnode *dvp = NULL; char *bp; if (!doingcache) goto out; - - nvcpp = &ncvhashtbl[NCVHASH(vp)]; - - LIST_FOREACH(ncp, nvcpp, nc_vhash) { - if (ncp->nc_vp == vp && - ncp->nc_vpid == vp->v_id && - (dvp = ncp->nc_dvp) != NULL && - /* avoid pesky '.' entries.. */ - dvp != vp && ncp->nc_dvpid == dvp->v_id) { - + TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_me) { + dvp = ncp->nc_dvp; + if (dvp && dvp != vp) + goto found; + } + goto miss; +found: #ifdef DIAGNOSTIC - if (ncp->nc_nlen == 1 && - ncp->nc_name[0] == '.') - panic("cache_revlookup: found entry for ."); - - if (ncp->nc_nlen == 2 && - ncp->nc_name[0] == '.' && - ncp->nc_name[1] == '.') - panic("cache_revlookup: found entry for .."); + if (ncp->nc_nlen == 1 && + ncp->nc_name[0] == '.') + panic("cache_revlookup: found entry for ."); + if (ncp->nc_nlen == 2 && + ncp->nc_name[0] == '.' && + ncp->nc_name[1] == '.') + panic("cache_revlookup: found entry for .."); #endif - nchstats.ncs_revhits++; + nchstats.ncs_revhits++; - if (bufp != NULL) { - bp = *bpp; - bp -= ncp->nc_nlen; - if (bp <= bufp) { - *dvpp = NULL; - return (ERANGE); - } - memcpy(bp, ncp->nc_name, ncp->nc_nlen); - *bpp = bp; - } - - *dvpp = dvp; - - /* - * XXX: Should we vget() here to have more - * consistent semantics with cache_lookup()? - * - * For MP safety it might be necessary to do - * this here, while also protecting hash - * tables themselves to provide some sort of - * sane inter locking. - */ - return (0); + if (bufp != NULL) { + bp = *bpp; + bp -= ncp->nc_nlen; + if (bp <= bufp) { + *dvpp = NULL; + return (ERANGE); } + memcpy(bp, ncp->nc_name, ncp->nc_nlen); + *bpp = bp; } - nchstats.ncs_revmiss++; - out: + *dvpp = dvp; + + /* + * XXX: Should we vget() here to have more + * consistent semantics with cache_lookup()? + */ + return (0); + +miss: + nchstats.ncs_revmiss++; +out: *dvpp = NULL; return (-1); } @@ -334,58 +339,50 @@ void cache_enter(struct vnode *dvp, struct vnode *vp, struct componentname *cnp) { struct namecache *ncp; - struct nchashhead *ncpp; - struct ncvhashhead *nvcpp; if (!doingcache || cnp->cn_namelen > NCHNAMLEN) return; /* - * Free the cache slot at head of lru chain. + * allocate, or recycle (free and allocate) an ncp. */ - if (numcache < desiredvnodes) { - ncp = pool_get(&nch_pool, PR_WAITOK|PR_ZERO); - numcache++; - } else if ((ncp = TAILQ_FIRST(&nclruhead)) != NULL) { - TAILQ_REMOVE(&nclruhead, ncp, nc_lru); - if (ncp->nc_hash.le_prev != NULL) { - LIST_REMOVE(ncp, nc_hash); - ncp->nc_hash.le_prev = NULL; - } - if (ncp->nc_vhash.le_prev != NULL) { - LIST_REMOVE(ncp, nc_vhash); - ncp->nc_vhash.le_prev = NULL; - } - } else - return; + if (numcache >= desiredvnodes) { + if ((ncp = TAILQ_FIRST(&nclruhead)) != NULL) + cache_zap(ncp); + else if ((ncp = TAILQ_FIRST(&nclruneghead)) != NULL) + cache_zap(ncp); + else + panic("wtf? leak?"); + } + ncp = pool_get(&nch_pool, PR_WAITOK|PR_ZERO); + /* grab the vnode we just found */ ncp->nc_vp = vp; if (vp) ncp->nc_vpid = vp->v_id; + /* fill in cache info */ ncp->nc_dvp = dvp; ncp->nc_dvpid = dvp->v_id; ncp->nc_nlen = cnp->cn_namelen; bcopy(cnp->cn_nameptr, ncp->nc_name, (unsigned)ncp->nc_nlen); - TAILQ_INSERT_TAIL(&nclruhead, ncp, nc_lru); - ncpp = &nchashtbl[NCHASH(dvp, cnp)]; - LIST_INSERT_HEAD(ncpp, ncp, nc_hash); - - /* - * Create reverse-cache entries (used in getcwd) for - * directories. - */ - - ncp->nc_vhash.le_prev = NULL; - ncp->nc_vhash.le_next = NULL; - - if (vp && vp != dvp && vp->v_type == VDIR && - (ncp->nc_nlen > 2 || - (ncp->nc_nlen > 1 && ncp->nc_name[1] != '.') || - (ncp->nc_nlen > 0 && ncp->nc_name[0] != '.'))) { - nvcpp = &ncvhashtbl[NCVHASH(vp)]; - LIST_INSERT_HEAD(nvcpp, ncp, nc_vhash); + if (RB_EMPTY(&dvp->v_nc_tree)) { + vhold(dvp); } + if (RB_INSERT(namecache_rb_cache, &dvp->v_nc_tree, ncp) != NULL) + panic("Ich habe eine Rotweinflasche in meinem Arsch\n"); + if (vp) { + TAILQ_INSERT_TAIL(&nclruhead, ncp, nc_lru); + numcache++; + TAILQ_INSERT_TAIL(&vp->v_cache_dst, ncp, nc_me); + } else { + TAILQ_INSERT_TAIL(&nclruneghead, ncp, nc_neg); + numneg++; + } + if (numneg > desiredvnodes) { + if ((ncp = TAILQ_FIRST(&nclruneghead)) != NULL) + cache_zap(ncp); + } } /* @@ -394,10 +391,8 @@ cache_enter(struct vnode *dvp, struct vnode *vp, struc void nchinit() { - TAILQ_INIT(&nclruhead); - nchashtbl = hashinit(desiredvnodes, M_CACHE, M_WAITOK, &nchash); - ncvhashtbl = hashinit(desiredvnodes/8, M_CACHE, M_WAITOK, &ncvhash); + TAILQ_INIT(&nclruneghead); pool_init(&nch_pool, sizeof(struct namecache), 0, 0, 0, "nchpl", &pool_allocator_nointr); } @@ -410,18 +405,16 @@ void cache_purge(struct vnode *vp) { struct namecache *ncp; - struct nchashhead *ncpp; + while ((ncp = TAILQ_FIRST(&vp->v_cache_dst))) + cache_zap(ncp); + while ((ncp = RB_ROOT(&vp->v_nc_tree))) + cache_zap(ncp); + + /* XXX this blows goats */ vp->v_id = ++nextvnodeid; - if (nextvnodeid != 0) - return; - for (ncpp = &nchashtbl[nchash]; ncpp >= nchashtbl; ncpp--) { - LIST_FOREACH(ncp, ncpp, nc_hash) { - ncp->nc_vpid = 0; - ncp->nc_dvpid = 0; - } - } - vp->v_id = ++nextvnodeid; + if (vp->v_id == 0) + vp->v_id = ++nextvnodeid; } /* @@ -437,6 +430,7 @@ cache_purgevfs(struct mount *mp) { struct namecache *ncp, *nxtcp; + /* whack the regular entries */ for (ncp = TAILQ_FIRST(&nclruhead); ncp != TAILQ_END(&nclruhead); ncp = nxtcp) { if (ncp->nc_dvp == NULL || ncp->nc_dvp->v_mount != mp) { @@ -444,19 +438,20 @@ cache_purgevfs(struct mount *mp) continue; } /* free the resources we had */ - ncp->nc_vp = NULL; - ncp->nc_dvp = NULL; - TAILQ_REMOVE(&nclruhead, ncp, nc_lru); - if (ncp->nc_hash.le_prev != NULL) { - LIST_REMOVE(ncp, nc_hash); - ncp->nc_hash.le_prev = NULL; - } - if (ncp->nc_vhash.le_prev != NULL) { - LIST_REMOVE(ncp, nc_vhash); - ncp->nc_vhash.le_prev = NULL; - } + cache_zap(ncp); /* cause rescan of list, it may have altered */ nxtcp = TAILQ_FIRST(&nclruhead); - TAILQ_INSERT_HEAD(&nclruhead, ncp, nc_lru); + } + /* whack the negative entries */ + for (ncp = TAILQ_FIRST(&nclruneghead); ncp != TAILQ_END(&nclruneghead); + ncp = nxtcp) { + if (ncp->nc_dvp == NULL || ncp->nc_dvp->v_mount != mp) { + nxtcp = TAILQ_NEXT(ncp, nc_neg); + continue; + } + /* free the resources we had */ + cache_zap(ncp); + /* cause rescan of list, it may have altered */ + nxtcp = TAILQ_FIRST(&nclruneghead); } } Index: kern/vfs_subr.c =================================================================== RCS file: /cvs/src/sys/kern/vfs_subr.c,v retrieving revision 1.177 diff -u -p kern/vfs_subr.c --- kern/vfs_subr.c 6 Jun 2009 18:06:22 -0000 1.177 +++ kern/vfs_subr.c 11 Jun 2009 21:32:21 -0000 @@ -360,6 +360,8 @@ getnewvnode(enum vtagtype tag, struct mount *mp, int ( splx(s); vp = pool_get(&vnode_pool, PR_WAITOK | PR_ZERO); RB_INIT(&vp->v_bufs_tree); + RB_INIT(&vp->v_nc_tree); + TAILQ_INIT(&vp->v_cache_dst); numvnodes++; } else { for (vp = TAILQ_FIRST(listhd); vp != NULLVP; Index: sys/namei.h =================================================================== RCS file: /cvs/src/sys/sys/namei.h,v retrieving revision 1.22 diff -u -p sys/namei.h --- sys/namei.h 29 Aug 2008 08:57:28 -0000 1.22 +++ sys/namei.h 11 Jun 2009 21:32:21 -0000 @@ -36,7 +36,12 @@ #define _SYS_NAMEI_H_ #include <sys/queue.h> +#include <sys/tree.h> +struct namecache; +struct namecache_rb_cache; +RB_PROTOTYPE(namecache_rb_cache, namecache, n_rbcache, namecache_compare); + /* * Encapsulation of namei parameters. */ @@ -156,9 +161,10 @@ struct nameidata { #define NCHNAMLEN 31 /* maximum name segment length we bother with */ struct namecache { - LIST_ENTRY(namecache) nc_hash; /* hash chain */ - LIST_ENTRY(namecache) nc_vhash; /* (reverse) directory hash chain */ - TAILQ_ENTRY(namecache) nc_lru; /* LRU chain */ + TAILQ_ENTRY(namecache) nc_lru; /* Regular Entry LRU chain */ + TAILQ_ENTRY(namecache) nc_neg; /* Negative Entry LRU chain */ + RB_ENTRY(namecache) n_rbcache; /* Namecache rb tree from vnode */ + TAILQ_ENTRY(namecache) nc_me; /* ncp's referring to me */ struct vnode *nc_dvp; /* vnode of parent of name */ u_long nc_dvpid; /* capability number of nc_dvp */ struct vnode *nc_vp; /* vnode the name refers to */ Index: sys/vnode.h =================================================================== RCS file: /cvs/src/sys/sys/vnode.h,v retrieving revision 1.99 diff -u -p sys/vnode.h --- sys/vnode.h 3 Jun 2009 14:45:55 -0000 1.99 +++ sys/vnode.h 11 Jun 2009 21:32:21 -0000 @@ -36,6 +36,7 @@ #include <sys/types.h> #include <sys/queue.h> #include <sys/lock.h> +#include <sys/namei.h> #include <sys/selinfo.h> #include <sys/tree.h> @@ -82,6 +83,7 @@ enum vtagtype { LIST_HEAD(buflists, buf); RB_HEAD(buf_rb_bufs, buf); +RB_HEAD(namecache_rb_cache, namecache); struct vnode { struct uvm_vnode v_uvm; /* uvm data */ @@ -110,6 +112,10 @@ struct vnode { struct fifoinfo *vu_fifoinfo; /* fifo (VFIFO) */ } v_un; + /* VFS namecache */ + struct namecache_rb_cache v_nc_tree; + TAILQ_HEAD(, namecache) v_cache_dst; /* cache entries to us */ + enum vtagtype v_tag; /* type of underlying data */ void *v_data; /* private data for fs */ struct selinfo v_selectinfo; /* identity of poller(s) */ @@ -248,8 +254,9 @@ extern int desiredvnodes; /* XXX number of vnodes des extern int maxvnodes; /* XXX number of vnodes to allocate */ extern time_t syncdelay; /* time to delay syncing vnodes */ extern int rushjob; /* # of slots syncer should run ASAP */ +extern void vhold(struct vnode *); +extern void vdrop(struct vnode *); #endif /* _KERNEL */ - /* * Mods for exensibility.