On Tue, Nov 08, 2016 at 10:57:41AM +1000, David Gwynne wrote: > this turns the pa_pagesz member of a pool allocator into a bitfield. > > pool pages can be many different sizes, they arent restricted to > just the size provided by the hardware. to support this without > introducing a page allocator per page size, a single page allocator > can be used to allocate many different sizes. however, right now > there is no way to pass a custom page allocator to pool_init and > tell it that an allocator can do different page sizes. currently > pool_init only uses the multi page allocators when it's allowed to > choose one itself. > > so, as i said above, pool allocators can now indicate what sizes > they can provide. the low bit in pa_pagesz indicates whether the > allocator can align its allocation to the requested size. this is > necessary if you want to know if you can store the pool page headers > inside the allocation. > > the rest of the bits map to supported page sizes. pools only support > page sizes that are powers of two. with that in mind, each power > of two is represented as a single bit which we can or together to > indicate the ranges of pages an allocator can provide. > > eg, the multi page pools on sparc64 would have have 0xffffe001 as > pa_pagesz. the low bit says they can align their pages, and you can > test which sizes they support by oring sizes. eg, ISSET(0xffffe001, > 8192) is true, as is ISSET(0xffffe001, 65536). > > in the future i want to use this to provide an allocator for all > the mbufs and clusters so we can configure how much memory we want > packets to consume rather than how many packets we want to provide. > on my box here kern.maxclusters is 16384, which means we can have > 16384 clusters allocated from any of the backend pools. 16384 64k > clusters is a gigabyte of ram, which is probably not what we want. > instead we should say we want all packets to be allocated from a > few meg of ram and let the any of the clusters come out of that > pool. this diff is a step toward that. > > another benefit of that would be then enable the per cpu caches for > mbufs and clusters, which will be necessary to scale performance > when the stack is unlocked further.
i hacked the mbuf change up and this is what it looks like. the mbuf layer creates its own page allocator has all the mbuf pools use it. the page allocator limits the amount of memory all mbuf pools together can use rather than have limits on each pool. Index: dev/pci/if_myx.c =================================================================== RCS file: /cvs/src/sys/dev/pci/if_myx.c,v retrieving revision 1.99 diff -u -p -r1.99 if_myx.c --- dev/pci/if_myx.c 31 Oct 2016 01:38:57 -0000 1.99 +++ dev/pci/if_myx.c 9 Nov 2016 04:31:47 -0000 @@ -294,8 +294,6 @@ myx_attach(struct device *parent, struct /* this is sort of racy */ if (myx_mcl_pool == NULL) { - extern struct kmem_pa_mode kp_dma_contig; - myx_mcl_pool = malloc(sizeof(*myx_mcl_pool), M_DEVBUF, M_WAITOK); if (myx_mcl_pool == NULL) { @@ -303,9 +301,9 @@ myx_attach(struct device *parent, struct DEVNAME(sc)); goto unmap; } - pool_init(myx_mcl_pool, MYX_RXBIG_SIZE, MYX_BOUNDARY, IPL_NET, - 0, "myxmcl", NULL); - pool_set_constraints(myx_mcl_pool, &kp_dma_contig); + + m_pool_init(myx_mcl_pool, MYX_RXBIG_SIZE, MYX_BOUNDARY, + "myxmcl"); } if (myx_pcie_dc(sc, pa) != 0) Index: kern/uipc_mbuf.c =================================================================== RCS file: /cvs/src/sys/kern/uipc_mbuf.c,v retrieving revision 1.237 diff -u -p -r1.237 uipc_mbuf.c --- kern/uipc_mbuf.c 27 Oct 2016 03:29:55 -0000 1.237 +++ kern/uipc_mbuf.c 9 Nov 2016 04:31:47 -0000 @@ -133,6 +133,19 @@ void m_extfree(struct mbuf *); void nmbclust_update(void); void m_zero(struct mbuf *); +struct mutex m_pool_mtx = MUTEX_INITIALIZER(IPL_NET); +unsigned int mbuf_mem_limit; /* how much memory can we allocated */ +unsigned int mbuf_mem_alloc; /* how much memory has been allocated */ + +void *m_pool_alloc(struct pool *, int, int *); +void m_pool_free(struct pool *, void *); + +struct pool_allocator m_pool_allocator = { + m_pool_alloc, + m_pool_free, + 0 /* will be copied from pool_allocator_multi */ +}; + static void (*mextfree_fns[4])(caddr_t, u_int, void *); static u_int num_extfree_fns; @@ -148,6 +161,11 @@ mbinit(void) int i; unsigned int lowbits; + m_pool_allocator.pa_pagesz = pool_allocator_multi.pa_pagesz; + + nmbclust_update(); + mbuf_mem_alloc = 0; + #if DIAGNOSTIC if (mclsizes[0] != MCLBYTES) panic("mbinit: the smallest cluster size != MCLBYTES"); @@ -155,9 +173,7 @@ mbinit(void) panic("mbinit: the largest cluster size != MAXMCLBYTES"); #endif - pool_init(&mbpool, MSIZE, 0, IPL_NET, 0, "mbufpl", NULL); - pool_set_constraints(&mbpool, &kp_dma_contig); - pool_setlowat(&mbpool, mblowat); + m_pool_init(&mbpool, MSIZE, 64, "mbufpl"); pool_init(&mtagpool, PACKET_TAG_MAXSIZE + sizeof(struct m_tag), 0, IPL_NET, 0, "mtagpl", NULL); @@ -171,47 +187,32 @@ mbinit(void) snprintf(mclnames[i], sizeof(mclnames[0]), "mcl%dk", mclsizes[i] >> 10); } - pool_init(&mclpools[i], mclsizes[i], 64, IPL_NET, 0, - mclnames[i], NULL); - pool_set_constraints(&mclpools[i], &kp_dma_contig); - pool_setlowat(&mclpools[i], mcllowat); + + m_pool_init(&mclpools[i], mclsizes[i], 64, mclnames[i]); } (void)mextfree_register(m_extfree_pool); KASSERT(num_extfree_fns == 1); - - nmbclust_update(); } void mbcpuinit() { + int i; + mbstat = counters_alloc_ncpus(mbstat, MBSTAT_COUNT, M_DEVBUF); + + pool_cache_init(&mbpool); + pool_cache_init(&mtagpool); + + for (i = 0; i < nitems(mclsizes); i++) + pool_cache_init(&mclpools[i]); } void nmbclust_update(void) { - unsigned int i, n; - - /* - * Set the hard limit on the mclpools to the number of - * mbuf clusters the kernel is to support. Log the limit - * reached message max once a minute. - */ - for (i = 0; i < nitems(mclsizes); i++) { - n = (unsigned long long)nmbclust * MCLBYTES / mclsizes[i]; - (void)pool_sethardlimit(&mclpools[i], n, mclpool_warnmsg, 60); - /* - * XXX this needs to be reconsidered. - * Setting the high water mark to nmbclust is too high - * but we need to have enough spare buffers around so that - * allocations in interrupt context don't fail or mclgeti() - * drivers may end up with empty rings. - */ - pool_sethiwat(&mclpools[i], n); - } - pool_sethiwat(&mbpool, nmbclust); + mbuf_mem_limit = nmbclust * MCLBYTES; } /* @@ -1377,6 +1378,49 @@ m_dup_pkt(struct mbuf *m0, unsigned int fail: m_freem(m); return (NULL); +} + +void * +m_pool_alloc(struct pool *pp, int flags, int *slowdown) +{ + void *v; + int avail = 1; + + if (pp->pr_pgsize + mbuf_mem_alloc > mbuf_mem_limit) + return (NULL); + + mtx_enter(&m_pool_mtx); + if (pp->pr_pgsize + mbuf_mem_alloc > mbuf_mem_limit) + avail = 0; + mbuf_mem_alloc += pp->pr_pgsize; + mtx_leave(&m_pool_mtx); + + v = (*pool_allocator_multi.pa_alloc)(pp, flags, slowdown); + + if (v == NULL) { + mtx_enter(&m_pool_mtx); + mbuf_mem_alloc -= pp->pr_pgsize; + mtx_leave(&m_pool_mtx); + } + + return (v); +} + +void +m_pool_free(struct pool *pp, void *v) +{ + mtx_enter(&m_pool_mtx); + mbuf_mem_alloc -= pp->pr_pgsize; + mtx_leave(&m_pool_mtx); + + (*pool_allocator_multi.pa_free)(pp, v); +} + +void +m_pool_init(struct pool *pp, u_int size, u_int align, const char *wmesg) +{ + pool_init(pp, size, align, IPL_NET, 0, wmesg, &m_pool_allocator); + pool_set_constraints(pp, &kp_dma_contig); } #ifdef DDB Index: sys/mbuf.h =================================================================== RCS file: /cvs/src/sys/sys/mbuf.h,v retrieving revision 1.222 diff -u -p -r1.222 mbuf.h --- sys/mbuf.h 24 Oct 2016 04:38:44 -0000 1.222 +++ sys/mbuf.h 9 Nov 2016 04:31:47 -0000 @@ -416,6 +416,7 @@ struct mbuf_queue { }; #ifdef _KERNEL +struct pool; extern int nmbclust; /* limit on the # of clusters */ extern int mblowat; /* mbuf low water mark */ @@ -444,6 +445,7 @@ int m_leadingspace(struct mbuf *); int m_trailingspace(struct mbuf *); struct mbuf *m_clget(struct mbuf *, int, u_int); void m_extref(struct mbuf *, struct mbuf *); +void m_pool_init(struct pool *, u_int, u_int, const char *); void m_extfree_pool(caddr_t, u_int, void *); void m_adj(struct mbuf *, int); int m_copyback(struct mbuf *, int, int, const void *, int); Index: sys/pool.h =================================================================== RCS file: /cvs/src/sys/sys/pool.h,v retrieving revision 1.67 diff -u -p -r1.67 pool.h --- sys/pool.h 7 Nov 2016 23:45:27 -0000 1.67 +++ sys/pool.h 9 Nov 2016 04:31:47 -0000 @@ -179,6 +205,7 @@ struct pool { #ifdef _KERNEL extern struct pool_allocator pool_allocator_single; +extern struct pool_allocator pool_allocator_multi; struct pool_request { TAILQ_ENTRY(pool_request) pr_entry;