Hi, This diff implements a more flexible approach for the number of pools malloc uses in the multi-threaded case. At the momemt I do not intend to commit this as-is, I first need this to get some feedback on what the proper default should be.
Currently the number of pools is fixed at 4. More pools mean less contention for allocations, but free becomes more expensive since a thread might need to check other pools increasing contention. I'd like to know how this diff behaves using your favorite mutli-threaded application. Often this will be a web-browser I guess. Test instructions: 0. Make sure you are running current. 1. Do a baseline test of your application. 2. Apply diff, build and install userland. 3. Run your test application with MALLOC_OPTIONS=value, where value is: "", +, -, ++, -- and +++. e.g. MALLOC_OPTIONS=++ chrome Note performance. Do multiple tests to get better statistics. If you're not able to do full tests, at least general observations are welcome. Tell a bit about the system you tested on (e.g. number of cores). Note that due to randomization, different runs might show different performance numbers since the pools shared by subsets of threads can turn out differently. Thanks, -Otto Index: include/thread_private.h =================================================================== RCS file: /cvs/src/lib/libc/include/thread_private.h,v retrieving revision 1.33 diff -u -p -r1.33 thread_private.h --- include/thread_private.h 5 Dec 2017 13:45:31 -0000 1.33 +++ include/thread_private.h 19 Dec 2018 06:52:07 -0000 @@ -7,7 +7,7 @@ #include <stdio.h> /* for FILE and __isthreaded */ -#define _MALLOC_MUTEXES 4 +#define _MALLOC_MUTEXES 32 void _malloc_init(int); #ifdef __LIBC__ PROTO_NORMAL(_malloc_init); Index: stdlib/malloc.c =================================================================== RCS file: /cvs/src/lib/libc/stdlib/malloc.c,v retrieving revision 1.257 diff -u -p -r1.257 malloc.c --- stdlib/malloc.c 10 Dec 2018 07:57:49 -0000 1.257 +++ stdlib/malloc.c 19 Dec 2018 06:52:07 -0000 @@ -143,6 +143,8 @@ struct dir_info { size_t cheap_reallocs; size_t malloc_used; /* bytes allocated */ size_t malloc_guarded; /* bytes used for guards */ + size_t pool_searches; /* searches for pool */ + size_t other_pool; /* searches in other pool */ #define STATS_ADD(x,y) ((x) += (y)) #define STATS_SUB(x,y) ((x) -= (y)) #define STATS_INC(x) ((x)++) @@ -179,7 +181,9 @@ struct chunk_info { }; struct malloc_readonly { - struct dir_info *malloc_pool[_MALLOC_MUTEXES]; /* Main bookkeeping information */ + /* Main bookkeeping information */ + struct dir_info *malloc_pool[_MALLOC_MUTEXES]; + u_int malloc_mutexes; /* how much in actual use? */ int malloc_mt; /* multi-threaded mode? */ int malloc_freecheck; /* Extensive double free check */ int malloc_freeunmap; /* mprotect free pages PROT_NONE? */ @@ -267,7 +271,7 @@ getpool(void) return mopts.malloc_pool[0]; else return mopts.malloc_pool[TIB_GET()->tib_tid & - (_MALLOC_MUTEXES - 1)]; + (mopts.malloc_mutexes - 1)]; } static __dead void @@ -316,6 +320,16 @@ static void omalloc_parseopt(char opt) { switch (opt) { + case '+': + mopts.malloc_mutexes <<= 1; + if (mopts.malloc_mutexes > _MALLOC_MUTEXES) + mopts.malloc_mutexes = _MALLOC_MUTEXES; + break; + case '-': + mopts.malloc_mutexes >>= 1; + if (mopts.malloc_mutexes < 1) + mopts.malloc_mutexes = 1; + break; case '>': mopts.malloc_cache <<= 1; if (mopts.malloc_cache > MALLOC_MAXCACHE) @@ -395,6 +409,7 @@ omalloc_init(void) /* * Default options */ + mopts.malloc_mutexes = 4; mopts.malloc_junk = 1; mopts.malloc_cache = MALLOC_DEFAULT_CACHE; @@ -485,7 +500,7 @@ omalloc_poolinit(struct dir_info **dp) for (j = 0; j < MALLOC_CHUNK_LISTS; j++) LIST_INIT(&d->chunk_dir[i][j]); } - STATS_ADD(d->malloc_used, regioninfo_size); + STATS_ADD(d->malloc_used, regioninfo_size + 3 * MALLOC_PAGESIZE); d->canary1 = mopts.malloc_canary ^ (u_int32_t)(uintptr_t)d; d->canary2 = ~d->canary1; @@ -1196,7 +1211,7 @@ _malloc_init(int from_rthreads) if (!mopts.malloc_canary) omalloc_init(); - max = from_rthreads ? _MALLOC_MUTEXES : 1; + max = from_rthreads ? mopts.malloc_mutexes : 1; if (((uintptr_t)&malloc_readonly & MALLOC_PAGEMASK) == 0) mprotect(&malloc_readonly, sizeof(malloc_readonly), PROT_READ | PROT_WRITE); @@ -1281,16 +1296,19 @@ findpool(void *p, struct dir_info *argpo struct dir_info *pool = argpool; struct region_info *r = find(pool, p); + STATS_INC(pool->pool_searches); if (r == NULL) { if (mopts.malloc_mt) { int i; - for (i = 0; i < _MALLOC_MUTEXES; i++) { - if (i == argpool->mutex) - continue; + STATS_INC(pool->other_pool); + for (i = 1; i < mopts.malloc_mutexes; i++) { + int j = (argpool->mutex + i) & + (mopts.malloc_mutexes - 1); + pool->active--; _MALLOC_UNLOCK(pool->mutex); - pool = mopts.malloc_pool[i]; + pool = mopts.malloc_pool[j]; _MALLOC_LOCK(pool->mutex); pool->active++; r = find(pool, p); @@ -2220,14 +2238,13 @@ malloc_dump1(int fd, int poolno, struct return; dprintf(fd, "Region slots free %zu/%zu\n", d->regions_free, d->regions_total); - dprintf(fd, "Finds %zu/%zu\n", d->finds, - d->find_collisions); - dprintf(fd, "Inserts %zu/%zu\n", d->inserts, - d->insert_collisions); - dprintf(fd, "Deletes %zu/%zu\n", d->deletes, - d->delete_moves); + dprintf(fd, "Finds %zu/%zu\n", d->finds, d->find_collisions); + dprintf(fd, "Inserts %zu/%zu\n", d->inserts, d->insert_collisions); + dprintf(fd, "Deletes %zu/%zu\n", d->deletes, d->delete_moves); dprintf(fd, "Cheap reallocs %zu/%zu\n", d->cheap_reallocs, d->cheap_realloc_tries); + dprintf(fd, "Other pool searches %zu/%zu\n", + d->other_pool, d->pool_searches); dprintf(fd, "In use %zu\n", d->malloc_used); dprintf(fd, "Guarded %zu\n", d->malloc_guarded); dump_free_chunk_info(fd, d); @@ -2289,7 +2306,7 @@ malloc_gdump(int fd) int i; int saved_errno = errno; - for (i = 0; i < _MALLOC_MUTEXES; i++) + for (i = 0; i < mopts.malloc_mutexes; i++) malloc_dump(fd, i, mopts.malloc_pool[i]); errno = saved_errno; @@ -2305,15 +2322,15 @@ malloc_exit(void) if (fd != -1) { dprintf(fd, "******** Start dump %s *******\n", __progname); dprintf(fd, - "MT=%d I=%d F=%d U=%d J=%d R=%d X=%d C=%d cache=%u G=%zu\n", - mopts.malloc_mt, mopts.internal_funcs, - mopts.malloc_freecheck, + "MT=%d M=%u I=%d F=%d U=%d J=%d R=%d X=%d C=%d cache=%u G=%zu\n", + mopts.malloc_mt, mopts.mallloc_mutexes, + mopts.internal_funcs, mopts.malloc_freecheck, mopts.malloc_freeunmap, mopts.malloc_junk, mopts.malloc_realloc, mopts.malloc_xmalloc, mopts.chunk_canaries, mopts.malloc_cache, mopts.malloc_guard); - for (i = 0; i < _MALLOC_MUTEXES; i++) + for (i = 0; i < mopts.malloc_mutexes; i++) malloc_dump(fd, i, mopts.malloc_pool[i]); dprintf(fd, "******** End dump %s *******\n", __progname); close(fd); Index: thread/rthread_libc.c =================================================================== RCS file: /cvs/src/lib/libc/thread/rthread_libc.c,v retrieving revision 1.2 diff -u -p -r1.2 rthread_libc.c --- thread/rthread_libc.c 5 Sep 2017 02:40:54 -0000 1.2 +++ thread/rthread_libc.c 19 Dec 2018 06:52:07 -0000 @@ -169,14 +169,69 @@ static struct pthread_mutex malloc_lock[ MALLOC_LOCK_INITIALIZER(0), MALLOC_LOCK_INITIALIZER(1), MALLOC_LOCK_INITIALIZER(2), - MALLOC_LOCK_INITIALIZER(3) + MALLOC_LOCK_INITIALIZER(4), + MALLOC_LOCK_INITIALIZER(5), + MALLOC_LOCK_INITIALIZER(6), + MALLOC_LOCK_INITIALIZER(7), + MALLOC_LOCK_INITIALIZER(8), + MALLOC_LOCK_INITIALIZER(9), + MALLOC_LOCK_INITIALIZER(10), + MALLOC_LOCK_INITIALIZER(11), + MALLOC_LOCK_INITIALIZER(12), + MALLOC_LOCK_INITIALIZER(13), + MALLOC_LOCK_INITIALIZER(14), + MALLOC_LOCK_INITIALIZER(15), + MALLOC_LOCK_INITIALIZER(16), + MALLOC_LOCK_INITIALIZER(17), + MALLOC_LOCK_INITIALIZER(18), + MALLOC_LOCK_INITIALIZER(19), + MALLOC_LOCK_INITIALIZER(20), + MALLOC_LOCK_INITIALIZER(21), + MALLOC_LOCK_INITIALIZER(22), + MALLOC_LOCK_INITIALIZER(23), + MALLOC_LOCK_INITIALIZER(24), + MALLOC_LOCK_INITIALIZER(25), + MALLOC_LOCK_INITIALIZER(26), + MALLOC_LOCK_INITIALIZER(27), + MALLOC_LOCK_INITIALIZER(28), + MALLOC_LOCK_INITIALIZER(29), + MALLOC_LOCK_INITIALIZER(30), + MALLOC_LOCK_INITIALIZER(31) }; static pthread_mutex_t malloc_mutex[_MALLOC_MUTEXES] = { &malloc_lock[0], &malloc_lock[1], &malloc_lock[2], - &malloc_lock[3] + &malloc_lock[3], + &malloc_lock[4], + &malloc_lock[5], + &malloc_lock[6], + &malloc_lock[7], + &malloc_lock[8], + &malloc_lock[9], + &malloc_lock[10], + &malloc_lock[11], + &malloc_lock[12], + &malloc_lock[13], + &malloc_lock[14], + &malloc_lock[15], + &malloc_lock[16], + &malloc_lock[17], + &malloc_lock[18], + &malloc_lock[19], + &malloc_lock[20], + &malloc_lock[21], + &malloc_lock[22], + &malloc_lock[23], + &malloc_lock[24], + &malloc_lock[25], + &malloc_lock[26], + &malloc_lock[27], + &malloc_lock[28], + &malloc_lock[29], + &malloc_lock[30], + &malloc_lock[31] }; void