On Wed, Dec 19, 2018 at 10:52:03AM +0100, Otto Moerbeek wrote: > Hi, > > This diff implements a more flexible approach for the number of pools > malloc uses in the multi-threaded case. At the momemt I do not intend > to commit this as-is, I first need this to get some feedback on what > the proper default should be. > > Currently the number of pools is fixed at 4. More pools mean less > contention for allocations, but free becomes more expensive since a > thread might need to check other pools increasing contention. > > I'd like to know how this diff behaves using your favorite > mutli-threaded application. Often this will be a web-browser I guess. > > Test instructions: > > 0. Make sure you are running current. > > 1. Do a baseline test of your application. > > 2. Apply diff, build and install userland. > > 3. Run your test application with MALLOC_OPTIONS=value, where value is: > "", +, -, ++, -- and +++. > > e.g. > > MALLOC_OPTIONS=++ chrome > > Note performance. Do multiple tests to get better statistics. > > If you're not able to do full tests, at least general observations are > welcome. Tell a bit about the system you tested on (e.g. number of > cores). Note that due to randomization, different runs might show > different performance numbers since the pools shared by subsets of > threads can turn out differently. > > Thanks, > > -Otto
New diff with problem noted by Janne Johansson fixed. Index: include/thread_private.h =================================================================== RCS file: /cvs/src/lib/libc/include/thread_private.h,v retrieving revision 1.33 diff -u -p -r1.33 thread_private.h --- include/thread_private.h 5 Dec 2017 13:45:31 -0000 1.33 +++ include/thread_private.h 19 Dec 2018 10:18:38 -0000 @@ -7,7 +7,7 @@ #include <stdio.h> /* for FILE and __isthreaded */ -#define _MALLOC_MUTEXES 4 +#define _MALLOC_MUTEXES 32 void _malloc_init(int); #ifdef __LIBC__ PROTO_NORMAL(_malloc_init); Index: stdlib/malloc.c =================================================================== RCS file: /cvs/src/lib/libc/stdlib/malloc.c,v retrieving revision 1.257 diff -u -p -r1.257 malloc.c --- stdlib/malloc.c 10 Dec 2018 07:57:49 -0000 1.257 +++ stdlib/malloc.c 19 Dec 2018 10:18:38 -0000 @@ -143,6 +143,8 @@ struct dir_info { size_t cheap_reallocs; size_t malloc_used; /* bytes allocated */ size_t malloc_guarded; /* bytes used for guards */ + size_t pool_searches; /* searches for pool */ + size_t other_pool; /* searches in other pool */ #define STATS_ADD(x,y) ((x) += (y)) #define STATS_SUB(x,y) ((x) -= (y)) #define STATS_INC(x) ((x)++) @@ -179,7 +181,9 @@ struct chunk_info { }; struct malloc_readonly { - struct dir_info *malloc_pool[_MALLOC_MUTEXES]; /* Main bookkeeping information */ + /* Main bookkeeping information */ + struct dir_info *malloc_pool[_MALLOC_MUTEXES]; + u_int malloc_mutexes; /* how much in actual use? */ int malloc_mt; /* multi-threaded mode? */ int malloc_freecheck; /* Extensive double free check */ int malloc_freeunmap; /* mprotect free pages PROT_NONE? */ @@ -267,7 +271,7 @@ getpool(void) return mopts.malloc_pool[0]; else return mopts.malloc_pool[TIB_GET()->tib_tid & - (_MALLOC_MUTEXES - 1)]; + (mopts.malloc_mutexes - 1)]; } static __dead void @@ -316,6 +320,16 @@ static void omalloc_parseopt(char opt) { switch (opt) { + case '+': + mopts.malloc_mutexes <<= 1; + if (mopts.malloc_mutexes > _MALLOC_MUTEXES) + mopts.malloc_mutexes = _MALLOC_MUTEXES; + break; + case '-': + mopts.malloc_mutexes >>= 1; + if (mopts.malloc_mutexes < 1) + mopts.malloc_mutexes = 1; + break; case '>': mopts.malloc_cache <<= 1; if (mopts.malloc_cache > MALLOC_MAXCACHE) @@ -395,6 +409,7 @@ omalloc_init(void) /* * Default options */ + mopts.malloc_mutexes = 4; mopts.malloc_junk = 1; mopts.malloc_cache = MALLOC_DEFAULT_CACHE; @@ -485,7 +500,7 @@ omalloc_poolinit(struct dir_info **dp) for (j = 0; j < MALLOC_CHUNK_LISTS; j++) LIST_INIT(&d->chunk_dir[i][j]); } - STATS_ADD(d->malloc_used, regioninfo_size); + STATS_ADD(d->malloc_used, regioninfo_size + 3 * MALLOC_PAGESIZE); d->canary1 = mopts.malloc_canary ^ (u_int32_t)(uintptr_t)d; d->canary2 = ~d->canary1; @@ -1196,7 +1211,7 @@ _malloc_init(int from_rthreads) if (!mopts.malloc_canary) omalloc_init(); - max = from_rthreads ? _MALLOC_MUTEXES : 1; + max = from_rthreads ? mopts.malloc_mutexes : 1; if (((uintptr_t)&malloc_readonly & MALLOC_PAGEMASK) == 0) mprotect(&malloc_readonly, sizeof(malloc_readonly), PROT_READ | PROT_WRITE); @@ -1281,16 +1296,19 @@ findpool(void *p, struct dir_info *argpo struct dir_info *pool = argpool; struct region_info *r = find(pool, p); + STATS_INC(pool->pool_searches); if (r == NULL) { if (mopts.malloc_mt) { int i; - for (i = 0; i < _MALLOC_MUTEXES; i++) { - if (i == argpool->mutex) - continue; + STATS_INC(pool->other_pool); + for (i = 1; i < mopts.malloc_mutexes; i++) { + int j = (argpool->mutex + i) & + (mopts.malloc_mutexes - 1); + pool->active--; _MALLOC_UNLOCK(pool->mutex); - pool = mopts.malloc_pool[i]; + pool = mopts.malloc_pool[j]; _MALLOC_LOCK(pool->mutex); pool->active++; r = find(pool, p); @@ -2220,14 +2238,13 @@ malloc_dump1(int fd, int poolno, struct return; dprintf(fd, "Region slots free %zu/%zu\n", d->regions_free, d->regions_total); - dprintf(fd, "Finds %zu/%zu\n", d->finds, - d->find_collisions); - dprintf(fd, "Inserts %zu/%zu\n", d->inserts, - d->insert_collisions); - dprintf(fd, "Deletes %zu/%zu\n", d->deletes, - d->delete_moves); + dprintf(fd, "Finds %zu/%zu\n", d->finds, d->find_collisions); + dprintf(fd, "Inserts %zu/%zu\n", d->inserts, d->insert_collisions); + dprintf(fd, "Deletes %zu/%zu\n", d->deletes, d->delete_moves); dprintf(fd, "Cheap reallocs %zu/%zu\n", d->cheap_reallocs, d->cheap_realloc_tries); + dprintf(fd, "Other pool searches %zu/%zu\n", + d->other_pool, d->pool_searches); dprintf(fd, "In use %zu\n", d->malloc_used); dprintf(fd, "Guarded %zu\n", d->malloc_guarded); dump_free_chunk_info(fd, d); @@ -2289,7 +2306,7 @@ malloc_gdump(int fd) int i; int saved_errno = errno; - for (i = 0; i < _MALLOC_MUTEXES; i++) + for (i = 0; i < mopts.malloc_mutexes; i++) malloc_dump(fd, i, mopts.malloc_pool[i]); errno = saved_errno; @@ -2305,15 +2322,15 @@ malloc_exit(void) if (fd != -1) { dprintf(fd, "******** Start dump %s *******\n", __progname); dprintf(fd, - "MT=%d I=%d F=%d U=%d J=%d R=%d X=%d C=%d cache=%u G=%zu\n", - mopts.malloc_mt, mopts.internal_funcs, - mopts.malloc_freecheck, + "MT=%d M=%u I=%d F=%d U=%d J=%d R=%d X=%d C=%d cache=%u G=%zu\n", + mopts.malloc_mt, mopts.mallloc_mutexes, + mopts.internal_funcs, mopts.malloc_freecheck, mopts.malloc_freeunmap, mopts.malloc_junk, mopts.malloc_realloc, mopts.malloc_xmalloc, mopts.chunk_canaries, mopts.malloc_cache, mopts.malloc_guard); - for (i = 0; i < _MALLOC_MUTEXES; i++) + for (i = 0; i < mopts.malloc_mutexes; i++) malloc_dump(fd, i, mopts.malloc_pool[i]); dprintf(fd, "******** End dump %s *******\n", __progname); close(fd); Index: thread/rthread_libc.c =================================================================== RCS file: /cvs/src/lib/libc/thread/rthread_libc.c,v retrieving revision 1.2 diff -u -p -r1.2 rthread_libc.c --- thread/rthread_libc.c 5 Sep 2017 02:40:54 -0000 1.2 +++ thread/rthread_libc.c 19 Dec 2018 10:18:38 -0000 @@ -169,14 +169,70 @@ static struct pthread_mutex malloc_lock[ MALLOC_LOCK_INITIALIZER(0), MALLOC_LOCK_INITIALIZER(1), MALLOC_LOCK_INITIALIZER(2), - MALLOC_LOCK_INITIALIZER(3) + MALLOC_LOCK_INITIALIZER(3), + MALLOC_LOCK_INITIALIZER(4), + MALLOC_LOCK_INITIALIZER(5), + MALLOC_LOCK_INITIALIZER(6), + MALLOC_LOCK_INITIALIZER(7), + MALLOC_LOCK_INITIALIZER(8), + MALLOC_LOCK_INITIALIZER(9), + MALLOC_LOCK_INITIALIZER(10), + MALLOC_LOCK_INITIALIZER(11), + MALLOC_LOCK_INITIALIZER(12), + MALLOC_LOCK_INITIALIZER(13), + MALLOC_LOCK_INITIALIZER(14), + MALLOC_LOCK_INITIALIZER(15), + MALLOC_LOCK_INITIALIZER(16), + MALLOC_LOCK_INITIALIZER(17), + MALLOC_LOCK_INITIALIZER(18), + MALLOC_LOCK_INITIALIZER(19), + MALLOC_LOCK_INITIALIZER(20), + MALLOC_LOCK_INITIALIZER(21), + MALLOC_LOCK_INITIALIZER(22), + MALLOC_LOCK_INITIALIZER(23), + MALLOC_LOCK_INITIALIZER(24), + MALLOC_LOCK_INITIALIZER(25), + MALLOC_LOCK_INITIALIZER(26), + MALLOC_LOCK_INITIALIZER(27), + MALLOC_LOCK_INITIALIZER(28), + MALLOC_LOCK_INITIALIZER(29), + MALLOC_LOCK_INITIALIZER(30), + MALLOC_LOCK_INITIALIZER(31) }; static pthread_mutex_t malloc_mutex[_MALLOC_MUTEXES] = { &malloc_lock[0], &malloc_lock[1], &malloc_lock[2], - &malloc_lock[3] + &malloc_lock[3], + &malloc_lock[4], + &malloc_lock[5], + &malloc_lock[6], + &malloc_lock[7], + &malloc_lock[8], + &malloc_lock[9], + &malloc_lock[10], + &malloc_lock[11], + &malloc_lock[12], + &malloc_lock[13], + &malloc_lock[14], + &malloc_lock[15], + &malloc_lock[16], + &malloc_lock[17], + &malloc_lock[18], + &malloc_lock[19], + &malloc_lock[20], + &malloc_lock[21], + &malloc_lock[22], + &malloc_lock[23], + &malloc_lock[24], + &malloc_lock[25], + &malloc_lock[26], + &malloc_lock[27], + &malloc_lock[28], + &malloc_lock[29], + &malloc_lock[30], + &malloc_lock[31] }; void