On Thu, Nov 7, 2019 at 18:20, Mark Wielaard <m...@klomp.org> wrote:
Hi Jonathan,
On Tue, 2019-10-29 at 22:14 +0100, Mark Wielaard wrote:
Pthread's thread-local variables are highly limited, which makes
it difficult to use many Dwarfs. This replaces that with a
less efficient (or elegant) but more robust method.
Thanks, it looks good (similar to a variant I already reviewed
earlier). Some small comments below.
I haven't benchmarked this version. Did you do any?
diff --git a/lib/atomics.h b/lib/atomics.h
index ffd12f87..e3773759 100644
--- a/lib/atomics.h
+++ b/lib/atomics.h
@@ -31,7 +31,9 @@
#if HAVE_STDATOMIC_H
/* If possible, use the compiler's preferred atomics. */
# include <stdatomic.h>
+# include <threads.h>
#else
/* Otherwise, try to use the builtins provided by this compiler.
*/
# include "stdatomic-fbsd.h"
+# define thread_local __thread
#endif /* HAVE_STDATOMIC_H */
Do we really need this?
We already use __thread unconditionally in the rest of the code.
The usage of threads.h seems to imply we actually want C11
_Thread_local. Is that what you really want, or can we just use
__thread in libdw_alloc.c for thread_id?
We don't really need it, I just got in the habit of writing
thread_local (and, proper C11 compat). __thread is perfectly fine for
thread_id.
diff --git a/libdw/dwarf_begin_elf.c b/libdw/dwarf_begin_elf.c
index 8d137414..eadff13b 100644
--- a/libdw/dwarf_begin_elf.c
+++ b/libdw/dwarf_begin_elf.c
@@ -418,13 +418,14 @@ dwarf_begin_elf (Elf *elf, Dwarf_Cmd cmd,
Elf_Scn *scngrp)
actual allocation. */
result->mem_default_size = mem_default_size;
result->oom_handler = __libdw_oom;
- if (pthread_key_create (&result->mem_key, NULL) != 0)
+ if (pthread_rwlock_init(&result->mem_rwl, NULL) != 0)
{
free (result);
- __libdw_seterrno (DWARF_E_NOMEM); /* no memory or max
pthread keys. */
+ __libdw_seterrno (DWARF_E_NOMEM); /* no memory. */
return NULL;
}
- atomic_init (&result->mem_tail, (uintptr_t)NULL);
+ result->mem_stacks = 0;
+ result->mem_tails = NULL;
if (cmd == DWARF_C_READ || cmd == DWARF_C_RDWR)
{
OK.
diff --git a/libdw/dwarf_end.c b/libdw/dwarf_end.c
index a2e94436..3fd2836d 100644
--- a/libdw/dwarf_end.c
+++ b/libdw/dwarf_end.c
@@ -95,17 +95,19 @@ dwarf_end (Dwarf *dwarf)
tdestroy (dwarf->split_tree, noop_free);
/* Free the internally allocated memory. */
- struct libdw_memblock *memp;
- memp = (struct libdw_memblock *) (atomic_load_explicit
- (&dwarf->mem_tail,
- memory_order_relaxed));
- while (memp != NULL)
- {
- struct libdw_memblock *prevp = memp->prev;
- free (memp);
- memp = prevp;
- }
- pthread_key_delete (dwarf->mem_key);
+ for (size_t i = 0; i < dwarf->mem_stacks; i++)
+ {
+ struct libdw_memblock *memp = dwarf->mem_tails[i];
+ while (memp != NULL)
+ {
+ struct libdw_memblock *prevp = memp->prev;
+ free (memp);
+ memp = prevp;
+ }
+ }
+ if (dwarf->mem_tails != NULL)
+ free (dwarf->mem_tails);
+ pthread_rwlock_destroy (&dwarf->mem_rwl);
/* Free the pubnames helper structure. */
free (dwarf->pubnames_sets);
OK. Might it be an idea to have some call to reset next_id (see
below)?
Generally not a good idea (see below).
diff --git a/libdw/libdwP.h b/libdw/libdwP.h
index ad2599eb..3e1ef59b 100644
--- a/libdw/libdwP.h
+++ b/libdw/libdwP.h
@@ -149,17 +149,6 @@ enum
#include "dwarf_sig8_hash.h"
-/* Structure for internal memory handling. This is basically a
simplified
- reimplementation of obstacks. Unfortunately the standard
obstack
- implementation is not usable in libraries. */
-struct libdw_memblock
-{
- size_t size;
- size_t remaining;
- struct libdw_memblock *prev;
- char mem[0];
-};
-
/* This is the structure representing the debugging state. */
struct Dwarf
{
@@ -231,11 +220,22 @@ struct Dwarf
/* Similar for addrx/constx, which will come from .debug_addr
section. */
struct Dwarf_CU *fake_addr_cu;
- /* Internal memory handling. Each thread allocates separately
and only
- allocates from its own blocks, while all the blocks are
pushed atomically
- onto a unified stack for easy deallocation. */
- pthread_key_t mem_key;
- atomic_uintptr_t mem_tail;
+ /* Supporting lock for internal memory handling. Ensures
threads that have
+ an entry in the mem_tails array are not disturbed by new
threads doing
+ allocations for this Dwarf. */
+ pthread_rwlock_t mem_rwl;
+
+ /* Internal memory handling. This is basically a simplified
thread-local
+ reimplementation of obstacks. Unfortunately the standard
obstack
+ implementation is not usable in libraries. */
+ size_t mem_stacks;
+ struct libdw_memblock
+ {
+ size_t size;
+ size_t remaining;
+ struct libdw_memblock *prev;
+ char mem[0];
+ } **mem_tails;
/* Default size of allocated memory blocks. */
size_t mem_default_size;
OK.
@@ -578,34 +578,31 @@ libdw_valid_user_form (int form)
extern void __libdw_seterrno (int value) internal_function;
-/* Memory handling, the easy parts. This macro does not do nor
need to do any
- locking for proper concurrent operation. */
+/* Memory handling, the easy parts. */
#define libdw_alloc(dbg, type, tsize, cnt) \
- ({ struct libdw_memblock *_tail = pthread_getspecific
(dbg->mem_key); \
- size_t _req = (tsize) * (cnt); \
- type *_result; \
- if (unlikely (_tail == NULL)) \
- _result = (type *) __libdw_allocate (dbg, _req, __alignof
(type)); \
+ ({ struct libdw_memblock *_tail = __libdw_alloc_tail(dbg);
\
+ size_t _required = (tsize) * (cnt); \
+ type *_result = (type *) (_tail->mem + (_tail->size -
_tail->remaining));\
+ size_t _padding = ((__alignof (type) \
+ - ((uintptr_t) _result & (__alignof (type) - 1))) \
+ & (__alignof (type) - 1)); \
+ if (unlikely (_tail->remaining < _required + _padding))
\
+ _result = (type *) __libdw_allocate (dbg, _required,
__alignof (type));\
else \
{ \
- _result = (type *) (_tail->mem + (_tail->size -
_tail->remaining)); \
- size_t _padding = ((__alignof (type) \
- - ((uintptr_t) _result & (__alignof (type) - 1))) \
- & (__alignof (type) - 1)); \
- if (unlikely (_tail->remaining < _req + _padding))
\
- _result = (type *) __libdw_allocate (dbg, _req, __alignof
(type)); \
- else \
- { \
- _req += _padding; \
- _result = (type *) ((char *) _result + _padding); \
- _tail->remaining -= _req;
\
- } \
+ _required += _padding; \
+ _result = (type *) ((char *) _result + _padding); \
+ _tail->remaining -= _required;
\
} \
_result; })
OK. Maybe add a comment that __libdw_alloc_tail makes sure that we get
a thread local libdw_memblock to work with.
#define libdw_typed_alloc(dbg, type) \
libdw_alloc (dbg, type, sizeof (type), 1)
+/* Callback to choose a thread-local memory allocation stack. */
+extern struct libdw_memblock *__libdw_alloc_tail (Dwarf* dbg)
+ __nonnull_attribute__ (1);
+
/* Callback to allocate more. */
extern void *__libdw_allocate (Dwarf *dbg, size_t minsize, size_t
align)
__attribute__ ((__malloc__)) __nonnull_attribute__ (1);
O. There is the comment already :)
diff --git a/libdw/libdw_alloc.c b/libdw/libdw_alloc.c
index f2e74d18..86ca7032 100644
--- a/libdw/libdw_alloc.c
+++ b/libdw/libdw_alloc.c
@@ -35,7 +35,68 @@
#include <stdlib.h>
#include "libdwP.h"
#include "system.h"
+#include "atomics.h"
+#if USE_VG_ANNOTATIONS == 1
+#include <helgrind.h>
+#include <drd.h>
I think if you include helgrind.h you won't get the drd.h
ANNOTATE_HAPPENS_BEFORE/AFTER. So do you also need to include drd.h?
Not really, just another habit. Since this is file only needs HAPPENS_*
helgrind.h is sufficient.
+#else
+#define ANNOTATE_HAPPENS_BEFORE(X)
+#define ANNOTATE_HAPPENS_AFTER(X)
+#endif
Could you explain the usage of the happens_before/after annotations in
this code. I must admit that I don't fully understand why/how it works
in this case. Specifically since realloc might change the address that
mem_tails points to.
Reader-writer locks ensure no "readers" are present whenever a "writer"
is around. In this case we use the "write" side for resizing mem_tails
and the "read" side when mem_tails needs to stay stable. Which is why
most of the time we have a read lock and then promote to a write lock
when we need to reallocate.
The annotations are to clean up a minor deficiency in Helgrind: for
whatever reason if you do writes under a read lock it reports races
with the writes from under the write lock (in this case,
__libdw_allocate and the realloc). I haven't dug deep enough to know
exactly why it happens, just that it does and adding this H-B arc seems
to fix the issue.
+#define THREAD_ID_UNSET ((size_t) -1)
+static thread_local size_t thread_id = THREAD_ID_UNSET;
+static atomic_size_t next_id = ATOMIC_VAR_INIT(0);
OK, but maybe use static __thread size_t thread_id as explained above?
Fine by me.
+struct libdw_memblock *
+__libdw_alloc_tail (Dwarf *dbg)
+{
+ if (thread_id == THREAD_ID_UNSET)
+ thread_id = atomic_fetch_add (&next_id, 1);
+
+ pthread_rwlock_rdlock (&dbg->mem_rwl);
+ if (thread_id >= dbg->mem_stacks)
+ {
+ pthread_rwlock_unlock (&dbg->mem_rwl);
+ pthread_rwlock_wrlock (&dbg->mem_rwl);
+
+ /* Another thread may have already reallocated. In theory
using an
+ atomic would be faster, but given that this only happens
once per
+ thread per Dwarf, some minor slowdown should be fine. */
+ if (thread_id >= dbg->mem_stacks)
+ {
+ dbg->mem_tails = realloc (dbg->mem_tails, (thread_id+1)
+ * sizeof (struct
libdw_memblock *));
+ if (dbg->mem_tails == NULL)
+ {
+ pthread_rwlock_unlock (&dbg->mem_rwl);
+ dbg->oom_handler();
+ }
+ for (size_t i = dbg->mem_stacks; i <= thread_id; i++)
+ dbg->mem_tails[i] = NULL;
+ dbg->mem_stacks = thread_id + 1;
+ ANNOTATE_HAPPENS_BEFORE (&dbg->mem_tails);
+ }
+
+ pthread_rwlock_unlock (&dbg->mem_rwl);
+ pthread_rwlock_rdlock (&dbg->mem_rwl);
+ }
OK. So next_id only increases, so it might leak a bit without thread
pools.
Might it make sense to have some __libdw_destroy_tail () function that
could be called from dwarf_end that checks if this was the last Dwarf
in use so we could reset next_id? It would only work in some cases, if
there are multiple Dwarfs in use it probably is useless. I guess it is
too much trickery for an odd corner case?
O, and I now think you would then also need something for dwarf_begin
to reset any set thread_ids... bleah. So probably way too complicated.
So lets not, unless you think this is actually simple.
Which is why I didn't want to do that.
The other option was to have a sort of free list for ids, but in that
case the cleanup isn't great (sometime after all threads have
completed... if you consider detached threads things get hairy). Plus
it requires a fully concurrent stack or queue, which is a complicated
data structure itself.
+ /* At this point, we have an entry in the tail array. */
+ ANNOTATE_HAPPENS_AFTER (&dbg->mem_tails);
+ struct libdw_memblock *result = dbg->mem_tails[thread_id];
+ if (result == NULL)
+ {
+ result = malloc (dbg->mem_default_size);
+ result->size = dbg->mem_default_size
+ - offsetof (struct libdw_memblock, mem);
+ result->remaining = result->size;
+ result->prev = NULL;
+ dbg->mem_tails[thread_id] = result;
+ }
+ pthread_rwlock_unlock (&dbg->mem_rwl);
+ return result;
+}
OK.
void *
__libdw_allocate (Dwarf *dbg, size_t minsize, size_t align)
@@ -52,10 +113,10 @@ __libdw_allocate (Dwarf *dbg, size_t minsize,
size_t align)
newp->size = size - offsetof (struct libdw_memblock, mem);
newp->remaining = (uintptr_t) newp + size - (result + minsize);
- newp->prev = (struct libdw_memblock*)atomic_exchange_explicit(
- &dbg->mem_tail, (uintptr_t)newp, memory_order_relaxed);
- if (pthread_setspecific (dbg->mem_key, newp) != 0)
- dbg->oom_handler ();
+ pthread_rwlock_rdlock (&dbg->mem_rwl);
+ newp->prev = dbg->mem_tails[thread_id];
+ dbg->mem_tails[thread_id] = newp;
+ pthread_rwlock_unlock (&dbg->mem_rwl);
return (void *) result;
}
OK. Since this is only called after __libdw_alloc_tail you know that
thread_id will be set.
Exactly.
Thanks,
Mark