Hi Mark, As per your suggestion, I have placed the lock around the entire __libdw_find_split_unit function call.
On Wed, Oct 11, 2023 at 12:17 PM Mark Wielaard <m...@klomp.org> wrote: > Hi Heather, > > On Tue, Oct 10, 2023 at 03:42:56PM +0200, Mark Wielaard wrote: > > From: Heather McIntyre <h...@rice.edu> > > > > * (try_split_file): Use eu_tsearch. > > Add lock for cu->split. > > > > Signed-off-by: Heather S. McIntyre <h...@rice.edu> > > Signed-off-by: Mark Wielaard <m...@klomp.org> > > --- > > libdw/libdw_find_split_unit.c | 43 +++++++++++++++++++++++++++++------ > > 1 file changed, 36 insertions(+), 7 deletions(-) > > > > diff --git a/libdw/libdw_find_split_unit.c > b/libdw/libdw_find_split_unit.c > > index 533f8072..a288e9bd 100644 > > --- a/libdw/libdw_find_split_unit.c > > +++ b/libdw/libdw_find_split_unit.c > > @@ -34,13 +34,19 @@ > > #include "libelfP.h" > > > > #include <limits.h> > > -#include <search.h> > > +#include <eu-search.h> > > #include <stdlib.h> > > #include <string.h> > > #include <sys/types.h> > > #include <sys/stat.h> > > #include <fcntl.h> > > > > +/* __libdw_link_skel_split() modifies "skel->split = split" > > + and "split->split = skel". > > + "cu->split" is read at multiple locations and conditionally updated. > > + Mutual exclusion is enforced to prevent a race. */ > > +rwlock_define(static, cu_split_lock); > > __libdw_link_skel_split is defined in libdw/libdwP.h and is called in > try_split_file. (It is also called in src/readelf.c but that is a > terrible hack, so ignore that for now.) > > A Dwarf_CU is created (see libdw/libdw_findcu.c) with split set to > (Dwarf_CU *) -1 to indicate the split dwarf is not yet set. If > cu->split is NULL it means the split dwarf was searched for, but not > found. > > > static void > > try_split_file (Dwarf_CU *cu, const char *dwo_path) > > { > > @@ -57,7 +63,7 @@ try_split_file (Dwarf_CU *cu, const char *dwo_path) > > if (split->unit_type == DW_UT_split_compile > > && cu->unit_id8 == split->unit_id8) > > { > > - if (tsearch (split->dbg, &cu->dbg->split_tree, > > + if (eu_tsearch (split->dbg, &cu->dbg->split_tree, > > __libdw_finddbg_cb) == NULL) > > { > > /* Something went wrong. Don't link. */ > > @@ -65,9 +71,13 @@ try_split_file (Dwarf_CU *cu, const char *dwo_path) > > break; > > } > > > > + rwlock_wrlock(cu_split_lock); > > + > > /* Link skeleton and split compile units. */ > > __libdw_link_skel_split (cu, split); > > > > + rwlock_unlock(cu_split_lock); > > + > > Is this locking wide enough? It seems like multiple threads can race > to this code and set different Dwarfs (which means they'll leak). See > below. > > > /* We have everything we need from this ELF > > file. And we are going to close the fd to > > not run out of file descriptors. */ > > @@ -75,8 +85,13 @@ try_split_file (Dwarf_CU *cu, const char *dwo_path) > > break; > > } > > } > > + > > + rwlock_rdlock(cu_split_lock); > > + > > if (cu->split == (Dwarf_CU *) -1) > > dwarf_end (split_dwarf); > > + > > + rwlock_unlock(cu_split_lock); > > This means __libdw_link_skel_split wasn't called above (so the > split_dwarf created by dwarf_begin didn't contain the expected CU). > > > } > > /* Always close, because we don't want to run out of file > > descriptors. See also the elf_fcntl ELF_C_FDDONE call > > @@ -89,9 +104,13 @@ Dwarf_CU * > > internal_function > > __libdw_find_split_unit (Dwarf_CU *cu) > > { > > + rwlock_rdlock(cu_split_lock); > > + Dwarf_CU *cu_split_local = cu->split; > > + rwlock_unlock(cu_split_lock); > > + > > /* Only try once. */ > > - if (cu->split != (Dwarf_CU *) -1) > > - return cu->split; > > + if (cu_split_local != (Dwarf_CU *) -1) > > + return cu_split_local; > > > > /* We need a skeleton unit with a comp_dir and [GNU_]dwo_name > attributes. > > The split unit will be the first in the dwo file and should have > the > > @@ -116,7 +135,11 @@ __libdw_find_split_unit (Dwarf_CU *cu) > > free (dwo_path); > > } > > > > - if (cu->split == (Dwarf_CU *) -1) > > + rwlock_rdlock(cu_split_lock); > > + cu_split_local = cu->split; > > + rwlock_unlock(cu_split_lock); > > + > > + if (cu_split_local == (Dwarf_CU *) -1) > > { > > /* Try compdir plus dwo_name. */ > > Dwarf_Attribute compdir; > > It looks like multiple threads can end up here after having seen > cu->split/cu_split_local == (Dwarf_CU *) -1) Which means multiple > threads will enter try_split_file and might all call > __libdw_link_skel_split as mentioned above. > > > @@ -138,9 +161,15 @@ __libdw_find_split_unit (Dwarf_CU *cu) > > } > > } > > > > + rwlock_wrlock(cu_split_lock); > > + > > /* If we found nothing, make sure we don't try again. */ > > if (cu->split == (Dwarf_CU *) -1) > > - cu->split = NULL; > > + { > > + cu->split = NULL; > > + cu_split_local = cu->split; > > + } > > > > - return cu->split; > > + rwlock_unlock(cu_split_lock); > > + return cu_split_local; > > } > > I think it would be better to keep the lock during the whole > __libdw_find_split_unit call. > > Cheers, > > Mark >