[PATCH] handle unwind tables that are embedded within unwinding code, [PR111731]

2024-03-15 Thread Thomas Neumann

Original bug report: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111731
Given that this is a regression, is this okay for gcc 13 and mainline?

The unwinding mechanism registers both the code range and the unwind
table itself within a b-tree lookup structure. That data structure
assumes that is consists of non-overlappping intervals. This
becomes a problem if the unwinding table is embedded within the
code itself, as now the intervals do overlap.

To fix this problem we now keep the unwind tables in a separate
b-tree, which prevents the overlap.

libgcc/ChangeLog:
PR libgcc/111731
* unwind-dw2-fde.c: Split unwind ranges if they contain the
unwind table.
---
 libgcc/unwind-dw2-fde.c | 37 +
 1 file changed, 21 insertions(+), 16 deletions(-)

diff --git a/libgcc/unwind-dw2-fde.c b/libgcc/unwind-dw2-fde.c
index 61a578d097e..9d503545677 100644
--- a/libgcc/unwind-dw2-fde.c
+++ b/libgcc/unwind-dw2-fde.c
@@ -48,6 +48,7 @@ typedef __UINTPTR_TYPE__ uintptr_type;
 #include "unwind-dw2-btree.h"
 
 static struct btree registered_frames;

+static struct btree registered_objects;
 static bool in_shutdown;
 
 static void

@@ -58,6 +59,7 @@ release_registered_frames (void)
   /* Release the b-tree and all frames. Frame releases that happen later are
* silently ignored */
   btree_destroy (®istered_frames);
+  btree_destroy (®istered_objects);
   in_shutdown = true;
 }
 
@@ -103,6 +105,21 @@ static __gthread_mutex_t object_mutex;

 #endif
 #endif
 
+#ifdef ATOMIC_FDE_FAST_PATH

+// Register the pc range for a given object in the lookup structure.
+static void
+register_pc_range_for_object (uintptr_type begin, struct object *ob)
+{
+  // Register the object itself to know the base pointer on deregistration.
+  btree_insert (®istered_objects, begin, 1, ob);
+
+  // Register the frame in the b-tree
+  uintptr_type range[2];
+  get_pc_range (ob, range);
+  btree_insert (®istered_frames, range[0], range[1] - range[0], ob);
+}
+#endif
+
 /* Called from crtbegin.o to register the unwind info for an object.  */
 
 void

@@ -124,13 +141,7 @@ __register_frame_info_bases (const void *begin, struct 
object *ob,
 #endif
 
 #ifdef ATOMIC_FDE_FAST_PATH

-  // Register the object itself to know the base pointer on deregistration.
-  btree_insert (®istered_frames, (uintptr_type) begin, 1, ob);
-
-  // Register the frame in the b-tree
-  uintptr_type range[2];
-  get_pc_range (ob, range);
-  btree_insert (®istered_frames, range[0], range[1] - range[0], ob);
+  register_pc_range_for_object ((uintptr_type) begin, ob);
 #else
   init_object_mutex_once ();
   __gthread_mutex_lock (&object_mutex);
@@ -178,13 +189,7 @@ __register_frame_info_table_bases (void *begin, struct 
object *ob,
   ob->s.b.encoding = DW_EH_PE_omit;
 
 #ifdef ATOMIC_FDE_FAST_PATH

-  // Register the object itself to know the base pointer on deregistration.
-  btree_insert (®istered_frames, (uintptr_type) begin, 1, ob);
-
-  // Register the frame in the b-tree
-  uintptr_type range[2];
-  get_pc_range (ob, range);
-  btree_insert (®istered_frames, range[0], range[1] - range[0], ob);
+  register_pc_range_for_object ((uintptr_type) begin, ob);
 #else
   init_object_mutex_once ();
   __gthread_mutex_lock (&object_mutex);
@@ -232,7 +237,7 @@ __deregister_frame_info_bases (const void *begin)
 
 #ifdef ATOMIC_FDE_FAST_PATH

   // Find the originally registered object to get the base pointer.
-  ob = btree_remove (®istered_frames, (uintptr_type) begin);
+  ob = btree_remove (®istered_objects, (uintptr_type) begin);
 
   // Remove the corresponding PC range.

   if (ob)
@@ -240,7 +245,7 @@ __deregister_frame_info_bases (const void *begin)
   uintptr_type range[2];
   get_pc_range (ob, range);
   if (range[0] != range[1])
-btree_remove (®istered_frames, range[0]);
+   btree_remove (®istered_frames, range[0]);
 }
 
   // Deallocate the sort array if any.

--
2.43.0



Re: [PATCH] handle unwind tables that are embedded within unwinding code, [PR111731]

2024-03-22 Thread Thomas Neumann

libgcc/ChangeLog:
 PR libgcc/111731
 * unwind-dw2-fde.c: Split unwind ranges if they contain the
 unwind table.
And what I'd suggest is committing to the trunk now, then waiting a week 
or two before backporting to gcc-13.


I will do that, thanks for looking at the patch.

Best

Thomas



Re: [PATCH] libgcc: Fix up unwind-dw2-btree.h [PR119151]

2025-03-10 Thread Thomas Neumann

Hi Jakub,


What differs from the textbook implementations is mostly that the leaf nodes
don't include just address as a key, but address range, address + size
(where we don't insert any ranges with zero size) and the lookups can be
performed for any address in the [address, address + size) range.  The keys
on inner nodes are still just address-1, so the child covers all nodes
where addr <= key unless it is covered already in children to the left.
The user (static executables or JIT) should always ensure there is no
overlap in between any of the ranges.


thanks for tracking this down (and sorry for me messing it up in the 
first place). I checked your logic and your patch, and both look fine to me.


Best

Thomas



PING^2: [PATCH] release the sorted FDE array when deregistering a frame [PR109685]

2023-06-02 Thread Thomas Neumann via Gcc-patches
Summary: The old linear scan logic called free while searching the list 
of frames. The atomic fast path finds the frame quickly, but forgot the 
free call. This patches adds the missing free. Bugzilla #109685.


See:
https://gcc.gnu.org/pipermail/gcc-patches/2023-May/619026.html

Best

Thomas


PING: [PATCH] fix radix sort on 32bit platforms [PR109670]

2023-06-02 Thread Thomas Neumann via Gcc-patches
Summary: The radix sort did not handle the uppermost byte correctly, 
which sometimes broke win32 exceptions. Bugzilla #109670. The reporter 
confirmed that the patch fixes the bug.


See:
https://gcc.gnu.org/pipermail/gcc-patches/2023-May/618000.html

Best

Thomas


PING: [PATCH v3] eliminate mutex in fast path of __register_frame

2022-07-27 Thread Thomas Neumann via Gcc-patches
Summary: __register_frame and the corresponding _Unwind_Find_FDE use a 
global mutex for frame registration and unwinding. This can lead to very 
poor performance on machines with high core counts. This patch organizes 
the frames in a b-tree with read-optimized synchronization instead, 
which allows for fully parallel unwinding.


See:
https://gcc.gnu.org/pipermail/gcc-patches/2022-June/597256.html

Best

Thomas


[PATCH] preserve base pointer for __deregister_frame [PR110956]

2023-08-10 Thread Thomas Neumann via Gcc-patches

Original bug report: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110956
Rainer Orth successfully tested the patch on Solaris with a full bootstrap.



Some uncommon unwinding table encodings need to access the base pointer
for address computations. We do not have that information in calls to
__deregister_frame_info_bases, and previously simply used nullptr as
base pointer. That is usually fine, but for some Solaris i386 shared
libraries that results in wrong address computations.

To fix this problem we now associate the unwinding object with
the table pointer itself, which is always known, in addition to
the PC range. When deregistering a frame, we first locate the object
using the table pointer, and then use the base pointer stored within
the object to compute the PC range.

libgcc/ChangeLog:
PR libgcc/110956
* unwind-dw2-fde.c: Associate object with address of unwinding
table.
---
 libgcc/unwind-dw2-fde.c | 34 +-
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/libgcc/unwind-dw2-fde.c b/libgcc/unwind-dw2-fde.c
index d7c4a467754..ae4530179f3 100644
--- a/libgcc/unwind-dw2-fde.c
+++ b/libgcc/unwind-dw2-fde.c
@@ -124,6 +124,9 @@ __register_frame_info_bases (const void *begin, struct 
object *ob,
 #endif
 
 #ifdef ATOMIC_FDE_FAST_PATH

+  // Register the object itself to know the base pointer on deregistration.
+  btree_insert (®istered_frames, (uintptr_type) begin, 1, ob);
+
   // Register the frame in the b-tree
   uintptr_type range[2];
   get_pc_range (ob, range);
@@ -175,6 +178,9 @@ __register_frame_info_table_bases (void *begin, struct 
object *ob,
   ob->s.b.encoding = DW_EH_PE_omit;
 
 #ifdef ATOMIC_FDE_FAST_PATH

+  // Register the object itself to know the base pointer on deregistration.
+  btree_insert (®istered_frames, (uintptr_type) begin, 1, ob);
+
   // Register the frame in the b-tree
   uintptr_type range[2];
   get_pc_range (ob, range);
@@ -225,22 +231,17 @@ __deregister_frame_info_bases (const void *begin)
 return ob;
 
 #ifdef ATOMIC_FDE_FAST_PATH

-  // Find the corresponding PC range
-  struct object lookupob;
-  lookupob.tbase = 0;
-  lookupob.dbase = 0;
-  lookupob.u.single = begin;
-  lookupob.s.i = 0;
-  lookupob.s.b.encoding = DW_EH_PE_omit;
-#ifdef DWARF2_OBJECT_END_PTR_EXTENSION
-  lookupob.fde_end = NULL;
-#endif
-  uintptr_type range[2];
-  get_pc_range (&lookupob, range);
+  // Find the originally registered object to get the base pointer.
+  ob = btree_remove (®istered_frames, (uintptr_type) begin);
 
-  // And remove

-  ob = btree_remove (®istered_frames, range[0]);
-  bool empty_table = (range[1] - range[0]) == 0;
+  // Remove the corresponding PC range.
+  if (ob)
+{
+  uintptr_type range[2];
+  get_pc_range (ob, range);
+  if (range[0] != range[1])
+   btree_remove (®istered_frames, range[0]);
+}
 
   // Deallocate the sort array if any.

   if (ob && ob->s.b.sorted)
@@ -283,12 +284,11 @@ __deregister_frame_info_bases (const void *begin)
 
  out:

   __gthread_mutex_unlock (&object_mutex);
-  const int empty_table = 0; // The non-atomic path stores all tables.
 #endif
 
   // If we didn't find anything in the lookup data structures then they

   // were either already destroyed or we tried to remove an empty range.
-  gcc_assert (in_shutdown || (empty_table || ob));
+  gcc_assert (in_shutdown || ob);
   return (void *) ob;
 }
 
--

2.39.2



[PATCH] fix radix sort on 32bit platforms [PR109670]

2023-05-10 Thread Thomas Neumann via Gcc-patches

The radix sort uses two buffers, a1 for input and a2 for output.
After every digit the role of the two buffers is swapped.
When terminating the sort early the code made sure the output
was in a2.  However, when we run out of bits, as can happen on
32bit platforms, the sorted result was in a1, was we had just
swapped a1 and a2.
This patch fixes the problem by unconditionally having a1 as
output after every loop iteration.

This bug manifested itself only on 32bit platforms and even then
only in some circumstances, as it needs frames where a swap
is required due to differences in the top-most byte, which is
affected by ASLR. The new logic was validated by exhaustive
search over 32bit input values.

libgcc/ChangeLog:
* unwind-dw2-fde.c: Fix radix sort buffer management.
---
 libgcc/unwind-dw2-fde.c | 8 +++-
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/libgcc/unwind-dw2-fde.c b/libgcc/unwind-dw2-fde.c
index 7b74c391ced..31a3834156b 100644
--- a/libgcc/unwind-dw2-fde.c
+++ b/libgcc/unwind-dw2-fde.c
@@ -624,8 +624,6 @@ fde_radixsort (struct object *ob, fde_extractor_t 
fde_extractor,

   // Stop if we are already sorted.
   if (!violations)
{
- // The sorted data is in a1 now.
- a2 = a1;
  break;
}

@@ -660,9 +658,9 @@ fde_radixsort (struct object *ob, fde_extractor_t 
fde_extractor,

 #undef FANOUT
 #undef FANOUTBITS

-  // The data is in a2 now, move in place if needed.
-  if (a2 != v1->array)
-memcpy (v1->array, a2, sizeof (const fde *) * n);
+  // The data is in a1 now, move in place if needed.
+  if (a1 != v1->array)
+memcpy (v1->array, a1, sizeof (const fde *) * n);
 }

 static inline void
--
2.39.2



PING: [PATCH] release the sorted FDE array when deregistering a frame [PR109685]

2023-05-12 Thread Thomas Neumann via Gcc-patches
Summary: The old linear scan logic called free while searching the list 
of frames. The atomic fast path finds the frame quickly, but forgot the 
free call. This patches adds the missing free. Bugzilla #109685.


See:
https://gcc.gnu.org/pipermail/gcc-patches/2023-May/617245.html

Best

Thomas


Re: [PATCH] Fix assertion for unwind-dw2-fde.c btree changes

2023-05-14 Thread Thomas Neumann via Gcc-patches

Dear Sören,


we ran into a regression introduced by these changes. The regression
manifests itself in a failing assertion in __deregister_frame_info_bases.
The assertion failure was observed while using Chromium's `flatc` build
system tool. The failing assertion is:

unwind-dw2-fde.c:281gcc_assert (in_shutdown || ob);
[snip]
However, I believe there is one more edge case that isn't being account
for presently: If the inserted entry has a size of 0 (i.e. if range[1] -
range[0] == 0) then the btree_insert call in __register_frame_info_bases
will not insert anything. This is not accounted for in

> [snip]


Would be cool if this could be fixed on the GCC trunk.


thanks for the details analysis and the patch, it looks obviously 
correct for me. I can apply it to trunk, but we need approval from a gcc 
maintainer first.


But independent of your patch, do you have the test case available in 
some easily accessible form, for example a docker image or an automated 
build script? I ask because something odd is happening here, somebody 
registered a non-empty EH that does not contain a single unwind range. I 
am puzzled why anybody would do that, I would like to double check that 
this is indeed the intended behavior and not a bug somewhere else. Or if 
you have the test case at hand, it would be great if you could do a 
quick step through get_pc_range for the affected frame to double-check 
that the table is indeed empty and we don't miss an entry for some 
strange reason.


Best

Thomas





Re: [PATCH] Fix assertion for unwind-dw2-fde.c btree changes

2023-05-14 Thread Thomas Neumann via Gcc-patches

even building the flatbuffers repo externally using cmake at the same revision
didn't reproduce it.

that said, i have attached a dockerfile that shows you what /does/ fail, under
the specific conditions. but there is no unpatched libgcc_s, so you'll have
to do that part yourself, and build a libgcc_s.so.1 without the patch in this
thread.


thanks for the dockerfile, I could reproduce the problem. For some 
strange reason the program really tried to register a frame table 
without entries. Perhaps that is caused by musl, I could not find the 
root cause for that. But nevertheless, fixing the assert is the right 
thing to do, we must ignore empty tables.


Best

Thomas



Re: [PATCH] Fix assertion for unwind-dw2-fde.c btree changes

2023-05-15 Thread Thomas Neumann via Gcc-patches

Hello, this patch breaks the build on targets where range is not declared i.e. 
where the #ifdef ATOMIC_FDE_FAST_PATH path is not taken.


argh, I did not realize I tested the patch only on atomic fast path 
platforms. The patch below fixes that by moving the check inside the #ifdef.


I will check that everything works on atomic and non-atomic platforms 
and commit the trivial move then. Sorry for the breakage.


Best

Thomas



From 550dc27f547a067e96137adeb85148d8a84c81a0 Mon Sep 17 00:00:00 2001
From: Thomas Neumann 
Date: Mon, 15 May 2023 14:59:22 +0200
Subject: [PATCH] fix assert in non-atomic path

The non-atomic path does not have range information,
we have to adjust the assert handle that case, too.

libgcc/ChangeLog:
* unwind-dw2-fde.c: Fix assert in non-atomic path.
---
 libgcc/unwind-dw2-fde.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/libgcc/unwind-dw2-fde.c b/libgcc/unwind-dw2-fde.c
index 8683a65aa02..df461a1527d 100644
--- a/libgcc/unwind-dw2-fde.c
+++ b/libgcc/unwind-dw2-fde.c
@@ -240,6 +240,7 @@ __deregister_frame_info_bases (const void *begin)

   // And remove
   ob = btree_remove (®istered_frames, range[0]);
+  bool empty_table = (range[1] - range[0]) == 0;
 #else
   init_object_mutex_once ();
   __gthread_mutex_lock (&object_mutex);
@@ -276,11 +277,12 @@ __deregister_frame_info_bases (const void *begin)

  out:
   __gthread_mutex_unlock (&object_mutex);
+  bool empty_table = false;
 #endif

   // If we didn't find anything in the lookup data structures then they
   // were either already destroyed or we tried to remove an empty range.
-  gcc_assert (in_shutdown || ((range[1] - range[0]) == 0 || ob));
+  gcc_assert (in_shutdown || (empty_table || ob));
   return (void *) ob;
 }

--
2.39.2




[PATCH v2] release the sorted FDE array when deregistering a frame [PR109685]

2023-05-19 Thread Thomas Neumann via Gcc-patches

Am 19.05.23 um 19:26 schrieb Jeff Law:

See:
https://gcc.gnu.org/pipermail/gcc-patches/2023-May/617245.html

I think this needs an update given the other changes in this space.

jeff


I have included the updated the patch below.



The atomic fastpath bypasses the code that releases the sort
array which was lazily allocated during unwinding. We now
check after deregistering if there is an array to free.

libgcc/ChangeLog:
* unwind-dw2-fde.c: Free sort array in atomic fast path.
---
 libgcc/unwind-dw2-fde.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/libgcc/unwind-dw2-fde.c b/libgcc/unwind-dw2-fde.c
index a5786bf729c..32b9e64a1c8 100644
--- a/libgcc/unwind-dw2-fde.c
+++ b/libgcc/unwind-dw2-fde.c
@@ -241,6 +241,12 @@ __deregister_frame_info_bases (const void *begin)
   // And remove
   ob = btree_remove (®istered_frames, range[0]);
   bool empty_table = (range[1] - range[0]) == 0;
+
+  // Deallocate the sort array if any.
+  if (ob && ob->s.b.sorted)
+{
+  free (ob->u.sort);
+}
 #else
   init_object_mutex_once ();
   __gthread_mutex_lock (&object_mutex);
--
2.39.2




[PATCH v4] eliminate mutex in fast path of __register_frame

2022-09-16 Thread Thomas Neumann via Gcc-patches

The __register_frame/__deregister_frame functions are used to register
unwinding frames from JITed code in a sorted list. That list itself
is protected by object_mutex, which leads to terrible performance
in multi-threaded code and is somewhat expensive even if single-threaded.
There was already a fast-path that avoided taking the mutex if no
frame was registered at all.

This commit eliminates both the mutex and the sorted list from
the atomic fast path, and replaces it with a btree that uses
optimistic lock coupling during lookup. This allows for fully parallel
unwinding and is essential to scale exception handling to large
core counts.

Changes since v3:
- Avoid code duplication by adding query mode to classify_object_over_fdes
- Adjust all comments as requested

libgcc/ChangeLog:

* unwind-dw2-fde.c (release_registered_frames): Cleanup at shutdown.
(__register_frame_info_table_bases): Use btree in atomic fast path.
(__deregister_frame_info_bases): Likewise.
(_Unwind_Find_FDE): Likewise.
(base_from_object): Make parameter const.
(classify_object_over_fdes): Add query-only mode.
(get_pc_range): Compute PC range for lookup.
* unwind-dw2-fde.h (last_fde): Make parameter const.
* unwind-dw2-btree.h: New file.
---
 libgcc/unwind-dw2-btree.h | 953 ++
 libgcc/unwind-dw2-fde.c   | 195 ++--
 libgcc/unwind-dw2-fde.h   |   2 +-
 3 files changed, 1098 insertions(+), 52 deletions(-)
 create mode 100644 libgcc/unwind-dw2-btree.h

diff --git a/libgcc/unwind-dw2-btree.h b/libgcc/unwind-dw2-btree.h
new file mode 100644
index 000..8853f0eab48
--- /dev/null
+++ b/libgcc/unwind-dw2-btree.h
@@ -0,0 +1,953 @@
+/* Lock-free btree for manually registered unwind frames.  */
+/* Copyright (C) 2022 Free Software Foundation, Inc.
+   Contributed by Thomas Neumann
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_UNWIND_DW2_BTREE_H
+#define GCC_UNWIND_DW2_BTREE_H
+
+#include 
+
+// Common logic for version locks.
+struct version_lock
+{
+  // The lock itself. The lowest bit indicates an exclusive lock,
+  // the second bit indicates waiting threads. All other bits are
+  // used as counter to recognize changes.
+  // Overflows are okay here, we must only prevent overflow to the
+  // same value within one lock_optimistic/validate
+  // range. Even on 32 bit platforms that would require 1 billion
+  // frame registrations within the time span of a few assembler
+  // instructions.
+  uintptr_t version_lock;
+};
+
+#ifdef __GTHREAD_HAS_COND
+// We should never get contention within the tree as it rarely changes.
+// But if we ever do get contention we use these for waiting.
+static __gthread_mutex_t version_lock_mutex = __GTHREAD_MUTEX_INIT;
+static __gthread_cond_t version_lock_cond = __GTHREAD_COND_INIT;
+#endif
+
+// Initialize in locked state.
+static inline void
+version_lock_initialize_locked_exclusive (struct version_lock *vl)
+{
+  vl->version_lock = 1;
+}
+
+// Try to lock the node exclusive.
+static inline bool
+version_lock_try_lock_exclusive (struct version_lock *vl)
+{
+  uintptr_t state = __atomic_load_n (&(vl->version_lock), __ATOMIC_SEQ_CST);
+  if (state & 1)
+return false;
+  return __atomic_compare_exchange_n (&(vl->version_lock), &state, state | 1,
+ false, __ATOMIC_SEQ_CST,
+ __ATOMIC_SEQ_CST);
+}
+
+// Lock the node exclusive, blocking as needed.
+static void
+version_lock_lock_exclusive (struct version_lock *vl)
+{
+#ifndef __GTHREAD_HAS_COND
+restart:
+#endif
+
+  // We should virtually never get contention here, as frame
+  // changes are rare.
+  uintptr_t state = __atomic_load_n (&(vl->version_lock), __ATOMIC_SEQ_CST);
+  if (!(state & 1))
+{
+  if (__atomic_compare_exchange_n (&(vl->version_lock), &state, state | 1,
+  false, __ATOMIC_SEQ_CST,
+  __ATOMIC_SEQ_CST))
+   return;
+}
+
+// We did get 

Re: [PATCH v4] eliminate mutex in fast path of __register_frame

2022-09-18 Thread Thomas Neumann via Gcc-patches

Hi Dimitar,


This patch broke avr and pru-elf cross builds:
   gcc/libgcc/unwind-dw2-fde.c:680:28: error: unknown type name ‘uintptr_t’
   680 |uintptr_t *range)

Should uintptr_t be replaced with __UINTPTR_TYPE__? Such change fixes the
above broken builds for me. But I'm not sure how valid it is for that
part of libgcc.


thanks for notifying me, I was not aware that uintptr_t is not available 
on all platforms. After looking at the existing code I think 
__UINTPTR_TYPE__ is the correct substitute. I will do some more testing 
and commit an s/uintptr_t/__UINTPTR_TYPE__/ patch soon. (I will probably 
use a typedef, as seen in generic-morestack.c, to avoid uglifying the code).


Best

Thomas


Re: [PATCH v4] eliminate mutex in fast path of __register_frame

2022-09-18 Thread Thomas Neumann via Gcc-patches

Hi Dimitar,


This patch broke avr and pru-elf cross builds:
   gcc/libgcc/unwind-dw2-fde.c:680:28: error: unknown type name ‘uintptr_t’
   680 |uintptr_t *range)

Should uintptr_t be replaced with __UINTPTR_TYPE__? Such change fixes the
above broken builds for me. But I'm not sure how valid it is for that
part of libgcc.


I have fixed that by using a typedef for __UINTPTR_TYPE__.

Best

Thomas


Re: [PATCH v4] eliminate mutex in fast path of __register_frame

2022-09-19 Thread Thomas Neumann via Gcc-patches
I haven't debugged this in any way, nor checked whether it only impacts 
exactly my below scenario, but noticed the following:


At least when building LibreOffice with Clang (16 trunk) with ASan and 
UBsan enabled against libstdc++ (with --gcc-toolchain and 
LD_LIBRARY_PATH to pick up a libstdc++ trunk build including this change 
at build and run-time), at least one of the LibreOffice tests executed 
during the build started to fail with


Apparently a registered frame is not found when deregistering, which 
triggers an assert. I will debug this. Could you send me a script or a 
description on how to reproduce the issue?


Best

Thomas


Re: [PATCH v4] eliminate mutex in fast path of __register_frame

2022-09-19 Thread Thomas Neumann via Gcc-patches



At least when building LibreOffice with Clang (16 trunk) with ASan and 
UBsan enabled against libstdc++ (with --gcc-toolchain and 
LD_LIBRARY_PATH to pick up a libstdc++ trunk build including this change 
at build and run-time), at least one of the LibreOffice tests executed 
during the build started to fail with


I could not reproduce the issue when building LibreOffice with gcc, but 
after reading the compiler-rt version of crtbegin.c I think the problem 
is the destruction order in compiler-rt. It calls 
__deregister_frame_info_bases after our lookup structure has already 
been destroyed.


Can you try if the patch below fixes the problem? It keeps the data 
structures alive at shutdown, though, which will probably make some leak 
detectors unhappy.


Alternatively we could simply remove the gcc_assert (ob) in line 285 of 
that file. As far as I can see in crt-begin nothing bad happens if we 
return nullptr at shutdown.


Best

Thomas


diff --git a/libgcc/unwind-dw2-fde.c b/libgcc/unwind-dw2-fde.c
index 919abfe0664..d427318280c 100644
--- a/libgcc/unwind-dw2-fde.c
+++ b/libgcc/unwind-dw2-fde.c
@@ -49,16 +49,6 @@ typedef __UINTPTR_TYPE__ uintptr_type;

 static struct btree registered_frames;

-static void
-release_registered_frames (void) __attribute__ ((destructor (110)));
-static void
-release_registered_frames (void)
-{
-  /* Release the b-tree and all frames. Frame releases that happen 
later are

-   * silently ignored */
-  btree_destroy (®istered_frames);
-}
-
 static void
 get_pc_range (const struct object *ob, uintptr_type *range);
 static void




[PATCH] Avoid depending on destructor order

2022-09-19 Thread Thomas Neumann via Gcc-patches

In some scenarios (e.g., when mixing gcc and clang code), it can
happen that frames are deregistered after the lookup structure
has already been destroyed. That in itself would be fine, but
it triggers an assert in __deregister_frame_info_bases that
expects to find the frame.

To avoid that, we now remember that the btree as already been
destroyed and disable the assert in that case.

libgcc/ChangeLog:

* unwind-dw2-fde.c: (release_register_frames) Remember
when the btree has been destroyed.
(__deregister_frame_info_bases) Disable the assert when
shutting down.
---
 libgcc/unwind-dw2-fde.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/libgcc/unwind-dw2-fde.c b/libgcc/unwind-dw2-fde.c
index 919abfe0664..d237179f4ea 100644
--- a/libgcc/unwind-dw2-fde.c
+++ b/libgcc/unwind-dw2-fde.c
@@ -48,6 +48,7 @@ typedef __UINTPTR_TYPE__ uintptr_type;
 #include "unwind-dw2-btree.h"

 static struct btree registered_frames;
+static bool in_shutdown;

 static void
 release_registered_frames (void) __attribute__ ((destructor (110)));
@@ -57,6 +58,7 @@ release_registered_frames (void)
   /* Release the b-tree and all frames. Frame releases that happen 
later are

* silently ignored */
   btree_destroy (®istered_frames);
+  in_shutdown = true;
 }

 static void
@@ -282,7 +284,7 @@ __deregister_frame_info_bases (const void *begin)
   __gthread_mutex_unlock (&object_mutex);
 #endif

-  gcc_assert (ob);
+  gcc_assert (in_shutdown || ob);
   return (void *) ob;
 }

--
2.34.1



Re: [PATCH] Avoid depending on destructor order

2022-09-23 Thread Thomas Neumann via Gcc-patches

This patch broke bootstrap on AIX and probably other targets.

#ifdef ATOMIC_FDE_FAST_PATH
#include "unwind-dw2-btree.h"

static struct btree registered_frames;
static bool in_shutdown;
...
#else

in_shutdown only is defined for ATOMIC_FDE_FAST_PATH but used in code / 
asserts not protected by that macro.


   gcc_assert (in_shutdown || ob);
   return (void *) ob;
}


I am sorry for that, I did not consider that my test machines all use 
the fast path.


I think the problem can be fixed by the trivial patch below, I will 
commit that after I have tested builds both with and without fast path.


Best

Thomas


diff --git a/libgcc/unwind-dw2-fde.c b/libgcc/unwind-dw2-fde.c
index d237179f4ea..d6e347c5481 100644
--- a/libgcc/unwind-dw2-fde.c
+++ b/libgcc/unwind-dw2-fde.c
@@ -67,6 +67,8 @@ static void
 init_object (struct object *ob);

 #else
+/* Without fast path frame lookup must always succeed */
+static const bool in_shutdown = false;

 /* The unseen_objects list contains objects that have been registered
but not yet categorized in any way.  The seen_objects list has had


Re: [PATCH] Avoid depending on destructor order

2022-09-23 Thread Thomas Neumann via Gcc-patches


+static const bool in_shutdown = false;

I'll let Jason or others decide if this is the right solution.  It seems 
that in_shutdown also could be declared outside the #ifdef and 
initialized as "false".


sure, either is fine. Moving it outside the #ifdef wastes one byte in 
the executable (while the compiler can eliminate the const), but it does 
not really matter.


I have verified that the patch below fixes builds for both fast-path and 
non-fast-path builds. But if you prefer I will move the in_shutdown 
definition instead.


Best

Thomas

PS: in_shutdown is an int here instead of a bool because non-fast-path 
builds do not include stdbool. Not a good reason, of course, but I 
wanted to keep the patch minimal and it makes no difference in practice.



When using the atomic fast path deregistering can fail during
program shutdown if the lookup structures are already destroyed.
The assert in __deregister_frame_info_bases takes that into
account. In the non-fast-path case however is not aware of
program shutdown, which caused a compiler error on such platforms.
We fix that by introducing a constant for in_shutdown in
non-fast-path builds.

libgcc/ChangeLog:
* unwind-dw2-fde.c: Introduce a constant for in_shutdown
for the non-fast-path case.

diff --git a/libgcc/unwind-dw2-fde.c b/libgcc/unwind-dw2-fde.c
index d237179f4ea..0bcd5061d76 100644
--- a/libgcc/unwind-dw2-fde.c
+++ b/libgcc/unwind-dw2-fde.c
@@ -67,6 +67,8 @@ static void
 init_object (struct object *ob);

 #else
+/* Without fast path frame deregistration must always succeed.  */
+static const int in_shutdown = 0;

 /* The unseen_objects list contains objects that have been registered
but not yet categorized in any way.  The seen_objects list has had


Re: [PATCH] Avoid depending on destructor order

2022-09-26 Thread Thomas Neumann via Gcc-patches

Hi Claudiu,


This change prohibits compiling of ARC backend:


+  gcc_assert (in_shutdown || ob);


in_shutdown is only defined when ATOMIC_FDE_FAST_PATH is defined,
while gcc_assert is outside of any ifdef. Please can you revisit this
line and change it accordingly.


I have a patch ready, I am waiting for someone to approve my patch:

https://gcc.gnu.org/pipermail/gcc-patches/2022-September/602130.html

Best

Thomas


Re: [PATCH] Avoid depending on destructor order

2022-09-26 Thread Thomas Neumann via Gcc-patches

Hi Iain,


You might also want to include Rainer’s patch,

AFAIR patches to fix bootstrap are allowed to proceed as an exception to
the usual rules,


I was not aware of that. I have pushed the patch below now (including 
Rainer's change), I will update the code if requested.


Best

Thomas

fix assert in __deregister_frame_info_bases

When using the atomic fast path deregistering can fail during
program shutdown if the lookup structures are already destroyed.
The assert in __deregister_frame_info_bases takes that into
account. In the non-fast-path case however is not aware of
program shutdown, which caused a compiler error on such platforms.
We fix that by introducing a constant for in_shutdown in
non-fast-path builds.
We also drop the destructor priority, as it is not supported on
all platforms and we no longer rely upon the priority anyway.

libgcc/ChangeLog:
* unwind-dw2-fde.c: Introduce a constant for in_shutdown
for the non-fast-path case. Drop destructor priority.

diff --git a/libgcc/unwind-dw2-fde.c b/libgcc/unwind-dw2-fde.c
index d237179f4ea..3c0cc654ec0 100644
--- a/libgcc/unwind-dw2-fde.c
+++ b/libgcc/unwind-dw2-fde.c
@@ -51,7 +51,7 @@ static struct btree registered_frames;
 static bool in_shutdown;

 static void
-release_registered_frames (void) __attribute__ ((destructor (110)));
+release_registered_frames (void) __attribute__ ((destructor));
 static void
 release_registered_frames (void)
 {
@@ -67,6 +67,8 @@ static void
 init_object (struct object *ob);

 #else
+/* Without fast path frame deregistration must always succeed.  */
+static const int in_shutdown = 0;

 /* The unseen_objects list contains objects that have been registered
but not yet categorized in any way.  The seen_objects list has had


[PATCH] eliminate mutex in fast path of __register_frame

2022-03-01 Thread Thomas Neumann via Gcc-patches

The __register_frame/__deregister_frame functions are used to register
unwinding frames from JITed code in a sorted list. That list itself
is protected by object_mutex, which leads to terrible performance
in multi-threaded code and is somewhat expensive even if single-threaded.
There was already a fast-path that avoided taking the mutex if no
frame was registered at all.

This commit eliminates both the mutex and the sorted list from
the atomic fast path, and replaces it with a btree that uses
optimistic lock coupling during lookup. This allows for fully parallel
unwinding and is essential to scale exception handling to large
core counts.

libgcc/ChangeLog:

* unwind-dw2-fde.c (release_registered_frames): Cleanup at shutdown.
(__register_frame_info_table_bases): Use btree in atomic fast path.
(__deregister_frame_info_bases): Likewise.
(_Unwind_Find_FDE): Likewise.
(base_from_object): Make parameter const.
(get_pc_range_from_fdes): Compute PC range for lookup.
(get_pc_range): Likewise.
* unwind-dw2-fde.h (last_fde): Make parameter const.
* unwind-dw2-btree.h: New file.
---
 libgcc/unwind-dw2-btree.h | 870 ++
 libgcc/unwind-dw2-fde.c   | 187 ++--
 libgcc/unwind-dw2-fde.h   |   2 +-
 3 files changed, 1023 insertions(+), 36 deletions(-)
 create mode 100644 libgcc/unwind-dw2-btree.h

diff --git a/libgcc/unwind-dw2-btree.h b/libgcc/unwind-dw2-btree.h
new file mode 100644
index 000..b16bb9994f8
--- /dev/null
+++ b/libgcc/unwind-dw2-btree.h
@@ -0,0 +1,870 @@
+/* Lock-free btree for manually registered unwind frames  */
+/* Copyright (C) 2022 Free Software Foundation, Inc.
+   Contributed by Thomas Neumann
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_UNWIND_DW2_BTREE_H
+#define GCC_UNWIND_DW2_BTREE_H
+
+#include 
+
+#ifndef HIDE_EXPORTS
+#pragma GCC visibility push(default)
+#endif
+
+// Common logic for version locks
+struct version_lock
+{
+  // The lock itself
+  uintptr_t version_lock;
+};
+
+// Initialize in locked state
+static inline void
+version_lock_initialize_locked_exclusive (struct version_lock *vl)
+{
+  vl->version_lock = 1;
+}
+
+// Try to lock the node exclusive
+static inline bool
+version_lock_try_lock_exclusive (struct version_lock *vl)
+{
+  uintptr_t state = __atomic_load_n (&(vl->version_lock), __ATOMIC_SEQ_CST);
+  if (state & 1)
+return false;
+  return __atomic_compare_exchange_n (&(vl->version_lock), &state, state | 1,
+ false, __ATOMIC_SEQ_CST,
+ __ATOMIC_SEQ_CST);
+}
+
+// Lock the node exclusive, blocking as needed
+static void
+version_lock_lock_exclusive (struct version_lock *vl)
+{
+  // We should virtually never get contention here, as frame
+  // changes are rare. Thus we use a simple spinlock
+  while (true)
+{
+  uintptr_t state = __atomic_load_n (&(vl->version_lock), 
__ATOMIC_SEQ_CST);
+  if (state & 1)
+   continue;
+  if (__atomic_compare_exchange_n (&(vl->version_lock), &state, state | 1,
+  false, __ATOMIC_SEQ_CST,
+  __ATOMIC_SEQ_CST))
+   return;
+}
+}
+
+// Release a locked node and increase the version lock
+static inline void
+version_lock_unlock_exclusive (struct version_lock *vl)
+{
+  uintptr_t state = __atomic_load_n (&(vl->version_lock), __ATOMIC_SEQ_CST);
+  __atomic_store_n (&(vl->version_lock), (state + 2) & (~((uintptr_t) 1)),
+   __ATOMIC_SEQ_CST);
+}
+
+// Acquire an optimistic "lock". Note that this does not lock at all, it
+// only allows for validation later
+static inline bool
+version_lock_lock_optimistic (const struct version_lock *vl, uintptr_t *lock)
+{
+  uintptr_t state = __atomic_load_n (&(vl->version_lock), __ATOMIC_SEQ_CST);
+  *lock = state;
+
+  // Acquire the lock fails when there is currently an exclusive lock
+  return !(state & 1);
+}
+
+// Validate a previously acqui

Re: [PATCH] eliminate mutex in fast path of __register_frame

2022-03-02 Thread Thomas Neumann via Gcc-patches




+// Common logic for version locks
+struct version_lock
+{
+  // The lock itself
+  uintptr_t version_lock;
+};


version_lock must not overflow, right?  This means we need a wider
counter on 32-bit, too.  glibc contains a 62-bit counter that it uses
for its own data structure.


an overflow is not a problem per se, it is only problematic if we hit 
exactly the same value again between lock_optimistic and validate. Note 
that these ranges are usually a handful of assembler instructions, and 
we would have to see 4 billion frame registrations in that short time 
span. I don't think that is a problem in practice. But I can switch to 
64bit, of course, if you think the risk is too high.





+// Lock the node exclusive, blocking as needed
+static void
+version_lock_lock_exclusive (struct version_lock *vl)
+{
+  // We should virtually never get contention here, as frame
+  // changes are rare. Thus we use a simple spinlock


Isn't this problematic if there are multiple compiler threads that race
to register their output with the unwinder?

If updates are rare, is per-node locking really needed?

What can we do to make this async-signal-safe, so that a call into the
unwinder from a signal handler does not spin endlessly if the receiving
thread is currently registering or deregistering a frame?  Is simply
calling sigprocmask around the register and unregister operations
enough?  (These operations don't need to be async-signal-safe, only
lookup should be.)  This can be a future change if we feel confident
that it's possible to rectify this later if needed.


we need the locking bit because a concurrent unwinder must recognize 
that the node is currently being modified. But we could eliminate the 
spinlock aspect by using a global mutex, which would guarantee us that 
nothing is locked exclusively and thus no spinning is required. That 
would also allow us to fix the async-signal-safety at the same time if 
needed by blocking signals globally during updates.


Note that the current code is not async-signal-safe either, it simply 
grabs a mutex. If a signal happens while calling __register_frame, and 
that handler tries to unwind, the current code will deadlock, too.





+// Validate a previously acquire lock
+static inline bool
+version_lock_validate (const struct version_lock *vl, uintptr_t lock)
+{
+  // Check that the node is still in the same state
+  uintptr_t state = __atomic_load_n (&(vl->version_lock), __ATOMIC_SEQ_CST);
+  return (state == lock);
+}


I think an acquire fence is missing before the __atomic_load_n.  We
learned this the hard way in the glibc implementation.  The reference
that Szabolcs found is:

  Hans Boehm, Can Seqlocks Get Along with Programming Language
  Memory Models?, Section 4.


thanks for the pointer, I will look at this more carefully. When I read 
the text correctly we need a __atomic_thread_fence(__ATOMIC_ACQUIRE) 
before the load, but I will double check that.




+static void
+btree_release_tree_recursively (struct btree *t, struct btree_node *n);
+
+// Destroy a tree and release all nodes. Not used currently, but could be 
called
+// at shutdown to destroy the frame lookup
+static void
+btree_destroy (struct btree *t)
+{


The comment seems to be incorrect, it is used?  Otherwise there should
be a compiler warning.



sorry for that, I initially wanted to keep the tree alive forever, but 
then decided to destroy it at shutdown. I will update the comment.



+// Allocate a node. This node will be returned in locked exclusive state
+static struct btree_node *
+btree_allocate_node (struct btree *t, bool inner)
+{



+  // No free page available, allocate a new one
+  struct btree_node *new_node
+   = (struct btree_node *) (malloc (sizeof (struct btree_node)));
+  version_lock_initialize_locked_exclusive (
+   &(new_node->version_lock)); // initialize the node in locked state
+  new_node->entry_count = 0;
+  new_node->type = inner ? btree_node_inner : btree_node_leaf;
+  return new_node;
+}
+}


This needs some form of error checking for malloc.  But I see the
existing code does not have that, either. 8-(


and I do not see how we can really handle a malloc failure here. What 
should we do except die? And, as you pointed out, that is fundamental in 
the surrounding code, too, it allocates an struct object via malloc, and 
will crash if that fails.




+// Find the corresponding entry the given address
+static struct object *
+btree_lookup (const struct btree *t, uintptr_t target_addr)
+{



+  if (type == btree_node_inner)
+   {
+ // We cannot call find_inner_slot here because we can only trust our
+ // validated entries
+ unsigned slot = 0;
+ while (((slot + 1) < entry_count)
+&& (iter->content.children[slot].separator < target_addr))
+   ++slot;
+ struct btree_node *child = iter->content.children[slot].child;
+ if (!btree_node_validate (iter, lock))

Re: [PATCH] eliminate mutex in fast path of __register_frame

2022-03-03 Thread Thomas Neumann via Gcc-patches

We may have to add a new interface.  In some other cases, I've seen
errno being used for error reporting, but that requires changes in
applications to recognize the error.  It's probably better to crash here
than to fail mysteriously later.

Out of curiosity, how many times do you call the registration functions
from your application?  Would a batch registration interface help?  It
could address error reporting as well.


we are compiling SQL queries into machine code, which means we call 
__(de)register_frame once per query. There is usually no way to batch 
that, as the queries typically come in one after the other.
In practice the system caches generated code, which means that most of 
the time are we executing an already compiled query with different 
parameters. Thus __register_frame is not a big performance problem for 
us, but unwinding is, as queries are executed in parallel using hundreds 
of threads and some of them fail.


Thank you for your detailed feedback, I will incorporate your 
suggestions and will send an updated patch in a few days.


Best

Thomas


[PATCH v2] eliminate mutex in fast path of __register_frame

2022-03-04 Thread Thomas Neumann via Gcc-patches

The __register_frame/__deregister_frame functions are used to register
unwinding frames from JITed code in a sorted list. That list itself
is protected by object_mutex, which leads to terrible performance
in multi-threaded code and is somewhat expensive even if single-threaded.
There was already a fast-path that avoided taking the mutex if no
frame was registered at all.

This commit eliminates both the mutex and the sorted list from
the atomic fast path, and replaces it with a btree that uses
optimistic lock coupling during lookup. This allows for fully parallel
unwinding and is essential to scale exception handling to large
core counts.

Changes since v1:
- eliminate all undefined behavior within the optimistic section
- use a mutex instead of spinning in case of conflicts
- addressed all other reviewer comments except for async-signal-safety.
  The old code was not async-signal-safe either, and that should be fixed
  in a separate commit, as that requires touching other parts of the code.

libgcc/ChangeLog:

* unwind-dw2-fde.c (release_registered_frames): Cleanup at shutdown.
(__register_frame_info_table_bases): Use btree in atomic fast path.
(__deregister_frame_info_bases): Likewise.
(_Unwind_Find_FDE): Likewise.
(base_from_object): Make parameter const.
(get_pc_range_from_fdes): Compute PC range for lookup.
(get_pc_range): Likewise.
* unwind-dw2-fde.h (last_fde): Make parameter const.
* unwind-dw2-btree.h: New file.
---
 libgcc/unwind-dw2-btree.h | 952 ++
 libgcc/unwind-dw2-fde.c   | 194 ++--
 libgcc/unwind-dw2-fde.h   |   2 +-
 3 files changed, 1112 insertions(+), 36 deletions(-)
 create mode 100644 libgcc/unwind-dw2-btree.h

diff --git a/libgcc/unwind-dw2-btree.h b/libgcc/unwind-dw2-btree.h
new file mode 100644
index 000..0b4835a439a
--- /dev/null
+++ b/libgcc/unwind-dw2-btree.h
@@ -0,0 +1,952 @@
+/* Lock-free btree for manually registered unwind frames  */
+/* Copyright (C) 2022 Free Software Foundation, Inc.
+   Contributed by Thomas Neumann
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_UNWIND_DW2_BTREE_H
+#define GCC_UNWIND_DW2_BTREE_H
+
+#include 
+
+// Common logic for version locks
+struct version_lock
+{
+  // The lock itself. The lowest bit indicates an exclusive lock,
+  // the second bit indicates waiting threads. All other bits are
+  // used as counter to recognize changes.
+  // Overflows are okay here, we must only prevent overflow to the
+  // same value within one lock_optimistic/validate
+  // range. Even on 32 bit platforms that would require 1 billion
+  // frame registrations within the time span of a few assembler
+  // instructions.
+  uintptr_t version_lock;
+};
+
+#ifdef __GTHREAD_HAS_COND
+// We should never get contention within the tree as it rarely changes.
+// But if we ever do get contention we use these for waiting
+static __gthread_mutex_t version_lock_mutex = __GTHREAD_MUTEX_INIT;
+static __gthread_cond_t version_lock_cond = __GTHREAD_COND_INIT;
+#endif
+
+// Initialize in locked state
+static inline void
+version_lock_initialize_locked_exclusive (struct version_lock *vl)
+{
+  vl->version_lock = 1;
+}
+
+// Try to lock the node exclusive
+static inline bool
+version_lock_try_lock_exclusive (struct version_lock *vl)
+{
+  uintptr_t state = __atomic_load_n (&(vl->version_lock), __ATOMIC_SEQ_CST);
+  if (state & 1)
+return false;
+  return __atomic_compare_exchange_n (&(vl->version_lock), &state, state | 1,
+ false, __ATOMIC_SEQ_CST,
+ __ATOMIC_SEQ_CST);
+}
+
+// Lock the node exclusive, blocking as needed
+static void
+version_lock_lock_exclusive (struct version_lock *vl)
+{
+#ifndef __GTHREAD_HAS_COND
+restart:
+#endif
+
+  // We should virtually never get contention here, as frame
+  // changes are rare
+  uintptr_t state = __atomic_load_n (&(vl->version_lock), __ATOMIC_SEQ_CST);
+  if (!(state & 1))
+{
+  if (__atomic_compare_exchange

PING^2: [PATCH v3] eliminate mutex in fast path of __register_frame

2022-08-26 Thread Thomas Neumann via Gcc-patches
Summary: __register_frame and the corresponding _Unwind_Find_FDE use a 
global mutex for frame registration and unwinding. This can lead to very 
poor performance on machines with high core counts. This patch organizes 
the frames in a b-tree with read-optimized synchronization instead, 
which allows for fully parallel unwinding.


See:
https://gcc.gnu.org/pipermail/gcc-patches/2022-June/597256.html

Best

Thomas


Re: [PATCH v3] eliminate mutex in fast path of __register_frame

2022-09-12 Thread Thomas Neumann via Gcc-patches
Thanks for your feedback, I will update the patch in the next few days, 
addressing the comments and reorganizing classify_object_over_fdes. 
Concerning your question:



+
+restart:
+  struct btree_node *iter;
+  uintptr_t lock;
+  {
+    // Accessing the root node requires defending against concurrent 
pointer
+    // changes Thus we couple rootLock -> lock on root node -> 
validate rootLock


Can you elaborate a bit more in the comment on why it's necessary and 
sufficient to validate the containing node after "locking" the child node?



+    if (!version_lock_lock_optimistic (&(t->root_lock), &lock))
+  goto restart;
+    iter = RLOAD (t->root);
+    if (!version_lock_validate (&(t->root_lock), lock))
+  goto restart;
+    if (!iter)
+  return NULL;
+    uintptr_t child_lock;
+    if ((!btree_node_lock_optimistic (iter, &child_lock))
+    || (!version_lock_validate (&(t->root_lock), lock)))
+  goto restart;
+    lock = child_lock;
+  }
+
+  // Now we can walk down towards the right leaf node


We are reading here without explicit locks, i.e., memory can change 
behind our back at any time. Thus, before we act on a value that we have 
read, we must check if that value is still valid.

The invariants that we rely upon are:
I1) a node pointer will always be either a nullptr or point to a b-tree 
node. The node might have been logically deleted, but the memory will 
always remain valid due to our free list.
I2) The root pointer is only modified by a thread that holds an 
exclusive lock on root_lock.
I3) valide calls fail if a) somebody else currently holds the lock, or 
b) if the version value has changed since locking, which happens when 
somebody releases an exlusive lock.


Using that, the code does the following steps

1. We lock root_lock optimistic. In case of an exclusive lock we restart 
immediately, otherwise we remember the current version
2. We load the root pointer and store it in iter. iter might contain 
arbitrary garbage due to races
3. We validate the root_lock, and restart in case of errors. Now we know 
that a) nobody has locked the root exclusively or modified it since step 
1 (I3), and b) iter indeed points to the node that corresponds the to 
the version that we have read in step 1 (due to I2)


At this point we now that iter is indeed the right root pointer. If it 
is a nullptr we stop here. Otherwise we continue with


4. We lock the root node itself (iter), storing the validated version in 
child_lock. We need that lock to detect changes to the content of the 
root, the root_lock only protects the pointer to the root. Loading the 
lock is safe due to I1. As our tree can change at any point in time, we 
need another valide call on root_lock to make sure that we still have 
the right root pointer.



At this point we have both a pointer to the root node and an optimistic 
lock that allows us to detect changes to the content of the root. Thus, 
we can safely navigate down, any changes to the b-tree that could affect 
us, like, e.g., node splits in the root, will be detected when 
validating the lock.



I hope that answered both the necessary and sufficient part. As a 
general rule, you must always use the pair relax-atomic-read + validate 
to avoid data races. Acting on any value that you have not validated is 
unsafe, as you could get races.


Best

Thomas


PING: [PATCH v2] eliminate mutex in fast path of __register_frame

2022-05-09 Thread Thomas Neumann via Gcc-patches
Summary: __register_frame and the corresponding _Unwind_Find_FDE use a 
global mutex for frame registration and unwinding. This can lead to very 
poor performance on machines with high core counts. This patch organizes 
the frames in a b-tree with read-optimized synchronization instead, 
which allows for fully parallel unwinding.


See:
https://gcc.gnu.org/pipermail/gcc-patches/2022-March/591203.html

Best

Thomas


[PATCH] release the sorted FDE array when deregistering a frame [PR109685]

2023-05-02 Thread Thomas Neumann via Gcc-patches

The atomic fastpath bypasses the code that releases the sort
array which was lazily allocated during unwinding. We now
check after deregistering if there is an array to free.

libgcc/ChangeLog:
* unwind-dw2-fde.c: Free sort array in atomic fast path.
---
 libgcc/unwind-dw2-fde.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/libgcc/unwind-dw2-fde.c b/libgcc/unwind-dw2-fde.c
index 7b74c391ced..4d2737ff9f7 100644
--- a/libgcc/unwind-dw2-fde.c
+++ b/libgcc/unwind-dw2-fde.c
@@ -240,6 +240,12 @@ __deregister_frame_info_bases (const void *begin)

   // And remove
   ob = btree_remove (®istered_frames, range[0]);
+
+  // Deallocate the sort array if any.
+  if (ob && ob->s.b.sorted)
+{
+  free (ob->u.sort);
+}
 #else
   init_object_mutex_once ();
   __gthread_mutex_lock (&object_mutex);
--
2.39.2



[PATCH v3] eliminate mutex in fast path of __register_frame

2022-06-26 Thread Thomas Neumann via Gcc-patches
NOTE: A stress test program and a detailed walkthrough that breaks this
patch into manageable parts can be found here:
https://databasearchitects.blogspot.com/2022/06/making-unwinding-through-jit-ed-code.html

The __register_frame/__deregister_frame functions are used to register
unwinding frames from JITed code in a sorted list. That list itself
is protected by object_mutex, which leads to terrible performance
in multi-threaded code and is somewhat expensive even if single-threaded.
There was already a fast-path that avoided taking the mutex if no
frame was registered at all.

This commit eliminates both the mutex and the sorted list from
the atomic fast path, and replaces it with a btree that uses
optimistic lock coupling during lookup. This allows for fully parallel
unwinding and is essential to scale exception handling to large
core counts.

Changes since v2:
- fix contention problem during unlocking

libgcc/ChangeLog:

* unwind-dw2-fde.c (release_registered_frames): Cleanup at shutdown.
(__register_frame_info_table_bases): Use btree in atomic fast path.
(__deregister_frame_info_bases): Likewise.
(_Unwind_Find_FDE): Likewise.
(base_from_object): Make parameter const.
(get_pc_range_from_fdes): Compute PC range for lookup.
(get_pc_range): Likewise.
* unwind-dw2-fde.h (last_fde): Make parameter const.
* unwind-dw2-btree.h: New file.
---
 libgcc/unwind-dw2-btree.h | 953 ++
 libgcc/unwind-dw2-fde.c   | 194 ++--
 libgcc/unwind-dw2-fde.h   |   2 +-
 3 files changed, 1113 insertions(+), 36 deletions(-)
 create mode 100644 libgcc/unwind-dw2-btree.h

diff --git a/libgcc/unwind-dw2-btree.h b/libgcc/unwind-dw2-btree.h
new file mode 100644
index 000..3b2b6871b46
--- /dev/null
+++ b/libgcc/unwind-dw2-btree.h
@@ -0,0 +1,953 @@
+/* Lock-free btree for manually registered unwind frames  */
+/* Copyright (C) 2022 Free Software Foundation, Inc.
+   Contributed by Thomas Neumann
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_UNWIND_DW2_BTREE_H
+#define GCC_UNWIND_DW2_BTREE_H
+
+#include 
+
+// Common logic for version locks
+struct version_lock
+{
+  // The lock itself. The lowest bit indicates an exclusive lock,
+  // the second bit indicates waiting threads. All other bits are
+  // used as counter to recognize changes.
+  // Overflows are okay here, we must only prevent overflow to the
+  // same value within one lock_optimistic/validate
+  // range. Even on 32 bit platforms that would require 1 billion
+  // frame registrations within the time span of a few assembler
+  // instructions.
+  uintptr_t version_lock;
+};
+
+#ifdef __GTHREAD_HAS_COND
+// We should never get contention within the tree as it rarely changes.
+// But if we ever do get contention we use these for waiting
+static __gthread_mutex_t version_lock_mutex = __GTHREAD_MUTEX_INIT;
+static __gthread_cond_t version_lock_cond = __GTHREAD_COND_INIT;
+#endif
+
+// Initialize in locked state
+static inline void
+version_lock_initialize_locked_exclusive (struct version_lock *vl)
+{
+  vl->version_lock = 1;
+}
+
+// Try to lock the node exclusive
+static inline bool
+version_lock_try_lock_exclusive (struct version_lock *vl)
+{
+  uintptr_t state = __atomic_load_n (&(vl->version_lock), __ATOMIC_SEQ_CST);
+  if (state & 1)
+return false;
+  return __atomic_compare_exchange_n (&(vl->version_lock), &state, state | 1,
+ false, __ATOMIC_SEQ_CST,
+ __ATOMIC_SEQ_CST);
+}
+
+// Lock the node exclusive, blocking as needed
+static void
+version_lock_lock_exclusive (struct version_lock *vl)
+{
+#ifndef __GTHREAD_HAS_COND
+restart:
+#endif
+
+  // We should virtually never get contention here, as frame
+  // changes are rare
+  uintptr_t state = __atomic_load_n (&(vl->version_lock), __ATOMIC_SEQ_CST);
+  if (!(state & 1))
+{
+  if (__atomic_compare_exchange_n (&(vl->version_lock), &state, state | 1,
+

Re: [PATCH v4] eliminate mutex in fast path of __register_frame

2022-11-21 Thread Thomas Neumann via Gcc-patches

Hi,


When dynamically linking a fast enough machine hides the latency, but when
Statically linking or on slower devices this change caused a 5x increase in
Instruction count and 2x increase in cycle count before getting to main.

This has been quite noticeable on smaller devices.  Is there a reason the btree
can't be initialized lazily? It seems a bit harsh to pay the cost of unwinding 
at
startup even when you don't throw exceptions..


we cannot easily do that lazily because otherwise we need a mutex for 
lazy initialization, which is exactly what we wanted to get rid of.


Having said that, I am surprised that you saw a noticeable difference. 
On most platforms there should not be dynamic frame registration at all, 
as the regular frames are directly read from the ELF data.


Can you please send me an precise description on how to reproduce the 
issue? (Platform, tools, a VM if you have one would be great). I will 
then debug this to improve the startup time.


Best

Thomas



Re: [PATCH v4] eliminate mutex in fast path of __register_frame

2022-11-21 Thread Thomas Neumann via Gcc-patches


It's easy to reproduce on x86 as well.

As a testcase:

#include 

int main(int argc, char** argv) {
 return 0;
}

And just compile with: g++ -O1 hello.cpp -static -o hello.exe.


thanks, I will take a look.

Best

Thomas



Re: [PATCH v4] eliminate mutex in fast path of __register_frame

2022-11-21 Thread Thomas Neumann via Gcc-patches

Hi,


When dynamically linking a fast enough machine hides the latency, but when
Statically linking or on slower devices this change caused a 5x increase in
Instruction count and 2x increase in cycle count before getting to main.


I have looked at ways to fix that. The problem is that with static 
linking unwinding tables are registered dynamically, and with my patch 
that registration triggers an eager sort of fde lists. While previously 
the lists were sorted when the first exception was thrown. If an 
application throws at least one exception there is no downside in eager 
sorting, but if the application never throws there is overhead.


The obvious way to improve the situation is to make sorting faster. When 
replacing the split+sort+merge logic with a radix sort (which can be 
done without additional memory) we get the following timings for your

#include 
int main() {}
example (with git stat -r 200):

# pre-patch version, fast
  0,06 msec task-clock
   272.286  cycles
   464.754  instructions
# post-patch version, slow
  0,21 msec task-clock
   972.876  cycles
 3.079.515  instructions
# +radix sort, in the middle
  0,13 msec task-clock
   604.697  cycles
 1.702.930  instructions

The radix sort clearly improves things, but it does not fully eliminate 
the overhead.


The question is, how much do we care about that situation (i.e., static 
linking, exceptions registered but never thrown). I could change the 
code to recognize three states instead of two: no exceptions registered, 
exceptions register but never thrown, and full exception mode. But that 
would increase code complexity and it would pessimize applications that 
do throw, as we now need more checks to guard against concurrent changes.


It makes the code more complex and a bit slower, which is why I am 
hesistant. But I can implement that if you think that we need that. Or 
we just replace the sort, which is probably a good idea anyway.


Best

Thomas


[PATCH] speed up end_fde_sort using radix sort

2022-11-22 Thread Thomas Neumann via Gcc-patches

When registering a dynamic unwinding frame the fde list is sorted.
Previously, we split the list into a sorted and an unsorted part,
sorted the later using heap sort, and merged both. That can be
quite slow due to the large number of (expensive) comparisons.

This patch replaces that logic with a radix sort instead. The
radix sort uses the same amount of memory as the old logic,
using the second list as auxiliary space, and it includes two
techniques to speed up sorting: First, it computes the pointer
addresses for blocks of values, reducing the decoding overhead.
And it recognizes when the data has reached a sorted state,
allowing for early termination. When running out of memory
we fall back to pure heap sort, as before.

For this test program

#include 
int main(int argc, char** argv) {
 return 0;
}

compiled with g++ -O -o hello -static hello.c we get with
perf stat -r 200 on a 5950X the following performance numbers:

old logic:

  0,20 msec task-clock
   930.834  cycles
 3.079.765  instructions
0,00030478 +- 0,0237 seconds time elapsed

new logic:

  0,10 msec task-clock
   473.269  cycles
 1.239.077  instructions
0,00021119 +- 0,0168 seconds time elapsed

libgcc/ChangeLog:
* unwind-dw2-fde.c: Use radix sort instead of split+sort+merge.
---
 libgcc/unwind-dw2-fde.c | 234 +++-
 1 file changed, 134 insertions(+), 100 deletions(-)

diff --git a/libgcc/unwind-dw2-fde.c b/libgcc/unwind-dw2-fde.c
index 3c0cc654ec0..b81540c41a4 100644
--- a/libgcc/unwind-dw2-fde.c
+++ b/libgcc/unwind-dw2-fde.c
@@ -456,22 +456,52 @@ fde_mixed_encoding_compare (struct object *ob, const fde 
*x, const fde *y)
 
 typedef int (*fde_compare_t) (struct object *, const fde *, const fde *);
 
+// The extractor functions compute the pointer values for a block of

+// fdes. The block processing hides the call overhead.
 
-/* This is a special mix of insertion sort and heap sort, optimized for

-   the data sets that actually occur. They look like
-   101 102 103 127 128 105 108 110 190 111 115 119 125 160 126 129 130.
-   I.e. a linearly increasing sequence (coming from functions in the text
-   section), with additionally a few unordered elements (coming from functions
-   in gnu_linkonce sections) whose values are higher than the values in the
-   surrounding linear sequence (but not necessarily higher than the values
-   at the end of the linear sequence!).
-   The worst-case total run time is O(N) + O(n log (n)), where N is the
-   total number of FDEs and n is the number of erratic ones.  */
+static void
+fde_unencoded_extract (struct object *ob __attribute__ ((unused)),
+  _Unwind_Ptr *target, const fde **x, int count)
+{
+  for (int index = 0; index < count; ++index)
+memcpy (target + index, x[index]->pc_begin, sizeof (_Unwind_Ptr));
+}
+
+static void
+fde_single_encoding_extract (struct object *ob, _Unwind_Ptr *target,
+const fde **x, int count)
+{
+  _Unwind_Ptr base;
+
+  base = base_from_object (ob->s.b.encoding, ob);
+  for (int index = 0; index < count; ++index)
+read_encoded_value_with_base (ob->s.b.encoding, base, x[index]->pc_begin,
+ target + index);
+}
+
+static void
+fde_mixed_encoding_extract (struct object *ob, _Unwind_Ptr *target,
+   const fde **x, int count)
+{
+  for (int index = 0; index < count; ++index)
+{
+  int encoding = get_fde_encoding (x[index]);
+  read_encoded_value_with_base (encoding, base_from_object (encoding, ob),
+   x[index]->pc_begin, target + index);
+}
+}
+
+typedef void (*fde_extractor_t) (struct object *, _Unwind_Ptr *, const fde **,
+int);
+
+// Data is is sorted using radix sort if possible, using an temporary
+// auxiliary data structure of the same size as the input. When running
+// out of memory to in-place heap sort.
 
 struct fde_accumulator

 {
   struct fde_vector *linear;
-  struct fde_vector *erratic;
+  struct fde_vector *aux;
 };
 
 static inline int

@@ -485,8 +515,8 @@ start_fde_sort (struct fde_accumulator *accu, size_t count)
   if ((accu->linear = malloc (size)))
 {
   accu->linear->count = 0;
-  if ((accu->erratic = malloc (size)))
-   accu->erratic->count = 0;
+  if ((accu->aux = malloc (size)))
+   accu->aux->count = 0;
   return 1;
 }
   else
@@ -500,59 +530,6 @@ fde_insert (struct fde_accumulator *accu, const fde 
*this_fde)
 accu->linear->array[accu->linear->count++] = this_fde;
 }
 
-/* Split LINEAR into a linear sequence with low values and an erratic

-   sequence with high values, put the linear one (of longest possible
-   length) into LINEAR and the erratic one into ERRATIC. This is O(N).
-
-   Because the longest linear sequence we are trying to locate within the
-   incoming LINEAR array c

Re: [PATCH v4] eliminate mutex in fast path of __register_frame

2022-11-22 Thread Thomas Neumann via Gcc-patches

Would it be possible to trigger lazy registration if the version is read
as a zero?  This would not introduce any additional atomic instructions
on the fast path.


yes, that is possible. The main problem is the transition from lazy to 
non-lazy mode when the first exception is thrown. We must somehow stop 
the world for that without introducing an additional mutex. But I have 
though about that some more, and that is possible too, by encoding a 
magic value as version during the transition, which causes the other 
threads to block. A bit ugly, but manageable. I will implement that in a 
few days.


Independent of that I think we should improve the sort logic, as we 
still have to sort, even in lazy mode, at latest when the first 
exception is thrown. I have send a patch that significantly improves 
that step.


Best

Thomas



[PATCH] initialize fde objects lazily

2022-12-09 Thread Thomas Neumann via Gcc-patches

When registering an unwind frame with __register_frame_info_bases
we currently initialize that fde object eagerly. This has the
advantage that it is immutable afterwards and we can safely
access it from multiple threads, but it has the disadvantage
that we pay the initialization cost even if the application
never throws an exception.

This commit changes the logic to initialize the objects lazily.
The objects themselves are inserted into the b-tree when
registering the frame, but the sorted fde_vector is
not constructed yet. Only on the first time that an
exception tries to pass through the registered code the
object is initialized. We notice that with a double checking,
first doing a relaxed load of the sorted bit and then re-checking
under a mutex when the object was not initialized yet.

Note that the check must implicitly be safe concering a concurrent
frame deregistration, as trying the deregister a frame that is
on the unwinding path of a concurrent exception is inherently racy.

libgcc/ChangeLog:
* unwind-dw2-fde.c: Initialize fde object lazily when
the first exception tries to pass through.
---
 libgcc/unwind-dw2-fde.c | 52 -
 1 file changed, 41 insertions(+), 11 deletions(-)

diff --git a/libgcc/unwind-dw2-fde.c b/libgcc/unwind-dw2-fde.c
index 3c0cc654ec0..6f69c20ff4b 100644
--- a/libgcc/unwind-dw2-fde.c
+++ b/libgcc/unwind-dw2-fde.c
@@ -63,8 +63,6 @@ release_registered_frames (void)
 
 static void

 get_pc_range (const struct object *ob, uintptr_type *range);
-static void
-init_object (struct object *ob);
 
 #else

 /* Without fast path frame deregistration must always succeed.  */
@@ -76,6 +74,7 @@ static const int in_shutdown = 0;
by decreasing value of pc_begin.  */
 static struct object *unseen_objects;
 static struct object *seen_objects;
+#endif
 
 #ifdef __GTHREAD_MUTEX_INIT

 static __gthread_mutex_t object_mutex = __GTHREAD_MUTEX_INIT;
@@ -103,7 +102,6 @@ init_object_mutex_once (void)
 static __gthread_mutex_t object_mutex;
 #endif
 #endif
-#endif
 
 /* Called from crtbegin.o to register the unwind info for an object.  */
 
@@ -126,10 +124,7 @@ __register_frame_info_bases (const void *begin, struct object *ob,

 #endif
 
 #ifdef ATOMIC_FDE_FAST_PATH

-  // Initialize eagerly to avoid locking later
-  init_object (ob);
-
-  // And register the frame
+  // Register the frame in the b-tree
   uintptr_type range[2];
   get_pc_range (ob, range);
   btree_insert (®istered_frames, range[0], range[1] - range[0], ob);
@@ -180,10 +175,7 @@ __register_frame_info_table_bases (void *begin, struct 
object *ob,
   ob->s.b.encoding = DW_EH_PE_omit;
 
 #ifdef ATOMIC_FDE_FAST_PATH

-  // Initialize eagerly to avoid locking later
-  init_object (ob);
-
-  // And register the frame
+  // Register the frame in the b-tree
   uintptr_type range[2];
   get_pc_range (ob, range);
   btree_insert (®istered_frames, range[0], range[1] - range[0], ob);
@@ -892,7 +884,15 @@ init_object (struct object* ob)
   accu.linear->orig_data = ob->u.single;
   ob->u.sort = accu.linear;
 
+#ifdef ATOMIC_FDE_FAST_PATH

+  // We must update the sorted bit with an atomic operation
+  struct object tmp;
+  tmp.s.b = ob->s.b;
+  tmp.s.b.sorted = 1;
+  __atomic_store (&(ob->s.b), &(tmp.s.b), __ATOMIC_SEQ_CST);
+#else
   ob->s.b.sorted = 1;
+#endif
 }
 
 #ifdef ATOMIC_FDE_FAST_PATH

@@ -1130,6 +1130,21 @@ search_object (struct object* ob, void *pc)
 }
 }
 
+#ifdef ATOMIC_FDE_FAST_PATH

+
+// Check if the object was already initialized
+static inline bool
+is_object_initialized (struct object *ob)
+{
+  // We have to use relaxed atomics for the read, which
+  // is a bit involved as we read from a bitfield
+  struct object tmp;
+  __atomic_load (&(ob->s.b), &(tmp.s.b), __ATOMIC_RELAXED);
+  return tmp.s.b.sorted;
+}
+
+#endif
+
 const fde *
 _Unwind_Find_FDE (void *pc, struct dwarf_eh_bases *bases)
 {
@@ -1141,6 +1156,21 @@ _Unwind_Find_FDE (void *pc, struct dwarf_eh_bases *bases)
   if (!ob)
 return NULL;
 
+  // Initialize the object lazily

+  if (!is_object_initialized (ob))
+{
+  // Check again under mutex
+  init_object_mutex_once ();
+  __gthread_mutex_lock (&object_mutex);
+
+  if (!ob->s.b.sorted)
+   {
+ init_object (ob);
+   }
+
+  __gthread_mutex_unlock (&object_mutex);
+}
+
   f = search_object (ob, pc);
 #else
 
--

2.37.2