[PATCH 1/3] src: Add threadlib library for parallel job execution

2025-03-29 Thread Aaron Merey
Add new internal static library libthread.a that provides infrastructure
for eu-* tools to run functions concurrently using pthreads.

threadlib.c manages per-job threads as well as per-job buffers for stdout
output.  Output for each job is printed to stdout in the order that the
jobs were added to the job queue.  This helps preserve the order of
output when parallelization is added to an eu-* tool.

threadlib.h declares functions add_job and run_jobs. Jobs are added to
a threadlib.c internal job queue using add_job. run_jobs concurrently
executes jobs in parallel.

eu-readelf now links against libthread.a when elfutils is configured
with --enable-thread-safety.

* src/Makefile.am: libthread.a is compiled and and linked with
readelf when USE_LOCKS is defined.
* src/threadlib.c: New file. Manages job creation, concurrent
execution and output handling.
* src/threadlib.h: New file. Declares functions add_job and
run_jobs.

Signed-off-by: Aaron Merey 
---
 src/Makefile.am |  17 
 src/threadlib.c | 253 
 src/threadlib.h |  34 +++
 3 files changed, 304 insertions(+)
 create mode 100644 src/threadlib.c
 create mode 100644 src/threadlib.h

diff --git a/src/Makefile.am b/src/Makefile.am
index ed245fc1..44d51393 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -51,6 +51,19 @@ libar.manifest: $(libar_a_OBJECTS)
 MOSTLYCLEANFILES = *.gconv
 CLEANFILES = $(bin_SCRIPTS) $(EXTRA_libar_a_DEPENDENCIES)
 
+if USE_LOCKS
+noinst_LIBRARIES += libthread.a
+
+libthread_a_SOURCES = threadlib.c
+
+EXTRA_DIST += threadlib.h
+
+libthread.manifest: $(libthread_a_OBJECTS)
+   $(AM_V_GEN)echo $^ > $@
+
+CLEANFILES += $(EXTRA_libthread_a_DEPENDENCIES)
+endif
+
 if BUILD_STATIC
 libasm = ../libasm/libasm.a
 libdw = ../libdw/libdw.a -lz $(zip_LIBS) $(libelf) -ldl -lpthread
@@ -90,6 +103,10 @@ ar_no_Wstack_usage = yes
 unstrip_no_Wstack_usage = yes
 
 readelf_LDADD = $(libdw) $(libebl) $(libelf) $(libeu) $(argp_LDADD)
+if USE_LOCKS
+readelf_LDADD += libthread.a
+thread_LDADD = threadlib.a $(libelf)
+endif
 nm_LDADD = $(libdw) $(libebl) $(libelf) $(libeu) $(argp_LDADD) $(obstack_LIBS) 
\
   $(demanglelib)
 size_LDADD = $(libelf) $(libeu) $(argp_LDADD)
diff --git a/src/threadlib.c b/src/threadlib.c
new file mode 100644
index ..493d466d
--- /dev/null
+++ b/src/threadlib.c
@@ -0,0 +1,253 @@
+/* Functions for running jobs concurrently.
+   Copyright (C) 2025 Red Hat, Inc.
+   This file is part of elfutils.
+
+   This file is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   elfutils is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see .  */
+
+#ifdef HAVE_CONFIG_H
+# include 
+#endif
+
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "threadlib.h"
+
+/* Dynamic buffer for thread output.  */
+typedef struct {
+  size_t sizeloc;
+  char *buf;
+  FILE *file;
+} output_stream_t;
+
+/* Allocate resources for STREAM.  */
+static void
+init_thread_output_stream (output_stream_t *stream)
+{
+  stream->buf = NULL;
+  stream->sizeloc = 0;
+  stream->file = open_memstream (&(stream->buf), &(stream->sizeloc));
+
+  if (stream->file == NULL)
+error (1, 0, _("cannot open thread output stream"));
+}
+
+/* Print and deallocate resources for STREAM.  */
+static void
+print_thread_output_stream (output_stream_t *stream)
+{
+  /* fclose may update stream->buf.  */
+  if (fclose (stream->file) != 0)
+error (1, 0, _("cannot close thread output stream"));
+
+  printf ("%s", stream->buf);
+  free (stream->buf);
+}
+
+typedef enum {
+  /* pthread_create has not been called.  */
+  NOT_STARTED,
+
+  /* pthread_create has been called.  */
+  STARTED,
+
+  /* The thread has finished running the job but has not been joined.  */
+  DONE,
+
+  /* pthread_join has been called.  */
+  JOINED
+} thread_state_t;
+
+struct job_t {
+  /* A job consists of calling this function then printing any output
+ to stdout.  This function is run from thread_start_job, which also
+ initializes the FILE *.  */
+  void *(*start_routine)(void *, FILE *);
+
+  /* Arg passed to start_routine.  */
+  void *arg;
+
+  /* Thread to run start_routine.  */
+  pthread_t thread;
+
+  /* See thread_state_t.  */
+  _Atomic thread_state_t state;
+
+  /* Dynamic buffer for output generated during start_routine.
+ Contents will get printed to stdout when a job finishes.  */
+  output_stream_t stream;
+
+  /* Next job in the linked list.  

[PATCH 2/3] src/readelf.c: Support concurrency for -w, --debug-dump

2025-03-29 Thread Aaron Merey
Implement concurrent execution of print_debug_* functions during handling
of -w, --debug-dump using libthread.a.

A new `-C, --concurrency=NUM` command line option controls the maximum
number of worker threads used. This value defaults to the number of CPUs.

Job output is buffered and printed in the order that jobs were added to
the queue. This helps preserve the existing order of stdout output. Full
support for output buffering in print_debug_* functions is added in the
next patch in this series.

* src/readelf.c (default_concurrency): Function estimating the
maximum number of threads.
(parse_opt): Handle -C, --concurrency=NUM.
(do_job): Entry point function for worker threads.
(schedule_job): If thread safety is enabled, add job to the
job queue.  Otherwise just run the job from the main thread.
(print_debug): Pass print_debug_* function pointers and
args to schedule_job. Also call run_jobs if thread safety
is enabled.

Signed-off-by: Aaron Merey 
---

Due to significant lock contention, running eu-readelf when configured
with --enable-thread-safety causes a major performance degradation.
Currently, concurrency may double the runtime of eu-readelf.

However I think this work should still be merged. --enable-thread-safety
is still marked as experimental and these patches facilitate testing
future improvements to lock contention.

 src/readelf.c | 152 --
 1 file changed, 148 insertions(+), 4 deletions(-)

diff --git a/src/readelf.c b/src/readelf.c
index e5bc16a5..a7a34468 100644
--- a/src/readelf.c
+++ b/src/readelf.c
@@ -57,6 +57,18 @@
 
 #include "../libdw/known-dwarf.h"
 
+#ifdef USE_LOCKS
+#include "threadlib.h"
+#endif
+
+#ifdef HAVE_SCHED_H
+#include 
+#endif
+
+#ifdef HAVE_SYS_RESOURCE_H
+#include 
+#endif
+
 #ifdef __linux__
 #define CORE_SIGILL  SIGILL
 #define CORE_SIGBUS  SIGBUS
@@ -150,6 +162,10 @@ static const struct argp_option options[] =
 N_("Ignored for compatibility (lines always wide)"), 0 },
   { "decompress", 'z', NULL, 0,
 N_("Show compression information for compressed sections (when used with 
-S); decompress section before dumping data (when used with -p or -x)"), 0 },
+#ifdef USE_LOCKS
+  { "concurrency", 'C', "NUM", 0,
+N_("Set maximum number of threads. Defaults to the number of CPUs."), 0 },
+#endif
   { NULL, 0, NULL, 0, NULL, 0 }
 };
 
@@ -249,6 +265,11 @@ static bool print_decompress = false;
 /* True if we want to show split compile units for debug_info skeletons.  */
 static bool show_split_units = false;
 
+#if USE_LOCKS
+/* Maximum number of worker threads during concurrency.  */
+static int max_worker_threads = -1;
+#endif
+
 /* Select printing of debugging sections.  */
 static enum section_e
 {
@@ -379,6 +400,43 @@ cleanup_list (struct section_argument *list)
 }
 }
 
+#ifdef USE_LOCKS
+/* Estimate the maximum number of worker threads. This is normally
+   #CPU - 1.  Return value is guaranteed to be at least 1.  */
+static int
+default_concurrency (void)
+{
+  unsigned aff = 0;
+#ifdef HAVE_SCHED_GETAFFINITY
+  {
+int ret;
+cpu_set_t mask;
+CPU_ZERO (&mask);
+ret = sched_getaffinity (0, sizeof(mask), &mask);
+if (ret == 0)
+  aff = CPU_COUNT (&mask);
+  }
+#endif
+
+  unsigned fn = 0;
+#ifdef HAVE_GETRLIMIT
+  {
+struct rlimit rlim;
+int rc = getrlimit (RLIMIT_NOFILE, &rlim);
+if (rc == 0)
+  fn = MAX ((rlim_t) 1, (rlim.rlim_cur - 100) / 2);
+/* Conservatively estimate that at least 2 fds are used
+   by each thread.  */
+  }
+#endif
+
+  unsigned d = MIN (MAX (aff, 2U),
+   MAX (fn, 2U));
+
+  return --d;
+}
+#endif
+
 int
 main (int argc, char *argv[])
 {
@@ -402,6 +460,12 @@ main (int argc, char *argv[])
   /* Before we start tell the ELF library which version we are using.  */
   elf_version (EV_CURRENT);
 
+#ifdef USE_LOCKS
+  /* If concurrency wasn't set by argp_parse, then set a default value.  */
+  if (max_worker_threads == -1)
+max_worker_threads = default_concurrency ();
+#endif
+
   /* Now process all the files given at the command line.  */
   bool only_one = remaining + 1 == argc;
   do
@@ -526,6 +590,19 @@ parse_opt (int key, char *arg,
 case 'c':
   print_archive_index = true;
   break;
+#if USE_LOCKS
+case 'C':
+  if (arg != NULL)
+   {
+ max_worker_threads = atoi (arg);
+ if (max_worker_threads < 2)
+   error (1, 0, _("-C NUM minimum 2"));
+
+ /* Decrement to account for the main thread.  */
+ --max_worker_threads;
+   }
+  break;
+#endif
 case 'w':
   if (arg == NULL)
{
@@ -5451,7 +5528,7 @@ listptr_base (struct listptr *p)
 }
 
 /* To store the name used in compare_listptr */
-static const char *sort_listptr_name;
+_Thread_local const char *sort_listptr_name;
 
 static int
 compare_listptr (const void *a, const void *b)
@@ -11909,6 +11986,63 @@ getone_dwflmod (D