[Bug debuginfod/30879] New: intermittent failure (libstdc++ uncaught exception terminate) during testsuite on sparc-gentoo

2023-09-22 Thread fche at redhat dot com via Elfutils-devel
https://sourceware.org/bugzilla/show_bug.cgi?id=30879

Bug ID: 30879
   Summary: intermittent failure (libstdc++ uncaught exception
terminate) during testsuite on sparc-gentoo
   Product: elfutils
   Version: unspecified
Status: NEW
  Severity: normal
  Priority: P2
 Component: debuginfod
  Assignee: unassigned at sourceware dot org
  Reporter: fche at redhat dot com
CC: elfutils-devel at sourceware dot org
  Target Milestone: ---

A particularly high-cpu-count buildbot worker reported intermittent failures in
the debuginfod testsuite.  

e.g. 

https://builder.sourceware.org/buildbot/#/builders/224/builds/67

We should ensure that exceptions that hardly ever occur are nevertheless caught
and reported reliably.

-- 
You are receiving this mail because:
You are on the CC list for the bug.

[PATCH] PR30879: debuginfod intermittent terminate()

2023-09-22 Thread Frank Ch. Eigler via Elfutils-devel


Author: Frank Ch. Eigler 
Date:   Fri Sep 22 15:30:51 2023 -0400

PR30879: intermittent debuginfod crash with unhandled exception

Code inspection identified two places where sqlite_ps objects were
being created/used outside try/catch protection.  This patch wraps or
replaces them.

* configure.ac: Look for glibc backtrace headers.
* debuginfod.cxx (scan): New function wrapped by a try/catch loop.
  (sqlite_checkpoint_pb): Use non-exception-producing sqlite functions.
  (main, my_terminate_handler): New terminate() handler.

diff --git a/configure.ac b/configure.ac
index 4b67c84425fa..29ed32feaee6 100644
--- a/configure.ac
+++ b/configure.ac
@@ -839,6 +839,7 @@ AS_IF([test "x$enable_libdebuginfod" = "xdummy"],
   [AC_DEFINE([DUMMY_LIBDEBUGINFOD], [1], [Build dummy libdebuginfod])])
 AM_CONDITIONAL([LIBDEBUGINFOD],[test "x$enable_libdebuginfod" = "xyes" || test 
"x$enable_libdebuginfod" = "xdummy"])
 AM_CONDITIONAL([DUMMY_LIBDEBUGINFOD],[test "x$enable_libdebuginfod" = 
"xdummy"])
+AC_CHECK_HEADERS([execinfo.h])
 
 # Look for libmicrohttpd, libarchive, sqlite for debuginfo server
 # minimum versions as per rhel7.
diff --git a/debuginfod/debuginfod.cxx b/debuginfod/debuginfod.cxx
index d72d2ad16960..e53228803bb0 100644
--- a/debuginfod/debuginfod.cxx
+++ b/debuginfod/debuginfod.cxx
@@ -44,6 +44,12 @@ extern "C" {
 }
 #endif
 
+#ifdef HAVE_EXECINFO_H
+extern "C" {
+#include 
+}
+#endif
+
 extern "C" {
 #include "printversion.h"
 #include "system.h"
@@ -95,6 +101,7 @@ extern "C" {
 #include 
 #include 
 #include 
+#include 
 #include 
 // #include  // on rhel7 gcc 4.8, not competent
 #include 
@@ -1152,22 +1159,13 @@ struct sqlite_ps
 
 struct sqlite_checkpoint_pb: public periodic_barrier
 {
-  sqlite_ps ckpt;
-
+  // NB: don't use sqlite_ps since it can throw exceptions during ctor etc.
   sqlite_checkpoint_pb(unsigned t, unsigned p):
-periodic_barrier(t, p), ckpt(db, "periodic wal checkpoint",
- "pragma wal_checkpoint(truncate);") {}
+periodic_barrier(t, p) { }
   
   void periodic_barrier_work() noexcept
   {
-try
-  {
-ckpt.reset().step_ok_done();
-  }
-catch (const reportable_exception& e)
-  {
-e.report(clog);
-  }
+(void) sqlite3_exec (db, "pragma wal_checkpoint(truncate);", NULL, NULL, 
NULL);
   }
 };
   
@@ -3714,11 +3712,9 @@ scan_archive_file (const string& rps, const stat_t& st,
 // The thread that consumes file names off of the scanq.  We hold
 // the persistent sqlite_ps's at this level and delegate file/archive
 // scanning to other functions.
-static void*
-thread_main_scanner (void* arg)
+static void
+scan ()
 {
-  (void) arg;
-
   // all the prepared statements fit to use, the _f_ set:
   sqlite_ps ps_f_upsert_buildids (db, "file-buildids-intern", "insert or 
ignore into " BUILDIDS "_buildids VALUES (NULL, ?);");
   sqlite_ps ps_f_upsert_fileparts (db, "file-fileparts-intern", "insert or 
ignore into " BUILDIDS "_fileparts VALUES (NULL, ?);");
@@ -3845,8 +3841,25 @@ thread_main_scanner (void* arg)
   inc_metric("thread_work_total","role","scan");
 }
 
-
   add_metric("thread_busy", "role", "scan", -1);
+}
+
+
+// Use this function as the thread entry point, so it can catch our
+// fleet of exceptions (incl. the sqlite_ps ctors) and report.
+static void*
+thread_main_scanner (void* arg)
+{
+  (void) arg;
+  while (! interrupted)
+try
+  {
+scan();
+  }
+catch (const reportable_exception& e)
+  {
+e.report(cerr);
+  }
   return 0;
 }
 
@@ -4359,6 +4372,20 @@ default_concurrency() // guaranteed >= 1
 }
 
 
+// 30879: Something to help out in case of an uncaught exception.
+void my_terminate_handler()
+{
+#if defined(__GLIBC__)
+  void *array[40];
+  int size = backtrace (array, 40);
+  backtrace_symbols_fd (array, size, STDERR_FILENO);
+#endif
+#if defined(__GLIBCXX__) || defined(__GLIBCPP__)
+  __gnu_cxx::__verbose_terminate_handler();
+#endif
+  abort();
+}
+
 
 int
 main (int argc, char *argv[])
@@ -4367,6 +4394,8 @@ main (int argc, char *argv[])
   (void) bindtextdomain (PACKAGE_TARNAME, LOCALEDIR);
   (void) textdomain (PACKAGE_TARNAME);
 
+  std::set_terminate(& my_terminate_handler);
+
   /* Tell the library which version we are expecting.  */
   elf_version (EV_CURRENT);