labath created this revision.
labath added reviewers: mgorny, DavidSpickett.
labath requested review of this revision.
Herald added a project: LLDB.

The lldb-server code is currently set up in a way that each
NativeProcess instance does its own waitpid handling. This works fine
for BSDs, where the code can do a waitpid(process_id), and get
information for all threads in that process.

The situation is trickier on linux, because waitpid(pid) will only
return information for the main thread of the process (one whose tid ==
pid). For this reason the linux code does a waitpid(-1), to get
information for all threads. This was fine while we were supporting just
a single process, but becomes a problem when we have multiple processes
as they end up stealing each others events.

There are two possible solutions to this problem:

- call waitpid(-1) centrally, and then dispatch the events to the appropriate 
process
- have each process call waitpid(tid) for all the threads it manages

This patch implements the second approach. Besides fitting better into
the existing design, it also has the added benefit of ensuring
predictable ordering for thread/process creation events (which come in
pairs -- one for the parent and one for the child). The first approach
OTOH, would make this ordering even more complicated since we would
have to keep the half-threads hanging in mid-air until we find the
process we should attach them to.

The downside to this approach is an increased number of syscalls (one
waitpid for each thread), but I think we're pretty far from optimizing
things like this, and so the cleanliness of the design is worth it.

The included test reproduces the circumstances which should demonstrate
the bug (which manifests as a hung test), but I have not been able to
get it to fail. The only place I've seen this failure modes are very
rare hangs in the thread sanitizer tests (tsan forks an addr2line
process to produce its error messages).


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D116372

Files:
  lldb/source/Plugins/Process/Linux/NativeProcessLinux.cpp
  lldb/source/Plugins/Process/Linux/NativeProcessLinux.h
  lldb/test/API/tools/lldb-server/TestGdbRemoteFork.py

Index: lldb/test/API/tools/lldb-server/TestGdbRemoteFork.py
===================================================================
--- lldb/test/API/tools/lldb-server/TestGdbRemoteFork.py
+++ lldb/test/API/tools/lldb-server/TestGdbRemoteFork.py
@@ -6,6 +6,40 @@
 class TestGdbRemoteFork(gdbremote_testcase.GdbRemoteTestCaseBase):
     mydir = TestBase.compute_mydir(__file__)
 
+    @add_test_categories(["fork"])
+    def test_fork_multithreaded(self):
+        self.build()
+        self.prep_debug_monitor_and_inferior(inferior_args=["thread:new"]*2 + ["fork"])
+        self.add_qSupported_packets(["multiprocess+", "fork-events+"])
+        ret = self.expect_gdbremote_sequence()
+        self.assertIn("fork-events+", ret["qSupported_response"])
+        self.reset_test_sequence()
+
+        # continue and expect fork
+        fork_regex = "[$]T.*;fork:p([0-9a-f]+)[.]([0-9a-f]+).*"
+        self.test_sequence.add_log_lines([
+            "read packet: $c#00",
+            {"direction": "send", "regex": fork_regex,
+             "capture": {1: "pid", 2: "tid"}},
+        ], True)
+        ret = self.expect_gdbremote_sequence()
+        pid = int(ret["pid"], 16)
+        self.reset_test_sequence()
+
+        # detach the forked child
+        self.test_sequence.add_log_lines([
+            "read packet: $D;{:x}#00".format(pid),
+            {"direction": "send", "regex": r"[$]OK#.*"},
+        ], True)
+        ret = self.expect_gdbremote_sequence()
+        self.reset_test_sequence()
+
+        # resume the parent
+        self.test_sequence.add_log_lines([
+            "read packet: $k#00",
+        ], True)
+        self.expect_gdbremote_sequence()
+
     def fork_and_detach_test(self, variant):
         self.build()
         self.prep_debug_monitor_and_inferior(inferior_args=[variant])
Index: lldb/source/Plugins/Process/Linux/NativeProcessLinux.h
===================================================================
--- lldb/source/Plugins/Process/Linux/NativeProcessLinux.h
+++ lldb/source/Plugins/Process/Linux/NativeProcessLinux.h
@@ -164,7 +164,7 @@
 
   static Status SetDefaultPtraceOpts(const lldb::pid_t);
 
-  void MonitorCallback(lldb::pid_t pid, WaitStatus status);
+  void MonitorCallback(NativeThreadLinux &thread, WaitStatus status);
 
   void WaitForCloneNotification(::pid_t pid);
 
@@ -180,7 +180,7 @@
 
   bool HasThreadNoLock(lldb::tid_t thread_id);
 
-  bool StopTrackingThread(lldb::tid_t thread_id);
+  void StopTrackingThread(NativeThreadLinux &thread);
 
   /// Create a new thread.
   ///
@@ -243,20 +243,9 @@
   /// Manages Intel PT process and thread traces.
   IntelPTManager m_intel_pt_manager;
 
-  struct CloneInfo {
-    int event;
-    lldb::tid_t parent_tid;
-  };
-
-  // Map of child processes that have been signaled once, and we are
-  // waiting for the second signal.
-  llvm::DenseMap<lldb::pid_t, llvm::Optional<CloneInfo>> m_pending_pid_map;
-
-  // Handle a clone()-like event.  If received by parent, clone_info contains
-  // additional info.  Returns true if the event is handled, or false if it
-  // is pending second notification.
-  bool MonitorClone(lldb::pid_t child_pid,
-                    llvm::Optional<CloneInfo> clone_info);
+  // Handle a clone()-like event.
+  bool MonitorClone(NativeThreadLinux &parent, lldb::pid_t child_pid,
+                    int event);
 };
 
 } // namespace process_linux
Index: lldb/source/Plugins/Process/Linux/NativeProcessLinux.cpp
===================================================================
--- lldb/source/Plugins/Process/Linux/NativeProcessLinux.cpp
+++ lldb/source/Plugins/Process/Linux/NativeProcessLinux.cpp
@@ -426,22 +426,24 @@
 }
 
 // Handles all waitpid events from the inferior process.
-void NativeProcessLinux::MonitorCallback(lldb::pid_t pid, WaitStatus status) {
+void NativeProcessLinux::MonitorCallback(NativeThreadLinux &thread,
+                                         WaitStatus status) {
   Log *log(GetLogIfAnyCategoriesSet(LIBLLDB_LOG_PROCESS));
 
   // Certain activities differ based on whether the pid is the tid of the main
   // thread.
-  const bool is_main_thread = (pid == GetID());
+  const bool is_main_thread = (thread.GetID() == GetID());
 
   // Handle when the thread exits.
   if (status.type == WaitStatus::Exit || status.type == WaitStatus::Signal) {
     LLDB_LOG(log,
              "got exit status({0}) , tid = {1} ({2} main thread), process "
              "state = {3}",
-             status, pid, is_main_thread ? "is" : "is not", GetState());
+             status, thread.GetID(), is_main_thread ? "is" : "is not",
+             GetState());
 
     // This is a thread that exited.  Ensure we're not tracking it anymore.
-    StopTrackingThread(pid);
+    StopTrackingThread(thread);
 
     if (is_main_thread) {
       // The main thread exited.  We're done monitoring.  Report to delegate.
@@ -454,37 +456,15 @@
   }
 
   siginfo_t info;
-  const auto info_err = GetSignalInfo(pid, &info);
-  auto thread_sp = GetThreadByID(pid);
-
-  if (!thread_sp) {
-    // Normally, the only situation when we cannot find the thread is if we
-    // have just received a new thread notification. This is indicated by
-    // GetSignalInfo() returning si_code == SI_USER and si_pid == 0
-    LLDB_LOG(log, "received notification about an unknown tid {0}.", pid);
-
-    if (info_err.Fail()) {
-      LLDB_LOG(log,
-               "(tid {0}) GetSignalInfo failed ({1}). "
-               "Ingoring this notification.",
-               pid, info_err);
-      return;
-    }
-
-    LLDB_LOG(log, "tid {0}, si_code: {1}, si_pid: {2}", pid, info.si_code,
-             info.si_pid);
-
-    MonitorClone(pid, llvm::None);
-    return;
-  }
+  const auto info_err = GetSignalInfo(thread.GetID(), &info);
 
   // Get details on the signal raised.
   if (info_err.Success()) {
     // We have retrieved the signal info.  Dispatch appropriately.
     if (info.si_signo == SIGTRAP)
-      MonitorSIGTRAP(info, *thread_sp);
+      MonitorSIGTRAP(info, thread);
     else
-      MonitorSignal(info, *thread_sp);
+      MonitorSignal(info, thread);
   } else {
     if (info_err.GetError() == EINVAL) {
       // This is a group stop reception for this tid. We can reach here if we
@@ -500,9 +480,8 @@
                "received a group stop for pid {0} tid {1}. Transparent "
                "handling of group stops not supported, resuming the "
                "thread.",
-               GetID(), pid);
-      ResumeThread(*thread_sp, thread_sp->GetState(),
-                   LLDB_INVALID_SIGNAL_NUMBER);
+               GetID(), thread.GetID());
+      ResumeThread(thread, thread.GetState(), LLDB_INVALID_SIGNAL_NUMBER);
     } else {
       // ptrace(GETSIGINFO) failed (but not due to group-stop).
 
@@ -512,12 +491,12 @@
 
       // Stop tracking the metadata for the thread since it's entirely off the
       // system now.
-      const bool thread_found = StopTrackingThread(pid);
+      StopTrackingThread(thread);
 
       LLDB_LOG(log,
                "GetSignalInfo failed: {0}, tid = {1}, status = {2}, "
-               "status = {3}, main_thread = {4}, thread_found: {5}",
-               info_err, pid, status, status, is_main_thread, thread_found);
+               "status = {3}, main_thread = {4}",
+               info_err, thread.GetID(), status, status, is_main_thread);
 
       if (is_main_thread) {
         // Notify the delegate - our process is not available but appears to
@@ -532,7 +511,7 @@
                  "pid {0} tid {1} non-main thread exit occurred, didn't "
                  "tell delegate anything since thread disappeared out "
                  "from underneath us",
-                 GetID(), pid);
+                 GetID(), thread.GetID());
       }
     }
   }
@@ -549,29 +528,14 @@
            pid);
   ::pid_t wait_pid =
       llvm::sys::RetryAfterSignal(-1, ::waitpid, pid, &status, __WALL);
-  // Since we are waiting on a specific pid, this must be the creation event.
-  // But let's do some checks just in case.
-  if (wait_pid != pid) {
-    LLDB_LOG(log,
-             "waiting for pid {0} failed. Assuming the pid has "
-             "disappeared in the meantime",
-             pid);
-    // The only way I know of this could happen is if the whole process was
-    // SIGKILLed in the mean time. In any case, we can't do anything about that
-    // now.
-    return;
-  }
-  if (WIFEXITED(status)) {
-    LLDB_LOG(log,
-             "waiting for pid {0} returned an 'exited' event. Not "
-             "tracking it.",
-             pid);
-    // Also a very improbable event.
-    m_pending_pid_map.erase(pid);
-    return;
-  }
 
-  MonitorClone(pid, llvm::None);
+  // It's theoretically possible to get other events if the entire process was
+  // SIGKILLed before we got a chance to check this. In that case, we'll just
+  // clean everything up when we get the process exit event.
+
+  LLDB_LOG(log,
+           "waitpid({0}, &status, __WALL) => {1} (errno: {2}, status = {3})",
+           pid, wait_pid, errno, WaitStatus::Decode(status));
 }
 
 void NativeProcessLinux::MonitorSIGTRAP(const siginfo_t &info,
@@ -598,8 +562,7 @@
                thread.GetID());
       ResumeThread(thread, thread.GetState(), LLDB_INVALID_SIGNAL_NUMBER);
     } else {
-      if (!MonitorClone(event_message, {{(info.si_code >> 8), thread.GetID()}}))
-        WaitForCloneNotification(event_message);
+      MonitorClone(thread, event_message, info.si_code >> 8);
     }
 
     break;
@@ -886,36 +849,15 @@
   StopRunningThreads(thread.GetID());
 }
 
-bool NativeProcessLinux::MonitorClone(
-    lldb::pid_t child_pid,
-    llvm::Optional<NativeProcessLinux::CloneInfo> clone_info) {
+bool NativeProcessLinux::MonitorClone(NativeThreadLinux &parent,
+                                      lldb::pid_t child_pid, int event) {
   Log *log(ProcessPOSIXLog::GetLogIfAllCategoriesSet(POSIX_LOG_PROCESS));
-  LLDB_LOG(log, "clone, child_pid={0}, clone info?={1}", child_pid,
-           clone_info.hasValue());
+  LLDB_LOG(log, "parent_tid={0}, child_pid={1}, event={2}", parent.GetID(),
+           child_pid, event);
 
-  auto find_it = m_pending_pid_map.find(child_pid);
-  if (find_it == m_pending_pid_map.end()) {
-    // not in the map, so this is the first signal for the PID
-    m_pending_pid_map.insert({child_pid, clone_info});
-    return false;
-  }
-  m_pending_pid_map.erase(find_it);
-
-  // second signal for the pid
-  assert(clone_info.hasValue() != find_it->second.hasValue());
-  if (!clone_info) {
-    // child signal does not indicate the event, so grab the one stored
-    // earlier
-    clone_info = find_it->second;
-  }
-
-  LLDB_LOG(log, "second signal for child_pid={0}, parent_tid={1}, event={2}",
-           child_pid, clone_info->parent_tid, clone_info->event);
+  WaitForCloneNotification(child_pid);
 
-  auto *parent_thread = GetThreadByID(clone_info->parent_tid);
-  assert(parent_thread);
-
-  switch (clone_info->event) {
+  switch (event) {
   case PTRACE_EVENT_CLONE: {
     // PTRACE_EVENT_CLONE can either mean a new thread or a new process.
     // Try to grab the new process' PGID to figure out which one it is.
@@ -930,15 +872,14 @@
       ThreadWasCreated(child_thread);
 
       // Resume the parent.
-      ResumeThread(*parent_thread, parent_thread->GetState(),
-                   LLDB_INVALID_SIGNAL_NUMBER);
+      ResumeThread(parent, parent.GetState(), LLDB_INVALID_SIGNAL_NUMBER);
       break;
     }
   }
     LLVM_FALLTHROUGH;
   case PTRACE_EVENT_FORK:
   case PTRACE_EVENT_VFORK: {
-    bool is_vfork = clone_info->event == PTRACE_EVENT_VFORK;
+    bool is_vfork = event == PTRACE_EVENT_VFORK;
     std::unique_ptr<NativeProcessLinux> child_process{new NativeProcessLinux(
         static_cast<::pid_t>(child_pid), m_terminal_fd, m_delegate, m_arch,
         m_main_loop, {static_cast<::pid_t>(child_pid)})};
@@ -949,12 +890,11 @@
     if (bool(m_enabled_extensions & expected_ext)) {
       m_delegate.NewSubprocess(this, std::move(child_process));
       // NB: non-vfork clone() is reported as fork
-      parent_thread->SetStoppedByFork(is_vfork, child_pid);
-      StopRunningThreads(parent_thread->GetID());
+      parent.SetStoppedByFork(is_vfork, child_pid);
+      StopRunningThreads(parent.GetID());
     } else {
       child_process->Detach();
-      ResumeThread(*parent_thread, parent_thread->GetState(),
-                   LLDB_INVALID_SIGNAL_NUMBER);
+      ResumeThread(parent, parent.GetState(), LLDB_INVALID_SIGNAL_NUMBER);
     }
     break;
   }
@@ -1729,24 +1669,19 @@
   return false;
 }
 
-bool NativeProcessLinux::StopTrackingThread(lldb::tid_t thread_id) {
+void NativeProcessLinux::StopTrackingThread(NativeThreadLinux &thread) {
   Log *const log = ProcessPOSIXLog::GetLogIfAllCategoriesSet(POSIX_LOG_THREAD);
-  LLDB_LOG(log, "tid: {0})", thread_id);
-
-  bool found = false;
-  for (auto it = m_threads.begin(); it != m_threads.end(); ++it) {
-    if (*it && ((*it)->GetID() == thread_id)) {
-      m_threads.erase(it);
-      found = true;
-      break;
-    }
-  }
+  lldb::tid_t thread_id = thread.GetID();
+  LLDB_LOG(log, "tid: {0}", thread_id);
 
-  if (found)
-    NotifyTracersOfThreadDestroyed(thread_id);
+  auto it = llvm::find_if(m_threads, [&](const auto &thread_up) {
+    return thread_up.get() == &thread;
+  });
+  assert(it != m_threads.end());
+  m_threads.erase(it);
 
+  NotifyTracersOfThreadDestroyed(thread_id);
   SignalIfAllThreadsStopped();
-  return found;
 }
 
 Status NativeProcessLinux::NotifyTracersOfNewThread(lldb::tid_t tid) {
@@ -1945,27 +1880,44 @@
 
 void NativeProcessLinux::SigchldHandler() {
   Log *log(ProcessPOSIXLog::GetLogIfAllCategoriesSet(POSIX_LOG_PROCESS));
-  // Process all pending waitpid notifications.
-  while (true) {
+
+  // Threads can appear or disappear as a result of event processing, so gather
+  // the events upfront.
+  llvm::DenseMap<lldb::tid_t, WaitStatus> tid_events;
+  for (const auto &thread_up : m_threads) {
     int status = -1;
-    ::pid_t wait_pid = llvm::sys::RetryAfterSignal(-1, ::waitpid, -1, &status,
-                                          __WALL | __WNOTHREAD | WNOHANG);
+    ::pid_t wait_pid =
+        llvm::sys::RetryAfterSignal(-1, ::waitpid, thread_up->GetID(), &status,
+                                    __WALL | __WNOTHREAD | WNOHANG);
 
     if (wait_pid == 0)
-      break; // We are done.
+      continue; // Nothing to do for this thread.
 
     if (wait_pid == -1) {
       Status error(errno, eErrorTypePOSIX);
-      LLDB_LOG(log, "waitpid (-1, &status, _) failed: {0}", error);
-      break;
+      LLDB_LOG(log, "waitpid({0}, &status, _) failed: {1}", thread_up->GetID(),
+               error);
+      continue;
     }
 
+    assert(wait_pid == static_cast<::pid_t>(thread_up->GetID()));
+
     WaitStatus wait_status = WaitStatus::Decode(status);
 
-    LLDB_LOG(log, "waitpid (-1, &status, _) => pid = {0}, status = {1}",
-             wait_pid, wait_status);
+    LLDB_LOG(log, "waitpid({0})  got status = {1}", thread_up->GetID(),
+             wait_status);
+    tid_events.try_emplace(thread_up->GetID(), wait_status);
+  }
 
-    MonitorCallback(wait_pid, wait_status);
+  for (auto &KV : tid_events) {
+    LLDB_LOG(log, "processing {0}({1}) ...", KV.first, KV.second);
+    NativeThreadLinux *thread = GetThreadByID(KV.first);
+    if (thread) {
+      MonitorCallback(*thread, KV.second);
+    } else {
+      // This can happen if one of the events is an main thread exit.
+      LLDB_LOG(log, "... but the thread has disappeared");
+    }
   }
 }
 
_______________________________________________
lldb-commits mailing list
lldb-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/lldb-commits

Reply via email to