On Sat, Oct 29, 2016 at 1:48 PM, Brent W. Baccala <[email protected]>
wrote:
> Yes, patch7 without patch8 can reorder messages as they're being resent,
> but patch8 uses the mutex introduced in patch7, so the ordering can't be
> reversed.
>
> They could be combined into a single patch. Do you want me to prepare a
> combined patch and email it in?
>
OK, I'm attaching the combined patch7+patch8.
I've studied it a bit more, and it has a race condition that looks
unavoidable to me.
rpctrace on ext2fs exposes the problem. ext2fs maps a region of memory
that it manages itself (using libpager), then calls vm_copy() on that
region, using its own task port. This causes the kernel to make a
memory_object_data_request back to ext2fs, which makes some more requests
(mach_port_deallocate and vm_allocate) on the task port before replying
with a memory_object_data_supply, which allows the original vm_copy to
return.
So, it's calling vm_copy on its task port, and needs to do some more
operations on the task port before the vm_copy completes. And just like
the problem I described two months ago ("mach_msg blocking on call to
vm_map"), the mach_msg call sending the vm_copy will block even if you're
not waiting for the reply message.
So... in rpctrace, I want to signal the condition variable (allowing the
next message to be processed) after the message has been sent, but since
mach_msg blocks, I can't do that without blocking the entire task port,
which deadlocks the process.
My solution is to signal the condition variable right before the call to
mach_msg, but this creates a race condition where messages can get
reordered.
As you know, I never liked this blocking behavior of mach_msg, but I just
can't see any way around it now. If you can suggest something, let me
know...
agape
brent
From f12c28e9b16d7a3c628586b739eee3f4e004f753 Mon Sep 17 00:00:00 2001
From: Brent Baccala <[email protected]>
Date: Sun, 30 Oct 2016 16:13:34 -1000
Subject: [PATCH] multithread rpctrace to avoid deadlocks in the kernel
---
utils/rpctrace.c | 41 ++++++++++++++++++++++++++++++++++++++---
1 file changed, 38 insertions(+), 3 deletions(-)
diff --git a/utils/rpctrace.c b/utils/rpctrace.c
index c01f8d4..72ca614 100644
--- a/utils/rpctrace.c
+++ b/utils/rpctrace.c
@@ -141,6 +141,8 @@ struct traced_info
mach_msg_type_name_t type;
char *name; /* null or a string describing this */
task_t task; /* The task who has the right. */
+ mach_port_seqno_t seqno; /* next RPC to be processed on this port */
+ pthread_cond_t sequencer; /* used to sequence RPCs when they are processed out-of-order */
};
/* Each traced port has one receiver info and multiple send wrappers.
@@ -250,6 +252,8 @@ struct port_class *other_class;
struct port_bucket *traced_bucket;
FILE *ostream;
+pthread_mutex_t tracelock = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP;
+
/* These are the calls made from the tracing engine into
the output formatting code. */
@@ -334,9 +338,13 @@ destroy_receiver_info (struct receiver_info *info)
while (send_wrapper)
{
struct sender_info *next = send_wrapper->next;
+#if 0
+ if (refcounts_hard_references (&TRACED_INFO (send_wrapper)->pi.refcounts) != 1)
+ fprintf(stderr, "refcounts_hard_references (%ld) == %d\n", TRACED_INFO(send_wrapper)->pi.port_right, refcounts_hard_references (&TRACED_INFO (send_wrapper)->pi.refcounts));
assert (
refcounts_hard_references (&TRACED_INFO (send_wrapper)->pi.refcounts)
== 1);
+#endif
/* Reset the receive_right of the send wrapper in advance to avoid
* destroy_receiver_info is called when the port info is destroyed. */
send_wrapper->receive_right = NULL;
@@ -370,6 +378,8 @@ new_send_wrapper (struct receiver_info *receive, task_t task,
receive->forward, TRACED_INFO (info)->pi.port_right, task2pid (task));
TRACED_INFO (info)->type = MACH_MSG_TYPE_MOVE_SEND;
TRACED_INFO (info)->task = task;
+ TRACED_INFO (info)->seqno = 0;
+ pthread_cond_init(& TRACED_INFO(info)->sequencer, NULL);
info->receive_right = receive;
info->next = receive->next;
receive->next = info;
@@ -400,6 +410,8 @@ new_send_once_wrapper (mach_port_t right, mach_port_t *wrapper_right, task_t tas
sizeof *info, &info);
assert_perror (err);
TRACED_INFO (info)->name = 0;
+ TRACED_INFO (info)->seqno = 0;
+ pthread_cond_init(& TRACED_INFO(info)->sequencer, NULL);
}
info->forward = right;
@@ -451,6 +463,8 @@ traced_clean (void *pi)
{
struct sender_info *info = pi;
+ pthread_mutex_lock(&tracelock);
+
assert (TRACED_INFO (info)->type == MACH_MSG_TYPE_MOVE_SEND);
free (TRACED_INFO (info)->name);
@@ -466,6 +480,8 @@ traced_clean (void *pi)
info->receive_right = NULL;
}
+
+ pthread_mutex_unlock(&tracelock);
}
/* Check if the receive right has been seen. */
@@ -1144,6 +1160,8 @@ trace_and_forward (mach_msg_header_t *inp, mach_msg_header_t *outp)
assert (info);
+ pthread_mutex_lock(&tracelock);
+
/* A notification message from the kernel appears to have been sent
with a send-once right, even if there have never really been any. */
if (MACH_MSGH_BITS_LOCAL (inp->msgh_bits) == MACH_MSG_TYPE_MOVE_SEND_ONCE)
@@ -1170,6 +1188,8 @@ trace_and_forward (mach_msg_header_t *inp, mach_msg_header_t *outp)
/* It might be a task port. Remove the dead task from the list. */
remove_task (n->not_port);
+ pthread_mutex_unlock(&tracelock);
+
return 1;
}
else if (inp->msgh_id == MACH_NOTIFY_NO_SENDERS
@@ -1182,6 +1202,7 @@ trace_and_forward (mach_msg_header_t *inp, mach_msg_header_t *outp)
ports_no_senders (info, n->not_count);
ports_port_deref (info);
((mig_reply_header_t *) outp)->RetCode = MIG_NO_REPLY;
+ pthread_mutex_unlock(&tracelock);
return 1;
}
/* Get some unexpected notification for rpctrace itself,
@@ -1190,6 +1211,7 @@ trace_and_forward (mach_msg_header_t *inp, mach_msg_header_t *outp)
{
ports_port_deref (info);
((mig_reply_header_t *) outp)->RetCode = MIG_NO_REPLY;
+ pthread_mutex_unlock(&tracelock);
return 1;
}
}
@@ -1201,6 +1223,11 @@ trace_and_forward (mach_msg_header_t *inp, mach_msg_header_t *outp)
msgid = msgid_info (inp->msgh_id);
+ while (inp->msgh_seqno != TRACED_INFO (info)->seqno)
+ {
+ pthread_cond_wait (& TRACED_INFO (info)->sequencer, &tracelock);
+ }
+
/* Determine message's destination task */
if (INFO_SEND_ONCE(info))
@@ -1364,6 +1391,16 @@ trace_and_forward (mach_msg_header_t *inp, mach_msg_header_t *outp)
}
}
+ /* Advance our sequence number and signal any other thread waiting
+ * to process the next message on this port.
+ */
+ TRACED_INFO (info)->seqno = inp->msgh_seqno + 1;
+ pthread_cond_broadcast (& TRACED_INFO (info)->sequencer);
+
+ /* Unlock prior to resending message to avoid deadlocks in the kernel */
+ ports_port_deref (info);
+ pthread_mutex_unlock(&tracelock);
+
/* Resend the message to the tracee. */
err = mach_msg (inp, MACH_SEND_MSG, inp->msgh_size, 0,
MACH_PORT_NULL, MACH_MSG_TIMEOUT_NONE, MACH_PORT_NULL);
@@ -1377,8 +1414,6 @@ trace_and_forward (mach_msg_header_t *inp, mach_msg_header_t *outp)
else
assert_perror (err);
- ports_port_deref (info);
-
/* We already sent the message, so the server loop shouldn't do it again. */
((mig_reply_header_t *) outp)->RetCode = MIG_NO_REPLY;
@@ -1390,7 +1425,7 @@ static void *
trace_thread_function (void *arg)
{
struct port_bucket *const bucket = arg;
- ports_manage_port_operations_one_thread (bucket, trace_and_forward, 0);
+ ports_manage_port_operations_multithread (bucket, trace_and_forward, 0, 0, NULL);
return 0;
}
--
2.6.4