Hi! As has been mentioned on omp-lang some time ago, what I've implemented for if (0) tasks is wrong for tasks that depend on some earlier tasks. I chose to ignore the if (0) and make them deferred tasks anyway, but that is not allowed. Instead, we need to wait on all the dependencies and when there are no parent task's children that the new task depends on, we can just continue with normal behavior of creating and immediately running if (0) task.
As both taskwait and this new kind of waiting is hopefully going to be rare, I've decided to move away some of the struct gomp_task fields that are only needed for those to a separate structure, and gomp_task just containing pointer to that. The structure is just live on the stack of GOMP_taskwait or the new gomp_task_maybe_wait_for_dependencies function. The new function first searches (using hash table) for all child tasks (running, ready to run or waiting for dependencies) of parent the new task is supposed to depend on, and marks them (parent_depends_on flag). Previously I've been removing all but the last out/inout dependency from the hash table, because all out/inout dependencies against the same address are serialized, but for this I now need all of them, so instead of removing the redundant ones from the chain I'm just moving them to the end of the linked list and ignoring them for the purposes of new deferred task dependcy search. All ready to run tasks with parent_depends_on flag are moved in the parent->children doubly linked list early, so that they are more likely to be scheduled soon, and also when adding in that case new tasks that were previously waiting for dependencies, I'm moving them in the children doubly linked list after the parent_depends_on tasks. So, the invariant should be that in that list we have first parent_depends_on ready to run tasks, then !parent_depends_on ready to run tasks, and finally already running tasks. Bootstrapped/regtested on x86_64-linux and i686-linux. I would welcome any review, because even more tests couldn't ensure I didn't make any mistakes in it. BTW, during testing I found that taskgroup handling (GOMP_taskgroup_end) seems badly broken, the depend-3.c test hangs without or with this patch with OMP_NUM_THREADS=1, because it only tries to schedule tasks from the current taskgroup, but a) it doesn't even check if there are not any tasks in the taskgroup that aren't ready to be scheduled yet due to unsatisfied dependencies b) it only attempts to schedule tasks from the current taskgroup, which is fine if there are any, but if there are none, it should just pick any ready to run task from the current task children, because the tasks in current taskgroup might be waiting for those. Will try to fix that next week. 2014-07-11 Jakub Jelinek <ja...@redhat.com> * libgomp.h (struct gomp_task_depend_entry): Add redundant_out field. (struct gomp_taskwait): New type. (struct gomp_task): Add taskwait and parent_depends_on, remove in_taskwait and taskwait_sem fields. (gomp_finish_task): Don't destroy taskwait_sem. * task.c (gomp_init_task): Don't init in_taskwait, instead init taskwait and parent_depends_on. (GOMP_task): For if (0) tasks with depend clause that depend on earlier tasks don't defer them, instead call gomp_task_maybe_wait_for_dependencies to wait for the dependencies. Initialize redundant_out field, for redundant out entries just move them at the end of linked list instead of removing them completely, and set redundant_out flag instead of redundant. (gomp_task_run_pre): Update last_parent_depends_on if scheduling that task. (gomp_task_run_post_handle_dependers): If parent is in gomp_task_maybe_wait_for_dependencies and newly runnable task is not parent_depends_on, queue it in parent->children linked list after all runnable tasks with parent_depends_on set. Adjust for addition of taskwait indirection. (gomp_task_run_post_remove_parent): If parent is in gomp_task_maybe_wait_for_dependencies and task to be removed is parent_depends_on, decrement n_depend and if needed awake parent. Adjust for addition of taskwait indirection. (GOMP_taskwait): Adjust for addition of taskwait indirection. (gomp_task_maybe_wait_for_dependencies): New function. * testsuite/libgomp.c/depend-5.c: New test. --- libgomp/libgomp.h.jj 2014-01-03 11:41:28.000000000 +0100 +++ libgomp/libgomp.h 2014-07-11 15:25:56.154873737 +0200 @@ -274,6 +274,7 @@ struct gomp_task_depend_entry struct gomp_task *task; bool is_in; bool redundant; + bool redundant_out; }; struct gomp_dependers_vec @@ -283,6 +284,17 @@ struct gomp_dependers_vec struct gomp_task *elem[]; }; +/* Used when in GOMP_taskwait or in gomp_task_maybe_wait_for_dependencies. */ + +struct gomp_taskwait +{ + bool in_taskwait; + bool in_depend_wait; + size_t n_depend; + struct gomp_task *last_parent_depends_on; + gomp_sem_t taskwait_sem; +}; + /* This structure describes a "task" to be run by a thread. */ struct gomp_task @@ -298,17 +310,17 @@ struct gomp_task struct gomp_taskgroup *taskgroup; struct gomp_dependers_vec *dependers; struct htab *depend_hash; + struct gomp_taskwait *taskwait; size_t depend_count; size_t num_dependees; struct gomp_task_icv icv; void (*fn) (void *); void *fn_data; enum gomp_task_kind kind; - bool in_taskwait; bool in_tied_task; bool final_task; bool copy_ctors_done; - gomp_sem_t taskwait_sem; + bool parent_depends_on; struct gomp_task_depend_entry depend[]; }; @@ -582,7 +594,6 @@ gomp_finish_task (struct gomp_task *task { if (__builtin_expect (task->depend_hash != NULL, 0)) free (task->depend_hash); - gomp_sem_destroy (&task->taskwait_sem); } /* team.c */ --- libgomp/task.c.jj 2014-01-03 11:41:28.000000000 +0100 +++ libgomp/task.c 2014-07-11 17:34:30.365960543 +0200 @@ -66,16 +66,16 @@ gomp_init_task (struct gomp_task *task, task->parent = parent_task; task->icv = *prev_icv; task->kind = GOMP_TASK_IMPLICIT; - task->in_taskwait = false; + task->taskwait = NULL; task->in_tied_task = false; task->final_task = false; task->copy_ctors_done = false; + task->parent_depends_on = false; task->children = NULL; task->taskgroup = NULL; task->dependers = NULL; task->depend_hash = NULL; task->depend_count = 0; - gomp_sem_init (&task->taskwait_sem, 0); } /* Clean up a task, after completing it. */ @@ -104,6 +104,8 @@ gomp_clear_parent (struct gomp_task *chi while (task != children); } +static void gomp_task_maybe_wait_for_dependencies (void **depend); + /* Called when encountering an explicit task directive. If IF_CLAUSE is false, then we must not delay in executing the task. If UNTIED is true, then the task may be executed by any member of the team. */ @@ -141,35 +143,12 @@ GOMP_task (void (*fn) (void *), void *da /* If there are depend clauses and earlier deferred sibling tasks with depend clauses, check if there isn't a dependency. If there - is, fall through to the deferred task handling, as we can't - schedule such tasks right away. There is no need to handle + is, we need to wait for them. There is no need to handle depend clauses for non-deferred tasks other than this, because the parent task is suspended until the child task finishes and thus it can't start further child tasks. */ if ((flags & 8) && thr->task && thr->task->depend_hash) - { - struct gomp_task *parent = thr->task; - struct gomp_task_depend_entry elem, *ent = NULL; - size_t ndepend = (uintptr_t) depend[0]; - size_t nout = (uintptr_t) depend[1]; - size_t i; - gomp_mutex_lock (&team->task_lock); - for (i = 0; i < ndepend; i++) - { - elem.addr = depend[i + 2]; - ent = htab_find (parent->depend_hash, &elem); - for (; ent; ent = ent->next) - if (i >= nout && ent->is_in) - continue; - else - break; - if (ent) - break; - } - gomp_mutex_unlock (&team->task_lock); - if (ent) - goto defer; - } + gomp_task_maybe_wait_for_dependencies (depend); gomp_init_task (&task, thr->task, gomp_icv (false)); task.kind = GOMP_TASK_IFFALSE; @@ -209,7 +188,6 @@ GOMP_task (void (*fn) (void *), void *da } else { - defer:; struct gomp_task *task; struct gomp_task *parent = thr->task; struct gomp_taskgroup *taskgroup = parent->taskgroup; @@ -275,11 +253,12 @@ GOMP_task (void (*fn) (void *), void *da task->depend[i].task = task; task->depend[i].is_in = i >= nout; task->depend[i].redundant = false; + task->depend[i].redundant_out = false; hash_entry_type *slot = htab_find_slot (&parent->depend_hash, &task->depend[i], INSERT); - hash_entry_type out = NULL; + hash_entry_type out = NULL, last = NULL; if (*slot) { /* If multiple depends on the same task are the @@ -294,6 +273,11 @@ GOMP_task (void (*fn) (void *), void *da } for (ent = *slot; ent; ent = ent->next) { + if (ent->redundant_out) + break; + + last = ent; + /* depend(in:...) doesn't depend on earlier depend(in:...). */ if (i >= nout && ent->is_in) @@ -341,21 +325,31 @@ GOMP_task (void (*fn) (void *), void *da *slot = &task->depend[i]; /* There is no need to store more than one depend({,in}out:) - task per address in the hash table chain, because each out + task per address in the hash table chain for the purpose + of creation of deferred tasks, because each out depends on all earlier outs, thus it is enough to record just the last depend({,in}out:). For depend(in:), we need to keep all of the previous ones not terminated yet, because a later depend({,in}out:) might need to depend on all of them. So, if the new task's clause is depend({,in}out:), we know there is at most one other depend({,in}out:) clause - in the list (out) and to maintain the invariant we now - need to remove it from the list. */ + in the list (out). For non-deferred tasks we want to see + all outs, so they are moved to the end of the chain, + after first redundant_out entry all following entries + should be redundant_out. */ if (!task->depend[i].is_in && out) { - if (out->next) - out->next->prev = out->prev; - out->prev->next = out->next; - out->redundant = true; + if (out != last) + { + out->next->prev = out->prev; + out->prev->next = out->next; + out->next = last->next; + out->prev = last; + last->next = out; + if (out->next) + out->next->prev = out; + } + out->redundant_out = true; } } if (task->num_dependees) @@ -421,8 +415,20 @@ static inline bool gomp_task_run_pre (struct gomp_task *child_task, struct gomp_task *parent, struct gomp_taskgroup *taskgroup, struct gomp_team *team) { - if (parent && parent->children == child_task) - parent->children = child_task->next_child; + if (parent) + { + if (parent->children == child_task) + parent->children = child_task->next_child; + if (__builtin_expect (child_task->parent_depends_on, 0) + && parent->taskwait->last_parent_depends_on == child_task) + { + if (child_task->prev_child->kind == GOMP_TASK_WAITING + && child_task->prev_child->parent_depends_on) + parent->taskwait->last_parent_depends_on = child_task->prev_child; + else + parent->taskwait->last_parent_depends_on = NULL; + } + } if (taskgroup && taskgroup->children == child_task) taskgroup->children = child_task->next_taskgroup; child_task->prev_queue->next_queue = child_task->next_queue; @@ -489,8 +495,23 @@ gomp_task_run_post_handle_dependers (str { if (parent->children) { - task->next_child = parent->children; - task->prev_child = parent->children->prev_child; + /* If parent is in gomp_task_maybe_wait_for_dependencies + and it doesn't need to wait for this task, put it after + all ready to run tasks it needs to wait for. */ + if (parent->taskwait && parent->taskwait->last_parent_depends_on + && !task->parent_depends_on) + { + struct gomp_task *last_parent_depends_on + = parent->taskwait->last_parent_depends_on; + task->next_child = last_parent_depends_on->next_child; + task->prev_child = last_parent_depends_on; + } + else + { + task->next_child = parent->children; + task->prev_child = parent->children->prev_child; + parent->children = task; + } task->next_child->prev_child = task; task->prev_child->next_child = task; } @@ -498,12 +519,23 @@ gomp_task_run_post_handle_dependers (str { task->next_child = task; task->prev_child = task; + parent->children = task; } - parent->children = task; - if (parent->in_taskwait) + if (parent->taskwait) { - parent->in_taskwait = false; - gomp_sem_post (&parent->taskwait_sem); + if (parent->taskwait->in_taskwait) + { + parent->taskwait->in_taskwait = false; + gomp_sem_post (&parent->taskwait->taskwait_sem); + } + else if (parent->taskwait->in_depend_wait) + { + parent->taskwait->in_depend_wait = false; + gomp_sem_post (&parent->taskwait->taskwait_sem); + } + if (parent->taskwait->last_parent_depends_on == NULL + && task->parent_depends_on) + parent->taskwait->last_parent_depends_on = task; } } if (taskgroup) @@ -575,6 +607,13 @@ gomp_task_run_post_remove_parent (struct struct gomp_task *parent = child_task->parent; if (parent == NULL) return; + if (__builtin_expect (child_task->parent_depends_on, 0) + && --parent->taskwait->n_depend == 0 + && parent->taskwait->in_depend_wait) + { + parent->taskwait->in_depend_wait = false; + gomp_sem_post (&parent->taskwait->taskwait_sem); + } child_task->prev_child->next_child = child_task->next_child; child_task->next_child->prev_child = child_task->prev_child; if (parent->children != child_task) @@ -589,10 +628,10 @@ gomp_task_run_post_remove_parent (struct written by child_task->fn above is flushed before the NULL is written. */ __atomic_store_n (&parent->children, NULL, MEMMODEL_RELEASE); - if (parent->in_taskwait) + if (parent->taskwait && parent->taskwait->in_taskwait) { - parent->in_taskwait = false; - gomp_sem_post (&parent->taskwait_sem); + parent->taskwait->in_taskwait = false; + gomp_sem_post (&parent->taskwait->taskwait_sem); } } } @@ -736,6 +775,7 @@ GOMP_taskwait (void) struct gomp_task *task = thr->task; struct gomp_task *child_task = NULL; struct gomp_task *to_free = NULL; + struct gomp_taskwait taskwait; int do_wake = 0; /* The acquire barrier on load of task->children here synchronizes @@ -748,18 +788,194 @@ GOMP_taskwait (void) || __atomic_load_n (&task->children, MEMMODEL_ACQUIRE) == NULL) return; + memset (&taskwait, 0, sizeof (taskwait)); gomp_mutex_lock (&team->task_lock); while (1) { bool cancelled = false; if (task->children == NULL) { + bool destroy_taskwait = task->taskwait != NULL; + task->taskwait = NULL; + gomp_mutex_unlock (&team->task_lock); + if (to_free) + { + gomp_finish_task (to_free); + free (to_free); + } + if (destroy_taskwait) + gomp_sem_destroy (&taskwait.taskwait_sem); + return; + } + if (task->children->kind == GOMP_TASK_WAITING) + { + child_task = task->children; + cancelled + = gomp_task_run_pre (child_task, task, child_task->taskgroup, + team); + if (__builtin_expect (cancelled, 0)) + { + if (to_free) + { + gomp_finish_task (to_free); + free (to_free); + to_free = NULL; + } + goto finish_cancelled; + } + } + else + { + /* All tasks we are waiting for are already running + in other threads. Wait for them. */ + if (task->taskwait == NULL) + { + taskwait.in_depend_wait = false; + gomp_sem_init (&taskwait.taskwait_sem, 0); + task->taskwait = &taskwait; + } + taskwait.in_taskwait = true; + } + gomp_mutex_unlock (&team->task_lock); + if (do_wake) + { + gomp_team_barrier_wake (&team->barrier, do_wake); + do_wake = 0; + } + if (to_free) + { + gomp_finish_task (to_free); + free (to_free); + to_free = NULL; + } + if (child_task) + { + thr->task = child_task; + child_task->fn (child_task->fn_data); + thr->task = task; + } + else + gomp_sem_wait (&taskwait.taskwait_sem); + gomp_mutex_lock (&team->task_lock); + if (child_task) + { + finish_cancelled:; + size_t new_tasks + = gomp_task_run_post_handle_depend (child_task, team); + child_task->prev_child->next_child = child_task->next_child; + child_task->next_child->prev_child = child_task->prev_child; + if (task->children == child_task) + { + if (child_task->next_child != child_task) + task->children = child_task->next_child; + else + task->children = NULL; + } + gomp_clear_parent (child_task->children); + gomp_task_run_post_remove_taskgroup (child_task); + to_free = child_task; + child_task = NULL; + team->task_count--; + if (new_tasks > 1) + { + do_wake = team->nthreads - team->task_running_count + - !task->in_tied_task; + if (do_wake > new_tasks) + do_wake = new_tasks; + } + } + } +} + +/* This is like GOMP_taskwait, but we only wait for tasks that the + upcoming task depends on. */ + +static void +gomp_task_maybe_wait_for_dependencies (void **depend) +{ + struct gomp_thread *thr = gomp_thread (); + struct gomp_task *task = thr->task; + struct gomp_team *team = thr->ts.team; + struct gomp_task_depend_entry elem, *ent = NULL; + struct gomp_taskwait taskwait; + struct gomp_task *last_parent_depends_on = NULL; + size_t ndepend = (uintptr_t) depend[0]; + size_t nout = (uintptr_t) depend[1]; + size_t i; + size_t num_awaited = 0; + struct gomp_task *child_task = NULL; + struct gomp_task *to_free = NULL; + int do_wake = 0; + + gomp_mutex_lock (&team->task_lock); + for (i = 0; i < ndepend; i++) + { + elem.addr = depend[i + 2]; + ent = htab_find (task->depend_hash, &elem); + for (; ent; ent = ent->next) + if (i >= nout && ent->is_in) + continue; + else + { + struct gomp_task *tsk = ent->task; + if (!tsk->parent_depends_on) + { + tsk->parent_depends_on = true; + ++num_awaited; + if (tsk->num_dependees == 0 && tsk->kind == GOMP_TASK_WAITING) + { + /* If a task we need to wait for is not already + running and is ready to be scheduled, move it + to front, so that we run it as soon as possible. */ + if (last_parent_depends_on) + { + tsk->prev_child->next_child = tsk->next_child; + tsk->next_child->prev_child = tsk->prev_child; + tsk->prev_child = last_parent_depends_on; + tsk->next_child = last_parent_depends_on->next_child; + tsk->prev_child->next_child = tsk; + tsk->next_child->prev_child = tsk; + } + else if (tsk != task->children) + { + tsk->prev_child->next_child = tsk->next_child; + tsk->next_child->prev_child = tsk->prev_child; + tsk->prev_child = task->children; + tsk->next_child = task->children->next_child; + task->children = tsk; + tsk->prev_child->next_child = tsk; + tsk->next_child->prev_child = tsk; + } + last_parent_depends_on = tsk; + } + } + } + } + if (num_awaited == 0) + { + gomp_mutex_unlock (&team->task_lock); + return; + } + + memset (&taskwait, 0, sizeof (taskwait)); + taskwait.n_depend = num_awaited; + taskwait.last_parent_depends_on = last_parent_depends_on; + gomp_sem_init (&taskwait.taskwait_sem, 0); + task->taskwait = &taskwait; + + while (1) + { + bool cancelled = false; + if (taskwait.n_depend == 0) + { + task->taskwait = NULL; gomp_mutex_unlock (&team->task_lock); if (to_free) { gomp_finish_task (to_free); free (to_free); } + gomp_sem_destroy (&taskwait.taskwait_sem); return; } if (task->children->kind == GOMP_TASK_WAITING) @@ -782,7 +998,7 @@ GOMP_taskwait (void) else /* All tasks we are waiting for are already running in other threads. Wait for them. */ - task->in_taskwait = true; + taskwait.in_depend_wait = true; gomp_mutex_unlock (&team->task_lock); if (do_wake) { @@ -802,13 +1018,15 @@ GOMP_taskwait (void) thr->task = task; } else - gomp_sem_wait (&task->taskwait_sem); + gomp_sem_wait (&taskwait.taskwait_sem); gomp_mutex_lock (&team->task_lock); if (child_task) { finish_cancelled:; size_t new_tasks = gomp_task_run_post_handle_depend (child_task, team); + if (child_task->parent_depends_on) + --taskwait.n_depend; child_task->prev_child->next_child = child_task->next_child; child_task->next_child->prev_child = child_task->prev_child; if (task->children == child_task) --- libgomp/testsuite/libgomp.c/depend-5.c.jj 2014-07-11 14:21:56.014015121 +0200 +++ libgomp/testsuite/libgomp.c/depend-5.c 2014-07-11 17:54:41.902833385 +0200 @@ -0,0 +1,98 @@ +#include <stdlib.h> + +__attribute__((noinline, noclone)) void +f1 (int ifval) +{ + int x = 1, y = 2, z = 3; + #pragma omp parallel + #pragma omp single + { + #pragma omp task shared (x) depend(out: x) + x = 2; + #pragma omp task shared (x) depend(inout: x) + { + if (x != 2) + abort (); + x = 3; + } + #pragma omp task shared (x) depend(inout: x) + { + if (x != 3) + abort (); + x = 4; + } + #pragma omp task shared (z) depend(in: z) + if (z != 3) + abort (); + #pragma omp task shared (z) depend(in: z) + if (z != 3) + abort (); + #pragma omp task shared (z) depend(in: z) + if (z != 3) + abort (); + #pragma omp task shared (z) depend(in: z) + if (z != 3) + abort (); + #pragma omp task shared (z) depend(in: z) + if (z != 3) + abort (); + #pragma omp task shared (z) depend(in: z) + if (z != 3) + abort (); + #pragma omp task shared (y) depend(in: y) + if (y != 2) + abort (); + #pragma omp task shared (y) depend(in: y) + if (y != 2) + abort (); + #pragma omp task shared (y) depend(in: y) + if (y != 2) + abort (); + #pragma omp task shared (y) depend(in: y) + if (y != 2) + abort (); + #pragma omp task if (ifval) shared (x, y) depend(in: x) depend(inout: y) + { + if (x != 4 || y != 2) + abort (); + y = 3; + } + if (ifval == 0) + { + /* The above if (0) task should have waited till all + the tasks with x and y dependencies finish. */ + if (x != 4 || y != 3) + abort (); + x = 5; + y = 4; + } + #pragma omp task shared (z) depend(inout: z) + { + if (z != 3) + abort (); + z = 4; + } + #pragma omp task shared (z) depend(inout: z) + { + if (z != 4) + abort (); + z = 5; + } + #pragma omp taskwait + if (x != (ifval ? 4 : 5) || y != (ifval ? 3 : 4) || z != 5) + abort (); + #pragma omp task if (ifval) shared (x, y) depend(in: x) depend(inout: y) + { + if (x != (ifval ? 4 : 5) || y != (ifval ? 3 : 4)) + abort (); + } + } +} + +int +main () +{ + f1 (0); + f1 (1); + return 0; +} Jakub