Hello. Following patch fixes up HSA kernel from kernel dispatching mechanism, where we forgot to wait in a loop for the dispatched child kernel. Apart from that, there's a small follow-up which changes naming scheme for HSA modules.
Martin
>From eca686b6495bf7faa32ecb292f94558c6bfdbdce Mon Sep 17 00:00:00 2001 From: marxin <mli...@suse.cz> Date: Tue, 20 Oct 2015 11:32:58 +0200 Subject: [PATCH 1/2] HSA: fix kernel-from-kernel dispatch mechanism libgomp/ChangeLog: 2015-10-20 Martin Liska <mli...@suse.cz> * plugin/plugin-hsa.c (struct agent_info): Add new kernel_dispatch_command_q member. (GOMP_OFFLOAD_init_device): Initialize the queue. (init_single_kernel): Verify that kernel dispatching is not run with a big depth. (create_kernel_dispatch): Rename from create_kernel_dispatch_recursive. (GOMP_OFFLOAD_run): Add signal waiting workaround. (GOMP_OFFLOAD_fini_device): Destroy the newly added queue. gcc/ChangeLog: 2015-10-20 Martin Liska <mli...@suse.cz> * hsa-gen.c (gen_hsa_insns_for_kernel_call): Add waiting for the signal in a loop. (gen_body_from_gimple): Skip HSA basic blocks with already created HBB. --- gcc/hsa-gen.c | 38 ++++++++++++++++++++++++------ libgomp/plugin/plugin-hsa.c | 56 +++++++++++++++++++++++++++++++++++++-------- 2 files changed, 78 insertions(+), 16 deletions(-) diff --git a/gcc/hsa-gen.c b/gcc/hsa-gen.c index ab27eb1..c36c9e0 100644 --- a/gcc/hsa-gen.c +++ b/gcc/hsa-gen.c @@ -3943,22 +3943,41 @@ gen_hsa_insns_for_kernel_call (hsa_bb *hbb, gcall *call) signal->m_memoryscope = BRIG_MEMORY_SCOPE_SYSTEM; hbb->append_insn (signal); + /* Prepare CFG for waiting loop. */ + edge e = split_block (hbb->m_bb, call); + + basic_block dest = split_edge (e); + edge false_e = EDGE_SUCC (dest, 0); + + false_e->flags &= ~EDGE_FALLTHRU; + false_e->flags |= EDGE_FALSE_VALUE; + + make_edge (e->dest, dest, EDGE_TRUE_VALUE); + /* Emit blocking signal waiting instruction. */ + hsa_bb *new_hbb = hsa_init_new_bb (dest); + hbb->append_insn (new hsa_insn_comment ("wait for the signal")); - hsa_op_reg *signal_result_reg = new hsa_op_reg (BRIG_TYPE_U64); + hsa_op_reg *signal_result_reg = new hsa_op_reg (BRIG_TYPE_S64); c = new hsa_op_immed (1, BRIG_TYPE_S64); - hsa_op_immed *c2 = new hsa_op_immed (UINT64_MAX, BRIG_TYPE_U64); - signal = new hsa_insn_signal (4, BRIG_OPCODE_SIGNAL, - BRIG_ATOMIC_WAITTIMEOUT_LT, BRIG_TYPE_S64); + signal = new hsa_insn_signal (3, BRIG_OPCODE_SIGNAL, + BRIG_ATOMIC_WAIT_LT, BRIG_TYPE_S64); signal->m_memoryorder = BRIG_MEMORY_ORDER_SC_ACQUIRE; signal->m_memoryscope = BRIG_MEMORY_SCOPE_SYSTEM; signal->set_op (0, signal_result_reg); signal->set_op (1, signal_reg); signal->set_op (2, c); - signal->set_op (3, c2); - hbb->append_insn (signal); + new_hbb->append_insn (signal); + + hsa_op_reg *ctrl = new hsa_op_reg (BRIG_TYPE_B1); + hsa_insn_cmp *cmp = new hsa_insn_cmp + (BRIG_COMPARE_EQ, ctrl->m_type, ctrl, signal_result_reg, + new hsa_op_immed (1, signal_result_reg->m_type)); + + new_hbb->append_insn (cmp); + new_hbb->append_insn (new hsa_insn_br (ctrl)); hsa_cfun->m_kernel_dispatch_count++; } @@ -4763,7 +4782,11 @@ gen_body_from_gimple (vec <hsa_op_reg_p> *ssa_map) FOR_EACH_BB_FN (bb, cfun) { gimple_stmt_iterator gsi; - hsa_bb *hbb = hsa_init_new_bb (bb); + hsa_bb *hbb = hsa_bb_for_bb (bb); + if (hbb) + continue; + + hbb = hsa_init_new_bb (bb); for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi)) { @@ -4777,6 +4800,7 @@ gen_body_from_gimple (vec <hsa_op_reg_p> *ssa_map) { gimple_stmt_iterator gsi; hsa_bb *hbb = hsa_bb_for_bb (bb); + gcc_assert (hbb != NULL); for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi)) if (!virtual_operand_p (gimple_phi_result (gsi_stmt (gsi)))) diff --git a/libgomp/plugin/plugin-hsa.c b/libgomp/plugin/plugin-hsa.c index ed3573f..45dfe9b 100644 --- a/libgomp/plugin/plugin-hsa.c +++ b/libgomp/plugin/plugin-hsa.c @@ -220,6 +220,8 @@ struct agent_info hsa_isa_t isa; /* Command queue of the agent. */ hsa_queue_t* command_q; + /* Kernel from kernel dispatch command queue. */ + hsa_queue_t* kernel_dispatch_command_q; /* The HSA memory region from which to allocate kernel arguments. */ hsa_region_t kernarg_region; @@ -447,6 +449,12 @@ GOMP_OFFLOAD_init_device (int n) if (status != HSA_STATUS_SUCCESS) hsa_fatal ("Error creating command queue", status); + status = hsa_queue_create (agent->id, queue_size, HSA_QUEUE_TYPE_MULTI, + queue_callback, NULL, UINT32_MAX, UINT32_MAX, + &agent->kernel_dispatch_command_q); + if (status != HSA_STATUS_SUCCESS) + hsa_fatal ("Error creating kernel dispatch command queue", status); + agent->kernarg_region.handle = (uint64_t) -1; status = hsa_agent_iterate_regions (agent->id, get_kernarg_memory_region, &agent->kernarg_region); @@ -455,6 +463,8 @@ GOMP_OFFLOAD_init_device (int n) "arguments"); HSA_DEBUG ("HSA agent initialized, queue has id %llu\n", (long long unsigned) agent->command_q->id); + HSA_DEBUG ("HSA agent initialized, kernel dispatch queue has id %llu\n", + (long long unsigned) agent->kernel_dispatch_command_q->id); agent->initialized = true; } @@ -739,7 +749,8 @@ final: /* Create kernel dispatch data structure for given KERNEL. */ static struct hsa_kernel_dispatch * -create_kernel_dispatch (struct kernel_info *kernel, unsigned omp_data_size) +create_single_kernel_dispatch (struct kernel_info *kernel, + unsigned omp_data_size) { struct agent_info *agent = kernel->agent; struct hsa_kernel_dispatch *shadow = GOMP_PLUGIN_malloc_cleared @@ -861,6 +872,12 @@ init_single_kernel (struct kernel_info *kernel, unsigned *max_omp_data_size) goto failure; } + if (dependency->dependencies_count > 0) + { + HSA_DEBUG ("HSA does not allow kernel dispatching code with " + "a depth bigger than one\n") + goto failure; + } init_single_kernel (dependency, max_omp_data_size); } @@ -919,20 +936,23 @@ print_kernel_dispatch (struct hsa_kernel_dispatch *dispatch, unsigned indent) dependencies. */ static struct hsa_kernel_dispatch * -create_kernel_dispatch_recursive (struct kernel_info *kernel, - unsigned omp_data_size) +create_kernel_dispatch (struct kernel_info *kernel, unsigned omp_data_size) { - struct hsa_kernel_dispatch *shadow = create_kernel_dispatch (kernel, - omp_data_size); + struct hsa_kernel_dispatch *shadow = create_single_kernel_dispatch + (kernel, omp_data_size); shadow->omp_num_threads = 64; shadow->debug = 0; + /* Create kernel dispatch data structures. We do not allow to have + a kernel dispatch with depth bigger than one. */ for (unsigned i = 0; i < kernel->dependencies_count; i++) { struct kernel_info *dependency = get_kernel_for_agent (kernel->agent, kernel->dependencies[i]); - shadow->children_dispatches[i] = create_kernel_dispatch_recursive + shadow->children_dispatches[i] = create_single_kernel_dispatch (dependency, omp_data_size); + shadow->children_dispatches[i]->queue = + kernel->agent->kernel_dispatch_command_q; } return shadow; @@ -1069,7 +1089,7 @@ GOMP_OFFLOAD_run (int n, void *fn_ptr, void *vars, const void* kern_launch) if (!kernel->initialized) GOMP_PLUGIN_fatal ("Called kernel must be initialized"); - struct hsa_kernel_dispatch *shadow = create_kernel_dispatch_recursive + struct hsa_kernel_dispatch *shadow = create_kernel_dispatch (kernel, kernel->max_omp_data_size); if (debug) @@ -1129,9 +1149,24 @@ GOMP_OFFLOAD_run (int n, void *fn_ptr, void *vars, const void* kern_launch) __atomic_store_n ((uint16_t*)(&packet->header), header, __ATOMIC_RELEASE); hsa_signal_store_release (agent->command_q->doorbell_signal, index); + /* TODO: fixup, following workaround is necessary to run kernel from + kernel dispatch mechanism on a Carrizo machine. */ + + for (unsigned i = 0; i < shadow->kernel_dispatch_count; i++) + { + hsa_signal_t child_s; + child_s.handle = shadow->children_dispatches[i]->signal; + + HSA_DEBUG ("Waiting for children completion signal: %lu\n", + shadow->children_dispatches[i]->signal); + while (hsa_signal_wait_acquire + (child_s, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, + HSA_WAIT_STATE_BLOCKED) != 0); + } + HSA_DEBUG ("Kernel dispatched, waiting for completion\n"); - hsa_signal_wait_acquire (s, HSA_SIGNAL_CONDITION_LT, 1, - UINT64_MAX, HSA_WAIT_STATE_BLOCKED); + while (hsa_signal_wait_acquire (s, HSA_SIGNAL_CONDITION_LT, 1, + UINT64_MAX, HSA_WAIT_STATE_BLOCKED) != 0); release_kernel_dispatch (shadow); @@ -1214,6 +1249,9 @@ GOMP_OFFLOAD_fini_device (int n) hsa_status_t status = hsa_queue_destroy (agent->command_q); if (status != HSA_STATUS_SUCCESS) hsa_fatal ("Error destroying command queue", status); + status = hsa_queue_destroy (agent->kernel_dispatch_command_q); + if (status != HSA_STATUS_SUCCESS) + hsa_fatal ("Error destroying kernel dispatch command queue", status); if (pthread_mutex_destroy (&agent->prog_mutex)) GOMP_PLUGIN_fatal ("Failed to destroy an HSA agent program mutex"); if (pthread_rwlock_destroy (&agent->modules_rwlock)) -- 2.6.0
>From f11e79721e71ca7682bff136e300594001d70b8e Mon Sep 17 00:00:00 2001 From: marxin <mli...@suse.cz> Date: Tue, 20 Oct 2015 13:41:23 +0200 Subject: [PATCH 2/2] HSA: prepend unique prefix for names of HSA modules. gcc/ChangeLog: 2015-10-20 Martin Liska <mli...@suse.cz> * hsa-brig.c (brig_init): Prepand '__hsa_module_' prefix to all names of HSA modules. --- gcc/hsa-brig.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gcc/hsa-brig.c b/gcc/hsa-brig.c index c4ec007..be7ef59 100644 --- a/gcc/hsa-brig.c +++ b/gcc/hsa-brig.c @@ -427,7 +427,7 @@ brig_init (void) part = main_input_filename; else part++; - asprintf (&modname, "&%s", part); + asprintf (&modname, "&__hsa_module_%s", part); char* extension = strchr (modname, '.'); if (extension) *extension = '\0'; @@ -451,7 +451,7 @@ brig_init (void) free (modname); } else - moddir.name = brig_emit_string ("unnamed_brig_module", '&'); + moddir.name = brig_emit_string ("__hsa_module_unnamed", '&'); moddir.base.kind = htole16 (BRIG_KIND_DIRECTIVE_MODULE); moddir.hsailMajor = htole32 (BRIG_VERSION_HSAIL_MAJOR) ; moddir.hsailMinor = htole32 (BRIG_VERSION_HSAIL_MINOR); -- 2.6.0