Andrew Stubbs wrote:
On 07/02/2025 09:40, Tobias Burnus wrote:
This patch permits loading generic ISA code objects - by just
trying whether the runtime accepts it.  If not, it fails with
an error. - The error messages should be a bit more helpful in
some cases as before.

...

Also I think all the sentences should finish with '.'.

Thanks for proof reading. Updated patch attached.

I also added the final sentence-end period, for consistency. But I note that this is a plugin-gcn-ism; there is even a GCC warning that 'warning_at'/error_at' diagnostic does not end with a full stop.

OK for mainline?

Tobias

PS: Pending patches:

* mkoffload.cc: switch -march= to generic version if it has a multilib and the specific one hasn't

* amdhsa.version fix

And otherwise to do:

* [Waiting for ROCm update] plugin-gcn: Suggest -march=gfx*-generic besides -march=gfx<specific>.

* Update install.texi – could be done now or once ROCm supports it?

Cf. original patch https://gcc.gnu.org/pipermail/gcc-patches/2025-February/675200.html for the last two.
[GCN] Handle generic ISA names in libgomp's plugin-gcn.c

libgomp/ChangeLog:

	* plugin/plugin-gcn.c (ELFABIVERSION_AMDGPU_HSA_V6,
	EF_AMDGPU_GENERIC_VERSION_V, EF_AMDGPU_GENERIC_VERSION_OFFSET,
	GET_GENERIC_VERSION): New #define.
	(elf_gcn_isa_is_generic): New.
	(isa_matches_agent): Accept all generic code objects on the first
	go; extend the diagnostic and handle runtime-failed case.
	(create_and_finalize_hsa_program): Call it also after loading
	the code failed, pass the status.

 libgomp/plugin/plugin-gcn.c | 118 ++++++++++++++++++++++++++++++++++----------
 1 file changed, 92 insertions(+), 26 deletions(-)

diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c
index 8015a6f80f3..5c65778191a 100644
--- a/libgomp/plugin/plugin-gcn.c
+++ b/libgomp/plugin/plugin-gcn.c
@@ -66,6 +66,14 @@
 #define R_AMDGPU_RELATIVE64	13	/* B + A  */
 #endif
 
+#define ELFABIVERSION_AMDGPU_HSA_V6		4
+
+#define EF_AMDGPU_GENERIC_VERSION_V		0xff000000  /* Mask.  */
+#define EF_AMDGPU_GENERIC_VERSION_OFFSET	24
+
+#define GET_GENERIC_VERSION(VAR) ((VAR & EF_AMDGPU_GENERIC_VERSION_V) \
+				  >> EF_AMDGPU_GENERIC_VERSION_OFFSET)
+
 /* GCN specific definitions for asynchronous queues.  */
 
 #define ASYNC_QUEUE_SIZE 64
@@ -242,7 +250,7 @@ struct kernel_dispatch
 };
 
 /* Structure of the kernargs segment, supporting console output.
- 
+
    This needs to match the definitions in Newlib, and the expectations
    in libgomp target code.  */
 
@@ -1668,6 +1676,13 @@ elf_gcn_isa_field (Elf64_Ehdr *image)
   return image->e_flags & EF_AMDGPU_MACH_MASK;
 }
 
+static int
+elf_gcn_isa_is_generic (Elf64_Ehdr *image)
+{
+  return (image->e_ident[8] == ELFABIVERSION_AMDGPU_HSA_V6
+	  && GET_GENERIC_VERSION (image->e_flags));
+}
+
 /* Returns the name that the HSA runtime uses for the ISA or NULL if we do not
    support the ISA. */
 
@@ -2399,38 +2414,88 @@ init_basic_kernel_info (struct kernel_info *kernel,
   return true;
 }
 
-/* Check that the GCN ISA of the given image matches the ISA of the agent. */
+/* If status is SUCCESS, assume that the code runs if either the ISA of agent
+   and code is the same - or it is generic code.
+   Otherwise, execution failed with the provided status code; try to give
+   some useful diagnostic.  */
 
 static bool
-isa_matches_agent (struct agent_info *agent, Elf64_Ehdr *image)
+isa_matches_agent (struct agent_info *agent, Elf64_Ehdr *image,
+		   hsa_status_t status)
 {
+  /* Generic image - assume that it works and only return to here
+     when it fails, i.e. fatal == true.  */
+  if (status == HSA_STATUS_SUCCESS && elf_gcn_isa_is_generic (image))
+    return true;
+
   int isa_field = elf_gcn_isa_field (image);
-  const char* isa_s = isa_name (isa_field);
-  if (!isa_s)
+  if (status == HSA_STATUS_SUCCESS && isa_field == agent->device_isa)
+    return true;
+
+  /* If we get here, either the binary is non-generic and has a mismatch of
+     the ISA - or is generic but not handled by the ROCm (e.g. because ROCm
+     is too old).  */
+
+  char msg[340];
+  char agent_isa_xs[8];
+  char device_isa_xs[8];
+  const char *agent_isa_s = isa_name (agent->device_isa);
+  const char *device_isa_s = isa_name (isa_field);
+  if (agent_isa_s == NULL)
     {
-      hsa_error ("Unsupported ISA in GCN code object.", HSA_STATUS_ERROR);
-      return false;
+      snprintf (agent_isa_xs, sizeof agent_isa_xs,
+		"0x%X", agent->device_isa);
+      agent_isa_s = agent_isa_xs;
     }
-
-  if (isa_field != agent->device_isa)
+  if (device_isa_s == NULL)
     {
-      char msg[204];
-      const char *agent_isa_s = isa_name (agent->device_isa);
-      assert (agent_isa_s);
-
-      snprintf (msg, sizeof msg,
-		"GCN code object ISA '%s' does not match GPU ISA '%s' "
-		"(device %d).\n"
-		"Try to recompile with '-foffload-options=-march=%s',\n"
-		"or use ROCR_VISIBLE_DEVICES to disable incompatible "
-		"devices.\n",
-		isa_s, agent_isa_s, agent->device_id, agent_isa_s);
-
-      hsa_error (msg, HSA_STATUS_ERROR);
-      return false;
+      snprintf (device_isa_xs, sizeof device_isa_xs, "0x%X", isa_field);
+      device_isa_s = device_isa_xs;
     }
 
-  return true;
+  /* Some error which should be unrelated to the ISA.  */
+  if (status != HSA_STATUS_SUCCESS
+      && status != HSA_STATUS_ERROR_INVALID_CODE_OBJECT
+      && status != HSA_STATUS_ERROR_INVALID_ISA_NAME
+      && status != HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS)
+    snprintf (msg, sizeof msg,
+	      "Could not load GCN code object with ISA %s on GPU with "
+	      "ISA %s (device %d).\n"
+	      "Consider using ROCR_VISIBLE_DEVICES to disable incompatible "
+	      "devices or run with LOADER_ENABLE_LOGGING=1 for more details.",
+	      device_isa_s, agent_isa_s, agent->device_id);
+  else if (status == HSA_STATUS_ERROR_INVALID_ISA_NAME
+	   && elf_gcn_isa_is_generic (image))
+    snprintf (msg, sizeof msg,
+	      "Unsupported generic ISA %s on GPU with ISA %s (device %d).\n"
+	      "%s%s%s"
+	      "Consider using ROCR_VISIBLE_DEVICES to disable incompatible "
+	      "devices, run with LOADER_ENABLE_LOGGING=1 for more details, "
+	      "or try updating to a ROCm that supports this generic ISA.",
+	      device_isa_s, agent_isa_s, agent->device_id,
+	      agent_isa_s[0] != '0'
+	      ? "Try to recompile with '-foffload-options=-march=" : "",
+	      agent_isa_s[0] != '0' ? agent_isa_s : "",
+	      agent_isa_s[0] != '0' ? ".\n" : "");
+  else if (agent_isa_s[0] == '0')
+    snprintf (msg, sizeof msg,
+	      "GCN code object ISA '%s' is incompatible with GPU ISA '%s' "
+	      "(device %d).\n"
+	      "Consider using ROCR_VISIBLE_DEVICES to disable incompatible "
+	      "devices or run with LOADER_ENABLE_LOGGING=1 for more details.",
+	      device_isa_s, agent_isa_s, agent->device_id);
+  else
+    snprintf (msg, sizeof msg,
+	      "GCN code object ISA '%s' is incompatible with GPU ISA '%s' "
+	      "(device %d).\n"
+	      "Try to recompile with '-foffload-options=-march=%s',\n"
+	      "or use ROCR_VISIBLE_DEVICES to disable incompatible "
+	      "devices.\n",
+	      device_isa_s, agent_isa_s, agent->device_id, agent_isa_s);
+
+  hsa_error (msg, status != HSA_STATUS_SUCCESS
+		  ? status : HSA_STATUS_ERROR_INVALID_CODE_OBJECT);
+  return false;
 }
 
 /* Create and finalize the program consisting of all loaded modules.  */
@@ -2464,7 +2529,8 @@ create_and_finalize_hsa_program (struct agent_info *agent)
     {
       Elf64_Ehdr *image = (Elf64_Ehdr *)module->image_desc->gcn_image->image;
 
-      if (!isa_matches_agent (agent, image))
+      /* Check the ISA early because older ROCm had unhelpful errors.  */
+      if (!isa_matches_agent (agent, image, HSA_STATUS_SUCCESS))
 	goto fail;
 
       hsa_code_object_t co = { 0 };
@@ -2482,7 +2548,7 @@ create_and_finalize_hsa_program (struct agent_info *agent)
 	(agent->executable, agent->id, co, "");
       if (status != HSA_STATUS_SUCCESS)
 	{
-	  hsa_error ("Could not load GCN code object", status);
+	  isa_matches_agent (agent, image, status);
 	  goto fail;
 	}
 

Reply via email to