commit:     276eb87f41378617e146e0aff60c0476b6838f86
Author:     Sv. Lockal <lockalsash <AT> gmail <DOT> com>
AuthorDate: Mon Oct 27 18:53:51 2025 +0000
Commit:     Sam James <sam <AT> gentoo <DOT> org>
CommitDate: Sun Nov  2 05:16:10 2025 +0000
URL:        https://gitweb.gentoo.org/repo/gentoo.git/commit/?id=276eb87f

rocm.eclass: add graceful rocm_add_sandbox; allow NPU for the sandbox

1) With upcoming firmware AMD NPU will be exposed as /dev/accel/accel0.
This device is owned by root:render, similarly to GPU.
When present, tools like rocminfo try to query device capabilities,
breaking sandbox. To fix this issue, this device has now addwrite in
check_amdgpu.

2) There are a bunch of bugs from tinderbox and users who forgot to
enable KFD in kernel. Instead of recommendation to check permissions,
they will see a better message, that AMD device is missing.

3) In cases when we just want to addwrite to AMD devices, new function
rocm_add_sandbox (similar to cuda_add_sandbox) was added. No errors are
raised if device is missing.

Bug: https://bugs.gentoo.org/965198
Signed-off-by: Sv. Lockal <lockalsash <AT> gmail.com>
Part-of: https://github.com/gentoo/gentoo/pull/44355
Signed-off-by: Sam James <sam <AT> gentoo.org>

 eclass/rocm.eclass | 37 ++++++++++++++++++++++++++++++++++---
 1 file changed, 34 insertions(+), 3 deletions(-)

diff --git a/eclass/rocm.eclass b/eclass/rocm.eclass
index 0fa99a4178e7..c24666e33e8d 100644
--- a/eclass/rocm.eclass
+++ b/eclass/rocm.eclass
@@ -248,13 +248,44 @@ get_amdgpu_flags() {
        echo $(printf "%s;" ${AMDGPU_TARGETS[@]})
 }
 
+# @FUNCTION: rocm_add_sandbox
+# @USAGE: [-w]
+# @DESCRIPTION:
+# Add AMD GPU/NPU dev nodes to the sandbox predict list.
+# with -w, add to the sandbox write list.
+rocm_add_sandbox() {
+       debug-print-function "${FUNCNAME[0]}" "$@"
+
+       local i
+       for i in /dev/kfd /dev/dri/render* /dev/accel/accel*; do
+               if [[ ! -c $i ]]; then
+                       continue
+               elif [[ $1 == '-w' ]]; then
+                       addwrite "$i"
+               else
+                       addpredict "$i"
+               fi
+       done
+}
+
 # @FUNCTION: check_amdgpu
 # @USAGE: check_amdgpu
 # @DESCRIPTION:
-# grant and check read-write permissions on AMDGPU devices, die if not 
available.
+# Grant and check read-write permissions on AMDGPU and AMDNPU devices.
+# Die if no AMDGPU devices are available.
 check_amdgpu() {
-       for device in /dev/kfd /dev/dri/render*; do
-               addwrite ${device}
+       # Common case: no AMDGPU device or the kernel fusion driver is disabled 
in the kernel.
+       if [[ ! -c /dev/kfd ]]; then
+               eerror "Device /dev/kfd does not exist!"
+               eerror "To proceed, you need to have an AMD GPU and have 
CONFIG_HSA_AMD set in your kernel config."
+               die "/dev/kfd is missing"
+       fi
+
+       local device
+       for device in /dev/kfd /dev/dri/render* /dev/accel/accel*; do
+               [[ ! -c ${device} ]] && continue
+
+               addwrite "${device}"
                if [[ ! -r ${device} || ! -w ${device} ]]; then
                        eerror "Cannot read or write ${device}!"
                        eerror "Make sure it is present and check the 
permission."

Reply via email to