commit: 276eb87f41378617e146e0aff60c0476b6838f86 Author: Sv. Lockal <lockalsash <AT> gmail <DOT> com> AuthorDate: Mon Oct 27 18:53:51 2025 +0000 Commit: Sam James <sam <AT> gentoo <DOT> org> CommitDate: Sun Nov 2 05:16:10 2025 +0000 URL: https://gitweb.gentoo.org/repo/gentoo.git/commit/?id=276eb87f
rocm.eclass: add graceful rocm_add_sandbox; allow NPU for the sandbox 1) With upcoming firmware AMD NPU will be exposed as /dev/accel/accel0. This device is owned by root:render, similarly to GPU. When present, tools like rocminfo try to query device capabilities, breaking sandbox. To fix this issue, this device has now addwrite in check_amdgpu. 2) There are a bunch of bugs from tinderbox and users who forgot to enable KFD in kernel. Instead of recommendation to check permissions, they will see a better message, that AMD device is missing. 3) In cases when we just want to addwrite to AMD devices, new function rocm_add_sandbox (similar to cuda_add_sandbox) was added. No errors are raised if device is missing. Bug: https://bugs.gentoo.org/965198 Signed-off-by: Sv. Lockal <lockalsash <AT> gmail.com> Part-of: https://github.com/gentoo/gentoo/pull/44355 Signed-off-by: Sam James <sam <AT> gentoo.org> eclass/rocm.eclass | 37 ++++++++++++++++++++++++++++++++++--- 1 file changed, 34 insertions(+), 3 deletions(-) diff --git a/eclass/rocm.eclass b/eclass/rocm.eclass index 0fa99a4178e7..c24666e33e8d 100644 --- a/eclass/rocm.eclass +++ b/eclass/rocm.eclass @@ -248,13 +248,44 @@ get_amdgpu_flags() { echo $(printf "%s;" ${AMDGPU_TARGETS[@]}) } +# @FUNCTION: rocm_add_sandbox +# @USAGE: [-w] +# @DESCRIPTION: +# Add AMD GPU/NPU dev nodes to the sandbox predict list. +# with -w, add to the sandbox write list. +rocm_add_sandbox() { + debug-print-function "${FUNCNAME[0]}" "$@" + + local i + for i in /dev/kfd /dev/dri/render* /dev/accel/accel*; do + if [[ ! -c $i ]]; then + continue + elif [[ $1 == '-w' ]]; then + addwrite "$i" + else + addpredict "$i" + fi + done +} + # @FUNCTION: check_amdgpu # @USAGE: check_amdgpu # @DESCRIPTION: -# grant and check read-write permissions on AMDGPU devices, die if not available. +# Grant and check read-write permissions on AMDGPU and AMDNPU devices. +# Die if no AMDGPU devices are available. check_amdgpu() { - for device in /dev/kfd /dev/dri/render*; do - addwrite ${device} + # Common case: no AMDGPU device or the kernel fusion driver is disabled in the kernel. + if [[ ! -c /dev/kfd ]]; then + eerror "Device /dev/kfd does not exist!" + eerror "To proceed, you need to have an AMD GPU and have CONFIG_HSA_AMD set in your kernel config." + die "/dev/kfd is missing" + fi + + local device + for device in /dev/kfd /dev/dri/render* /dev/accel/accel*; do + [[ ! -c ${device} ]] && continue + + addwrite "${device}" if [[ ! -r ${device} || ! -w ${device} ]]; then eerror "Cannot read or write ${device}!" eerror "Make sure it is present and check the permission."
