commit: 2909348b11927b54e8f9c9952743f664ff51e51d Author: Sv. Lockal <lockalsash <AT> gmail <DOT> com> AuthorDate: Sun Nov 9 12:24:38 2025 +0000 Commit: Alfredo Tupone <tupone <AT> gentoo <DOT> org> CommitDate: Mon Nov 10 07:37:35 2025 +0000 URL: https://gitweb.gentoo.org/repo/gentoo.git/commit/?id=2909348b
sci-ml/caffe2: pass [rocm?] to sci-ml/gloo, fix build with [distributed] The separate build issues were: * sci-libs/hipBLAS:0/7.0[-rocsolver] caused runtime import error * USE="rocm distributed" caused runtime import error * USE="distributed" failed to build with systemwide tensorpipe Signed-off-by: Sv. Lockal <lockalsash <AT> gmail.com> Part-of: https://github.com/gentoo/gentoo/pull/44550 Closes: https://github.com/gentoo/gentoo/pull/44550 Signed-off-by: Alfredo Tupone <tupone <AT> gentoo.org> ...{caffe2-2.9.0.ebuild => caffe2-2.9.0-r1.ebuild} | 16 +++++++++---- .../files/caffe2-2.9.0-rocm-distributed-link.patch | 28 ++++++++++++++++++++++ 2 files changed, 40 insertions(+), 4 deletions(-) diff --git a/sci-ml/caffe2/caffe2-2.9.0.ebuild b/sci-ml/caffe2/caffe2-2.9.0-r1.ebuild similarity index 94% rename from sci-ml/caffe2/caffe2-2.9.0.ebuild rename to sci-ml/caffe2/caffe2-2.9.0-r1.ebuild index 6c189b4d66f6..5fa8fab9c3ef 100644 --- a/sci-ml/caffe2/caffe2-2.9.0.ebuild +++ b/sci-ml/caffe2/caffe2-2.9.0-r1.ebuild @@ -80,7 +80,7 @@ RDEPEND=" cusparselt? ( dev-libs/cusparselt ) ) fbgemm? ( sci-ml/FBGEMM ) - gloo? ( >=sci-ml/gloo-2025.06.04[cuda?] ) + gloo? ( >=sci-ml/gloo-2025.06.04[cuda?,rocm?] ) mpi? ( virtual/mpi ) nnpack? ( sci-ml/NNPACK @@ -100,7 +100,7 @@ RDEPEND=" nccl? ( >=dev-libs/rccl-6.3:= <dev-libs/rccl-7.1:= ) >=dev-util/hip-6.3:= <dev-util/hip-7.1:= >=dev-util/roctracer-6.3:= <dev-util/roctracer-7.1:= - >=sci-libs/hipBLAS-6.3:= <sci-libs/hipBLAS-7.1:= + || ( sci-libs/hipBLAS:0/6.3 sci-libs/hipBLAS:0/6.4 sci-libs/hipBLAS:0/7.0[rocsolver] ) >=sci-libs/hipBLASLt-6.3:= <sci-libs/hipBLASLt-7.1:= >=sci-libs/hipFFT-6.3:= <sci-libs/hipFFT-7.1:= >=sci-libs/hipRAND-6.3:= <sci-libs/hipRAND-7.1:= @@ -111,6 +111,7 @@ RDEPEND=" >=sci-libs/rocRAND-6.3:= <sci-libs/rocRAND-7.1:= >=sci-libs/rocSOLVER-6.3:= <sci-libs/rocSOLVER-7.1:= memefficient? ( sci-libs/aotriton-bin:0/0.11 ) + distributed? ( >=dev-util/rocm-smi-6.3:= <dev-util/rocm-smi-7.1:= ) ) distributed? ( !rocm? ( sci-ml/tensorpipe[cuda?] ) @@ -161,6 +162,7 @@ PATCHES=( "${FILESDIR}"/${PN}-2.7.1-aotriton-fixes.patch "${FILESDIR}"/${PN}-2.8.0-rocm-minus-flash.patch "${FILESDIR}"/${P}-cmake.patch + "${FILESDIR}"/${P}-rocm-distributed-link.patch ) src_prepare() { @@ -177,6 +179,9 @@ src_prepare() { torch/CMakeLists.txt \ || die + # tensorpipe is in system, not a build target of caffe2 + sed -e '/target_compile_options_if_supported(tensorpipe/d' -i cmake/Dependencies.cmake || die + # Drop third_party from CMake tree sed -i \ -e '/add_subdirectory.*third_party/d' \ @@ -240,11 +245,14 @@ src_prepare() { # Systemwide gcc (for absl and at::TensorBase) + hipcc (llvm>=18) need abi-compat=17. # But systemwide clang>=18 + hipcc (>=llvm-18) need opposite! # See also: https://github.com/llvm/llvm-project/issues/102443#issuecomment-2329726287 - sed '/-fclang-abi-compat=17/d' -i cmake/Dependencies.cmake || die + sed -e '/-fclang-abi-compat=17/d' -i cmake/Dependencies.cmake || die fi # Workaround for libc++ issue https://github.com/llvm/llvm-project/issues/100802 - sed 's/std::memcpy/memcpy/g' -i torch/headeronly/util/Half.h || die + sed -e 's/std::memcpy/memcpy/g' -i torch/headeronly/util/Half.h || die + + # Typo: https://github.com/pytorch/pytorch/pull/166502 + sed -e 's/gloo_hiop/gloo_hip/' -i cmake/Modules/FindGloo.cmake || die ebegin "HIPifying cuda sources" ${EPYTHON} tools/amd_build/build_amd.py || die diff --git a/sci-ml/caffe2/files/caffe2-2.9.0-rocm-distributed-link.patch b/sci-ml/caffe2/files/caffe2-2.9.0-rocm-distributed-link.patch new file mode 100644 index 000000000000..56c11bc84301 --- /dev/null +++ b/sci-ml/caffe2/files/caffe2-2.9.0-rocm-distributed-link.patch @@ -0,0 +1,28 @@ +USE_ROCM=ON USE_DISTRIBUTED=ON: fix *runtime* error: undefined symbol: rsmi_init + +Upstream bug: https://github.com/pytorch/pytorch/issues/158725 +--- a/cmake/Dependencies.cmake ++++ b/cmake/Dependencies.cmake +@@ -1058,6 +1058,12 @@ if(USE_ROCM) + ) + endif() + ++ if(USE_DISTRIBUTED) ++ list(APPEND Caffe2_PUBLIC_HIP_DEPENDENCY_LIBS ++ rocm_smi64 ++ ) ++ endif() ++ + # ---[ Kernel asserts + # Kernel asserts is disabled for ROCm by default. + # It can be turned on by turning on the env USE_ROCM_KERNEL_ASSERT to the build system. +--- a/cmake/public/LoadHIP.cmake ++++ b/cmake/public/LoadHIP.cmake +@@ -170,6 +170,7 @@ if(HIP_FOUND) + find_package_and_print_version(rocthrust REQUIRED) + find_package_and_print_version(hipsolver REQUIRED) + find_package_and_print_version(rocsolver REQUIRED) ++ find_package_and_print_version(rocm_smi REQUIRED) + # workaround cmake 4 build issue + if(CMAKE_VERSION VERSION_GREATER_EQUAL "4.0.0") + message(WARNING "Work around hiprtc cmake failure for cmake >= 4")
