commit:     2909348b11927b54e8f9c9952743f664ff51e51d
Author:     Sv. Lockal <lockalsash <AT> gmail <DOT> com>
AuthorDate: Sun Nov  9 12:24:38 2025 +0000
Commit:     Alfredo Tupone <tupone <AT> gentoo <DOT> org>
CommitDate: Mon Nov 10 07:37:35 2025 +0000
URL:        https://gitweb.gentoo.org/repo/gentoo.git/commit/?id=2909348b

sci-ml/caffe2: pass [rocm?] to sci-ml/gloo, fix build with [distributed]

The separate build issues were:
* sci-libs/hipBLAS:0/7.0[-rocsolver] caused runtime import error
* USE="rocm distributed" caused runtime import error
* USE="distributed" failed to build with systemwide tensorpipe

Signed-off-by: Sv. Lockal <lockalsash <AT> gmail.com>
Part-of: https://github.com/gentoo/gentoo/pull/44550
Closes: https://github.com/gentoo/gentoo/pull/44550
Signed-off-by: Alfredo Tupone <tupone <AT> gentoo.org>

 ...{caffe2-2.9.0.ebuild => caffe2-2.9.0-r1.ebuild} | 16 +++++++++----
 .../files/caffe2-2.9.0-rocm-distributed-link.patch | 28 ++++++++++++++++++++++
 2 files changed, 40 insertions(+), 4 deletions(-)

diff --git a/sci-ml/caffe2/caffe2-2.9.0.ebuild 
b/sci-ml/caffe2/caffe2-2.9.0-r1.ebuild
similarity index 94%
rename from sci-ml/caffe2/caffe2-2.9.0.ebuild
rename to sci-ml/caffe2/caffe2-2.9.0-r1.ebuild
index 6c189b4d66f6..5fa8fab9c3ef 100644
--- a/sci-ml/caffe2/caffe2-2.9.0.ebuild
+++ b/sci-ml/caffe2/caffe2-2.9.0-r1.ebuild
@@ -80,7 +80,7 @@ RDEPEND="
                cusparselt? ( dev-libs/cusparselt )
        )
        fbgemm? ( sci-ml/FBGEMM )
-       gloo? ( >=sci-ml/gloo-2025.06.04[cuda?] )
+       gloo? ( >=sci-ml/gloo-2025.06.04[cuda?,rocm?] )
        mpi? ( virtual/mpi )
        nnpack? (
                sci-ml/NNPACK
@@ -100,7 +100,7 @@ RDEPEND="
                nccl? ( >=dev-libs/rccl-6.3:= <dev-libs/rccl-7.1:= )
                >=dev-util/hip-6.3:=       <dev-util/hip-7.1:=
                >=dev-util/roctracer-6.3:= <dev-util/roctracer-7.1:=
-               >=sci-libs/hipBLAS-6.3:=   <sci-libs/hipBLAS-7.1:=
+               || ( sci-libs/hipBLAS:0/6.3 sci-libs/hipBLAS:0/6.4 
sci-libs/hipBLAS:0/7.0[rocsolver] )
                >=sci-libs/hipBLASLt-6.3:= <sci-libs/hipBLASLt-7.1:=
                >=sci-libs/hipFFT-6.3:=    <sci-libs/hipFFT-7.1:=
                >=sci-libs/hipRAND-6.3:=   <sci-libs/hipRAND-7.1:=
@@ -111,6 +111,7 @@ RDEPEND="
                >=sci-libs/rocRAND-6.3:=   <sci-libs/rocRAND-7.1:=
                >=sci-libs/rocSOLVER-6.3:= <sci-libs/rocSOLVER-7.1:=
                memefficient? ( sci-libs/aotriton-bin:0/0.11 )
+               distributed? ( >=dev-util/rocm-smi-6.3:= 
<dev-util/rocm-smi-7.1:= )
        )
        distributed? (
                !rocm? ( sci-ml/tensorpipe[cuda?] )
@@ -161,6 +162,7 @@ PATCHES=(
        "${FILESDIR}"/${PN}-2.7.1-aotriton-fixes.patch
        "${FILESDIR}"/${PN}-2.8.0-rocm-minus-flash.patch
        "${FILESDIR}"/${P}-cmake.patch
+       "${FILESDIR}"/${P}-rocm-distributed-link.patch
 )
 
 src_prepare() {
@@ -177,6 +179,9 @@ src_prepare() {
                torch/CMakeLists.txt \
                || die
 
+       # tensorpipe is in system, not a build target of caffe2
+       sed -e '/target_compile_options_if_supported(tensorpipe/d' -i 
cmake/Dependencies.cmake || die
+
        # Drop third_party from CMake tree
        sed -i \
                -e '/add_subdirectory.*third_party/d' \
@@ -240,11 +245,14 @@ src_prepare() {
                        # Systemwide gcc (for absl and at::TensorBase) + hipcc 
(llvm>=18) need abi-compat=17.
                        # But systemwide clang>=18 + hipcc (>=llvm-18) need 
opposite!
                        # See also: 
https://github.com/llvm/llvm-project/issues/102443#issuecomment-2329726287
-                       sed '/-fclang-abi-compat=17/d' -i 
cmake/Dependencies.cmake || die
+                       sed -e '/-fclang-abi-compat=17/d' -i 
cmake/Dependencies.cmake || die
                fi
 
                # Workaround for libc++ issue 
https://github.com/llvm/llvm-project/issues/100802
-               sed 's/std::memcpy/memcpy/g' -i torch/headeronly/util/Half.h || 
die
+               sed -e 's/std::memcpy/memcpy/g' -i torch/headeronly/util/Half.h 
|| die
+
+               # Typo: https://github.com/pytorch/pytorch/pull/166502
+               sed -e 's/gloo_hiop/gloo_hip/' -i cmake/Modules/FindGloo.cmake 
|| die
 
                ebegin "HIPifying cuda sources"
                ${EPYTHON} tools/amd_build/build_amd.py || die

diff --git a/sci-ml/caffe2/files/caffe2-2.9.0-rocm-distributed-link.patch 
b/sci-ml/caffe2/files/caffe2-2.9.0-rocm-distributed-link.patch
new file mode 100644
index 000000000000..56c11bc84301
--- /dev/null
+++ b/sci-ml/caffe2/files/caffe2-2.9.0-rocm-distributed-link.patch
@@ -0,0 +1,28 @@
+USE_ROCM=ON USE_DISTRIBUTED=ON: fix *runtime* error: undefined symbol: 
rsmi_init
+
+Upstream bug: https://github.com/pytorch/pytorch/issues/158725
+--- a/cmake/Dependencies.cmake
++++ b/cmake/Dependencies.cmake
+@@ -1058,6 +1058,12 @@ if(USE_ROCM)
+       )
+     endif()
+ 
++    if(USE_DISTRIBUTED)
++      list(APPEND Caffe2_PUBLIC_HIP_DEPENDENCY_LIBS
++        rocm_smi64
++      )
++    endif()
++
+     # ---[ Kernel asserts
+     # Kernel asserts is disabled for ROCm by default.
+     # It can be turned on by turning on the env USE_ROCM_KERNEL_ASSERT to the 
build system.
+--- a/cmake/public/LoadHIP.cmake
++++ b/cmake/public/LoadHIP.cmake
+@@ -170,6 +170,7 @@ if(HIP_FOUND)
+   find_package_and_print_version(rocthrust REQUIRED)
+   find_package_and_print_version(hipsolver REQUIRED)
+   find_package_and_print_version(rocsolver REQUIRED)
++  find_package_and_print_version(rocm_smi REQUIRED)
+   # workaround cmake 4 build issue
+   if(CMAKE_VERSION VERSION_GREATER_EQUAL "4.0.0")
+     message(WARNING "Work around hiprtc cmake failure for cmake >= 4")

Reply via email to