commit: b09dd88412fe2d5eee5a8891e08bfa2d67848da3
Author: Alfredo Tupone <tupone <AT> gentoo <DOT> org>
AuthorDate: Mon Feb 2 09:38:57 2026 +0000
Commit: Alfredo Tupone <tupone <AT> gentoo <DOT> org>
CommitDate: Mon Feb 2 09:39:28 2026 +0000
URL: https://gitweb.gentoo.org/repo/gentoo.git/commit/?id=b09dd884
sci-ml/caffe2: drop 2.8.0-r2
Signed-off-by: Alfredo Tupone <tupone <AT> gentoo.org>
sci-ml/caffe2/Manifest | 2 -
sci-ml/caffe2/caffe2-2.8.0-r2.ebuild | 384 ---------------------
.../files/composable-kernel-6.4.1-expand-isa.patch | 141 --------
3 files changed, 527 deletions(-)
diff --git a/sci-ml/caffe2/Manifest b/sci-ml/caffe2/Manifest
index 35010993e955..ec1ad63665de 100644
--- a/sci-ml/caffe2/Manifest
+++ b/sci-ml/caffe2/Manifest
@@ -1,6 +1,4 @@
DIST composable_kernel-7fe50dc3.tar.gz 5380728 BLAKE2B
c89c346d8e2d7a93a9cf26409e477fcdd25c43bc3f99d904c3bfe1bc282c6844ef2f2c80aceabe3bf4494db3457285384d5de5a22281aa426ba7479af82b0caf
SHA512
a62f92e2dd7da944bd34bab6cf3bf624f630dc316d29c755e9fd523343c3f7648b0b7e0c9a0c8f5e9654477599ae8be9dac687d4054b0390f064ac2e40fc1cd3
-DIST composable_kernel-8086bbe3.tar.gz 4418862 BLAKE2B
b710e3d4586899443ec01044dad19fd2f992c351e2f65ba526dfcc47cc65c095beaf8ac21a8f71c02a0eb524d364e817b27241a9198884f2bdae9924b51e24e4
SHA512
8410b5a1c864d71f3034ef0d9d1245078856d09cc191faec59856c229bf11d89ae291036d735cb5cec4f1d72e6e9e8f6921833147f9619d30cfab8722d3a9f63
DIST flash-attention-2.7.4.gh.tar.gz 5841323 BLAKE2B
432999d763f2b3d732580ddfea5d3e01370351db0656546259a5e500a07516dd03c98828bfb55855dabe4adc651033b5d97ea4725ca46158b9970f0fbc662710
SHA512
05a4afb09e666f7404d6a3f8b5256e7bed6eba60a6f1bde2b7dbb96d318975f0b458c2521c7a38d88e97b6e4c27f29077cf787849daf82586e33f43a3d9a84b3
DIST pytorch-2.10.0.tar.gz 62555251 BLAKE2B
6b48b4d3d3802a82d37231c472032f8c9390bdbfe9e810ff811f5521737b46794db51e87f7aad3928be917923ae8ac9c619b14d8d0cfacfc115dffb38347bd43
SHA512
929b1be42954f22e2091e0696d6175c8f30752925ce0bbe3a60a9393aff1f0afb228fa5efaf1ce26e78cd9e99294d150109e9ba9cb3243c90e9ec06cc082f74d
-DIST pytorch-2.8.0.tar.gz 56565754 BLAKE2B
a8f07513b92f9293f8322508f9fc73a462f89fe51cb1f280af371cee19cbe7e2bf900ba2b3c43fd08ea415566db441a6d6310d77f18477e957641be311a361a5
SHA512
448e9dad4aa10f1793d35e6ffe9f0f69b7719d41e6eccceb687a8d0c148e22d03e4f76170a05308ef9323a7aea41aa74605077ae1d68c6d949f13b3340ebf310
DIST pytorch-2.9.1.tar.gz 55764697 BLAKE2B
b22e154034f8a25aa3ef949eb6b0456777e11fe5f97de56c6112d93a2e154db425e97848911af458d179f03d7154956f53b715c7b9d7e7f074e0baceac35dad8
SHA512
d7098408d44e0fee9ded4afd6622df6f08757bf02eee878ae25b62a275f82eb16f96a07027c670c6ffdd431c8714c569249bd8518ac8828a504e99908b8c38b1
diff --git a/sci-ml/caffe2/caffe2-2.8.0-r2.ebuild
b/sci-ml/caffe2/caffe2-2.8.0-r2.ebuild
deleted file mode 100644
index 71f03fa30c24..000000000000
--- a/sci-ml/caffe2/caffe2-2.8.0-r2.ebuild
+++ /dev/null
@@ -1,384 +0,0 @@
-# Copyright 2022-2025 Gentoo Authors
-# Distributed under the terms of the GNU General Public License v2
-
-EAPI=8
-
-PYTHON_COMPAT=( python3_{11..14} )
-ROCM_VERSION=6.1
-inherit python-single-r1 cmake cuda flag-o-matic prefix rocm toolchain-funcs
-
-MYPN=pytorch
-MYP=${MYPN}-${PV}
-
-# caffe2-2.6.0 depends on future version of composable kernel
-# TODO: replace it with RDEPEND in the future
-CK_COMMIT=8086bbe3a78d931eb96fe12fdc014082e18d18d3
-CK_P=composable_kernel-${CK_COMMIT:0:8}
-
-FLASH_PV=2.7.4
-FLASH_PN=flash-attention
-FLASH_P=${FLASH_PN}-${FLASH_PV}
-FLASH_ATT_URI="https://github.com/Dao-AILab/${FLASH_PN}/archive/refs/tags/v${FLASH_PV}.tar.gz
-> ${FLASH_P}.gh.tar.gz"
-
-AOTRITON_PV=0.9.2b
-AOTRITON_PN=aotriton
-AOTRITON_P=${AOTRITON_PN}-${AOTRITON_PV}
-AOTRITON_tar=${AOTRITON_P}-manylinux_2_28_x86_64-rocm6.3-shared.tar.gz
-
-DESCRIPTION="A deep learning framework"
-HOMEPAGE="https://pytorch.org/"
-SRC_URI="
- https://github.com/pytorch/${MYPN}/archive/refs/tags/v${PV}.tar.gz ->
${MYP}.tar.gz
- rocm? (
-
https://github.com/ROCm/composable_kernel/archive/${CK_COMMIT}.tar.gz
- -> ${CK_P}.tar.gz
- )
- flash? ( ${FLASH_ATT_URI} )
- memefficient? ( ${FLASH_ATT_URI} )
-"
-
-S="${WORKDIR}"/${MYP}
-
-LICENSE="BSD"
-SLOT="0"
-KEYWORDS="~amd64 ~arm64"
-IUSE="cuda cusparselt distributed fbgemm flash gloo memefficient mkl mpi
nnpack +numpy
- onednn openblas opencl openmp qnnpack rocm xnnpack"
-RESTRICT="test"
-REQUIRED_USE="
- ${PYTHON_REQUIRED_USE}
- mpi? ( distributed )
- gloo? ( distributed )
- ?? ( cuda rocm )
- rocm? (
- || ( ${ROCM_REQUIRED_USE} )
- !flash
- )
-"
-
-RDEPEND="
- ${PYTHON_DEPS}
- dev-cpp/abseil-cpp:=
- dev-cpp/gflags:=
- >=dev-cpp/glog-0.5.0
- dev-cpp/nlohmann_json
- dev-cpp/opentelemetry-cpp
- dev-libs/cpuinfo
- dev-libs/libfmt:=
- dev-libs/protobuf:=
- dev-libs/pthreadpool
- dev-libs/sleef
- sci-ml/foxi
- ~sci-ml/kineto-0.4.0_p20250617
- sci-ml/onnx
- virtual/lapack
- cuda? (
- dev-libs/cudnn
- >=sci-ml/cudnn-frontend-1.12.0:=
- >=dev-util/nvidia-cuda-toolkit-12.9:=[profiler]
- cusparselt? ( dev-libs/cusparselt )
- )
- fbgemm? ( sci-ml/FBGEMM )
- gloo? ( <=sci-ml/gloo-2025.06.04[cuda?] )
- mpi? ( virtual/mpi )
- nnpack? ( sci-ml/NNPACK )
- numpy? ( $(python_gen_cond_dep '
- dev-python/numpy[${PYTHON_USEDEP}]
- ') )
- onednn? ( =sci-ml/oneDNN-3.5* )
- opencl? ( virtual/opencl )
- qnnpack? (
- !sci-libs/QNNPACK
- sci-ml/gemmlowp
- )
- rocm? (
- >=dev-libs/rccl-6.1 <dev-libs/rccl-6.5
- >=dev-util/hip-6.1 <dev-util/hip-6.5
- >=dev-util/roctracer-6.1 <dev-util/roctracer-6.5
- >=sci-libs/hipBLAS-6.1 <sci-libs/hipBLAS-6.5
- >=sci-libs/hipBLASLt-6.1 <sci-libs/hipBLASLt-6.5
- >=sci-libs/hipCUB-6.1 <sci-libs/hipCUB-6.5
- >=sci-libs/hipFFT-6.1 <sci-libs/hipFFT-6.5
- >=sci-libs/hipRAND-6.1 <sci-libs/hipRAND-6.5
- >=sci-libs/hipSOLVER-6.1 <sci-libs/hipSOLVER-6.5
- >=sci-libs/hipSPARSE-6.1 <sci-libs/hipSPARSE-6.5
- >=sci-libs/miopen-6.1 <sci-libs/miopen-6.5
- >=sci-libs/rocPRIM-6.1 <sci-libs/rocPRIM-6.5
- >=sci-libs/rocThrust-6.1 <sci-libs/rocThrust-6.5
- memefficient? ( sci-libs/aotriton-bin:0/0.9 )
- )
- distributed? (
- sci-ml/tensorpipe[cuda?]
- dev-cpp/cpp-httplib
- )
- xnnpack? ( >=sci-ml/XNNPACK-2024.11 )
- mkl? ( sci-libs/mkl )
- openblas? ( sci-libs/openblas )
-"
-
-DEPEND="
- ${RDEPEND}
- dev-libs/flatbuffers
- dev-libs/FXdiv
- dev-libs/pocketfft
- dev-libs/psimd
- sci-ml/FP16
- $(python_gen_cond_dep '
- dev-python/pybind11[${PYTHON_USEDEP}]
- dev-python/pyyaml[${PYTHON_USEDEP}]
- dev-python/typing-extensions[${PYTHON_USEDEP}]
- ')
- cuda? ( ~dev-libs/cutlass-3.9.2[tools(+)] )
- onednn? ( sci-ml/ideep )
- qnnpack? ( dev-libs/clog )
-"
-
-PATCHES=(
- "${FILESDIR}"/${PN}-2.5.1-unbundle_fmt.patch
- "${FILESDIR}"/${PN}-2.5.1-unbundle_kineto.patch
- "${FILESDIR}"/${P}-unbundle_pocketfft.patch
- "${FILESDIR}"/${PN}-2.5.1-cudnn_include_fix.patch
- "${FILESDIR}"/${P}-gentoo.patch
- "${FILESDIR}"/${PN}-2.4.0-cpp-httplib.patch
- "${FILESDIR}"/${PN}-2.5.1-glog-0.6.0.patch
- "${FILESDIR}"/${PN}-2.5.1-newfix-functorch-install.patch
- "${FILESDIR}"/${PN}-2.6.0-rocm-fix-std-cpp17.patch
- "${FILESDIR}"/${P}-cmake.patch
- "${FILESDIR}"/${PN}-2.7.0-glog-0.7.1.patch
- "${FILESDIR}"/${PN}-2.7.1-aotriton-fixes.patch
- "${FILESDIR}"/${PN}-2.8.0-rocm-minus-flash.patch
-)
-
-src_prepare() {
- if use flash || use memefficient; then
- mv "${WORKDIR}"/${FLASH_P}/* third_party/${FLASH_PN}/ || die
- fi
- filter-lto #bug 862672
-
- # Unbundle fmt
- sed -i \
- -e 's|::fmt-header-only||' \
- c10/CMakeLists.txt \
- cmake/Dependencies.cmake \
- torch/CMakeLists.txt \
- || die
-
- # Drop third_party from CMake tree
- sed -i \
- -e '/add_subdirectory.*third_party/d' \
- CMakeLists.txt \
- cmake/Dependencies.cmake \
- cmake/ProtoBuf.cmake \
- aten/src/ATen/CMakeLists.txt \
- || die
- # Change libc10* path
- sed -i \
- -e "/EXPORT/s|DESTINATION lib)|DESTINATION $(get_libdir))|" \
- c10/cuda/CMakeLists.txt \
- c10/CMakeLists.txt \
- c10/hip/CMakeLists.txt \
- || die
-
- # Change libaotriton path
- sed -i \
- -e "s|}/lib|}/$(get_libdir)|g" \
- cmake/External/aotriton.cmake \
- || die
-
- # Noisy warnings from Logging.h
- sed -i 's/-Wextra-semi//' cmake/public/utils.cmake || die
-
- cmake_src_prepare
- pushd torch/csrc/jit/serialization || die
- flatc --cpp --gen-mutable --scoped-enums mobile_bytecode.fbs || die
- popd
-
- # prefixify the hardcoded paths, after all patches are applied
- hprefixify \
- aten/CMakeLists.txt \
- caffe2/CMakeLists.txt \
- cmake/Metal.cmake \
- cmake/Modules/*.cmake \
- cmake/Modules_CUDA_fix/FindCUDNN.cmake \
- cmake/Modules_CUDA_fix/upstream/FindCUDA/make2cmake.cmake \
-
cmake/Modules_CUDA_fix/upstream/FindPackageHandleStandardArgs.cmake \
- cmake/public/LoadHIP.cmake \
- cmake/public/cuda.cmake \
- cmake/Dependencies.cmake \
- torch/CMakeLists.txt \
- CMakeLists.txt
-
- if use rocm; then
- sed -e "s:/opt/rocm:/usr:" \
- -e "s:lib/cmake:$(get_libdir)/cmake:g" \
- -i cmake/public/LoadHIP.cmake || die
-
- # TODO: delete, when caffe2 depends on systemwide
composable_kernel
- sed -e
"s:third_party/composable_kernel:../composable_kernel-${CK_COMMIT}:g" \
- -i aten/src/ATen/CMakeLists.txt || die
-
- # Bug 959808: fix for gfx101x targets
- pushd "${WORKDIR}/composable_kernel-${CK_COMMIT}" > /dev/null
|| die
- eapply "${FILESDIR}"/composable-kernel-6.4.1-expand-isa.patch
- popd > /dev/null || die
-
- if tc-is-clang; then
- # Systemwide gcc (for absl and at::TensorBase) + hipcc
(llvm>=18) need abi-compat=17.
- # But systemwide clang>=18 + hipcc (>=llvm-18) need
opposite!
- # See also:
https://github.com/llvm/llvm-project/issues/102443#issuecomment-2329726287
- sed '/-fclang-abi-compat=17/d' -i
cmake/Dependencies.cmake || die
- fi
-
- # Workaround for libc++ issue
https://github.com/llvm/llvm-project/issues/100802
- sed 's/std::memcpy/memcpy/g' -i c10/util/Half.h || die
-
- ebegin "HIPifying cuda sources"
- ${EPYTHON} tools/amd_build/build_amd.py || die
- eend $?
- fi
-}
-
-src_configure() {
- if use cuda && [[ -z ${TORCH_CUDA_ARCH_LIST} ]]; then
- ewarn "WARNING: caffe2 is being built with its default CUDA
compute capabilities: 3.5 and 7.0."
- ewarn "These may not be optimal for your GPU."
- ewarn ""
- ewarn "To configure caffe2 with the CUDA compute capability
that is optimal for your GPU,"
- ewarn "set TORCH_CUDA_ARCH_LIST in your make.conf, and
re-emerge caffe2."
- ewarn "For example, to use CUDA capability 7.5 & 3.5, add:
TORCH_CUDA_ARCH_LIST=7.5 3.5"
- ewarn "For a Maxwell model GPU, an example value would be:
TORCH_CUDA_ARCH_LIST=Maxwell"
- ewarn ""
- ewarn "You can look up your GPU's CUDA compute capability at
https://developer.nvidia.com/cuda-gpus"
- ewarn "or by running /opt/cuda/extras/demo_suite/deviceQuery |
grep 'CUDA Capability'"
- fi
-
- local mycmakeargs=(
- -DBUILD_CUSTOM_PROTOBUF=OFF
- -DLIBSHM_INSTALL_LIB_SUBDIR="${EPREFIX}"/usr/$(get_libdir)
- -DPython_EXECUTABLE="${PYTHON}"
- -DTORCH_INSTALL_LIB_DIR="${EPREFIX}"/usr/$(get_libdir)
- -DUSE_CCACHE=OFF
- -DUSE_CUDA=$(usex cuda)
- -DUSE_DISTRIBUTED=$(usex distributed)
- -DUSE_FAKELOWP=OFF
- -DUSE_FBGEMM=$(usex fbgemm)
- -DUSE_FLASH_ATTENTION=$(usex flash)
- -DUSE_GFLAGS=ON
- -DUSE_GLOG=ON
- -DUSE_GLOO=$(usex gloo)
- -DUSE_ITT=OFF
- -DUSE_KINETO=ON
- -DUSE_KLEIDIAI=OFF # TODO
- -DUSE_MAGMA=OFF # TODO: In GURU as sci-libs/magma
- -DUSE_MEM_EFF_ATTENTION=$(usex memefficient)
- -DUSE_MKLDNN=$(usex onednn)
- -DUSE_MPI=$(usex mpi)
- -DUSE_NCCL=OFF
- -DUSE_NNPACK=$(usex nnpack)
- -DUSE_NUMA=OFF
- -DUSE_NUMPY=$(usex numpy)
- -DUSE_OPENCL=$(usex opencl)
- -DUSE_OPENMP=$(usex openmp)
- -DUSE_PYTORCH_QNNPACK=$(usex qnnpack)
- -DUSE_PYTORCH_METAL=OFF
- -DUSE_ROCM=$(usex rocm)
- -DUSE_SYSTEM_CPUINFO=ON
- -DUSE_SYSTEM_EIGEN_INSTALL=ON
- -DUSE_SYSTEM_FP16=ON
- -DUSE_SYSTEM_FXDIV=ON
- -DUSE_SYSTEM_GLOO=ON
- -DUSE_SYSTEM_NVTX=ON
- -DUSE_SYSTEM_ONNX=ON
- -DUSE_SYSTEM_PSIMD=ON
- -DUSE_SYSTEM_PTHREADPOOL=ON
- -DUSE_SYSTEM_PYBIND11=ON
- -DUSE_SYSTEM_SLEEF=ON
- -DUSE_SYSTEM_XNNPACK=$(usex xnnpack)
- -DUSE_TENSORPIPE=$(usex distributed)
- -DUSE_UCC=OFF
- -DUSE_VALGRIND=OFF
- -DUSE_XNNPACK=$(usex xnnpack)
- -DUSE_XPU=OFF
- -Wno-dev
- )
-
- if use mkl; then
- mycmakeargs+=(-DBLAS=MKL)
- elif use openblas; then
- mycmakeargs+=(-DBLAS=OpenBLAS)
- else
- mycmakeargs+=(-DBLAS=Generic -DBLAS_LIBRARIES=)
- fi
-
- if use cuda; then
- addpredict "/dev/nvidiactl" # bug 867706
- addpredict "/dev/char"
- addpredict "/proc/self/task" # bug 926116
-
- mycmakeargs+=(
- -DUSE_CUDNN=ON
- -DTORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST:-3.5
7.0}"
- -DUSE_NCCL=OFF # TODO: NVIDIA Collective Communication
Library
- -DCMAKE_CUDA_FLAGS="$(cuda_gccdir -f | tr -d \")"
- -DUSE_CUSPARSELT=$(usex cusparselt)
- )
- elif use rocm; then
- export PYTORCH_ROCM_ARCH="$(get_amdgpu_flags)"
-
- if use memefficient; then
- export AOTRITON_INSTALLED_PREFIX="${ESYSROOT}/usr"
- fi
-
- mycmakeargs+=(
- -DUSE_NCCL=ON
- -DUSE_SYSTEM_NCCL=ON
- -DCMAKE_REQUIRE_FIND_PACKAGE_HIP=ON
- )
-
- # ROCm libraries produce too much warnings
- append-cxxflags -Wno-deprecated-declarations -Wno-unused-result
-Wno-unused-value
- fi
-
- if use onednn; then
- mycmakeargs+=(
- -DMKLDNN_FOUND=ON
- -DMKLDNN_LIBRARIES=dnnl
-
-DMKLDNN_INCLUDE_DIR="${ESYSROOT}/usr/include/oneapi/dnnl"
- )
- fi
-
- cmake_src_configure
-}
-
-src_compile() {
- PYTORCH_BUILD_VERSION=${PV} \
- PYTORCH_BUILD_NUMBER=0 \
- cmake_src_compile
-}
-
-python_install() {
- python_domodule python/torch
- mkdir "${D}"$(python_get_sitedir)/torch/bin || die
- mkdir "${D}"$(python_get_sitedir)/torch/lib || die
- mkdir "${D}"$(python_get_sitedir)/torch/include || die
- ln -s ../../../../../include/torch \
- "${D}$(python_get_sitedir)"/torch/include/torch || die # bug
923269
- ln -s ../../../../../bin/torch_shm_manager \
- "${D}"/$(python_get_sitedir)/torch/bin/torch_shm_manager || die
- ln -s ../../../../../$(get_libdir)/libtorch_global_deps.so \
- "${D}"/$(python_get_sitedir)/torch/lib/libtorch_global_deps.so
|| die
-}
-
-src_install() {
- cmake_src_install
-
- # Used by pytorch ebuild
- insinto "/var/lib/${PN}"
- doins "${BUILD_DIR}"/CMakeCache.txt
- dostrip -x /var/lib/${PN}/functorch.so
-
- rm -rf python
- mkdir -p python/torch || die
- cp torch/version.py python/torch/ || die
- python_install
-}
diff --git a/sci-ml/caffe2/files/composable-kernel-6.4.1-expand-isa.patch
b/sci-ml/caffe2/files/composable-kernel-6.4.1-expand-isa.patch
deleted file mode 100644
index 8a3fb4e1ec6d..000000000000
--- a/sci-ml/caffe2/files/composable-kernel-6.4.1-expand-isa.patch
+++ /dev/null
@@ -1,141 +0,0 @@
-Fix for "undeclared identifier 'CK_BUFFER_RESOURCE_3RD_DWORD'" for
AMDGPU_TARGETS="gfx1012".
-Combines of 3 patches from
https://github.com/ROCm/composable_kernel/issues/775#issuecomment-2726315348
-
-Bug: https://bugs.gentoo.org/947583
-Bug: https://bugs.gentoo.org/show_bug.cgi?id=959808
---- a/include/ck/ck.hpp
-+++ b/include/ck/ck.hpp
-@@ -82,7 +82,7 @@ CK_DECLARE_ENV_VAR_BOOL(CK_LOGGING)
- #define CK_BUFFER_RESOURCE_3RD_DWORD -1
- #elif defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__) ||
defined(__gfx9__)
- #define CK_BUFFER_RESOURCE_3RD_DWORD 0x00020000
--#elif defined(__gfx103__)
-+#elif defined(__gfx101__) || defined(__gfx103__)
- #define CK_BUFFER_RESOURCE_3RD_DWORD 0x31014000
- #elif defined(__gfx11__) || defined(__gfx12__)
- #define CK_BUFFER_RESOURCE_3RD_DWORD 0x31004000
-@@ -90,12 +90,12 @@ CK_DECLARE_ENV_VAR_BOOL(CK_LOGGING)
-
- // FMA instruction
- #ifndef __HIP_DEVICE_COMPILE__ // for host code, define
nothing
--#elif defined(__gfx803__) || defined(__gfx900__) // for GPU code
--#define CK_USE_AMD_V_MAC_F32
--#elif defined(__gfx906__) || defined(__gfx9__) || defined(__gfx103__) // for
GPU code
-+#elif defined(__gfx906__) || defined(__gfx9__) || defined(__gfx103__) ||
defined(__gfx1011__) || defined(__gfx1012__) // for GPU code
- #define CK_USE_AMD_V_FMAC_F32
- #define CK_USE_AMD_V_DOT2_F32_F16
- #define CK_USE_AMD_V_DOT4_I32_I8
-+#elif defined(__gfx803__) || defined(__gfx900__) || defined(__gfx101__) //
for GPU code
-+#define CK_USE_AMD_V_MAC_F32
- #elif defined(__gfx11__) || defined(__gfx12__)
- #define CK_USE_AMD_V_FMAC_F32
- #define CK_USE_AMD_V_DOT2_F32_F16
----
a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp
-+++
b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp
-@@ -71,7 +71,7 @@ __global__ void
- const Block2CTileMap block_2_ctile_map)
- {
- #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) ||
defined(__gfx908__) || \
-- defined(__gfx90a__) || defined(__gfx94__) || defined(__gfx103__) ||
defined(__gfx11__) || \
-+ defined(__gfx90a__) || defined(__gfx94__) || defined(__gfx101__) ||
defined(__gfx103__) || defined(__gfx11__) || \
- defined(__gfx12__))
-
- const index_t num_blocks_per_batch =
---- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp
-+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp
-@@ -51,7 +51,7 @@ __global__ void
- const Block2CTileMap block_2_ctile_map)
- {
- #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) ||
defined(__gfx9__) || \
-- defined(__gfx103__) || defined(__gfx11__) || defined(__gfx12__))
-+ defined(__gfx101__) || defined(__gfx103__) || defined(__gfx11__) ||
defined(__gfx12__))
-
- constexpr index_t shared_block_size =
- GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(ABDataType);
----
a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_dl.hpp
-+++
b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_dl.hpp
-@@ -48,7 +48,7 @@ __global__ void
- const Block2CTileMap block_2_ctile_map,
- const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
- {
--#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) ||
defined(__gfx103__) || \
-+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) ||
defined(__gfx101__) || defined(__gfx103__) || \
- defined(__gfx90a__) || defined(__gfx908__) || defined(__gfx94__) ||
defined(__gfx11__) || \
- defined(__gfx12__))
- const index_t num_blocks_per_batch =
----
a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
-+++
b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
-@@ -90,7 +90,7 @@ __global__ void
- const Block2CTileMap block_2_ctile_map,
- const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
- {
--#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) ||
defined(__gfx103__) || \
-+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) ||
defined(__gfx101__) || defined(__gfx103__) || \
- defined(__gfx90a__) || defined(__gfx908__) || defined(__gfx94__) ||
defined(__gfx11__) || \
- defined(__gfx12__))
- // offset base pointer for each work-group
----
a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp
-+++
b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp
-@@ -106,7 +106,7 @@ __global__ void
- const Block2CTileMap block_2_ctile_map,
- const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
- {
--#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) ||
defined(__gfx103__) || \
-+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) ||
defined(__gfx101__) || defined(__gfx103__) || \
- defined(__gfx11__) || defined(__gfx12__))
- // offset base pointer for each work-group
- const index_t num_blocks_per_batch =
----
a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp
-+++
b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp
-@@ -40,7 +40,7 @@ __global__ void
- const CDEElementwiseOperation
cde_element_op)
- {
- #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) ||
defined(__gfx908__) || \
-- defined(__gfx90a__) || defined(__gfx103__) || defined(__gfx11__) ||
defined(__gfx94__) || \
-+ defined(__gfx90a__) || defined(__gfx101__) || defined(__gfx103__) ||
defined(__gfx11__) || defined(__gfx94__) || \
- defined(__gfx12__))
- __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
-
---- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dpp.hpp
-+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dpp.hpp
-@@ -28,7 +28,7 @@ __global__ void
- #endif
- kernel_gemm_dpp(const typename GridwiseGemm::Argument karg)
- {
--#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx103__) ||
defined(__gfx11__))
-+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx101__) ||
defined(__gfx103__) || defined(__gfx11__))
- __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
-
- const auto a_grid_desc_ak0_m_ak1 = amd_wave_read_first_lane(
---- a/include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp
-+++ b/include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp
-@@ -36,7 +36,7 @@ __global__ void
- const ComputePtrOffsetOfStridedBatch
compute_ptr_offset_of_batch)
- {
- #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) ||
defined(__gfx908__) || \
-- defined(__gfx90a__) || defined(__gfx94__) || defined(__gfx103__) ||
defined(__gfx11__) || \
-+ defined(__gfx90a__) || defined(__gfx94__) || defined(__gfx101__) ||
defined(__gfx103__) || defined(__gfx11__) || \
- defined(__gfx12__))
- GridwiseTensorRearrangeKernel::Run(in_grid_desc,
- p_in_global,
---- a/include/ck_tile/core/config.hpp
-+++ b/include/ck_tile/core/config.hpp
-@@ -10,6 +10,9 @@
- #if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) ||
defined(__gfx950__)
- #define __gfx94__
- #endif
-+#if defined(__gfx1010__) || defined(__gfx1011__) || defined(__gfx1012__)
-+#define __gfx101__
-+#endif
- #if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || \
- defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || \
- defined(__gfx10_3_generic__)
-@@ -199,7 +202,7 @@
- #elif defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__) || \
- defined(__gfx9__) // for GPU code
- #define CK_TILE_BUFFER_RESOURCE_3RD_DWORD 0x00020000
--#elif defined(__gfx103__) // for GPU code
-+#elif defined(__gfx101__) || defined(__gfx103__) // for GPU code
- #define CK_TILE_BUFFER_RESOURCE_3RD_DWORD 0x31014000
- #elif defined(__gfx11__) || defined(__gfx12__) // for GPU code
- #define CK_TILE_BUFFER_RESOURCE_3RD_DWORD 0x31004000