yaoyaoding commented on code in PR #300:
URL: https://github.com/apache/tvm-ffi/pull/300#discussion_r2629601847


##########
cmake/Utils/EmbedCubin.cmake:
##########
@@ -15,205 +15,163 @@
 # specific language governing permissions and limitations
 # under the License.
 
+# Do not let cmake to link cudart.
+set(CMAKE_CUDA_RUNTIME_LIBRARY None)
+
+# We need this to simulate `CUDA_{CUBIN,FATBIN}_COMPILATION` in 
`add_tvm_ffi_{cubin,fatbin}`, to
+# copy `a.cu.o` to `a.cubin`/`a.fatbin`.
+set(COPY_SCRIPT "${CMAKE_BINARY_DIR}/cuda_copy_utils.cmake")
+file(
+  WRITE ${COPY_SCRIPT}
+  "
+# Arguments: OBJECTS (semicolon-separated list), OUT_DIR, EXT
+string(REPLACE \"\\\"\" \"\" ext_strip \"\${EXT}\")
+string(REPLACE \"\\\"\" \"\" out_dir_strip \"\${OUT_DIR}\")
+foreach(obj_raw \${OBJECTS})
+  string(REPLACE \"\\\"\" \"\" obj \"\${obj_raw}\")
+
+  # Extract filename: /path/to/kernel.cu.o -> kernel
+  # Note: CMake objects are usually named source.cu.o, so we strip extensions 
twice.
+  get_filename_component(fname \${obj} NAME_WE)
+  get_filename_component(fname \${fname} NAME_WE)
+
+  # If OUT_DIR is provided, use it. Otherwise, use the object's directory.
+  if(NOT out_dir_strip STREQUAL \"\")
+      set(final_dir \"\${out_dir_strip}\")
+  else()
+      get_filename_component(final_dir \${obj} DIRECTORY)
+  endif()
+
+  message(\"Copying \${obj} -> \${final_dir}/\${fname}.\${ext_strip}\")
+  execute_process(
+    COMMAND \${CMAKE_COMMAND} -E copy_if_different
+    \"\${obj}\"
+    \"\${final_dir}/\${fname}.\${ext_strip}\"
+  )
+endforeach()
+"
+)
+
 # ~~~
-# tvm_ffi_generate_cubin(
-#   OUTPUT <output_cubin_file>
-#   SOURCE <cuda_source_file>
-#   [ARCH <architecture>]
-#   [OPTIONS <extra_nvcc_options>...]
-#   [DEPENDS <additional_dependencies>...]
-# )
+# add_tvm_ffi_cubin(<target_name> CUDA <source_file>)
 #
-# Compiles a CUDA source file to CUBIN format using nvcc.
+# Creates an object library that compiles CUDA source to CUBIN format.
+# This function uses CMake's native CUDA support and respects 
CMAKE_CUDA_ARCHITECTURES.
+# This is a compatibility util for cmake < 3.27, user can create
+# cmake target with `CUDA_CUBIN_COMPILATION` for cmake >= 3.27.
 #
 # Parameters:
-#   OUTPUT: Path to the output CUBIN file (e.g., kernel.cubin)
-#   SOURCE: Path to the CUDA source file (e.g., kernel.cu)
-#   ARCH: Target GPU architecture (default: native for auto-detection)
-#         Examples: sm_75, sm_80, sm_86, compute_80, native
-#   OPTIONS: Additional nvcc compiler options (e.g., -O3, --use_fast_math)
-#   DEPENDS: Optional additional dependencies
-#
-# The function will:
-#   1. Find the CUDA compiler (nvcc)
-#   2. Compile the SOURCE to CUBIN with specified architecture and options
-#   3. Create the output CUBIN file
+#   target_name: Name of the object library target
+#   CUDA: One CUDA source file
 #
 # Example:
-#   tvm_ffi_generate_cubin(
-#     OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/kernel.cubin
-#     SOURCE src/kernel.cu
-#     ARCH native
-#     OPTIONS -O3 --use_fast_math
-#   )
+#   add_tvm_ffi_cubin(my_kernel_cubin CUDA kernel.cu)
 # ~~~
-
-# cmake-lint: disable=C0111,C0103
-function (tvm_ffi_generate_cubin)
-  # Parse arguments
-  set(options "")
-  set(oneValueArgs OUTPUT SOURCE ARCH)
-  set(multiValueArgs OPTIONS DEPENDS)
-  cmake_parse_arguments(ARG "${options}" "${oneValueArgs}" "${multiValueArgs}" 
${ARGN})
-
-  # Validate required arguments
-  if (NOT ARG_OUTPUT)
-    message(FATAL_ERROR "tvm_ffi_generate_cubin: OUTPUT is required")
-  endif ()
-  if (NOT ARG_SOURCE)
-    message(FATAL_ERROR "tvm_ffi_generate_cubin: SOURCE is required")
+function (add_tvm_ffi_cubin target_name)
+  cmake_parse_arguments(ARG "" "CUDA" "" ${ARGN})
+  if (NOT ARG_CUDA)
+    message(FATAL_ERROR "add_tvm_ffi_cubin: CUDA source is required")
   endif ()
 
-  # Default architecture to native if not specified
-  if (NOT ARG_ARCH)
-    set(ARG_ARCH "native")
-  endif ()
+  add_library(${target_name} OBJECT ${ARG_CUDA})
+  target_compile_options(${target_name} PRIVATE 
$<$<COMPILE_LANGUAGE:CUDA>:--cubin>)
 
-  # Ensure CUDA compiler is available
-  if (NOT CMAKE_CUDA_COMPILER)
-    message(
-      FATAL_ERROR
-        "tvm_ffi_generate_cubin: CMAKE_CUDA_COMPILER not found. Enable CUDA 
language in project()."
-    )
+  add_custom_target(
+    ${target_name}_bin ALL
+    COMMAND ${CMAKE_COMMAND} -DOBJECTS="$<TARGET_OBJECTS:${target_name}>" 
-DOUT_DIR="" -DEXT="cubin"
+            -P "${COPY_SCRIPT}"
+    DEPENDS ${target_name}
+    COMMENT "Generating .cubin files for ${target_name}"
+    VERBATIM
+  )
+endfunction ()
+
+# ~~~
+# add_tvm_ffi_fatbin(<target_name> CUDA <source_file>)
+#
+# Creates an object library that compiles CUDA source to FATBIN format.
+# This function uses CMake's native CUDA support and respects 
CMAKE_CUDA_ARCHITECTURES.
+# This is a compatibility util for cmake < 3.27, user can create
+# cmake target with `CUDA_FATBIN_COMPILATION` for cmake >= 3.27.
+#
+# Parameters:
+#   target_name: Name of the object library target
+#   CUDA: One CUDA source file
+#
+# Example:
+#   add_tvm_ffi_fatbin(my_kernel_cubin CUDA kernel.cu)
+# ~~~
+function (add_tvm_ffi_fatbin target_name)
+  cmake_parse_arguments(ARG "" "CUDA" "" ${ARGN})
+  if (NOT ARG_CUDA)
+    message(FATAL_ERROR "add_tvm_ffi_fatbin: CUDA source is required")
   endif ()
 
-  # Get absolute paths
-  get_filename_component(ARG_SOURCE_ABS "${ARG_SOURCE}" ABSOLUTE)
-  get_filename_component(ARG_OUTPUT_ABS "${ARG_OUTPUT}" ABSOLUTE)
+  add_library(${target_name} OBJECT ${ARG_CUDA})
+  target_compile_options(${target_name} PRIVATE 
$<$<COMPILE_LANGUAGE:CUDA>:--fatbin>)
 
-  # Build nvcc command
-  add_custom_command(
-    OUTPUT "${ARG_OUTPUT_ABS}"
-    COMMAND ${CMAKE_CUDA_COMPILER} --cubin -arch=${ARG_ARCH} ${ARG_OPTIONS} 
"${ARG_SOURCE_ABS}" -o
-            "${ARG_OUTPUT_ABS}"
-    DEPENDS "${ARG_SOURCE_ABS}" ${ARG_DEPENDS}
-    COMMENT "Compiling ${ARG_SOURCE} to CUBIN (arch: ${ARG_ARCH})"
+  add_custom_target(
+    ${target_name}_bin ALL
+    COMMAND ${CMAKE_COMMAND} -DOBJECTS="$<TARGET_OBJECTS:${target_name}>" 
-DOUT_DIR=""
+            -DEXT="fatbin" -P "${COPY_SCRIPT}"
+    DEPENDS ${target_name}
+    COMMENT "Generating .fatbin files for ${target_name}"
     VERBATIM
   )
 endfunction ()
 
 # ~~~
-# tvm_ffi_embed_cubin(
-#   OUTPUT <output_object_file>
-#   SOURCE <source_file>
-#   CUBIN <cubin_file>
-#   NAME <symbol_name>
-#   [DEPENDS <additional_dependencies>...]
-# )
+# tvm_ffi_embed_bin_into(<target_name> <library_name>
+#                        BIN <cubin_or_fatbin>
+#                        INTERMEDIATE_FILE <intermediate_path>)
 #
-# Compiles a C++ source file and embeds a CUBIN file into it, creating a
-# combined object file that can be linked into a shared library or executable.
+# Embed one cubin/fatbin into given target with specified library name,
+# can be loaded with `TVM_FFI_EMBED_CUBIN(library_name)`.
+# Can only have one object in target and one cubin/fatbin.
 #
-# Parameters:
-#   OUTPUT: Path to the output object file (e.g., lib_embedded_with_cubin.o)
-#   SOURCE: Path to the C++ source file that uses TVM_FFI_EMBED_CUBIN macro
-#   CUBIN: Path to the CUBIN file to embed (can be a file path or a custom 
target output)
-#   NAME: Name used in the TVM_FFI_EMBED_CUBIN macro (e.g., "env" for 
TVM_FFI_EMBED_CUBIN(env))
-#   DEPENDS: Optional additional dependencies (e.g., custom targets)
+# The reason of this design is to integrate with cmake's workflow.
 #
-# The function will:
-#   1. Compile the SOURCE file to an intermediate object file
-#   2. Use the tvm_ffi.utils.embed_cubin Python utility to merge the object 
file
-#      with the CUBIN data
-#   3. Create symbols: __tvm_ffi__cubin_<NAME> and __tvm_ffi__cubin_<NAME>_end
+# Parameters:
+#   target_name: Name of the object library target
+#   library_name: Name of the kernel library
+#   BIN: CUBIN or FATBIN file
+#   INTERMEDIATE_FILE: Optional, location to copy original object file to.

Review Comment:
   what's the purpose of exposing `INTERMEDIATE_FILE` to the user?



##########
include/tvm/ffi/extra/cuda/unify_api.h:
##########
@@ -0,0 +1,183 @@
+/*

Review Comment:
   It's not clear to me whether we should unify the cuda runtime api and driver 
api in a standalone header and expose to the users - people might rely on these 
headers but it might not be the intention for this PR.
   
   I prefer to merge this file into the cubin_launcher.h, and add `_` as the 
prefix to the unified functions so that it keep internal and we can refactor 
the implementation in the future when needed.
   
   @tqchen , how do you think the `unify_api.h`?



##########
cmake/Utils/EmbedCubin.cmake:
##########
@@ -15,205 +15,163 @@
 # specific language governing permissions and limitations
 # under the License.
 
+# Do not let cmake to link cudart.
+set(CMAKE_CUDA_RUNTIME_LIBRARY None)

Review Comment:
   consider restore this option after we created the targets in this utility 
since the downstream projects might configure this option.



##########
examples/cubin_launcher/dynamic_cubin/src/lib_dynamic.cc:
##########
@@ -80,14 +80,19 @@ void AddOne(tvm::ffi::TensorView x, tvm::ffi::TensorView y) 
{
 
   // Get CUDA stream
   DLDevice device = x.device();
-  cudaStream_t stream =
-      static_cast<cudaStream_t>(TVMFFIEnvGetStream(device.device_type, 
device.device_id));
+  tvm::ffi::StreamHandle stream =
+      
static_cast<tvm::ffi::StreamHandle>(TVMFFIEnvGetStream(device.device_type, 
device.device_id));
 
   // Launch kernel
-  cudaError_t result = g_add_one_kernel->Launch(args, grid, block, stream);
+  tvm::ffi::ResultHandle result = g_add_one_kernel->Launch(args, grid, block, 
stream);

Review Comment:
   And the "Handle" is usually used for "pointer" / resource index. 



##########
examples/cubin_launcher/dynamic_cubin/src/lib_dynamic.cc:
##########
@@ -80,14 +80,19 @@ void AddOne(tvm::ffi::TensorView x, tvm::ffi::TensorView y) 
{
 
   // Get CUDA stream
   DLDevice device = x.device();
-  cudaStream_t stream =
-      static_cast<cudaStream_t>(TVMFFIEnvGetStream(device.device_type, 
device.device_id));
+  tvm::ffi::StreamHandle stream =
+      
static_cast<tvm::ffi::StreamHandle>(TVMFFIEnvGetStream(device.device_type, 
device.device_id));
 
   // Launch kernel
-  cudaError_t result = g_add_one_kernel->Launch(args, grid, block, stream);
+  tvm::ffi::ResultHandle result = g_add_one_kernel->Launch(args, grid, block, 
stream);

Review Comment:
   The name `tvm::ffi::ResultHandle` is also too generic. Is it cuda specific? 
If so, better to add 'CUDA' in the name.



##########
docs/guides/cubin_launcher.rst:
##########
@@ -374,6 +411,7 @@ C++ Macros
 ~~~~~~~~~~
 
 - :c:macro:`TVM_FFI_EMBED_CUBIN`: Declare embedded CUBIN module
+- :c:macro:`TVM_FFI_LOAD_LIBRARY_FROM_BYTES`: Load CUBIN from byte array

Review Comment:
   In the global context of tvm-ffi, this name is a little confusing, consider 
adding `CUBIN` to the name so that others know it's cubin related utility macro.



##########
cmake/Utils/EmbedCubin.cmake:
##########
@@ -15,205 +15,163 @@
 # specific language governing permissions and limitations
 # under the License.
 
+# Do not let cmake to link cudart.
+set(CMAKE_CUDA_RUNTIME_LIBRARY None)
+
+# We need this to simulate `CUDA_{CUBIN,FATBIN}_COMPILATION` in 
`add_tvm_ffi_{cubin,fatbin}`, to
+# copy `a.cu.o` to `a.cubin`/`a.fatbin`.
+set(COPY_SCRIPT "${CMAKE_BINARY_DIR}/cuda_copy_utils.cmake")
+file(
+  WRITE ${COPY_SCRIPT}
+  "
+# Arguments: OBJECTS (semicolon-separated list), OUT_DIR, EXT
+string(REPLACE \"\\\"\" \"\" ext_strip \"\${EXT}\")
+string(REPLACE \"\\\"\" \"\" out_dir_strip \"\${OUT_DIR}\")
+foreach(obj_raw \${OBJECTS})
+  string(REPLACE \"\\\"\" \"\" obj \"\${obj_raw}\")
+
+  # Extract filename: /path/to/kernel.cu.o -> kernel
+  # Note: CMake objects are usually named source.cu.o, so we strip extensions 
twice.
+  get_filename_component(fname \${obj} NAME_WE)
+  get_filename_component(fname \${fname} NAME_WE)
+
+  # If OUT_DIR is provided, use it. Otherwise, use the object's directory.
+  if(NOT out_dir_strip STREQUAL \"\")
+      set(final_dir \"\${out_dir_strip}\")
+  else()
+      get_filename_component(final_dir \${obj} DIRECTORY)
+  endif()
+
+  message(\"Copying \${obj} -> \${final_dir}/\${fname}.\${ext_strip}\")
+  execute_process(
+    COMMAND \${CMAKE_COMMAND} -E copy_if_different
+    \"\${obj}\"
+    \"\${final_dir}/\${fname}.\${ext_strip}\"
+  )
+endforeach()
+"
+)
+
 # ~~~
-# tvm_ffi_generate_cubin(
-#   OUTPUT <output_cubin_file>
-#   SOURCE <cuda_source_file>
-#   [ARCH <architecture>]
-#   [OPTIONS <extra_nvcc_options>...]
-#   [DEPENDS <additional_dependencies>...]
-# )
+# add_tvm_ffi_cubin(<target_name> CUDA <source_file>)
 #
-# Compiles a CUDA source file to CUBIN format using nvcc.
+# Creates an object library that compiles CUDA source to CUBIN format.
+# This function uses CMake's native CUDA support and respects 
CMAKE_CUDA_ARCHITECTURES.
+# This is a compatibility util for cmake < 3.27, user can create
+# cmake target with `CUDA_CUBIN_COMPILATION` for cmake >= 3.27.
 #
 # Parameters:
-#   OUTPUT: Path to the output CUBIN file (e.g., kernel.cubin)
-#   SOURCE: Path to the CUDA source file (e.g., kernel.cu)
-#   ARCH: Target GPU architecture (default: native for auto-detection)
-#         Examples: sm_75, sm_80, sm_86, compute_80, native
-#   OPTIONS: Additional nvcc compiler options (e.g., -O3, --use_fast_math)
-#   DEPENDS: Optional additional dependencies
-#
-# The function will:
-#   1. Find the CUDA compiler (nvcc)
-#   2. Compile the SOURCE to CUBIN with specified architecture and options
-#   3. Create the output CUBIN file
+#   target_name: Name of the object library target
+#   CUDA: One CUDA source file
 #
 # Example:
-#   tvm_ffi_generate_cubin(
-#     OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/kernel.cubin
-#     SOURCE src/kernel.cu
-#     ARCH native
-#     OPTIONS -O3 --use_fast_math
-#   )
+#   add_tvm_ffi_cubin(my_kernel_cubin CUDA kernel.cu)
 # ~~~
-
-# cmake-lint: disable=C0111,C0103
-function (tvm_ffi_generate_cubin)
-  # Parse arguments
-  set(options "")
-  set(oneValueArgs OUTPUT SOURCE ARCH)
-  set(multiValueArgs OPTIONS DEPENDS)
-  cmake_parse_arguments(ARG "${options}" "${oneValueArgs}" "${multiValueArgs}" 
${ARGN})
-
-  # Validate required arguments
-  if (NOT ARG_OUTPUT)
-    message(FATAL_ERROR "tvm_ffi_generate_cubin: OUTPUT is required")
-  endif ()
-  if (NOT ARG_SOURCE)
-    message(FATAL_ERROR "tvm_ffi_generate_cubin: SOURCE is required")
+function (add_tvm_ffi_cubin target_name)
+  cmake_parse_arguments(ARG "" "CUDA" "" ${ARGN})
+  if (NOT ARG_CUDA)
+    message(FATAL_ERROR "add_tvm_ffi_cubin: CUDA source is required")
   endif ()
 
-  # Default architecture to native if not specified
-  if (NOT ARG_ARCH)
-    set(ARG_ARCH "native")
-  endif ()
+  add_library(${target_name} OBJECT ${ARG_CUDA})
+  target_compile_options(${target_name} PRIVATE 
$<$<COMPILE_LANGUAGE:CUDA>:--cubin>)
 
-  # Ensure CUDA compiler is available
-  if (NOT CMAKE_CUDA_COMPILER)
-    message(
-      FATAL_ERROR
-        "tvm_ffi_generate_cubin: CMAKE_CUDA_COMPILER not found. Enable CUDA 
language in project()."
-    )
+  add_custom_target(
+    ${target_name}_bin ALL
+    COMMAND ${CMAKE_COMMAND} -DOBJECTS="$<TARGET_OBJECTS:${target_name}>" 
-DOUT_DIR="" -DEXT="cubin"
+            -P "${COPY_SCRIPT}"
+    DEPENDS ${target_name}
+    COMMENT "Generating .cubin files for ${target_name}"
+    VERBATIM
+  )
+endfunction ()
+
+# ~~~
+# add_tvm_ffi_fatbin(<target_name> CUDA <source_file>)
+#
+# Creates an object library that compiles CUDA source to FATBIN format.
+# This function uses CMake's native CUDA support and respects 
CMAKE_CUDA_ARCHITECTURES.
+# This is a compatibility util for cmake < 3.27, user can create
+# cmake target with `CUDA_FATBIN_COMPILATION` for cmake >= 3.27.
+#
+# Parameters:
+#   target_name: Name of the object library target
+#   CUDA: One CUDA source file
+#
+# Example:
+#   add_tvm_ffi_fatbin(my_kernel_cubin CUDA kernel.cu)
+# ~~~
+function (add_tvm_ffi_fatbin target_name)
+  cmake_parse_arguments(ARG "" "CUDA" "" ${ARGN})
+  if (NOT ARG_CUDA)
+    message(FATAL_ERROR "add_tvm_ffi_fatbin: CUDA source is required")
   endif ()
 
-  # Get absolute paths
-  get_filename_component(ARG_SOURCE_ABS "${ARG_SOURCE}" ABSOLUTE)
-  get_filename_component(ARG_OUTPUT_ABS "${ARG_OUTPUT}" ABSOLUTE)
+  add_library(${target_name} OBJECT ${ARG_CUDA})
+  target_compile_options(${target_name} PRIVATE 
$<$<COMPILE_LANGUAGE:CUDA>:--fatbin>)
 
-  # Build nvcc command
-  add_custom_command(
-    OUTPUT "${ARG_OUTPUT_ABS}"
-    COMMAND ${CMAKE_CUDA_COMPILER} --cubin -arch=${ARG_ARCH} ${ARG_OPTIONS} 
"${ARG_SOURCE_ABS}" -o
-            "${ARG_OUTPUT_ABS}"
-    DEPENDS "${ARG_SOURCE_ABS}" ${ARG_DEPENDS}
-    COMMENT "Compiling ${ARG_SOURCE} to CUBIN (arch: ${ARG_ARCH})"
+  add_custom_target(
+    ${target_name}_bin ALL
+    COMMAND ${CMAKE_COMMAND} -DOBJECTS="$<TARGET_OBJECTS:${target_name}>" 
-DOUT_DIR=""
+            -DEXT="fatbin" -P "${COPY_SCRIPT}"
+    DEPENDS ${target_name}
+    COMMENT "Generating .fatbin files for ${target_name}"
     VERBATIM
   )
 endfunction ()
 
 # ~~~
-# tvm_ffi_embed_cubin(
-#   OUTPUT <output_object_file>
-#   SOURCE <source_file>
-#   CUBIN <cubin_file>
-#   NAME <symbol_name>
-#   [DEPENDS <additional_dependencies>...]
-# )
+# tvm_ffi_embed_bin_into(<target_name> <library_name>
+#                        BIN <cubin_or_fatbin>
+#                        INTERMEDIATE_FILE <intermediate_path>)
 #
-# Compiles a C++ source file and embeds a CUBIN file into it, creating a
-# combined object file that can be linked into a shared library or executable.
+# Embed one cubin/fatbin into given target with specified library name,
+# can be loaded with `TVM_FFI_EMBED_CUBIN(library_name)`.
+# Can only have one object in target and one cubin/fatbin.
 #
-# Parameters:
-#   OUTPUT: Path to the output object file (e.g., lib_embedded_with_cubin.o)
-#   SOURCE: Path to the C++ source file that uses TVM_FFI_EMBED_CUBIN macro
-#   CUBIN: Path to the CUBIN file to embed (can be a file path or a custom 
target output)
-#   NAME: Name used in the TVM_FFI_EMBED_CUBIN macro (e.g., "env" for 
TVM_FFI_EMBED_CUBIN(env))
-#   DEPENDS: Optional additional dependencies (e.g., custom targets)
+# The reason of this design is to integrate with cmake's workflow.
 #
-# The function will:
-#   1. Compile the SOURCE file to an intermediate object file
-#   2. Use the tvm_ffi.utils.embed_cubin Python utility to merge the object 
file
-#      with the CUBIN data
-#   3. Create symbols: __tvm_ffi__cubin_<NAME> and __tvm_ffi__cubin_<NAME>_end
+# Parameters:
+#   target_name: Name of the object library target
+#   library_name: Name of the kernel library
+#   BIN: CUBIN or FATBIN file
+#   INTERMEDIATE_FILE: Optional, location to copy original object file to.
 #
 # Example:
-#   tvm_ffi_embed_cubin(
-#     OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/lib_embedded_with_cubin.o
-#     SOURCE src/lib_embedded.cc
-#     CUBIN ${CMAKE_CURRENT_BINARY_DIR}/kernel.cubin
-#     NAME env
-#   )
-#
-#   add_library(lib_embedded SHARED 
${CMAKE_CURRENT_BINARY_DIR}/lib_embedded_with_cubin.o)
-#   target_link_libraries(lib_embedded PRIVATE tvm_ffi_header CUDA::cudart)
-#
-# Note: The .note.GNU-stack section is automatically added to mark the stack as
-#       non-executable, so you don't need to add linker options manually
+#   tvm_ffi_embed_bin_into(lib_embedded env BIN 
"$<TARGET_OBJECTS:kernel_fatbin>")
 # ~~~
+function (tvm_ffi_embed_bin_into target_name kernel_name)
+  cmake_parse_arguments(ARG "" "BIN;INTERMEDIATE_FILE" "" ${ARGN})
 
-# cmake-lint: disable=C0111,C0103
-function (tvm_ffi_embed_cubin)
-  # Parse arguments
-  set(options "")
-  set(oneValueArgs OUTPUT SOURCE CUBIN NAME)
-  set(multiValueArgs DEPENDS)
-  cmake_parse_arguments(ARG "${options}" "${oneValueArgs}" "${multiValueArgs}" 
${ARGN})
-
-  # Validate required arguments
-  if (NOT ARG_OUTPUT)
-    message(FATAL_ERROR "tvm_ffi_embed_cubin: OUTPUT is required")
-  endif ()
-  if (NOT ARG_SOURCE)
-    message(FATAL_ERROR "tvm_ffi_embed_cubin: SOURCE is required")
-  endif ()
-  if (NOT ARG_CUBIN)
-    message(FATAL_ERROR "tvm_ffi_embed_cubin: CUBIN is required")
-  endif ()
-  if (NOT ARG_NAME)
-    message(FATAL_ERROR "tvm_ffi_embed_cubin: NAME is required")
+  if (NOT ARG_BIN)
+    message(FATAL_ERROR "tvm_ffi_embed_object: BIN is required")
   endif ()
 
-  # Ensure Python is found (prefer virtualenv)
-  if (NOT Python_EXECUTABLE)
-    set(Python_FIND_VIRTUALENV FIRST)
-    find_package(
-      Python
-      COMPONENTS Interpreter
-      REQUIRED
-    )
-  endif ()
+  get_filename_component(LIB_ABS "$<TARGET_OBJECTS:${target_name}>" ABSOLUTE)
+  if (NOT ARG_INTERMEDIATE_FILE)
+    get_filename_component(OUTPUT_DIR_ABS "${LIB_ABS}" DIRECTORY)
 
-  # Get absolute paths
-  get_filename_component(ARG_SOURCE_ABS "${ARG_SOURCE}" ABSOLUTE)
-  get_filename_component(ARG_OUTPUT_ABS "${ARG_OUTPUT}" ABSOLUTE)
-
-  # Generate intermediate object file path
-  get_filename_component(OUTPUT_DIR "${ARG_OUTPUT_ABS}" DIRECTORY)
-  get_filename_component(OUTPUT_NAME "${ARG_OUTPUT_ABS}" NAME_WE)
-  set(INTERMEDIATE_OBJ "${OUTPUT_DIR}/${OUTPUT_NAME}_intermediate.o")
-
-  # Get include directories from tvm_ffi_header
-  get_target_property(TVM_FFI_INCLUDES tvm_ffi_header 
INTERFACE_INCLUDE_DIRECTORIES)
-
-  # Convert list to -I flags
-  set(INCLUDE_FLAGS "")
-  foreach (inc_dir ${TVM_FFI_INCLUDES})
-    list(APPEND INCLUDE_FLAGS "-I${inc_dir}")
-  endforeach ()
-
-  # Add CUDA include directories if CUDAToolkit is found
-  if (TARGET CUDA::cudart)
-    get_target_property(CUDA_INCLUDES CUDA::cudart 
INTERFACE_INCLUDE_DIRECTORIES)
-    foreach (inc_dir ${CUDA_INCLUDES})
-      list(APPEND INCLUDE_FLAGS "-I${inc_dir}")
-    endforeach ()
+    set(final_output "${OUTPUT_DIR_ABS}/${kernel_name}_intermediate.o")
+  else ()
+    get_filename_component(final_output "${ARG_INTERMEDIATE_FILE}" ABSOLUTE)
   endif ()
 
-  # Step 1: Compile source file to intermediate object file
   add_custom_command(
-    OUTPUT "${INTERMEDIATE_OBJ}"
-    COMMAND ${CMAKE_CXX_COMPILER} -c -fPIC -std=c++17 ${INCLUDE_FLAGS} 
"${ARG_SOURCE_ABS}" -o
-            "${INTERMEDIATE_OBJ}"
-    DEPENDS "${ARG_SOURCE_ABS}"
-    COMMENT "Compiling ${ARG_SOURCE} to intermediate object file"
-    VERBATIM
+    TARGET ${target_name}
+    PRE_LINK
+    COMMAND ${CMAKE_COMMAND} -E copy_if_different 
"$<TARGET_OBJECTS:${target_name}>"

Review Comment:
   Can we directly use `$<TARGET_OBJECTS:${target_name}>` in the subsequent 
`add_custom_command` to replace `${final_output}`? Some comment will be helpful 
for future developers.



##########
cmake/Utils/EmbedCubin.cmake:
##########
@@ -15,205 +15,163 @@
 # specific language governing permissions and limitations
 # under the License.
 
+# Do not let cmake to link cudart.
+set(CMAKE_CUDA_RUNTIME_LIBRARY None)
+
+# We need this to simulate `CUDA_{CUBIN,FATBIN}_COMPILATION` in 
`add_tvm_ffi_{cubin,fatbin}`, to
+# copy `a.cu.o` to `a.cubin`/`a.fatbin`.
+set(COPY_SCRIPT "${CMAKE_BINARY_DIR}/cuda_copy_utils.cmake")

Review Comment:
   can this inlined script be a standalone script under cmake/Utils?



##########
cmake/Utils/EmbedCubin.cmake:
##########
@@ -15,205 +15,163 @@
 # specific language governing permissions and limitations
 # under the License.
 
+# Do not let cmake to link cudart.
+set(CMAKE_CUDA_RUNTIME_LIBRARY None)
+
+# We need this to simulate `CUDA_{CUBIN,FATBIN}_COMPILATION` in 
`add_tvm_ffi_{cubin,fatbin}`, to
+# copy `a.cu.o` to `a.cubin`/`a.fatbin`.
+set(COPY_SCRIPT "${CMAKE_BINARY_DIR}/cuda_copy_utils.cmake")
+file(
+  WRITE ${COPY_SCRIPT}
+  "
+# Arguments: OBJECTS (semicolon-separated list), OUT_DIR, EXT
+string(REPLACE \"\\\"\" \"\" ext_strip \"\${EXT}\")
+string(REPLACE \"\\\"\" \"\" out_dir_strip \"\${OUT_DIR}\")
+foreach(obj_raw \${OBJECTS})
+  string(REPLACE \"\\\"\" \"\" obj \"\${obj_raw}\")
+
+  # Extract filename: /path/to/kernel.cu.o -> kernel
+  # Note: CMake objects are usually named source.cu.o, so we strip extensions 
twice.
+  get_filename_component(fname \${obj} NAME_WE)
+  get_filename_component(fname \${fname} NAME_WE)
+
+  # If OUT_DIR is provided, use it. Otherwise, use the object's directory.
+  if(NOT out_dir_strip STREQUAL \"\")
+      set(final_dir \"\${out_dir_strip}\")
+  else()
+      get_filename_component(final_dir \${obj} DIRECTORY)
+  endif()
+
+  message(\"Copying \${obj} -> \${final_dir}/\${fname}.\${ext_strip}\")
+  execute_process(
+    COMMAND \${CMAKE_COMMAND} -E copy_if_different
+    \"\${obj}\"
+    \"\${final_dir}/\${fname}.\${ext_strip}\"
+  )
+endforeach()
+"
+)
+
 # ~~~
-# tvm_ffi_generate_cubin(
-#   OUTPUT <output_cubin_file>
-#   SOURCE <cuda_source_file>
-#   [ARCH <architecture>]
-#   [OPTIONS <extra_nvcc_options>...]
-#   [DEPENDS <additional_dependencies>...]
-# )
+# add_tvm_ffi_cubin(<target_name> CUDA <source_file>)
 #
-# Compiles a CUDA source file to CUBIN format using nvcc.
+# Creates an object library that compiles CUDA source to CUBIN format.
+# This function uses CMake's native CUDA support and respects 
CMAKE_CUDA_ARCHITECTURES.
+# This is a compatibility util for cmake < 3.27, user can create
+# cmake target with `CUDA_CUBIN_COMPILATION` for cmake >= 3.27.
 #
 # Parameters:
-#   OUTPUT: Path to the output CUBIN file (e.g., kernel.cubin)
-#   SOURCE: Path to the CUDA source file (e.g., kernel.cu)
-#   ARCH: Target GPU architecture (default: native for auto-detection)
-#         Examples: sm_75, sm_80, sm_86, compute_80, native
-#   OPTIONS: Additional nvcc compiler options (e.g., -O3, --use_fast_math)
-#   DEPENDS: Optional additional dependencies
-#
-# The function will:
-#   1. Find the CUDA compiler (nvcc)
-#   2. Compile the SOURCE to CUBIN with specified architecture and options
-#   3. Create the output CUBIN file
+#   target_name: Name of the object library target
+#   CUDA: One CUDA source file
 #
 # Example:
-#   tvm_ffi_generate_cubin(
-#     OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/kernel.cubin
-#     SOURCE src/kernel.cu
-#     ARCH native
-#     OPTIONS -O3 --use_fast_math
-#   )
+#   add_tvm_ffi_cubin(my_kernel_cubin CUDA kernel.cu)
 # ~~~
-
-# cmake-lint: disable=C0111,C0103
-function (tvm_ffi_generate_cubin)
-  # Parse arguments
-  set(options "")
-  set(oneValueArgs OUTPUT SOURCE ARCH)
-  set(multiValueArgs OPTIONS DEPENDS)
-  cmake_parse_arguments(ARG "${options}" "${oneValueArgs}" "${multiValueArgs}" 
${ARGN})
-
-  # Validate required arguments
-  if (NOT ARG_OUTPUT)
-    message(FATAL_ERROR "tvm_ffi_generate_cubin: OUTPUT is required")
-  endif ()
-  if (NOT ARG_SOURCE)
-    message(FATAL_ERROR "tvm_ffi_generate_cubin: SOURCE is required")
+function (add_tvm_ffi_cubin target_name)
+  cmake_parse_arguments(ARG "" "CUDA" "" ${ARGN})
+  if (NOT ARG_CUDA)
+    message(FATAL_ERROR "add_tvm_ffi_cubin: CUDA source is required")
   endif ()
 
-  # Default architecture to native if not specified
-  if (NOT ARG_ARCH)
-    set(ARG_ARCH "native")
-  endif ()
+  add_library(${target_name} OBJECT ${ARG_CUDA})
+  target_compile_options(${target_name} PRIVATE 
$<$<COMPILE_LANGUAGE:CUDA>:--cubin>)
 
-  # Ensure CUDA compiler is available
-  if (NOT CMAKE_CUDA_COMPILER)
-    message(
-      FATAL_ERROR
-        "tvm_ffi_generate_cubin: CMAKE_CUDA_COMPILER not found. Enable CUDA 
language in project()."
-    )
+  add_custom_target(
+    ${target_name}_bin ALL
+    COMMAND ${CMAKE_COMMAND} -DOBJECTS="$<TARGET_OBJECTS:${target_name}>" 
-DOUT_DIR="" -DEXT="cubin"
+            -P "${COPY_SCRIPT}"
+    DEPENDS ${target_name}
+    COMMENT "Generating .cubin files for ${target_name}"
+    VERBATIM
+  )
+endfunction ()
+
+# ~~~
+# add_tvm_ffi_fatbin(<target_name> CUDA <source_file>)
+#
+# Creates an object library that compiles CUDA source to FATBIN format.
+# This function uses CMake's native CUDA support and respects 
CMAKE_CUDA_ARCHITECTURES.
+# This is a compatibility util for cmake < 3.27, user can create
+# cmake target with `CUDA_FATBIN_COMPILATION` for cmake >= 3.27.
+#
+# Parameters:
+#   target_name: Name of the object library target
+#   CUDA: One CUDA source file
+#
+# Example:
+#   add_tvm_ffi_fatbin(my_kernel_cubin CUDA kernel.cu)
+# ~~~
+function (add_tvm_ffi_fatbin target_name)
+  cmake_parse_arguments(ARG "" "CUDA" "" ${ARGN})
+  if (NOT ARG_CUDA)
+    message(FATAL_ERROR "add_tvm_ffi_fatbin: CUDA source is required")
   endif ()
 
-  # Get absolute paths
-  get_filename_component(ARG_SOURCE_ABS "${ARG_SOURCE}" ABSOLUTE)
-  get_filename_component(ARG_OUTPUT_ABS "${ARG_OUTPUT}" ABSOLUTE)
+  add_library(${target_name} OBJECT ${ARG_CUDA})
+  target_compile_options(${target_name} PRIVATE 
$<$<COMPILE_LANGUAGE:CUDA>:--fatbin>)
 
-  # Build nvcc command
-  add_custom_command(
-    OUTPUT "${ARG_OUTPUT_ABS}"
-    COMMAND ${CMAKE_CUDA_COMPILER} --cubin -arch=${ARG_ARCH} ${ARG_OPTIONS} 
"${ARG_SOURCE_ABS}" -o
-            "${ARG_OUTPUT_ABS}"
-    DEPENDS "${ARG_SOURCE_ABS}" ${ARG_DEPENDS}
-    COMMENT "Compiling ${ARG_SOURCE} to CUBIN (arch: ${ARG_ARCH})"
+  add_custom_target(
+    ${target_name}_bin ALL
+    COMMAND ${CMAKE_COMMAND} -DOBJECTS="$<TARGET_OBJECTS:${target_name}>" 
-DOUT_DIR=""
+            -DEXT="fatbin" -P "${COPY_SCRIPT}"
+    DEPENDS ${target_name}
+    COMMENT "Generating .fatbin files for ${target_name}"
     VERBATIM
   )
 endfunction ()
 
 # ~~~
-# tvm_ffi_embed_cubin(
-#   OUTPUT <output_object_file>
-#   SOURCE <source_file>
-#   CUBIN <cubin_file>
-#   NAME <symbol_name>
-#   [DEPENDS <additional_dependencies>...]
-# )
+# tvm_ffi_embed_bin_into(<target_name> <library_name>
+#                        BIN <cubin_or_fatbin>
+#                        INTERMEDIATE_FILE <intermediate_path>)
 #
-# Compiles a C++ source file and embeds a CUBIN file into it, creating a
-# combined object file that can be linked into a shared library or executable.
+# Embed one cubin/fatbin into given target with specified library name,
+# can be loaded with `TVM_FFI_EMBED_CUBIN(library_name)`.
+# Can only have one object in target and one cubin/fatbin.
 #
-# Parameters:
-#   OUTPUT: Path to the output object file (e.g., lib_embedded_with_cubin.o)
-#   SOURCE: Path to the C++ source file that uses TVM_FFI_EMBED_CUBIN macro
-#   CUBIN: Path to the CUBIN file to embed (can be a file path or a custom 
target output)
-#   NAME: Name used in the TVM_FFI_EMBED_CUBIN macro (e.g., "env" for 
TVM_FFI_EMBED_CUBIN(env))
-#   DEPENDS: Optional additional dependencies (e.g., custom targets)
+# The reason of this design is to integrate with cmake's workflow.
 #
-# The function will:
-#   1. Compile the SOURCE file to an intermediate object file
-#   2. Use the tvm_ffi.utils.embed_cubin Python utility to merge the object 
file
-#      with the CUBIN data
-#   3. Create symbols: __tvm_ffi__cubin_<NAME> and __tvm_ffi__cubin_<NAME>_end
+# Parameters:
+#   target_name: Name of the object library target
+#   library_name: Name of the kernel library

Review Comment:
   Alternatively, we can make the signature like
   ```cmake
   tvm_ffi_embed_bin_into(
       <target_name> 
       SYMBOL <symbol_name>
       BIN <cubin_or_fatbin>
   )
   ```



##########
docs/guides/cubin_launcher.rst:
##########
@@ -374,6 +411,7 @@ C++ Macros
 ~~~~~~~~~~
 
 - :c:macro:`TVM_FFI_EMBED_CUBIN`: Declare embedded CUBIN module
+- :c:macro:`TVM_FFI_LOAD_LIBRARY_FROM_BYTES`: Load CUBIN from byte array

Review Comment:
   Like `TVM_FFI_EMBED_CUBIN_FROM_BYTES`



##########
cmake/Utils/EmbedCubin.cmake:
##########
@@ -15,205 +15,163 @@
 # specific language governing permissions and limitations
 # under the License.
 
+# Do not let cmake to link cudart.
+set(CMAKE_CUDA_RUNTIME_LIBRARY None)
+
+# We need this to simulate `CUDA_{CUBIN,FATBIN}_COMPILATION` in 
`add_tvm_ffi_{cubin,fatbin}`, to
+# copy `a.cu.o` to `a.cubin`/`a.fatbin`.
+set(COPY_SCRIPT "${CMAKE_BINARY_DIR}/cuda_copy_utils.cmake")
+file(
+  WRITE ${COPY_SCRIPT}
+  "
+# Arguments: OBJECTS (semicolon-separated list), OUT_DIR, EXT
+string(REPLACE \"\\\"\" \"\" ext_strip \"\${EXT}\")
+string(REPLACE \"\\\"\" \"\" out_dir_strip \"\${OUT_DIR}\")
+foreach(obj_raw \${OBJECTS})
+  string(REPLACE \"\\\"\" \"\" obj \"\${obj_raw}\")
+
+  # Extract filename: /path/to/kernel.cu.o -> kernel
+  # Note: CMake objects are usually named source.cu.o, so we strip extensions 
twice.
+  get_filename_component(fname \${obj} NAME_WE)
+  get_filename_component(fname \${fname} NAME_WE)
+
+  # If OUT_DIR is provided, use it. Otherwise, use the object's directory.
+  if(NOT out_dir_strip STREQUAL \"\")
+      set(final_dir \"\${out_dir_strip}\")
+  else()
+      get_filename_component(final_dir \${obj} DIRECTORY)
+  endif()
+
+  message(\"Copying \${obj} -> \${final_dir}/\${fname}.\${ext_strip}\")
+  execute_process(
+    COMMAND \${CMAKE_COMMAND} -E copy_if_different
+    \"\${obj}\"
+    \"\${final_dir}/\${fname}.\${ext_strip}\"
+  )
+endforeach()
+"
+)
+
 # ~~~
-# tvm_ffi_generate_cubin(
-#   OUTPUT <output_cubin_file>
-#   SOURCE <cuda_source_file>
-#   [ARCH <architecture>]
-#   [OPTIONS <extra_nvcc_options>...]
-#   [DEPENDS <additional_dependencies>...]
-# )
+# add_tvm_ffi_cubin(<target_name> CUDA <source_file>)
 #
-# Compiles a CUDA source file to CUBIN format using nvcc.
+# Creates an object library that compiles CUDA source to CUBIN format.
+# This function uses CMake's native CUDA support and respects 
CMAKE_CUDA_ARCHITECTURES.
+# This is a compatibility util for cmake < 3.27, user can create
+# cmake target with `CUDA_CUBIN_COMPILATION` for cmake >= 3.27.
 #
 # Parameters:
-#   OUTPUT: Path to the output CUBIN file (e.g., kernel.cubin)
-#   SOURCE: Path to the CUDA source file (e.g., kernel.cu)
-#   ARCH: Target GPU architecture (default: native for auto-detection)
-#         Examples: sm_75, sm_80, sm_86, compute_80, native
-#   OPTIONS: Additional nvcc compiler options (e.g., -O3, --use_fast_math)
-#   DEPENDS: Optional additional dependencies
-#
-# The function will:
-#   1. Find the CUDA compiler (nvcc)
-#   2. Compile the SOURCE to CUBIN with specified architecture and options
-#   3. Create the output CUBIN file
+#   target_name: Name of the object library target
+#   CUDA: One CUDA source file
 #
 # Example:
-#   tvm_ffi_generate_cubin(
-#     OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/kernel.cubin
-#     SOURCE src/kernel.cu
-#     ARCH native
-#     OPTIONS -O3 --use_fast_math
-#   )
+#   add_tvm_ffi_cubin(my_kernel_cubin CUDA kernel.cu)
 # ~~~
-
-# cmake-lint: disable=C0111,C0103
-function (tvm_ffi_generate_cubin)
-  # Parse arguments
-  set(options "")
-  set(oneValueArgs OUTPUT SOURCE ARCH)
-  set(multiValueArgs OPTIONS DEPENDS)
-  cmake_parse_arguments(ARG "${options}" "${oneValueArgs}" "${multiValueArgs}" 
${ARGN})
-
-  # Validate required arguments
-  if (NOT ARG_OUTPUT)
-    message(FATAL_ERROR "tvm_ffi_generate_cubin: OUTPUT is required")
-  endif ()
-  if (NOT ARG_SOURCE)
-    message(FATAL_ERROR "tvm_ffi_generate_cubin: SOURCE is required")
+function (add_tvm_ffi_cubin target_name)
+  cmake_parse_arguments(ARG "" "CUDA" "" ${ARGN})
+  if (NOT ARG_CUDA)
+    message(FATAL_ERROR "add_tvm_ffi_cubin: CUDA source is required")
   endif ()
 
-  # Default architecture to native if not specified
-  if (NOT ARG_ARCH)
-    set(ARG_ARCH "native")
-  endif ()
+  add_library(${target_name} OBJECT ${ARG_CUDA})
+  target_compile_options(${target_name} PRIVATE 
$<$<COMPILE_LANGUAGE:CUDA>:--cubin>)
 
-  # Ensure CUDA compiler is available
-  if (NOT CMAKE_CUDA_COMPILER)
-    message(
-      FATAL_ERROR
-        "tvm_ffi_generate_cubin: CMAKE_CUDA_COMPILER not found. Enable CUDA 
language in project()."
-    )
+  add_custom_target(
+    ${target_name}_bin ALL
+    COMMAND ${CMAKE_COMMAND} -DOBJECTS="$<TARGET_OBJECTS:${target_name}>" 
-DOUT_DIR="" -DEXT="cubin"
+            -P "${COPY_SCRIPT}"
+    DEPENDS ${target_name}
+    COMMENT "Generating .cubin files for ${target_name}"
+    VERBATIM
+  )
+endfunction ()
+
+# ~~~
+# add_tvm_ffi_fatbin(<target_name> CUDA <source_file>)
+#
+# Creates an object library that compiles CUDA source to FATBIN format.
+# This function uses CMake's native CUDA support and respects 
CMAKE_CUDA_ARCHITECTURES.
+# This is a compatibility util for cmake < 3.27, user can create
+# cmake target with `CUDA_FATBIN_COMPILATION` for cmake >= 3.27.
+#
+# Parameters:
+#   target_name: Name of the object library target
+#   CUDA: One CUDA source file
+#
+# Example:
+#   add_tvm_ffi_fatbin(my_kernel_cubin CUDA kernel.cu)
+# ~~~
+function (add_tvm_ffi_fatbin target_name)
+  cmake_parse_arguments(ARG "" "CUDA" "" ${ARGN})
+  if (NOT ARG_CUDA)
+    message(FATAL_ERROR "add_tvm_ffi_fatbin: CUDA source is required")
   endif ()
 
-  # Get absolute paths
-  get_filename_component(ARG_SOURCE_ABS "${ARG_SOURCE}" ABSOLUTE)
-  get_filename_component(ARG_OUTPUT_ABS "${ARG_OUTPUT}" ABSOLUTE)
+  add_library(${target_name} OBJECT ${ARG_CUDA})
+  target_compile_options(${target_name} PRIVATE 
$<$<COMPILE_LANGUAGE:CUDA>:--fatbin>)
 
-  # Build nvcc command
-  add_custom_command(
-    OUTPUT "${ARG_OUTPUT_ABS}"
-    COMMAND ${CMAKE_CUDA_COMPILER} --cubin -arch=${ARG_ARCH} ${ARG_OPTIONS} 
"${ARG_SOURCE_ABS}" -o
-            "${ARG_OUTPUT_ABS}"
-    DEPENDS "${ARG_SOURCE_ABS}" ${ARG_DEPENDS}
-    COMMENT "Compiling ${ARG_SOURCE} to CUBIN (arch: ${ARG_ARCH})"
+  add_custom_target(
+    ${target_name}_bin ALL
+    COMMAND ${CMAKE_COMMAND} -DOBJECTS="$<TARGET_OBJECTS:${target_name}>" 
-DOUT_DIR=""
+            -DEXT="fatbin" -P "${COPY_SCRIPT}"
+    DEPENDS ${target_name}
+    COMMENT "Generating .fatbin files for ${target_name}"
     VERBATIM
   )
 endfunction ()
 
 # ~~~
-# tvm_ffi_embed_cubin(
-#   OUTPUT <output_object_file>
-#   SOURCE <source_file>
-#   CUBIN <cubin_file>
-#   NAME <symbol_name>
-#   [DEPENDS <additional_dependencies>...]
-# )
+# tvm_ffi_embed_bin_into(<target_name> <library_name>
+#                        BIN <cubin_or_fatbin>
+#                        INTERMEDIATE_FILE <intermediate_path>)
 #
-# Compiles a C++ source file and embeds a CUBIN file into it, creating a
-# combined object file that can be linked into a shared library or executable.
+# Embed one cubin/fatbin into given target with specified library name,
+# can be loaded with `TVM_FFI_EMBED_CUBIN(library_name)`.
+# Can only have one object in target and one cubin/fatbin.
 #
-# Parameters:
-#   OUTPUT: Path to the output object file (e.g., lib_embedded_with_cubin.o)
-#   SOURCE: Path to the C++ source file that uses TVM_FFI_EMBED_CUBIN macro
-#   CUBIN: Path to the CUBIN file to embed (can be a file path or a custom 
target output)
-#   NAME: Name used in the TVM_FFI_EMBED_CUBIN macro (e.g., "env" for 
TVM_FFI_EMBED_CUBIN(env))
-#   DEPENDS: Optional additional dependencies (e.g., custom targets)
+# The reason of this design is to integrate with cmake's workflow.
 #
-# The function will:
-#   1. Compile the SOURCE file to an intermediate object file
-#   2. Use the tvm_ffi.utils.embed_cubin Python utility to merge the object 
file
-#      with the CUBIN data
-#   3. Create symbols: __tvm_ffi__cubin_<NAME> and __tvm_ffi__cubin_<NAME>_end
+# Parameters:
+#   target_name: Name of the object library target
+#   library_name: Name of the kernel library

Review Comment:
   ```suggestion
   #   symbol_name: Name of the symbol in TVM_FFI_EMBED_CUBIN macro.
   ```



##########
include/tvm/ffi/extra/cuda/unify_api.h:
##########
@@ -0,0 +1,183 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef TVM_FFI_EXTRA_CUDA_UNIFY_API_H_
+#define TVM_FFI_EXTRA_CUDA_UNIFY_API_H_
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <tvm/ffi/error.h>
+#include <tvm/ffi/extra/cuda/base.h>
+
+#include <filesystem>
+#include <string>
+
+#ifndef TVM_FFI_CUBIN_LAUNCHER_USE_DRIVER_API
+#if CUDART_VERSION >= 12080
+// Use Runtime API by default if possible
+#define TVM_FFI_CUBIN_LAUNCHER_USE_DRIVER_API 0
+#else
+#define TVM_FFI_CUBIN_LAUNCHER_USE_DRIVER_API 1
+#endif  // CUDART_VERSION >= 12080
+#else
+#if (!(TVM_FFI_CUBIN_LAUNCHER_USE_DRIVER_API)) && (CUDART_VERSION < 12080)
+#define _STRINGIFY(x) #x
+#define STR(x) _STRINGIFY(x)
+static_assert(false, "Runtime API only supported for CUDA >= 12.8, got CUDA 
Runtime version: " STR(
+                         CUDART_VERSION));
+#endif
+#endif
+
+namespace tvm::ffi {
+
+#if TVM_FFI_CUBIN_LAUNCHER_USE_DRIVER_API
+
+using StreamHandle = CUstream;
+using ResultHandle = CUresult;
+
+using LibraryHandle = CUlibrary;
+using KernelHandle = CUkernel;
+using LaunchConfigHandle = CUlaunchConfig;
+using LaunchAttrHandle = CUlaunchAttribute;
+
+using DeviceAttrHandle = CUdevice_attribute;
+using DeviceHandle = CUdevice;
+
+#define FFI_CUDA_SUCCESS CUDA_SUCCESS
+
+#define load_function cuLibraryGetKernel
+#define get_device_count cuDeviceGetCount
+#define get_device_attr cuDeviceGetAttribute
+#define unload_library cuLibraryUnload
+
+#else
+
+using StreamHandle = cudaStream_t;
+using ResultHandle = cudaError_t;
+
+using LibraryHandle = cudaLibrary_t;
+using KernelHandle = cudaKernel_t;
+using LaunchConfigHandle = cudaLaunchConfig_t;
+using LaunchAttrHandle = cudaLaunchAttribute;
+
+using DeviceAttrHandle = cudaDeviceAttr;
+using DeviceHandle = int;
+
+#define FFI_CUDA_SUCCESS cudaSuccess
+
+#define load_function cudaLibraryGetKernel
+#define get_device_count cudaGetDeviceCount
+#define get_device_attr cudaDeviceGetAttribute
+#define unload_library cudaLibraryUnload
+
+#endif
+
+#define TVM_FFI_CHECK_RUNTIME_CUDA_ERROR(stmt)                                 
     \
+  do {                                                                         
     \
+    cudaError_t __err = (stmt);                                                
     \
+    if (__err != cudaSuccess) {                                                
     \
+      const char* __err_name = cudaGetErrorName(__err);                        
     \
+      const char* __err_str = cudaGetErrorString(__err);                       
     \
+      TVM_FFI_THROW(RuntimeError) << "CUDA Runtime Error: " << __err_name << " 
("   \
+                                  << static_cast<int>(__err) << "): " << 
__err_str; \
+    }                                                                          
     \
+  } while (0)
+
+#define TVM_FFI_CHECK_DRIVER_CUDA_ERROR(stmt)                                  
\
+  do {                                                                         
\
+    CUresult __err = (stmt);                                                   
\
+    if (__err != CUDA_SUCCESS) {                                               
\
+      const char *name, *info;                                                 
\
+      cuGetErrorName(__err, &name);                                            
\
+      cuGetErrorString(__err, &info);                                          
\
+      TVM_FFI_THROW(RuntimeError) << "CUDA Driver Error: " << name << " ("     
\
+                                  << static_cast<int>(__err) << "): " << info; 
\
+    }                                                                          
\
+  } while (0)
+
+static ResultHandle load_image(LibraryHandle* library, const void* image) {
+#if TVM_FFI_CUBIN_LAUNCHER_USE_DRIVER_API
+  return cuLibraryLoadData(library, image, nullptr, nullptr, 0, nullptr, 
nullptr, 0);
+#else
+  return cudaLibraryLoadData(library, image, nullptr, nullptr, 0, nullptr, 
nullptr, 0);
+#endif
+}
+
+static DeviceHandle idx_to_device(int idx) {
+#if TVM_FFI_CUBIN_LAUNCHER_USE_DRIVER_API
+  CUdevice o;
+  TVM_FFI_CHECK_DRIVER_CUDA_ERROR(cuDeviceGet(&o, idx));
+  return o;
+#else
+  return idx;
+#endif
+}
+
+static ResultHandle launch_kernel(KernelHandle kernel, void** args, 
tvm::ffi::dim3 grid,
+                                  tvm::ffi::dim3 block, StreamHandle stream,
+                                  uint32_t dyn_smem_bytes = 0) {
+#if TVM_FFI_CUBIN_LAUNCHER_USE_DRIVER_API
+  return cuLaunchKernel(reinterpret_cast<CUfunction>(kernel), grid.x, grid.y, 
grid.z, block.x,

Review Comment:
   ```suggestion
     return cuLaunchKernel(kernel, grid.x, grid.y, grid.z, block.x,
   ```



##########
include/tvm/ffi/extra/cuda/unify_api.h:
##########
@@ -0,0 +1,183 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef TVM_FFI_EXTRA_CUDA_UNIFY_API_H_
+#define TVM_FFI_EXTRA_CUDA_UNIFY_API_H_
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <tvm/ffi/error.h>
+#include <tvm/ffi/extra/cuda/base.h>
+
+#include <filesystem>
+#include <string>
+
+#ifndef TVM_FFI_CUBIN_LAUNCHER_USE_DRIVER_API
+#if CUDART_VERSION >= 12080
+// Use Runtime API by default if possible
+#define TVM_FFI_CUBIN_LAUNCHER_USE_DRIVER_API 0
+#else
+#define TVM_FFI_CUBIN_LAUNCHER_USE_DRIVER_API 1
+#endif  // CUDART_VERSION >= 12080
+#else
+#if (!(TVM_FFI_CUBIN_LAUNCHER_USE_DRIVER_API)) && (CUDART_VERSION < 12080)
+#define _STRINGIFY(x) #x
+#define STR(x) _STRINGIFY(x)
+static_assert(false, "Runtime API only supported for CUDA >= 12.8, got CUDA 
Runtime version: " STR(
+                         CUDART_VERSION));
+#endif
+#endif
+
+namespace tvm::ffi {
+
+#if TVM_FFI_CUBIN_LAUNCHER_USE_DRIVER_API
+
+using StreamHandle = CUstream;
+using ResultHandle = CUresult;
+
+using LibraryHandle = CUlibrary;
+using KernelHandle = CUkernel;
+using LaunchConfigHandle = CUlaunchConfig;
+using LaunchAttrHandle = CUlaunchAttribute;
+
+using DeviceAttrHandle = CUdevice_attribute;
+using DeviceHandle = CUdevice;
+
+#define FFI_CUDA_SUCCESS CUDA_SUCCESS
+
+#define load_function cuLibraryGetKernel
+#define get_device_count cuDeviceGetCount
+#define get_device_attr cuDeviceGetAttribute
+#define unload_library cuLibraryUnload
+
+#else
+
+using StreamHandle = cudaStream_t;
+using ResultHandle = cudaError_t;
+
+using LibraryHandle = cudaLibrary_t;
+using KernelHandle = cudaKernel_t;
+using LaunchConfigHandle = cudaLaunchConfig_t;
+using LaunchAttrHandle = cudaLaunchAttribute;
+
+using DeviceAttrHandle = cudaDeviceAttr;
+using DeviceHandle = int;
+
+#define FFI_CUDA_SUCCESS cudaSuccess
+
+#define load_function cudaLibraryGetKernel
+#define get_device_count cudaGetDeviceCount
+#define get_device_attr cudaDeviceGetAttribute
+#define unload_library cudaLibraryUnload
+
+#endif
+
+#define TVM_FFI_CHECK_RUNTIME_CUDA_ERROR(stmt)                                 
     \
+  do {                                                                         
     \
+    cudaError_t __err = (stmt);                                                
     \
+    if (__err != cudaSuccess) {                                                
     \
+      const char* __err_name = cudaGetErrorName(__err);                        
     \
+      const char* __err_str = cudaGetErrorString(__err);                       
     \
+      TVM_FFI_THROW(RuntimeError) << "CUDA Runtime Error: " << __err_name << " 
("   \
+                                  << static_cast<int>(__err) << "): " << 
__err_str; \
+    }                                                                          
     \
+  } while (0)
+
+#define TVM_FFI_CHECK_DRIVER_CUDA_ERROR(stmt)                                  
\
+  do {                                                                         
\
+    CUresult __err = (stmt);                                                   
\
+    if (__err != CUDA_SUCCESS) {                                               
\
+      const char *name, *info;                                                 
\
+      cuGetErrorName(__err, &name);                                            
\
+      cuGetErrorString(__err, &info);                                          
\
+      TVM_FFI_THROW(RuntimeError) << "CUDA Driver Error: " << name << " ("     
\
+                                  << static_cast<int>(__err) << "): " << info; 
\
+    }                                                                          
\
+  } while (0)
+
+static ResultHandle load_image(LibraryHandle* library, const void* image) {
+#if TVM_FFI_CUBIN_LAUNCHER_USE_DRIVER_API
+  return cuLibraryLoadData(library, image, nullptr, nullptr, 0, nullptr, 
nullptr, 0);
+#else
+  return cudaLibraryLoadData(library, image, nullptr, nullptr, 0, nullptr, 
nullptr, 0);
+#endif
+}
+
+static DeviceHandle idx_to_device(int idx) {
+#if TVM_FFI_CUBIN_LAUNCHER_USE_DRIVER_API
+  CUdevice o;
+  TVM_FFI_CHECK_DRIVER_CUDA_ERROR(cuDeviceGet(&o, idx));
+  return o;
+#else
+  return idx;
+#endif
+}
+
+static ResultHandle launch_kernel(KernelHandle kernel, void** args, 
tvm::ffi::dim3 grid,
+                                  tvm::ffi::dim3 block, StreamHandle stream,
+                                  uint32_t dyn_smem_bytes = 0) {
+#if TVM_FFI_CUBIN_LAUNCHER_USE_DRIVER_API
+  return cuLaunchKernel(reinterpret_cast<CUfunction>(kernel), grid.x, grid.y, 
grid.z, block.x,

Review Comment:
   We can use 
[cuKernelGetFunction](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__LIBRARY.html#group__CUDA__LIBRARY_1ge4cf9abafaba338acb977585b0d7374a)
 to get a `CUfunction` from `CUkernel`. From the documentation, it seems that 
`CUfunction` is context-specific and `CUkernel` is context-agnostic. Thus, the 
reinterpret_cast here might not be proper since they are different things.
   
   The `cuLaunchKernel` accepts both `CUkernel` and `CUfunction`, thus the 
kernel launch can work.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to