https://github.com/silee2 created https://github.com/llvm/llvm-project/pull/65539:
None >From 863a72b4e099f4aa24e43fdaaeb2ab0e171a0381 Mon Sep 17 00:00:00 2001 From: "Lee, Sang Ik" <sang.ik....@intel.com> Date: Wed, 30 Aug 2023 13:44:02 -0700 Subject: [PATCH 01/13] Add SyclRuntimeWrappers and Add CMake option MLIR_ENABLE_SYCL_RUNNER --- mlir/CMakeLists.txt | 1 + mlir/cmake/modules/FindLevelZero.cmake | 221 ++++++++++ mlir/cmake/modules/FindSyclRuntime.cmake | 68 +++ mlir/lib/ExecutionEngine/CMakeLists.txt | 35 ++ .../ExecutionEngine/SyclRuntimeWrappers.cpp | 386 ++++++++++++++++++ 5 files changed, 711 insertions(+) create mode 100644 mlir/cmake/modules/FindLevelZero.cmake create mode 100644 mlir/cmake/modules/FindSyclRuntime.cmake create mode 100644 mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp diff --git a/mlir/CMakeLists.txt b/mlir/CMakeLists.txt index fa4f6e76f985fb5..4a67e018273819f 100644 --- a/mlir/CMakeLists.txt +++ b/mlir/CMakeLists.txt @@ -116,6 +116,7 @@ add_definitions(-DMLIR_ROCM_CONVERSIONS_ENABLED=${MLIR_ENABLE_ROCM_CONVERSIONS}) set(MLIR_ENABLE_CUDA_RUNNER 0 CACHE BOOL "Enable building the mlir CUDA runner") set(MLIR_ENABLE_ROCM_RUNNER 0 CACHE BOOL "Enable building the mlir ROCm runner") +set(MLIR_ENABLE_SYCL_RUNNER 0 CACHE BOOL "Enable building the mlir Sycl runner") set(MLIR_ENABLE_SPIRV_CPU_RUNNER 0 CACHE BOOL "Enable building the mlir SPIR-V cpu runner") set(MLIR_ENABLE_VULKAN_RUNNER 0 CACHE BOOL "Enable building the mlir Vulkan runner") set(MLIR_ENABLE_NVPTXCOMPILER 0 CACHE BOOL diff --git a/mlir/cmake/modules/FindLevelZero.cmake b/mlir/cmake/modules/FindLevelZero.cmake new file mode 100644 index 000000000000000..012187f0afc0b07 --- /dev/null +++ b/mlir/cmake/modules/FindLevelZero.cmake @@ -0,0 +1,221 @@ +# CMake find_package() module for level-zero +# +# Example usage: +# +# find_package(LevelZero) +# +# If successful, the following variables will be defined: +# LevelZero_FOUND +# LevelZero_INCLUDE_DIRS +# LevelZero_LIBRARY +# LevelZero_LIBRARIES_DIR +# +# By default, the module searches the standard paths to locate the "ze_api.h" +# and the ze_loader shared library. When using a custom level-zero installation, +# the environment variable "LEVEL_ZERO_DIR" should be specified telling the +# module to get the level-zero library and headers from that location. + +include(FindPackageHandleStandardArgs) + +# Search path priority +# 1. CMake Variable LEVEL_ZERO_DIR +# 2. Environment Variable LEVEL_ZERO_DIR + +if(NOT LEVEL_ZERO_DIR) + if(DEFINED ENV{LEVEL_ZERO_DIR}) + set(LEVEL_ZERO_DIR "$ENV{LEVEL_ZERO_DIR}") + endif() +endif() + +if(LEVEL_ZERO_DIR) + find_path(LevelZero_INCLUDE_DIR + NAMES level_zero/ze_api.h + PATHS ${LEVEL_ZERO_DIR}/include + NO_DEFAULT_PATH + ) + + if(LINUX) + find_library(LevelZero_LIBRARY + NAMES ze_loader + PATHS ${LEVEL_ZERO_DIR}/lib + ${LEVEL_ZERO_DIR}/lib/x86_64-linux-gnu + NO_DEFAULT_PATH + ) + else() + find_library(LevelZero_LIBRARY + NAMES ze_loader + PATHS ${LEVEL_ZERO_DIR}/lib + NO_DEFAULT_PATH + ) + endif() +else() + find_path(LevelZero_INCLUDE_DIR + NAMES level_zero/ze_api.h + ) + + find_library(LevelZero_LIBRARY + NAMES ze_loader + ) +endif() + +# Compares the two version string that are supposed to be in x.y.z format +# and reports if the argument VERSION_STR1 is greater than or equal than +# version_str2. The strings are compared lexicographically after conversion to +# lists of equal lengths, with the shorter string getting zero-padded. +function(compare_versions VERSION_STR1 VERSION_STR2 OUTPUT) + # Convert the strings to list + string(REPLACE "." ";" VL1 ${VERSION_STR1}) + string(REPLACE "." ";" VL2 ${VERSION_STR2}) + # get lengths of both lists + list(LENGTH VL1 VL1_LEN) + list(LENGTH VL2 VL2_LEN) + set(LEN ${VL1_LEN}) + # If they differ in size pad the shorter list with 0s + if(VL1_LEN GREATER VL2_LEN) + math(EXPR DIFF "${VL1_LEN} - ${VL2_LEN}" OUTPUT_FORMAT DECIMAL) + foreach(IDX RANGE 1 ${DIFF} 1) + list(APPEND VL2 "0") + endforeach() + elseif(VL2_LEN GREATER VL2_LEN) + math(EXPR DIFF "${VL1_LEN} - ${VL2_LEN}" OUTPUT_FORMAT DECIMAL) + foreach(IDX RANGE 1 ${DIFF} 1) + list(APPEND VL2 "0") + endforeach() + set(LEN ${VL2_LEN}) + endif() + math(EXPR LEN_SUB_ONE "${LEN}-1") + foreach(IDX RANGE 0 ${LEN_SUB_ONE} 1) + list(GET VL1 ${IDX} VAL1) + list(GET VL2 ${IDX} VAL2) + + if(${VAL1} GREATER ${VAL2}) + set(${OUTPUT} TRUE PARENT_SCOPE) + break() + elseif(${VAL1} LESS ${VAL2}) + set(${OUTPUT} FALSE PARENT_SCOPE) + break() + else() + set(${OUTPUT} TRUE PARENT_SCOPE) + endif() + endforeach() + + endfunction(compare_versions) + +# Creates a small function to run and extract the LevelZero loader version. +function(get_l0_loader_version) + + set(L0_VERSIONEER_SRC + [====[ + #include <iostream> + #include <level_zero/loader/ze_loader.h> + #include <string> + int main() { + ze_result_t result; + std::string loader("loader"); + zel_component_version_t *versions; + size_t size = 0; + result = zeInit(0); + if (result != ZE_RESULT_SUCCESS) { + std::cerr << "Failed to init ze driver" << std::endl; + return -1; + } + zelLoaderGetVersions(&size, nullptr); + versions = new zel_component_version_t[size]; + zelLoaderGetVersions(&size, versions); + for (size_t i = 0; i < size; i++) { + if (loader.compare(versions[i].component_name) == 0) { + std::cout << versions[i].component_lib_version.major << "." + << versions[i].component_lib_version.minor << "." + << versions[i].component_lib_version.patch; + break; + } + } + delete[] versions; + return 0; + } + ]====] + ) + + set(L0_VERSIONEER_FILE ${CMAKE_BINARY_DIR}/temp/l0_versioneer.cpp) + + file(WRITE ${L0_VERSIONEER_FILE} "${L0_VERSIONEER_SRC}") + + # We need both the directories in the include path as ze_loader.h + # includes "ze_api.h" and not "level_zero/ze_api.h". + list(APPEND INCLUDE_DIRS ${LevelZero_INCLUDE_DIR}) + list(APPEND INCLUDE_DIRS ${LevelZero_INCLUDE_DIR}/level_zero) + list(JOIN INCLUDE_DIRS ";" INCLUDE_DIRS_STR) + try_run(L0_VERSIONEER_RUN L0_VERSIONEER_COMPILE + "${CMAKE_BINARY_DIR}" + "${L0_VERSIONEER_FILE}" + LINK_LIBRARIES ${LevelZero_LIBRARY} + CMAKE_FLAGS + "-DINCLUDE_DIRECTORIES=${INCLUDE_DIRS_STR}" + RUN_OUTPUT_VARIABLE L0_VERSION + ) + if(${L0_VERSIONEER_COMPILE} AND (DEFINED L0_VERSIONEER_RUN)) + set(LevelZero_VERSION ${L0_VERSION} PARENT_SCOPE) + message(STATUS "Found Level Zero of version: ${L0_VERSION}") + else() + message(FATAL_ERROR + "Could not compile a level-zero program to extract loader version" + ) + endif() +endfunction(get_l0_loader_version) + +if(LevelZero_INCLUDE_DIR AND LevelZero_LIBRARY) + list(APPEND LevelZero_LIBRARIES "${LevelZero_LIBRARY}") + list(APPEND LevelZero_INCLUDE_DIRS ${LevelZero_INCLUDE_DIR}) + if(OpenCL_FOUND) + list(APPEND LevelZero_INCLUDE_DIRS ${OpenCL_INCLUDE_DIRS}) + endif() + + cmake_path(GET LevelZero_LIBRARY PARENT_PATH LevelZero_LIBRARIES_PATH) + set(LevelZero_LIBRARIES_DIR ${LevelZero_LIBRARIES_PATH}) + + if(NOT TARGET LevelZero::LevelZero) + add_library(LevelZero::LevelZero INTERFACE IMPORTED) + set_target_properties(LevelZero::LevelZero + PROPERTIES INTERFACE_LINK_LIBRARIES "${LevelZero_LIBRARIES}" + ) + set_target_properties(LevelZero::LevelZero + PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${LevelZero_INCLUDE_DIRS}" + ) + endif() +endif() + +# Check if a specific version of Level Zero is required +if(LevelZero_FIND_VERSION) + get_l0_loader_version() + set(VERSION_GT_FIND_VERSION FALSE) + compare_versions( + ${LevelZero_VERSION} + ${LevelZero_FIND_VERSION} + VERSION_GT_FIND_VERSION + ) + if(${VERSION_GT_FIND_VERSION}) + set(LevelZero_FOUND TRUE) + else() + set(LevelZero_FOUND FALSE) + endif() +else() + set(LevelZero_FOUND TRUE) +endif() + +find_package_handle_standard_args(LevelZero + REQUIRED_VARS + LevelZero_FOUND + LevelZero_INCLUDE_DIRS + LevelZero_LIBRARY + LevelZero_LIBRARIES_DIR + HANDLE_COMPONENTS +) +mark_as_advanced(LevelZero_LIBRARY LevelZero_INCLUDE_DIRS) + +if(LevelZero_FOUND) + find_package_message(LevelZero "Found LevelZero: ${LevelZero_LIBRARY}" + "(found version ${LevelZero_VERSION})" + ) +else() + find_package_message(LevelZero "Could not find LevelZero" "") +endif() diff --git a/mlir/cmake/modules/FindSyclRuntime.cmake b/mlir/cmake/modules/FindSyclRuntime.cmake new file mode 100644 index 000000000000000..38b065a3f284c2c --- /dev/null +++ b/mlir/cmake/modules/FindSyclRuntime.cmake @@ -0,0 +1,68 @@ +# CMake find_package() module for SYCL Runtime +# +# Example usage: +# +# find_package(SyclRuntime) +# +# If successful, the following variables will be defined: +# SyclRuntime_FOUND +# SyclRuntime_INCLUDE_DIRS +# SyclRuntime_LIBRARY +# SyclRuntime_LIBRARIES_DIR +# + +include(FindPackageHandleStandardArgs) + +if(NOT DEFINED ENV{CMPLR_ROOT}) + message(WARNING "Please make sure to install Intel DPC++ Compiler and run setvars.(sh/bat)") + message(WARNING "You can download standalone Intel DPC++ Compiler from https://www.intel.com/content/www/us/en/developer/articles/tool/oneapi-standalone-components.html#compilers") +else() + if(LINUX OR (${CMAKE_SYSTEM_NAME} MATCHES "Linux")) + set(SyclRuntime_ROOT "$ENV{CMPLR_ROOT}/linux") + elseif(WIN32) + set(SyclRuntime_ROOT "$ENV{CMPLR_ROOT}/windows") + endif() + list(APPEND SyclRuntime_INCLUDE_DIRS "${SyclRuntime_ROOT}/include") + list(APPEND SyclRuntime_INCLUDE_DIRS "${SyclRuntime_ROOT}/include/sycl") + + set(SyclRuntime_LIBRARY_DIR "${SyclRuntime_ROOT}/lib") + + message(STATUS "SyclRuntime_LIBRARY_DIR: ${SyclRuntime_LIBRARY_DIR}") + find_library(SyclRuntime_LIBRARY + NAMES sycl + PATHS ${SyclRuntime_LIBRARY_DIR} + NO_DEFAULT_PATH + ) +endif() + +if(SyclRuntime_LIBRARY) + set(SyclRuntime_FOUND TRUE) + if(NOT TARGET SyclRuntime::SyclRuntime) + add_library(SyclRuntime::SyclRuntime INTERFACE IMPORTED) + set_target_properties(SyclRuntime::SyclRuntime + PROPERTIES INTERFACE_LINK_LIBRARIES "${SyclRuntime_LIBRARY}" + ) + set_target_properties(SyclRuntime::SyclRuntime + PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${SyclRuntime_INCLUDE_DIRS}" + ) + endif() +else() + set(SyclRuntime_FOUND FALSE) +endif() + +find_package_handle_standard_args(SyclRuntime + REQUIRED_VARS + SyclRuntime_FOUND + SyclRuntime_INCLUDE_DIRS + SyclRuntime_LIBRARY + SyclRuntime_LIBRARY_DIR + HANDLE_COMPONENTS +) + +mark_as_advanced(SyclRuntime_LIBRARY SyclRuntime_INCLUDE_DIRS) + +if(SyclRuntime_FOUND) + find_package_message(SyclRuntime "Found SyclRuntime: ${SyclRuntime_LIBRARY}" "") +else() + find_package_message(SyclRuntime "Could not find SyclRuntime" "") +endif() diff --git a/mlir/lib/ExecutionEngine/CMakeLists.txt b/mlir/lib/ExecutionEngine/CMakeLists.txt index ea33c2c6ed261e1..47b1e82d60ff03f 100644 --- a/mlir/lib/ExecutionEngine/CMakeLists.txt +++ b/mlir/lib/ExecutionEngine/CMakeLists.txt @@ -328,4 +328,39 @@ if(LLVM_ENABLE_PIC) hip::host hip::amdhip64 ) endif() + + if(MLIR_ENABLE_SYCL_RUNNER) + find_package(SyclRuntime) + + if(NOT SyclRuntime_FOUND) + message(FATAL_ERROR "syclRuntime not found. Please set check oneapi installation and run setvars.sh.") + endif() + + find_package(LevelZero) + + if(NOT LevelZero_FOUND) + message(FATAL_ERROR "LevelZero not found. Please set LEVEL_ZERO_DIR.") + endif() + + add_mlir_library(sycl-runtime + SHARED + SyclRuntimeWrappers.cpp + + EXCLUDE_FROM_LIBMLIR + ) + + check_cxx_compiler_flag("-frtti" CXX_HAS_FRTTI_FLAG) + if(NOT CXX_HAS_FRTTI_FLAG) + message(FATAL_ERROR "CXX compiler does not accept flag -frtti") + endif() + target_compile_options (sycl-runtime PUBLIC -fexceptions -frtti) + + target_include_directories(sycl-runtime PRIVATE + ${MLIR_INCLUDE_DIRS} + ) + + target_link_libraries(sycl-runtime PRIVATE LevelZero::LevelZero SyclRuntime::SyclRuntime) + + set_property(TARGET sycl-runtime APPEND PROPERTY BUILD_RPATH "${LevelZero_LIBRARIES_DIR}" "${SyclRuntime_LIBRARIES_DIR}") + endif() endif() diff --git a/mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp new file mode 100644 index 000000000000000..ed5ed2170f411c3 --- /dev/null +++ b/mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp @@ -0,0 +1,386 @@ +//===- SyclRuntimeWrappers.cpp - MLIR Sycl API wrapper library ------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include <algorithm> +#include <array> +#include <atomic> +#include <cassert> +#include <cfloat> +#include <cstdint> +#include <cstdio> +#include <cstdlib> +#include <stdexcept> +#include <tuple> +#include <vector> + +#include <CL/sycl.hpp> +#include <level_zero/ze_api.h> +#include <map> +#include <mutex> +#include <sycl/ext/oneapi/backend/level_zero.hpp> + +#ifdef _WIN32 +#define SYCL_RUNTIME_EXPORT __declspec(dllexport) +#else +#define SYCL_RUNTIME_EXPORT +#endif // _WIN32 + +namespace { + +template <typename F> auto catchAll(F &&func) { + try { + return func(); + } catch (const std::exception &e) { + fprintf(stdout, "An exception was thrown: %s\n", e.what()); + fflush(stdout); + abort(); + } catch (...) { + fprintf(stdout, "An unknown exception was thrown\n"); + fflush(stdout); + abort(); + } +} + +#define L0_SAFE_CALL(call) \ + { \ + ze_result_t status = (call); \ + if (status != ZE_RESULT_SUCCESS) { \ + fprintf(stdout, "L0 error %d\n", status); \ + fflush(stdout); \ + abort(); \ + } \ + } + +} // namespace + +struct SpirvModule { + ze_module_handle_t module = nullptr; + ~SpirvModule(); +}; + +namespace { +// Create a Map for the spirv module lookup +std::map<void *, SpirvModule> moduleCache; +std::mutex mutexLock; +} // namespace + +SpirvModule::~SpirvModule() { + L0_SAFE_CALL(zeModuleDestroy(SpirvModule::module)); +} + +struct ParamDesc { + void *data; + size_t size; + + bool operator==(const ParamDesc &rhs) const { + return data == rhs.data && size == rhs.size; + } + + bool operator!=(const ParamDesc &rhs) const { return !(*this == rhs); } +}; + +template <typename T> size_t countUntil(T *ptr, T &&elem) { + assert(ptr); + auto curr = ptr; + while (*curr != elem) { + ++curr; + } + return static_cast<size_t>(curr - ptr); +} + +static sycl::device getDefaultDevice() { + auto platformList = sycl::platform::get_platforms(); + for (const auto &platform : platformList) { + auto platformName = platform.get_info<sycl::info::platform::name>(); + bool isLevelZero = platformName.find("Level-Zero") != std::string::npos; + if (!isLevelZero) + continue; + + return platform.get_devices()[0]; + } +} + +struct GPUSYCLQUEUE { + + sycl::device syclDevice_; + sycl::context syclContext_; + sycl::queue syclQueue_; + + GPUSYCLQUEUE(sycl::property_list propList) { + + syclDevice_ = getDefaultDevice(); + syclContext_ = sycl::context(syclDevice_); + syclQueue_ = sycl::queue(syclContext_, syclDevice_, propList); + } + + GPUSYCLQUEUE(sycl::device *device, sycl::context *context, + sycl::property_list propList) { + syclDevice_ = *device; + syclContext_ = *context; + syclQueue_ = sycl::queue(syclContext_, syclDevice_, propList); + } + GPUSYCLQUEUE(sycl::device *device, sycl::property_list propList) { + + syclDevice_ = *device; + syclContext_ = sycl::context(syclDevice_); + syclQueue_ = sycl::queue(syclContext_, syclDevice_, propList); + } + + GPUSYCLQUEUE(sycl::context *context, sycl::property_list propList) { + + syclDevice_ = getDefaultDevice(); + syclContext_ = *context; + syclQueue_ = sycl::queue(syclContext_, syclDevice_, propList); + } + +}; // end of GPUSYCLQUEUE + +static void *allocDeviceMemory(GPUSYCLQUEUE *queue, size_t size, + size_t alignment, bool isShared) { + void *memPtr = nullptr; + if (isShared) { + memPtr = sycl::aligned_alloc_shared(alignment, size, queue->syclQueue_); + } else { + memPtr = sycl::aligned_alloc_device(alignment, size, queue->syclQueue_); + } + if (memPtr == nullptr) { + throw std::runtime_error( + "aligned_alloc_shared() failed to allocate memory!"); + } + return memPtr; +} + +static void deallocDeviceMemory(GPUSYCLQUEUE *queue, void *ptr) { + sycl::free(ptr, queue->syclQueue_); +} + +static ze_module_handle_t loadModule(GPUSYCLQUEUE *queue, const void *data, + size_t dataSize) { + assert(data); + auto syclQueue = queue->syclQueue_; + ze_module_handle_t zeModule; + + auto it = moduleCache.find((void *)data); + // Check the map if the module is present/cached. + if (it != moduleCache.end()) { + return it->second.module; + } + + ze_module_desc_t desc = {ZE_STRUCTURE_TYPE_MODULE_DESC, + nullptr, + ZE_MODULE_FORMAT_IL_SPIRV, + dataSize, + (const uint8_t *)data, + nullptr, + nullptr}; + auto zeDevice = sycl::get_native<sycl::backend::ext_oneapi_level_zero>( + syclQueue.get_device()); + auto zeContext = sycl::get_native<sycl::backend::ext_oneapi_level_zero>( + syclQueue.get_context()); + L0_SAFE_CALL(zeModuleCreate(zeContext, zeDevice, &desc, &zeModule, nullptr)); + std::lock_guard<std::mutex> entryLock(mutexLock); + moduleCache[(void *)data].module = zeModule; + return zeModule; +} + +static sycl::kernel *getKernel(GPUSYCLQUEUE *queue, ze_module_handle_t zeModule, + const char *name) { + assert(zeModule); + assert(name); + auto syclQueue = queue->syclQueue_; + ze_kernel_handle_t zeKernel; + sycl::kernel *syclKernel; + ze_kernel_desc_t desc = {}; + desc.pKernelName = name; + + L0_SAFE_CALL(zeKernelCreate(zeModule, &desc, &zeKernel)); + sycl::kernel_bundle<sycl::bundle_state::executable> kernelBundle = + sycl::make_kernel_bundle<sycl::backend::ext_oneapi_level_zero, + sycl::bundle_state::executable>( + {zeModule}, syclQueue.get_context()); + + auto kernel = sycl::make_kernel<sycl::backend::ext_oneapi_level_zero>( + {kernelBundle, zeKernel}, syclQueue.get_context()); + syclKernel = new sycl::kernel(kernel); + return syclKernel; +} + +static sycl::event enqueueKernel(sycl::queue queue, sycl::kernel *kernel, + sycl::nd_range<3> NdRange, ParamDesc *params, + size_t sharedMemBytes) { + auto paramsCount = countUntil(params, ParamDesc{nullptr, 0}); + // The assumption is, if there is a param for the shared local memory, + // then that will always be the last argument. + if (sharedMemBytes) { + paramsCount = paramsCount - 1; + } + sycl::event event = queue.submit([&](sycl::handler &cgh) { + for (size_t i = 0; i < paramsCount; i++) { + auto param = params[i]; + cgh.set_arg(static_cast<uint32_t>(i), + *(static_cast<void **>(param.data))); + } + if (sharedMemBytes) { + // TODO: Handle other data types + using share_mem_t = + sycl::accessor<float, 1, sycl::access::mode::read_write, + sycl::access::target::local>; + share_mem_t local_buffer = + share_mem_t(sharedMemBytes / sizeof(float), cgh); + cgh.set_arg(paramsCount, local_buffer); + cgh.parallel_for(NdRange, *kernel); + } else { + cgh.parallel_for(NdRange, *kernel); + } + }); + return event; +} + +static void launchKernel(GPUSYCLQUEUE *queue, sycl::kernel *kernel, + size_t gridX, size_t gridY, size_t gridZ, + size_t blockX, size_t blockY, size_t blockZ, + size_t sharedMemBytes, ParamDesc *params) { + auto syclQueue = queue->syclQueue_; + auto syclGlobalRange = + ::sycl::range<3>(blockZ * gridZ, blockY * gridY, blockX * gridX); + auto syclLocalRange = ::sycl::range<3>(blockZ, blockY, blockX); + sycl::nd_range<3> syclNdRange( + sycl::nd_range<3>(syclGlobalRange, syclLocalRange)); + + if (getenv("IMEX_ENABLE_PROFILING")) { + auto executionTime = 0.0f; + auto maxTime = 0.0f; + auto minTime = FLT_MAX; + auto rounds = 100; + auto warmups = 3; + + if (getenv("IMEX_PROFILING_RUNS")) { + auto runs = strtol(getenv("IMEX_PROFILING_RUNS"), NULL, 10L); + if (runs) + rounds = runs; + } + + if (getenv("IMEX_PROFILING_WARMUPS")) { + auto runs = strtol(getenv("IMEX_PROFILING_WARMUPS"), NULL, 10L); + if (warmups) + warmups = runs; + } + + // warmups + for (int r = 0; r < warmups; r++) { + enqueueKernel(syclQueue, kernel, syclNdRange, params, sharedMemBytes); + } + + for (int r = 0; r < rounds; r++) { + sycl::event event = + enqueueKernel(syclQueue, kernel, syclNdRange, params, sharedMemBytes); + + auto startTime = event.get_profiling_info< + cl::sycl::info::event_profiling::command_start>(); + auto endTime = event.get_profiling_info< + cl::sycl::info::event_profiling::command_end>(); + auto gap = float(endTime - startTime) / 1000000.0f; + executionTime += gap; + if (gap > maxTime) + maxTime = gap; + if (gap < minTime) + minTime = gap; + } + + fprintf(stdout, + "the kernel execution time is (ms):" + "avg: %.4f, min: %.4f, max: %.4f (over %d runs)\n", + executionTime / rounds, minTime, maxTime, rounds); + } else { + enqueueKernel(syclQueue, kernel, syclNdRange, params, sharedMemBytes); + } +} + +// Wrappers + +extern "C" SYCL_RUNTIME_EXPORT GPUSYCLQUEUE *gpuCreateStream(void *device, + void *context) { + auto propList = sycl::property_list{}; + if (getenv("IMEX_ENABLE_PROFILING")) { + propList = sycl::property_list{sycl::property::queue::enable_profiling()}; + } + return catchAll([&]() { + if (!device && !context) { + return new GPUSYCLQUEUE(propList); + } else if (device && context) { + // TODO: Check if the pointers/address is valid and holds the correct + // device and context + return new GPUSYCLQUEUE(static_cast<sycl::device *>(device), + static_cast<sycl::context *>(context), propList); + } else if (device && !context) { + return new GPUSYCLQUEUE(static_cast<sycl::device *>(device), propList); + } else { + return new GPUSYCLQUEUE(static_cast<sycl::context *>(context), propList); + } + }); +} + +extern "C" SYCL_RUNTIME_EXPORT void gpuStreamDestroy(GPUSYCLQUEUE *queue) { + catchAll([&]() { delete queue; }); +} + +extern "C" SYCL_RUNTIME_EXPORT void * +gpuMemAlloc(GPUSYCLQUEUE *queue, size_t size, size_t alignment, bool isShared) { + return catchAll([&]() { + if (queue) { + return allocDeviceMemory(queue, size, alignment, isShared); + } + }); +} + +extern "C" SYCL_RUNTIME_EXPORT void gpuMemFree(GPUSYCLQUEUE *queue, void *ptr) { + catchAll([&]() { + if (queue && ptr) { + deallocDeviceMemory(queue, ptr); + } + }); +} + +extern "C" SYCL_RUNTIME_EXPORT ze_module_handle_t +gpuModuleLoad(GPUSYCLQUEUE *queue, const void *data, size_t dataSize) { + return catchAll([&]() { + if (queue) { + return loadModule(queue, data, dataSize); + } + }); +} + +extern "C" SYCL_RUNTIME_EXPORT sycl::kernel * +gpuKernelGet(GPUSYCLQUEUE *queue, ze_module_handle_t module, const char *name) { + return catchAll([&]() { + if (queue) { + return getKernel(queue, module, name); + } + }); +} + +extern "C" SYCL_RUNTIME_EXPORT void +gpuLaunchKernel(GPUSYCLQUEUE *queue, sycl::kernel *kernel, size_t gridX, + size_t gridY, size_t gridZ, size_t blockX, size_t blockY, + size_t blockZ, size_t sharedMemBytes, void *params) { + return catchAll([&]() { + if (queue) { + launchKernel(queue, kernel, gridX, gridY, gridZ, blockX, blockY, blockZ, + sharedMemBytes, static_cast<ParamDesc *>(params)); + } + }); +} + +extern "C" SYCL_RUNTIME_EXPORT void gpuWait(GPUSYCLQUEUE *queue) { + + catchAll([&]() { + if (queue) { + queue->syclQueue_.wait(); + } + }); +} >From 19bc391f87aba81196c0f4233b2a9ab808ade282 Mon Sep 17 00:00:00 2001 From: "Lee, Sang Ik" <sang.ik....@intel.com> Date: Wed, 30 Aug 2023 20:59:43 +0000 Subject: [PATCH 02/13] Fix config and build issues. --- mlir/lib/ExecutionEngine/CMakeLists.txt | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/mlir/lib/ExecutionEngine/CMakeLists.txt b/mlir/lib/ExecutionEngine/CMakeLists.txt index 47b1e82d60ff03f..101d9baafcccc49 100644 --- a/mlir/lib/ExecutionEngine/CMakeLists.txt +++ b/mlir/lib/ExecutionEngine/CMakeLists.txt @@ -6,6 +6,7 @@ set(LLVM_OPTIONAL_SOURCES CRunnerUtils.cpp CudaRuntimeWrappers.cpp SparseTensorRuntime.cpp + SyclRuntimeWrappers.cpp ExecutionEngine.cpp Float16bits.cpp RocmRuntimeWrappers.cpp @@ -342,7 +343,7 @@ if(LLVM_ENABLE_PIC) message(FATAL_ERROR "LevelZero not found. Please set LEVEL_ZERO_DIR.") endif() - add_mlir_library(sycl-runtime + add_mlir_library(mlir_sycl_runtime SHARED SyclRuntimeWrappers.cpp @@ -353,14 +354,14 @@ if(LLVM_ENABLE_PIC) if(NOT CXX_HAS_FRTTI_FLAG) message(FATAL_ERROR "CXX compiler does not accept flag -frtti") endif() - target_compile_options (sycl-runtime PUBLIC -fexceptions -frtti) + target_compile_options (mlir_sycl_runtime PUBLIC -fexceptions -frtti) - target_include_directories(sycl-runtime PRIVATE + target_include_directories(mlir_sycl_runtime PRIVATE ${MLIR_INCLUDE_DIRS} ) - target_link_libraries(sycl-runtime PRIVATE LevelZero::LevelZero SyclRuntime::SyclRuntime) + target_link_libraries(mlir_sycl_runtime PRIVATE LevelZero::LevelZero SyclRuntime::SyclRuntime) - set_property(TARGET sycl-runtime APPEND PROPERTY BUILD_RPATH "${LevelZero_LIBRARIES_DIR}" "${SyclRuntime_LIBRARIES_DIR}") + set_property(TARGET mlir_sycl_runtime APPEND PROPERTY BUILD_RPATH "${LevelZero_LIBRARIES_DIR}" "${SyclRuntime_LIBRARIES_DIR}") endif() endif() >From 330d04db7155bbf416ee422934b47eac3dea70ad Mon Sep 17 00:00:00 2001 From: "Lee, Sang Ik" <sang.ik....@intel.com> Date: Thu, 31 Aug 2023 15:09:11 -0700 Subject: [PATCH 03/13] Suppress clang compiler error. --- mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp index ed5ed2170f411c3..439d31134aa8b9a 100644 --- a/mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp +++ b/mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp @@ -58,6 +58,9 @@ template <typename F> auto catchAll(F &&func) { } // namespace +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wglobal-constructors" + struct SpirvModule { ze_module_handle_t module = nullptr; ~SpirvModule(); @@ -73,6 +76,8 @@ SpirvModule::~SpirvModule() { L0_SAFE_CALL(zeModuleDestroy(SpirvModule::module)); } +#pragma clang diagnostic pop + struct ParamDesc { void *data; size_t size; >From 203d23eefe5a32f4c13313579bd1d9b9630e2413 Mon Sep 17 00:00:00 2001 From: "Lee, Sang Ik" <sang.ik....@intel.com> Date: Wed, 6 Sep 2023 09:01:06 -0700 Subject: [PATCH 04/13] Add gpu serialize to spirv pass. --- .../mlir/Dialect/GPU/Transforms/Passes.td | 4 ++ mlir/lib/Dialect/GPU/CMakeLists.txt | 1 + .../GPU/Transforms/SerializeToSPIRV.cpp | 70 +++++++++++++++++++ .../GPU/Transforms/serialize-spirv.mlir | 53 ++++++++++++++ 4 files changed, 128 insertions(+) create mode 100644 mlir/lib/Dialect/GPU/Transforms/SerializeToSPIRV.cpp create mode 100644 mlir/test/Dialect/GPU/Transforms/serialize-spirv.mlir diff --git a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td index fc20bd2ed921aea..f285f45448ecc7e 100644 --- a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td @@ -187,4 +187,8 @@ def GpuROCDLAttachTarget: Pass<"rocdl-attach-target", ""> { ]; } +def GpuSerializeToSPIRVPass : Pass<"gpu-serialize-to-spirv", "ModuleOp"> { + let summary = "Serialize spirv dialect to spirv binary"; +} + #endif // MLIR_DIALECT_GPU_PASSES diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt index 6244132c073a4a6..e2c1dc7adf646f7 100644 --- a/mlir/lib/Dialect/GPU/CMakeLists.txt +++ b/mlir/lib/Dialect/GPU/CMakeLists.txt @@ -58,6 +58,7 @@ add_mlir_dialect_library(MLIRGPUTransforms Transforms/SerializeToBlob.cpp Transforms/SerializeToCubin.cpp Transforms/SerializeToHsaco.cpp + Transforms/SerializeToSPIRV.cpp Transforms/ShuffleRewriter.cpp Transforms/ROCDLAttachTarget.cpp diff --git a/mlir/lib/Dialect/GPU/Transforms/SerializeToSPIRV.cpp b/mlir/lib/Dialect/GPU/Transforms/SerializeToSPIRV.cpp new file mode 100644 index 000000000000000..f013f531371de86 --- /dev/null +++ b/mlir/lib/Dialect/GPU/Transforms/SerializeToSPIRV.cpp @@ -0,0 +1,70 @@ +//===- SerializeToSPIRV.cpp - Convert GPU kernel to SPIRV blob -------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This pass iterates all the SPIR-V modules in the top module and serializes +/// each SPIR-V module to SPIR-V binary and then attachs the binary blob as a +/// string attribute to the corresponding gpu module. +/// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/GPU/Transforms/Passes.h" + +#include "mlir/Dialect/GPU/IR/GPUDialect.h" +#include "mlir/Dialect/GPU/Transforms/Passes.h" +#include "mlir/Dialect/SPIRV/IR/SPIRVDialect.h" +#include "mlir/Dialect/SPIRV/IR/SPIRVOps.h" +#include "mlir/Target/SPIRV/Serialization.h" + +namespace mlir { +#define GEN_PASS_DEF_GPUSERIALIZETOSPIRVPASS +#include "mlir/Dialect/GPU/Transforms/Passes.h.inc" +} // namespace mlir + +using namespace mlir; + +struct GpuSerializeToSPIRVPass : public mlir::impl::GpuSerializeToSPIRVPassBase<GpuSerializeToSPIRVPass> { +public: + void runOnOperation() override { + auto mod = getOperation(); + llvm::SmallVector<uint32_t, 0> spvBinary; + for (mlir::gpu::GPUModuleOp gpuMod : mod.getOps<gpu::GPUModuleOp>()) { + auto name = gpuMod.getName(); + // check that the spv module has the same name with gpu module except the + // prefix "__spv__" + auto isSameMod = [&](spirv::ModuleOp spvMod) -> bool { + auto spvModName = spvMod.getName(); + return spvModName->consume_front("__spv__") && spvModName == name; + }; + auto spvMods = mod.getOps<spirv::ModuleOp>(); + auto it = llvm::find_if(spvMods, isSameMod); + if (it == spvMods.end()) { + gpuMod.emitError() << "Unable to find corresponding SPIR-V module"; + signalPassFailure(); + return; + } + auto spvMod = *it; + + spvBinary.clear(); + // serialize the spv module to spv binary + if (mlir::failed(spirv::serialize(spvMod, spvBinary))) { + spvMod.emitError() << "Failed to serialize SPIR-V module"; + signalPassFailure(); + return; + } + + // attach the spv binary to the gpu module + auto spvData = + llvm::StringRef(reinterpret_cast<const char *>(spvBinary.data()), + spvBinary.size() * sizeof(uint32_t)); + auto spvAttr = mlir::StringAttr::get(&getContext(), spvData); + gpuMod->setAttr(gpu::getDefaultGpuBinaryAnnotation(), spvAttr); + spvMod->erase(); + } + } +}; diff --git a/mlir/test/Dialect/GPU/Transforms/serialize-spirv.mlir b/mlir/test/Dialect/GPU/Transforms/serialize-spirv.mlir new file mode 100644 index 000000000000000..d70e18f3401d38d --- /dev/null +++ b/mlir/test/Dialect/GPU/Transforms/serialize-spirv.mlir @@ -0,0 +1,53 @@ +// RUN: mlir-opt -gpu-serialize-to-spirv %s | FileCheck %s +module attributes {gpu.container_module, spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR], [SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume]>, #spirv.resource_limits<>>} { + // CHECK: gpu.module @addt_kernel attributes {gpu.binary = + spirv.module @__spv__addt_kernel Physical64 OpenCL requires #spirv.vce<v1.0, [Int64, Addresses, Kernel], []> { + spirv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spirv.ptr<vector<3xi64>, Input> + spirv.func @addt_kernel(%arg0: !spirv.ptr<f32, CrossWorkgroup>, %arg1: !spirv.ptr<f32, CrossWorkgroup>, %arg2: !spirv.ptr<f32, CrossWorkgroup>) "None" attributes {spirv.entry_point_abi = #spirv.entry_point_abi<>, workgroup_attributions = 0 : i64} { + %cst5_i64 = spirv.Constant 5 : i64 + %__builtin_var_WorkgroupId___addr = spirv.mlir.addressof @__builtin_var_WorkgroupId__ : !spirv.ptr<vector<3xi64>, Input> + %0 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi64> + %1 = spirv.CompositeExtract %0[0 : i32] : vector<3xi64> + %__builtin_var_WorkgroupId___addr_0 = spirv.mlir.addressof @__builtin_var_WorkgroupId__ : !spirv.ptr<vector<3xi64>, Input> + %2 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr_0 : vector<3xi64> + %3 = spirv.CompositeExtract %2[1 : i32] : vector<3xi64> + spirv.Branch ^bb1 + ^bb1: // pred: ^bb0 + %4 = spirv.IMul %1, %cst5_i64 : i64 + %5 = spirv.IAdd %4, %3 : i64 + %6 = spirv.InBoundsPtrAccessChain %arg0[%5] : !spirv.ptr<f32, CrossWorkgroup>, i64 + %7 = spirv.Load "CrossWorkgroup" %6 ["Aligned", 4] : f32 + %8 = spirv.IMul %1, %cst5_i64 : i64 + %9 = spirv.IAdd %8, %3 : i64 + %10 = spirv.InBoundsPtrAccessChain %arg1[%9] : !spirv.ptr<f32, CrossWorkgroup>, i64 + %11 = spirv.Load "CrossWorkgroup" %10 ["Aligned", 4] : f32 + %12 = spirv.FAdd %7, %11 : f32 + %13 = spirv.IMul %1, %cst5_i64 : i64 + %14 = spirv.IAdd %13, %3 : i64 + %15 = spirv.InBoundsPtrAccessChain %arg2[%14] : !spirv.ptr<f32, CrossWorkgroup>, i64 + spirv.Store "CrossWorkgroup" %15, %12 ["Aligned", 4] : f32 + spirv.Return + } + spirv.EntryPoint "Kernel" @addt_kernel, @__builtin_var_WorkgroupId__ + } + gpu.module @addt_kernel { + gpu.func @addt_kernel(%arg0: memref<?xf32>, %arg1: memref<?xf32>, %arg2: memref<?xf32>) kernel attributes {spirv.entry_point_abi = #spirv.entry_point_abi<>} { + %c5 = arith.constant 5 : index + %0 = gpu.block_id x + %1 = gpu.block_id y + cf.br ^bb1 + ^bb1: // pred: ^bb0 + %2 = arith.muli %0, %c5 : index + %3 = arith.addi %2, %1 : index + %4 = memref.load %arg0[%3] : memref<?xf32> + %5 = arith.muli %0, %c5 : index + %6 = arith.addi %5, %1 : index + %7 = memref.load %arg1[%6] : memref<?xf32> + %8 = arith.addf %4, %7 : f32 + %9 = arith.muli %0, %c5 : index + %10 = arith.addi %9, %1 : index + memref.store %8, %arg2[%10] : memref<?xf32> + gpu.return + } + } +} >From a755e8f6ea0d919d62640b3aa41db93f793812dd Mon Sep 17 00:00:00 2001 From: "Lee, Sang Ik" <sang.ik....@intel.com> Date: Wed, 6 Sep 2023 09:09:22 -0700 Subject: [PATCH 05/13] Add dependency. --- mlir/lib/Dialect/GPU/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt index e2c1dc7adf646f7..38fa60ba06f59a9 100644 --- a/mlir/lib/Dialect/GPU/CMakeLists.txt +++ b/mlir/lib/Dialect/GPU/CMakeLists.txt @@ -97,6 +97,7 @@ add_mlir_dialect_library(MLIRGPUTransforms MLIRSupport MLIRROCDLTarget MLIRTransformUtils + MLIRSPIRVSerialization ) add_subdirectory(TransformOps) >From 1d5d04661841e50e29dac0620c33478b4a5f572b Mon Sep 17 00:00:00 2001 From: Nishant Patel <nishant.b.p...@intel.com> Date: Wed, 6 Sep 2023 17:17:30 +0000 Subject: [PATCH 06/13] Add Sycl Runtime Wrappers --- .../GPUCommon/GPUToLLVMConversion.cpp | 24 +- .../ExecutionEngine/CudaRuntimeWrappers.cpp | 5 +- .../ExecutionEngine/RocmRuntimeWrappers.cpp | 4 +- .../ExecutionEngine/SyclRuntimeWrappers.cpp | 301 ++++-------------- ...ower-launch-func-to-gpu-runtime-calls.mlir | 7 +- .../Integration/GPU/SYCL/gpu-to-spirv.mlir | 50 +++ 6 files changed, 144 insertions(+), 247 deletions(-) create mode 100644 mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp index eddf3e9a47d0bc8..111cfbf93f26a9b 100644 --- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp +++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp @@ -101,7 +101,8 @@ class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern<OpTy> { FunctionCallBuilder moduleLoadCallBuilder = { "mgpuModuleLoad", llvmPointerType /* void *module */, - {llvmPointerType /* void *cubin */}}; + {llvmPointerType, /* void *cubin */ + llvmInt64Type /* size_t size */}}; FunctionCallBuilder moduleUnloadCallBuilder = { "mgpuModuleUnload", llvmVoidType, {llvmPointerType /* void *module */}}; FunctionCallBuilder moduleGetFunctionCallBuilder = { @@ -125,7 +126,8 @@ class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern<OpTy> { llvmInt32Type, /* unsigned int sharedMemBytes */ llvmPointerType, /* void *hstream */ llvmPointerPointerType, /* void **kernelParams */ - llvmPointerPointerType /* void **extra */ + llvmPointerPointerType, /* void **extra */ + llvmInt64Type /* size_t paramsCount */ }}; FunctionCallBuilder streamCreateCallBuilder = { "mgpuStreamCreate", llvmPointerType /* void *stream */, {}}; @@ -1134,7 +1136,21 @@ LogicalResult ConvertLaunchFuncOpToGpuRuntimeCallPattern::matchAndRewrite( loc, rewriter, nameBuffer.str(), binaryAttr.getValue(), LLVM::Linkage::Internal, getTypeConverter()->useOpaquePointers()); - auto module = moduleLoadCallBuilder.create(loc, rewriter, data); + // SPIRV requires binary size + auto gpuBlob = binaryAttr.getValue(); + auto gpuBlobSize = rewriter.create<mlir::LLVM::ConstantOp>( + loc, llvmInt64Type, + mlir::IntegerAttr::get(llvmInt64Type, + static_cast<int64_t>(gpuBlob.size()))); + + auto paramsCount = rewriter.create<mlir::LLVM::ConstantOp>( + loc, llvmInt64Type, + mlir::IntegerAttr::get( + llvmInt64Type, + static_cast<int64_t>(launchOp.getNumKernelOperands()))); + + auto module = + moduleLoadCallBuilder.create(loc, rewriter, {data, gpuBlobSize}); // Get the function from the module. The name corresponds to the name of // the kernel function. auto kernelName = generateKernelNameConstant( @@ -1158,7 +1174,7 @@ LogicalResult ConvertLaunchFuncOpToGpuRuntimeCallPattern::matchAndRewrite( {function.getResult(), adaptor.getGridSizeX(), adaptor.getGridSizeY(), adaptor.getGridSizeZ(), adaptor.getBlockSizeX(), adaptor.getBlockSizeY(), adaptor.getBlockSizeZ(), dynamicSharedMemorySize, stream, kernelParams, - /*extra=*/nullpointer}); + /*extra=*/nullpointer, paramsCount}); if (launchOp.getAsyncToken()) { // Async launch: make dependent ops use the same stream. diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp index 1dba677ebe66365..8a53d99c778a63a 100644 --- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp +++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp @@ -119,7 +119,8 @@ static bool cusparseLt_initiated = false; #endif // MLIR_ENABLE_CUDA_CUSPARSELT #endif // MLIR_ENABLE_CUDA_CUSPARSE -extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUmodule mgpuModuleLoad(void *data) { +extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUmodule +mgpuModuleLoad(void *data, size_t /*gpuBlobSize*/) { ScopedContext scopedContext; CUmodule module = nullptr; CUDA_REPORT_IF_ERROR(cuModuleLoadData(&module, data)); @@ -144,7 +145,7 @@ extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuLaunchKernel(CUfunction function, intptr_t gridX, intptr_t gridY, intptr_t gridZ, intptr_t blockX, intptr_t blockY, intptr_t blockZ, int32_t smem, CUstream stream, void **params, - void **extra) { + void **extra, size_t /*paramsCount*/) { ScopedContext scopedContext; int32_t maxShmem = 0; CUdevice device = getDefaultCuDevice(); diff --git a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp index bd3868a8e196f6f..998ff5b8b829f88 100644 --- a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp +++ b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp @@ -32,7 +32,7 @@ thread_local static int32_t defaultDevice = 0; -extern "C" hipModule_t mgpuModuleLoad(void *data) { +extern "C" hipModule_t mgpuModuleLoad(void *data, size_t /*gpuBlobSize*/) { hipModule_t module = nullptr; HIP_REPORT_IF_ERROR(hipModuleLoadData(&module, data)); return module; @@ -57,7 +57,7 @@ extern "C" void mgpuLaunchKernel(hipFunction_t function, intptr_t gridX, intptr_t blockX, intptr_t blockY, intptr_t blockZ, int32_t smem, hipStream_t stream, void **params, - void **extra) { + void **extra, size_t /*paramsCount*/) { HIP_REPORT_IF_ERROR(hipModuleLaunchKernel(function, gridX, gridY, gridZ, blockX, blockY, blockZ, smem, stream, params, extra)); diff --git a/mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp index 439d31134aa8b9a..60ac27bd84e72fd 100644 --- a/mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp +++ b/mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp @@ -1,10 +1,14 @@ -//===- SyclRuntimeWrappers.cpp - MLIR Sycl API wrapper library ------------===// +//===- SyclRuntimeWrappers.cpp - MLIR SYCL wrapper library ------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// +// +// Implements C wrappers around the sycl runtime library. +// +//===----------------------------------------------------------------------===// #include <algorithm> #include <array> @@ -32,7 +36,8 @@ namespace { -template <typename F> auto catchAll(F &&func) { +template <typename F> +auto catchAll(F &&func) { try { return func(); } catch (const std::exception &e) { @@ -58,46 +63,6 @@ template <typename F> auto catchAll(F &&func) { } // namespace -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wglobal-constructors" - -struct SpirvModule { - ze_module_handle_t module = nullptr; - ~SpirvModule(); -}; - -namespace { -// Create a Map for the spirv module lookup -std::map<void *, SpirvModule> moduleCache; -std::mutex mutexLock; -} // namespace - -SpirvModule::~SpirvModule() { - L0_SAFE_CALL(zeModuleDestroy(SpirvModule::module)); -} - -#pragma clang diagnostic pop - -struct ParamDesc { - void *data; - size_t size; - - bool operator==(const ParamDesc &rhs) const { - return data == rhs.data && size == rhs.size; - } - - bool operator!=(const ParamDesc &rhs) const { return !(*this == rhs); } -}; - -template <typename T> size_t countUntil(T *ptr, T &&elem) { - assert(ptr); - auto curr = ptr; - while (*curr != elem) { - ++curr; - } - return static_cast<size_t>(curr - ptr); -} - static sycl::device getDefaultDevice() { auto platformList = sycl::platform::get_platforms(); for (const auto &platform : platformList) { @@ -108,74 +73,39 @@ static sycl::device getDefaultDevice() { return platform.get_devices()[0]; } + throw std::runtime_error("getDefaultDevice failed"); } -struct GPUSYCLQUEUE { +// Create global device and context +sycl::device syclDevice = getDefaultDevice(); +sycl::context syclContext = sycl::context(syclDevice); - sycl::device syclDevice_; - sycl::context syclContext_; +struct QUEUE { sycl::queue syclQueue_; - GPUSYCLQUEUE(sycl::property_list propList) { - - syclDevice_ = getDefaultDevice(); - syclContext_ = sycl::context(syclDevice_); - syclQueue_ = sycl::queue(syclContext_, syclDevice_, propList); - } - - GPUSYCLQUEUE(sycl::device *device, sycl::context *context, - sycl::property_list propList) { - syclDevice_ = *device; - syclContext_ = *context; - syclQueue_ = sycl::queue(syclContext_, syclDevice_, propList); - } - GPUSYCLQUEUE(sycl::device *device, sycl::property_list propList) { - - syclDevice_ = *device; - syclContext_ = sycl::context(syclDevice_); - syclQueue_ = sycl::queue(syclContext_, syclDevice_, propList); - } - - GPUSYCLQUEUE(sycl::context *context, sycl::property_list propList) { - - syclDevice_ = getDefaultDevice(); - syclContext_ = *context; - syclQueue_ = sycl::queue(syclContext_, syclDevice_, propList); - } - -}; // end of GPUSYCLQUEUE + QUEUE() { syclQueue_ = sycl::queue(syclContext, syclDevice); } +}; -static void *allocDeviceMemory(GPUSYCLQUEUE *queue, size_t size, - size_t alignment, bool isShared) { +static void *allocDeviceMemory(QUEUE *queue, size_t size, bool isShared) { void *memPtr = nullptr; if (isShared) { - memPtr = sycl::aligned_alloc_shared(alignment, size, queue->syclQueue_); + memPtr = sycl::aligned_alloc_shared(64, size, syclDevice, syclContext); } else { - memPtr = sycl::aligned_alloc_device(alignment, size, queue->syclQueue_); + memPtr = sycl::aligned_alloc_device(64, size, syclDevice, syclContext); } if (memPtr == nullptr) { - throw std::runtime_error( - "aligned_alloc_shared() failed to allocate memory!"); + throw std::runtime_error("mem allocation failed!"); } return memPtr; } -static void deallocDeviceMemory(GPUSYCLQUEUE *queue, void *ptr) { +static void deallocDeviceMemory(QUEUE *queue, void *ptr) { sycl::free(ptr, queue->syclQueue_); } -static ze_module_handle_t loadModule(GPUSYCLQUEUE *queue, const void *data, - size_t dataSize) { +static ze_module_handle_t loadModule(const void *data, size_t dataSize) { assert(data); - auto syclQueue = queue->syclQueue_; ze_module_handle_t zeModule; - - auto it = moduleCache.find((void *)data); - // Check the map if the module is present/cached. - if (it != moduleCache.end()) { - return it->second.module; - } - ze_module_desc_t desc = {ZE_STRUCTURE_TYPE_MODULE_DESC, nullptr, ZE_MODULE_FORMAT_IL_SPIRV, @@ -183,21 +113,17 @@ static ze_module_handle_t loadModule(GPUSYCLQUEUE *queue, const void *data, (const uint8_t *)data, nullptr, nullptr}; - auto zeDevice = sycl::get_native<sycl::backend::ext_oneapi_level_zero>( - syclQueue.get_device()); - auto zeContext = sycl::get_native<sycl::backend::ext_oneapi_level_zero>( - syclQueue.get_context()); + auto zeDevice = + sycl::get_native<sycl::backend::ext_oneapi_level_zero>(syclDevice); + auto zeContext = + sycl::get_native<sycl::backend::ext_oneapi_level_zero>(syclContext); L0_SAFE_CALL(zeModuleCreate(zeContext, zeDevice, &desc, &zeModule, nullptr)); - std::lock_guard<std::mutex> entryLock(mutexLock); - moduleCache[(void *)data].module = zeModule; return zeModule; } -static sycl::kernel *getKernel(GPUSYCLQUEUE *queue, ze_module_handle_t zeModule, - const char *name) { +static sycl::kernel *getKernel(ze_module_handle_t zeModule, const char *name) { assert(zeModule); assert(name); - auto syclQueue = queue->syclQueue_; ze_kernel_handle_t zeKernel; sycl::kernel *syclKernel; ze_kernel_desc_t desc = {}; @@ -206,186 +132,87 @@ static sycl::kernel *getKernel(GPUSYCLQUEUE *queue, ze_module_handle_t zeModule, L0_SAFE_CALL(zeKernelCreate(zeModule, &desc, &zeKernel)); sycl::kernel_bundle<sycl::bundle_state::executable> kernelBundle = sycl::make_kernel_bundle<sycl::backend::ext_oneapi_level_zero, - sycl::bundle_state::executable>( - {zeModule}, syclQueue.get_context()); + sycl::bundle_state::executable>({zeModule}, + syclContext); auto kernel = sycl::make_kernel<sycl::backend::ext_oneapi_level_zero>( - {kernelBundle, zeKernel}, syclQueue.get_context()); + {kernelBundle, zeKernel}, syclContext); syclKernel = new sycl::kernel(kernel); return syclKernel; } -static sycl::event enqueueKernel(sycl::queue queue, sycl::kernel *kernel, - sycl::nd_range<3> NdRange, ParamDesc *params, - size_t sharedMemBytes) { - auto paramsCount = countUntil(params, ParamDesc{nullptr, 0}); - // The assumption is, if there is a param for the shared local memory, - // then that will always be the last argument. - if (sharedMemBytes) { - paramsCount = paramsCount - 1; - } - sycl::event event = queue.submit([&](sycl::handler &cgh) { - for (size_t i = 0; i < paramsCount; i++) { - auto param = params[i]; - cgh.set_arg(static_cast<uint32_t>(i), - *(static_cast<void **>(param.data))); - } - if (sharedMemBytes) { - // TODO: Handle other data types - using share_mem_t = - sycl::accessor<float, 1, sycl::access::mode::read_write, - sycl::access::target::local>; - share_mem_t local_buffer = - share_mem_t(sharedMemBytes / sizeof(float), cgh); - cgh.set_arg(paramsCount, local_buffer); - cgh.parallel_for(NdRange, *kernel); - } else { - cgh.parallel_for(NdRange, *kernel); - } - }); - return event; -} - -static void launchKernel(GPUSYCLQUEUE *queue, sycl::kernel *kernel, - size_t gridX, size_t gridY, size_t gridZ, - size_t blockX, size_t blockY, size_t blockZ, - size_t sharedMemBytes, ParamDesc *params) { - auto syclQueue = queue->syclQueue_; +static void launchKernel(QUEUE *queue, sycl::kernel *kernel, size_t gridX, + size_t gridY, size_t gridZ, size_t blockX, + size_t blockY, size_t blockZ, size_t sharedMemBytes, + void **params, size_t paramsCount) { auto syclGlobalRange = ::sycl::range<3>(blockZ * gridZ, blockY * gridY, blockX * gridX); auto syclLocalRange = ::sycl::range<3>(blockZ, blockY, blockX); sycl::nd_range<3> syclNdRange( sycl::nd_range<3>(syclGlobalRange, syclLocalRange)); - if (getenv("IMEX_ENABLE_PROFILING")) { - auto executionTime = 0.0f; - auto maxTime = 0.0f; - auto minTime = FLT_MAX; - auto rounds = 100; - auto warmups = 3; - - if (getenv("IMEX_PROFILING_RUNS")) { - auto runs = strtol(getenv("IMEX_PROFILING_RUNS"), NULL, 10L); - if (runs) - rounds = runs; - } - - if (getenv("IMEX_PROFILING_WARMUPS")) { - auto runs = strtol(getenv("IMEX_PROFILING_WARMUPS"), NULL, 10L); - if (warmups) - warmups = runs; - } - - // warmups - for (int r = 0; r < warmups; r++) { - enqueueKernel(syclQueue, kernel, syclNdRange, params, sharedMemBytes); - } - - for (int r = 0; r < rounds; r++) { - sycl::event event = - enqueueKernel(syclQueue, kernel, syclNdRange, params, sharedMemBytes); - - auto startTime = event.get_profiling_info< - cl::sycl::info::event_profiling::command_start>(); - auto endTime = event.get_profiling_info< - cl::sycl::info::event_profiling::command_end>(); - auto gap = float(endTime - startTime) / 1000000.0f; - executionTime += gap; - if (gap > maxTime) - maxTime = gap; - if (gap < minTime) - minTime = gap; + queue->syclQueue_.submit([&](sycl::handler &cgh) { + for (size_t i = 0; i < paramsCount; i++) { + cgh.set_arg(static_cast<uint32_t>(i), *(static_cast<void **>(params[i]))); } - - fprintf(stdout, - "the kernel execution time is (ms):" - "avg: %.4f, min: %.4f, max: %.4f (over %d runs)\n", - executionTime / rounds, minTime, maxTime, rounds); - } else { - enqueueKernel(syclQueue, kernel, syclNdRange, params, sharedMemBytes); - } + cgh.parallel_for(syclNdRange, *kernel); + }); } // Wrappers -extern "C" SYCL_RUNTIME_EXPORT GPUSYCLQUEUE *gpuCreateStream(void *device, - void *context) { - auto propList = sycl::property_list{}; - if (getenv("IMEX_ENABLE_PROFILING")) { - propList = sycl::property_list{sycl::property::queue::enable_profiling()}; - } - return catchAll([&]() { - if (!device && !context) { - return new GPUSYCLQUEUE(propList); - } else if (device && context) { - // TODO: Check if the pointers/address is valid and holds the correct - // device and context - return new GPUSYCLQUEUE(static_cast<sycl::device *>(device), - static_cast<sycl::context *>(context), propList); - } else if (device && !context) { - return new GPUSYCLQUEUE(static_cast<sycl::device *>(device), propList); - } else { - return new GPUSYCLQUEUE(static_cast<sycl::context *>(context), propList); - } - }); +extern "C" SYCL_RUNTIME_EXPORT QUEUE *mgpuStreamCreate() { + + return catchAll([&]() { return new QUEUE(); }); } -extern "C" SYCL_RUNTIME_EXPORT void gpuStreamDestroy(GPUSYCLQUEUE *queue) { +extern "C" SYCL_RUNTIME_EXPORT void mgpuStreamDestroy(QUEUE *queue) { catchAll([&]() { delete queue; }); } -extern "C" SYCL_RUNTIME_EXPORT void * -gpuMemAlloc(GPUSYCLQUEUE *queue, size_t size, size_t alignment, bool isShared) { +extern "C" SYCL_RUNTIME_EXPORT void *mgpuMemAlloc(uint64_t size, QUEUE *queue, + bool isShared) { return catchAll([&]() { - if (queue) { - return allocDeviceMemory(queue, size, alignment, isShared); - } + return allocDeviceMemory(queue, static_cast<size_t>(size), true); }); } -extern "C" SYCL_RUNTIME_EXPORT void gpuMemFree(GPUSYCLQUEUE *queue, void *ptr) { +extern "C" SYCL_RUNTIME_EXPORT void mgpuMemFree(void *ptr, QUEUE *queue) { catchAll([&]() { - if (queue && ptr) { + if (ptr) { deallocDeviceMemory(queue, ptr); } }); } extern "C" SYCL_RUNTIME_EXPORT ze_module_handle_t -gpuModuleLoad(GPUSYCLQUEUE *queue, const void *data, size_t dataSize) { - return catchAll([&]() { - if (queue) { - return loadModule(queue, data, dataSize); - } - }); +mgpuModuleLoad(const void *data, size_t gpuBlobSize) { + return catchAll([&]() { return loadModule(data, gpuBlobSize); }); } extern "C" SYCL_RUNTIME_EXPORT sycl::kernel * -gpuKernelGet(GPUSYCLQUEUE *queue, ze_module_handle_t module, const char *name) { - return catchAll([&]() { - if (queue) { - return getKernel(queue, module, name); - } - }); +mgpuModuleGetFunction(ze_module_handle_t module, const char *name) { + return catchAll([&]() { return getKernel(module, name); }); } extern "C" SYCL_RUNTIME_EXPORT void -gpuLaunchKernel(GPUSYCLQUEUE *queue, sycl::kernel *kernel, size_t gridX, - size_t gridY, size_t gridZ, size_t blockX, size_t blockY, - size_t blockZ, size_t sharedMemBytes, void *params) { +mgpuLaunchKernel(sycl::kernel *kernel, size_t gridX, size_t gridY, size_t gridZ, + size_t blockX, size_t blockY, size_t blockZ, + size_t sharedMemBytes, QUEUE *queue, void **params, + void **extra, size_t paramsCount) { return catchAll([&]() { - if (queue) { - launchKernel(queue, kernel, gridX, gridY, gridZ, blockX, blockY, blockZ, - sharedMemBytes, static_cast<ParamDesc *>(params)); - } + launchKernel(queue, kernel, gridX, gridY, gridZ, blockX, blockY, blockZ, + sharedMemBytes, params, paramsCount); }); } -extern "C" SYCL_RUNTIME_EXPORT void gpuWait(GPUSYCLQUEUE *queue) { +extern "C" SYCL_RUNTIME_EXPORT void mgpuStreamSynchronize(QUEUE *queue) { - catchAll([&]() { - if (queue) { - queue->syclQueue_.wait(); - } - }); + catchAll([&]() { queue->syclQueue_.wait(); }); +} + +extern "C" SYCL_RUNTIME_EXPORT void +mgpuModuleUnload(ze_module_handle_t module) { + + catchAll([&]() { L0_SAFE_CALL(zeModuleDestroy(module)); }); } diff --git a/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir index 2cdc4e8dbb1ad67..96e8a6dbd35b171 100644 --- a/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir +++ b/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir @@ -34,8 +34,10 @@ module attributes {gpu.container_module} { // CHECK: [[ADDRESSOF:%.*]] = llvm.mlir.addressof @[[GLOBAL]] // CHECK: [[BINARY:%.*]] = llvm.getelementptr [[ADDRESSOF]]{{\[}}0, 0] // CHECK-SAME: -> !llvm.ptr + // CHECK: [[BINARYSIZE:%.*]] = llvm.mlir.constant + // CHECK: [[PARAMSCOUNT:%.*]] = llvm.mlir.constant - // CHECK: [[MODULE:%.*]] = llvm.call @mgpuModuleLoad([[BINARY]]) + // CHECK: [[MODULE:%.*]] = llvm.call @mgpuModuleLoad([[BINARY]], [[BINARYSIZE]]) // CHECK: [[FUNC:%.*]] = llvm.call @mgpuModuleGetFunction([[MODULE]], {{.*}}) // CHECK: [[STREAM:%.*]] = llvm.call @mgpuStreamCreate @@ -53,10 +55,11 @@ module attributes {gpu.container_module} { // CHECK: llvm.getelementptr %[[MEMREF]][0, 5] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct[[STRUCT_BODY:<.*>]] // CHECK: [[EXTRA_PARAMS:%.*]] = llvm.mlir.null : !llvm.ptr + // CHECK: llvm.call @mgpuLaunchKernel([[FUNC]], [[C8]], [[C8]], [[C8]], // CHECK-SAME: [[C8]], [[C8]], [[C8]], [[C256]], [[STREAM]], - // CHECK-SAME: [[PARAMS]], [[EXTRA_PARAMS]]) + // CHECK-SAME: [[PARAMS]], [[EXTRA_PARAMS]], [[PARAMSCOUNT]]) // CHECK: llvm.call @mgpuStreamSynchronize // CHECK: llvm.call @mgpuStreamDestroy // CHECK: llvm.call @mgpuModuleUnload diff --git a/mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir b/mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir new file mode 100644 index 000000000000000..0be0e31a3d71e94 --- /dev/null +++ b/mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir @@ -0,0 +1,50 @@ +module @add attributes {gpu.container_module} { + memref.global "private" constant @__constant_3x3xi64_0 : memref<3x3xi64> = dense<[[1, 4098, 3], [16777220, 5, 4294967302], [7, 1099511627784, 9]]> + memref.global "private" constant @__constant_3x3xi64 : memref<3x3xi64> = dense<[[1, 2, 3], [4, 5, 4102], [16777223, 4294967304, 1099511627785]]> + func.func @main() { + %0 = memref.get_global @__constant_3x3xi64 : memref<3x3xi64> + %1 = memref.get_global @__constant_3x3xi64_0 : memref<3x3xi64> + %2 = call @test(%0, %1) : (memref<3x3xi64>, memref<3x3xi64>) -> memref<3x3xi64> + %cast = memref.cast %2 : memref<3x3xi64> to memref<*xi64> + call @printMemrefI64(%cast) : (memref<*xi64>) -> () + return + } + func.func private @printMemrefI64(memref<*xi64>) + func.func @test(%arg0: memref<3x3xi64>, %arg1: memref<3x3xi64>) -> memref<3x3xi64> { + %c3 = arith.constant 3 : index + %c1 = arith.constant 1 : index + %0 = gpu.wait async + %memref, %asyncToken = gpu.alloc async [%0] (): memref<3x3xi64> + gpu.wait [%asyncToken] + memref.copy %arg1, %memref : memref<3x3xi64> to memref<3x3xi64> + %1 = gpu.wait async + %memref_0, %asyncToken_1 = gpu.alloc async [%1] () : memref<3x3xi64> + gpu.wait [%asyncToken_1] + memref.copy %arg0, %memref_0 : memref<3x3xi64> to memref<3x3xi64> + %2 = gpu.wait async + %memref_2, %asyncToken_3 = gpu.alloc async [%2] () : memref<3x3xi64> + %3 = gpu.launch_func async [%asyncToken_3] @test_kernel::@test_kernel blocks in (%c3, %c3, %c1) threads in (%c1, %c1, %c1) args(%memref_0 : memref<3x3xi64>, %memref : memref<3x3xi64>, %memref_2 : memref<3x3xi64>) + gpu.wait [%3] + %alloc = memref.alloc() : memref<3x3xi64> + memref.copy %memref_2, %alloc : memref<3x3xi64> to memref<3x3xi64> + %4 = gpu.wait async + %5 = gpu.dealloc async [%4] %memref_2 : memref<3x3xi64> + %6 = gpu.dealloc async [%5] %memref_0 : memref<3x3xi64> + %7 = gpu.dealloc async [%6] %memref : memref<3x3xi64> + gpu.wait [%7] + return %alloc : memref<3x3xi64> + } + gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Float16Buffer, Int64, Int16, Int8, Bfloat16ConversionINTEL, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR], [SPV_INTEL_bfloat16_conversion, SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume]>, api=OpenCL, #spirv.resource_limits<>>} { + gpu.func @test_kernel(%arg0: memref<3x3xi64>, %arg1: memref<3x3xi64>, %arg2: memref<3x3xi64>) kernel attributes {gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 3, 3, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>} { + %0 = gpu.block_id x + %1 = gpu.block_id y + %2 = memref.load %arg0[%0, %1] : memref<3x3xi64> + %3 = memref.load %arg1[%0, %1] : memref<3x3xi64> + %4 = arith.addi %2, %3 : i64 + memref.store %4, %arg2[%0, %1] : memref<3x3xi64> + gpu.return + } + } +} + + \ No newline at end of file >From 66fe69f30c015c2619d840a4ef91b18219ebf7c7 Mon Sep 17 00:00:00 2001 From: "Lee, Sang Ik" <sang.ik....@intel.com> Date: Wed, 6 Sep 2023 19:23:39 +0000 Subject: [PATCH 07/13] Temp save. Builds but Integration test fails. --- mlir/test/CMakeLists.txt | 4 ++++ mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir | 11 +++++++++-- mlir/test/lit.cfg.py | 3 +++ mlir/test/lit.site.cfg.py.in | 1 + 4 files changed, 17 insertions(+), 2 deletions(-) diff --git a/mlir/test/CMakeLists.txt b/mlir/test/CMakeLists.txt index 66a9cb01106ba5d..874e7718f4a36d1 100644 --- a/mlir/test/CMakeLists.txt +++ b/mlir/test/CMakeLists.txt @@ -135,6 +135,10 @@ if(MLIR_ENABLE_ROCM_RUNNER) list(APPEND MLIR_TEST_DEPENDS mlir_rocm_runtime) endif() +if(MLIR_ENABLE_SYCL_RUNNER) + list(APPEND MLIR_TEST_DEPENDS mlir_sycl_runtime) +endif() + list(APPEND MLIR_TEST_DEPENDS MLIRUnitTests) if(LLVM_BUILD_EXAMPLES) diff --git a/mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir b/mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir index 0be0e31a3d71e94..6ff9d4bf6ca8f5a 100644 --- a/mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir +++ b/mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir @@ -1,3 +1,10 @@ +// RUN: mlir-opt %s -pass-pipeline='builtin.module(convert-gpu-to-spirv{use-64bit-index=true},spirv.module(spirv-lower-abi-attrs,spirv-update-vce),func.func(llvm-request-c-wrappers),gpu-serialize-to-spirv,convert-func-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' \ +// RUN: | mlir-cpu-runner \ +// RUN: --shared-libs=%mlir_sycl_runtime \ +// RUN: --shared-libs=%mlir_runner_utils \ +// RUN: --entry-point-result=void \ +// RUN: | FileCheck %s + module @add attributes {gpu.container_module} { memref.global "private" constant @__constant_3x3xi64_0 : memref<3x3xi64> = dense<[[1, 4098, 3], [16777220, 5, 4294967302], [7, 1099511627784, 9]]> memref.global "private" constant @__constant_3x3xi64 : memref<3x3xi64> = dense<[[1, 2, 3], [4, 5, 4102], [16777223, 4294967304, 1099511627785]]> @@ -34,7 +41,7 @@ module @add attributes {gpu.container_module} { gpu.wait [%7] return %alloc : memref<3x3xi64> } - gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Float16Buffer, Int64, Int16, Int8, Bfloat16ConversionINTEL, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR], [SPV_INTEL_bfloat16_conversion, SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume]>, api=OpenCL, #spirv.resource_limits<>>} { + gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR], [SPV_KHR_expect_assume]>, api=OpenCL, #spirv.resource_limits<>>} { gpu.func @test_kernel(%arg0: memref<3x3xi64>, %arg1: memref<3x3xi64>, %arg2: memref<3x3xi64>) kernel attributes {gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 3, 3, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>} { %0 = gpu.block_id x %1 = gpu.block_id y @@ -47,4 +54,4 @@ module @add attributes {gpu.container_module} { } } - \ No newline at end of file + diff --git a/mlir/test/lit.cfg.py b/mlir/test/lit.cfg.py index f265ac794c6f6db..5d3a4dc575a7b28 100644 --- a/mlir/test/lit.cfg.py +++ b/mlir/test/lit.cfg.py @@ -119,6 +119,9 @@ def add_runtime(name): if config.enable_cuda_runner: tools.extend([add_runtime("mlir_cuda_runtime")]) +if config.enable_sycl_runner: + tools.extend([add_runtime("mlir_sycl_runtime")]) + # The following tools are optional tools.extend( [ diff --git a/mlir/test/lit.site.cfg.py.in b/mlir/test/lit.site.cfg.py.in index ef1fdbc0cba07c0..897c12f3abcac75 100644 --- a/mlir/test/lit.site.cfg.py.in +++ b/mlir/test/lit.site.cfg.py.in @@ -30,6 +30,7 @@ config.enable_cuda_runner = @MLIR_ENABLE_CUDA_RUNNER@ config.run_rocm_tests = @MLIR_ENABLE_ROCM_CONVERSIONS@ config.enable_rocm_runner = @MLIR_ENABLE_ROCM_RUNNER@ config.rocm_test_chipset = "@ROCM_TEST_CHIPSET@" +config.enable_sycl_runner = @MLIR_ENABLE_SYCL_RUNNER@ config.enable_spirv_cpu_runner = @MLIR_ENABLE_SPIRV_CPU_RUNNER@ config.enable_vulkan_runner = @MLIR_ENABLE_VULKAN_RUNNER@ config.enable_bindings_python = @MLIR_ENABLE_BINDINGS_PYTHON@ >From b0758f0ed491b7264cbdd10fc4e82d280e5298f2 Mon Sep 17 00:00:00 2001 From: Nishant Patel <nishant.b.p...@intel.com> Date: Wed, 6 Sep 2023 17:17:30 +0000 Subject: [PATCH 08/13] Update Sycl Runtime Wrappers --- .../lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp | 12 ++++++++++-- mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp | 3 ++- mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp | 3 ++- .../GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir | 3 ++- mlir/test/Conversion/GPUCommon/typed-pointers.mlir | 3 ++- mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir | 8 +++----- 6 files changed, 21 insertions(+), 11 deletions(-) diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp index 111cfbf93f26a9b..2b92c1cd8b00e50 100644 --- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp +++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp @@ -169,7 +169,8 @@ class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern<OpTy> { "mgpuMemAlloc", llvmPointerType /* void * */, {llvmIntPtrType /* intptr_t sizeBytes */, - llvmPointerType /* void *stream */}}; + llvmPointerType /* void *stream */, + llvmInt64Type /* size_t isHostShared */}}; FunctionCallBuilder deallocCallBuilder = { "mgpuMemFree", llvmVoidType, @@ -801,6 +802,8 @@ LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite( auto loc = allocOp.getLoc(); + bool isShared = allocOp.getHostShared(); + // Get shape of the memref as values: static sizes are constant // values and dynamic sizes are passed to 'alloc' as operands. SmallVector<Value, 4> shape; @@ -813,8 +816,13 @@ LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite( // descriptor. Type elementPtrType = this->getElementPtrType(memRefType); auto stream = adaptor.getAsyncDependencies().front(); + + auto isHostShared = rewriter.create<mlir::LLVM::ConstantOp>( + loc, llvmInt64Type, rewriter.getI64IntegerAttr(isShared)); + Value allocatedPtr = - allocCallBuilder.create(loc, rewriter, {sizeBytes, stream}).getResult(); + allocCallBuilder.create(loc, rewriter, {sizeBytes, stream, isHostShared}) + .getResult(); if (!getTypeConverter()->useOpaquePointers()) allocatedPtr = rewriter.create<LLVM::BitcastOp>(loc, elementPtrType, allocatedPtr); diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp index 8a53d99c778a63a..79dc2eed38f06a9 100644 --- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp +++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp @@ -211,7 +211,8 @@ extern MLIR_CUDA_WRAPPERS_EXPORT "C" void mgpuEventRecord(CUevent event, CUDA_REPORT_IF_ERROR(cuEventRecord(event, stream)); } -extern "C" void *mgpuMemAlloc(uint64_t sizeBytes, CUstream /*stream*/) { +extern "C" void *mgpuMemAlloc(uint64_t sizeBytes, CUstream /*stream*/, + bool /*isHostShared*/) { ScopedContext scopedContext; CUdeviceptr ptr; CUDA_REPORT_IF_ERROR(cuMemAlloc(&ptr, sizeBytes)); diff --git a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp index 998ff5b8b829f88..b50fd7eb9d05929 100644 --- a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp +++ b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp @@ -99,7 +99,8 @@ extern "C" void mgpuEventRecord(hipEvent_t event, hipStream_t stream) { HIP_REPORT_IF_ERROR(hipEventRecord(event, stream)); } -extern "C" void *mgpuMemAlloc(uint64_t sizeBytes, hipStream_t /*stream*/) { +extern "C" void *mgpuMemAlloc(uint64_t sizeBytes, hipStream_t /*stream*/, + bool /*isHostShared*/) { void *ptr; HIP_REPORT_IF_ERROR(hipMalloc(&ptr, sizeBytes)); return ptr; diff --git a/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir index 2506c6ceb990ef5..f365dcb02daf4c2 100644 --- a/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir +++ b/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir @@ -8,7 +8,8 @@ module attributes {gpu.container_module} { %0 = gpu.wait async // CHECK: %[[gep:.*]] = llvm.getelementptr {{.*}}[%[[size]]] // CHECK: %[[size_bytes:.*]] = llvm.ptrtoint %[[gep]] - // CHECK: llvm.call @mgpuMemAlloc(%[[size_bytes]], %[[stream]]) + // CHECK: %[[isHostShared:.*]] = llvm.mlir.constant + // CHECK: llvm.call @mgpuMemAlloc(%[[size_bytes]], %[[stream]], %[[isHostShared]]) %1, %2 = gpu.alloc async [%0] (%size) : memref<?xf32> // CHECK: %[[float_ptr:.*]] = llvm.extractvalue {{.*}}[0] // CHECK: llvm.call @mgpuMemFree(%[[float_ptr]], %[[stream]]) diff --git a/mlir/test/Conversion/GPUCommon/typed-pointers.mlir b/mlir/test/Conversion/GPUCommon/typed-pointers.mlir index 2fa6c854c567819..e27162c7dbc1902 100644 --- a/mlir/test/Conversion/GPUCommon/typed-pointers.mlir +++ b/mlir/test/Conversion/GPUCommon/typed-pointers.mlir @@ -8,7 +8,8 @@ module attributes {gpu.container_module} { %0 = gpu.wait async // CHECK: %[[gep:.*]] = llvm.getelementptr {{.*}}[%[[size]]] // CHECK: %[[size_bytes:.*]] = llvm.ptrtoint %[[gep]] - // CHECK: llvm.call @mgpuMemAlloc(%[[size_bytes]], %[[stream]]) + // CHECK: %[[isHostShared:.*]] = llvm.mlir.constant + // CHECK: llvm.call @mgpuMemAlloc(%[[size_bytes]], %[[stream]], %[[isHostShared]]) %1, %2 = gpu.alloc async [%0] (%size) : memref<?xf32> // CHECK: %[[float_ptr:.*]] = llvm.extractvalue {{.*}}[0] // CHECK: %[[void_ptr:.*]] = llvm.bitcast %[[float_ptr]] diff --git a/mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir b/mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir index 6ff9d4bf6ca8f5a..b6180d322ce9329 100644 --- a/mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir +++ b/mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir @@ -21,15 +21,15 @@ module @add attributes {gpu.container_module} { %c3 = arith.constant 3 : index %c1 = arith.constant 1 : index %0 = gpu.wait async - %memref, %asyncToken = gpu.alloc async [%0] (): memref<3x3xi64> + %memref, %asyncToken = gpu.alloc async [%0] host_shared (): memref<3x3xi64> gpu.wait [%asyncToken] memref.copy %arg1, %memref : memref<3x3xi64> to memref<3x3xi64> %1 = gpu.wait async - %memref_0, %asyncToken_1 = gpu.alloc async [%1] () : memref<3x3xi64> + %memref_0, %asyncToken_1 = gpu.alloc async [%1] host_shared () : memref<3x3xi64> gpu.wait [%asyncToken_1] memref.copy %arg0, %memref_0 : memref<3x3xi64> to memref<3x3xi64> %2 = gpu.wait async - %memref_2, %asyncToken_3 = gpu.alloc async [%2] () : memref<3x3xi64> + %memref_2, %asyncToken_3 = gpu.alloc async [%2] host_shared () : memref<3x3xi64> %3 = gpu.launch_func async [%asyncToken_3] @test_kernel::@test_kernel blocks in (%c3, %c3, %c1) threads in (%c1, %c1, %c1) args(%memref_0 : memref<3x3xi64>, %memref : memref<3x3xi64>, %memref_2 : memref<3x3xi64>) gpu.wait [%3] %alloc = memref.alloc() : memref<3x3xi64> @@ -53,5 +53,3 @@ module @add attributes {gpu.container_module} { } } } - - >From c37e8d84ea6a9ec84b27ffe30f47b6acfbbda202 Mon Sep 17 00:00:00 2001 From: "Lee, Sang Ik" <sang.ik....@intel.com> Date: Wed, 6 Sep 2023 20:35:53 +0000 Subject: [PATCH 09/13] Add f32 integration test. --- mlir/test/Integration/GPU/SYCL/addf.mlir | 58 ++++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 mlir/test/Integration/GPU/SYCL/addf.mlir diff --git a/mlir/test/Integration/GPU/SYCL/addf.mlir b/mlir/test/Integration/GPU/SYCL/addf.mlir new file mode 100644 index 000000000000000..b9fc36547220e77 --- /dev/null +++ b/mlir/test/Integration/GPU/SYCL/addf.mlir @@ -0,0 +1,58 @@ +// RUN: mlir-opt %s -pass-pipeline='builtin.module(convert-gpu-to-spirv{use-64bit-index=true},spirv.module(spirv-lower-abi-attrs,spirv-update-vce),func.func(llvm-request-c-wrappers),gpu-serialize-to-spirv,convert-func-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' \ +// RUN: | mlir-cpu-runner \ +// RUN: --shared-libs=%mlir_sycl_runtime \ +// RUN: --shared-libs=%mlir_runner_utils \ +// RUN: --entry-point-result=void \ +// RUN: | FileCheck %s + +module @add attributes { + gpu.container_module, + spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR], [SPV_KHR_expect_assume]>, api=OpenCL, #spirv.resource_limits<>> +} { + memref.global "private" constant @__constant_9xf32_0 : memref<9xf32> = dense<[1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1]> + memref.global "private" constant @__constant_9xf32 : memref<9xf32> = dense<[2.2, 2.2, 2.2, 2.2, 2.2, 2.2, 2.2, 2.2, 2.2]> + func.func @main() { + %0 = memref.get_global @__constant_9xf32 : memref<9xf32> + %1 = memref.get_global @__constant_9xf32_0 : memref<9xf32> + %2 = call @test(%0, %1) : (memref<9xf32>, memref<9xf32>) -> memref<9xf32> + %cast = memref.cast %2 : memref<9xf32> to memref<*xf32> + call @printMemrefI64(%cast) : (memref<*xf32>) -> () + return + } + func.func private @printMemrefI64(memref<*xf32>) + func.func @test(%arg0: memref<9xf32>, %arg1: memref<9xf32>) -> memref<9xf32> { + %c9 = arith.constant 9 : index + %c1 = arith.constant 1 : index + %0 = gpu.wait async + %memref, %asyncToken = gpu.alloc async [%0] host_shared (): memref<9xf32> + gpu.wait [%asyncToken] + memref.copy %arg1, %memref : memref<9xf32> to memref<9xf32> + %1 = gpu.wait async + %memref_0, %asyncToken_1 = gpu.alloc async [%1] host_shared () : memref<9xf32> + gpu.wait [%asyncToken_1] + memref.copy %arg0, %memref_0 : memref<9xf32> to memref<9xf32> + %2 = gpu.wait async + %memref_2, %asyncToken_3 = gpu.alloc async [%2] host_shared () : memref<9xf32> + %3 = gpu.launch_func async [%asyncToken_3] @test_kernel::@test_kernel blocks in (%c9, %c1, %c1) threads in (%c1, %c1, %c1) args(%memref_0 : memref<9xf32>, %memref : memref<9xf32>, %memref_2 : memref<9xf32>) + gpu.wait [%3] + %alloc = memref.alloc() : memref<9xf32> + memref.copy %memref_2, %alloc : memref<9xf32> to memref<9xf32> + %4 = gpu.wait async + %5 = gpu.dealloc async [%4] %memref_2 : memref<9xf32> + %6 = gpu.dealloc async [%5] %memref_0 : memref<9xf32> + %7 = gpu.dealloc async [%6] %memref : memref<9xf32> + gpu.wait [%7] + return %alloc : memref<9xf32> + } + gpu.module @test_kernel { + gpu.func @test_kernel(%arg0: memref<9xf32>, %arg1: memref<9xf32>, %arg2: memref<9xf32>) kernel attributes {gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 9, 1, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>} { + //%0 = gpu.block_id x + //%2 = memref.load %arg0[%0] : memref<9xf32> + //%3 = memref.load %arg1[%0] : memref<9xf32> + //%4 = arith.addf %2, %3 : f32 + //memref.store %4, %arg2[%0] : memref<9xf32> + gpu.return + } + } + // CHECK: [3.3, 3.3, 3.3, 3.3, 3.3, 3.3, 3.3, 3.3, 3.3] +} >From 84d584d5ba21a8177eca4818a076daf1d471b38c Mon Sep 17 00:00:00 2001 From: "Lee, Sang Ik" <sang.ik....@intel.com> Date: Wed, 6 Sep 2023 21:26:30 +0000 Subject: [PATCH 10/13] Add new option to convert-gpu-to-spirv pass to handle OpenCL --- mlir/include/mlir/Conversion/Passes.td | 5 ++++- mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp | 3 ++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td index ed37abf85275bf3..3bb6006a467fe37 100644 --- a/mlir/include/mlir/Conversion/Passes.td +++ b/mlir/include/mlir/Conversion/Passes.td @@ -568,7 +568,10 @@ def ConvertGPUToSPIRV : Pass<"convert-gpu-to-spirv", "ModuleOp"> { let options = [ Option<"use64bitIndex", "use-64bit-index", "bool", /*default=*/"false", - "Use 64-bit integers to convert index types"> + "Use 64-bit integers to convert index types">, + Option<"useOpenCL", "use-opencl", + "bool", /*default=*/"false", + "Use OpenCL instead of Vulkan"> ]; } diff --git a/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp b/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp index f37c70a771f5916..a52c99ec9daec16 100644 --- a/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp +++ b/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp @@ -71,7 +71,8 @@ void GPUToSPIRVPass::runOnOperation() { std::unique_ptr<ConversionTarget> target = spirv::getMemorySpaceToStorageClassTarget(*context); spirv::MemorySpaceToStorageClassMap memorySpaceMap = - spirv::mapMemorySpaceToVulkanStorageClass; + this->useOpenCL ? spirv::mapMemorySpaceToOpenCLStorageClass : + spirv::mapMemorySpaceToVulkanStorageClass; spirv::MemorySpaceToStorageClassConverter converter(memorySpaceMap); RewritePatternSet patterns(context); >From 2f8ab53057dd7a1433bc4e867a265ee4a215bb2b Mon Sep 17 00:00:00 2001 From: "Lee, Sang Ik" <sang.ik....@intel.com> Date: Wed, 6 Sep 2023 21:27:10 +0000 Subject: [PATCH 11/13] Update pass pipeline for integration tests. --- mlir/test/Integration/GPU/SYCL/addf.mlir | 2 +- mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mlir/test/Integration/GPU/SYCL/addf.mlir b/mlir/test/Integration/GPU/SYCL/addf.mlir index b9fc36547220e77..984b1e0ae528723 100644 --- a/mlir/test/Integration/GPU/SYCL/addf.mlir +++ b/mlir/test/Integration/GPU/SYCL/addf.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s -pass-pipeline='builtin.module(convert-gpu-to-spirv{use-64bit-index=true},spirv.module(spirv-lower-abi-attrs,spirv-update-vce),func.func(llvm-request-c-wrappers),gpu-serialize-to-spirv,convert-func-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' \ +// RUN: mlir-opt %s -pass-pipeline='builtin.module(convert-gpu-to-spirv{use-64bit-index=true use-opencl=true},spirv.module(spirv-lower-abi-attrs,spirv-update-vce),func.func(llvm-request-c-wrappers),gpu-serialize-to-spirv,convert-scf-to-cf,convert-cf-to-llvm,convert-arith-to-llvm,convert-math-to-llvm,convert-func-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_sycl_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir b/mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir index b6180d322ce9329..c52e723af68f32b 100644 --- a/mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir +++ b/mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s -pass-pipeline='builtin.module(convert-gpu-to-spirv{use-64bit-index=true},spirv.module(spirv-lower-abi-attrs,spirv-update-vce),func.func(llvm-request-c-wrappers),gpu-serialize-to-spirv,convert-func-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' \ +// RUN: mlir-opt %s -pass-pipeline='builtin.module(convert-gpu-to-spirv{use-64bit-index=true use-opencl=true},spirv.module(spirv-lower-abi-attrs,spirv-update-vce),func.func(llvm-request-c-wrappers),gpu-serialize-to-spirv,convert-scf-to-cf,convert-cf-to-llvm,convert-arith-to-llvm,convert-math-to-llvm,convert-func-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_sycl_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ >From 41bc1125acb049e655aa9c934b8e3da1f64cc26f Mon Sep 17 00:00:00 2001 From: "Lee, Sang Ik" <sang.ik....@intel.com> Date: Wed, 6 Sep 2023 21:55:10 +0000 Subject: [PATCH 12/13] Fix. --- mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp | 4 ---- mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp | 5 +++++ mlir/test/Integration/GPU/SYCL/addf.mlir | 11 ++++++----- mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir | 3 +++ 4 files changed, 14 insertions(+), 9 deletions(-) diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp index 2b92c1cd8b00e50..808431b82472471 100644 --- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp +++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp @@ -789,10 +789,6 @@ LogicalResult ConvertHostUnregisterOpToGpuRuntimeCallPattern::matchAndRewrite( LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite( gpu::AllocOp allocOp, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { - if (adaptor.getHostShared()) - return rewriter.notifyMatchFailure( - allocOp, "host_shared allocation is not supported"); - MemRefType memRefType = allocOp.getType(); if (failed(areAllLLVMTypes(allocOp, adaptor.getOperands(), rewriter)) || diff --git a/mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp index 60ac27bd84e72fd..6b40d4a6922c9f0 100644 --- a/mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp +++ b/mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp @@ -76,10 +76,15 @@ static sycl::device getDefaultDevice() { throw std::runtime_error("getDefaultDevice failed"); } +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wglobal-constructors" + // Create global device and context sycl::device syclDevice = getDefaultDevice(); sycl::context syclContext = sycl::context(syclDevice); +#pragma clang diagnostic pop + struct QUEUE { sycl::queue syclQueue_; diff --git a/mlir/test/Integration/GPU/SYCL/addf.mlir b/mlir/test/Integration/GPU/SYCL/addf.mlir index 984b1e0ae528723..d4e00ddedf3877c 100644 --- a/mlir/test/Integration/GPU/SYCL/addf.mlir +++ b/mlir/test/Integration/GPU/SYCL/addf.mlir @@ -6,8 +6,7 @@ // RUN: | FileCheck %s module @add attributes { - gpu.container_module, - spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR], [SPV_KHR_expect_assume]>, api=OpenCL, #spirv.resource_limits<>> + gpu.container_module } { memref.global "private" constant @__constant_9xf32_0 : memref<9xf32> = dense<[1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1]> memref.global "private" constant @__constant_9xf32 : memref<9xf32> = dense<[2.2, 2.2, 2.2, 2.2, 2.2, 2.2, 2.2, 2.2, 2.2]> @@ -16,10 +15,10 @@ module @add attributes { %1 = memref.get_global @__constant_9xf32_0 : memref<9xf32> %2 = call @test(%0, %1) : (memref<9xf32>, memref<9xf32>) -> memref<9xf32> %cast = memref.cast %2 : memref<9xf32> to memref<*xf32> - call @printMemrefI64(%cast) : (memref<*xf32>) -> () + call @printMemrefF32(%cast) : (memref<*xf32>) -> () return } - func.func private @printMemrefI64(memref<*xf32>) + func.func private @printMemrefF32(memref<*xf32>) func.func @test(%arg0: memref<9xf32>, %arg1: memref<9xf32>) -> memref<9xf32> { %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index @@ -44,7 +43,9 @@ module @add attributes { gpu.wait [%7] return %alloc : memref<9xf32> } - gpu.module @test_kernel { + gpu.module @test_kernel attributes { + spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR], [SPV_KHR_expect_assume]>, api=OpenCL, #spirv.resource_limits<>> + }{ gpu.func @test_kernel(%arg0: memref<9xf32>, %arg1: memref<9xf32>, %arg2: memref<9xf32>) kernel attributes {gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 9, 1, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>} { //%0 = gpu.block_id x //%2 = memref.load %arg0[%0] : memref<9xf32> diff --git a/mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir b/mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir index c52e723af68f32b..36d132d0c94d32e 100644 --- a/mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir +++ b/mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir @@ -52,4 +52,7 @@ module @add attributes {gpu.container_module} { gpu.return } } + // CHECK: [2, 4100, 6], + // CHECK: [16777224, 10, 4294971404], + // CHECK: [16777230, 1103806595088, 1099511627794] } >From aae8d757874239e9362eb60ea0eebfd572a0303f Mon Sep 17 00:00:00 2001 From: "Lee, Sang Ik" <sang.ik....@intel.com> Date: Wed, 6 Sep 2023 21:56:16 +0000 Subject: [PATCH 13/13] Remove unneeded test. --- mlir/test/Integration/GPU/SYCL/addf.mlir | 59 ------------------------ 1 file changed, 59 deletions(-) delete mode 100644 mlir/test/Integration/GPU/SYCL/addf.mlir diff --git a/mlir/test/Integration/GPU/SYCL/addf.mlir b/mlir/test/Integration/GPU/SYCL/addf.mlir deleted file mode 100644 index d4e00ddedf3877c..000000000000000 --- a/mlir/test/Integration/GPU/SYCL/addf.mlir +++ /dev/null @@ -1,59 +0,0 @@ -// RUN: mlir-opt %s -pass-pipeline='builtin.module(convert-gpu-to-spirv{use-64bit-index=true use-opencl=true},spirv.module(spirv-lower-abi-attrs,spirv-update-vce),func.func(llvm-request-c-wrappers),gpu-serialize-to-spirv,convert-scf-to-cf,convert-cf-to-llvm,convert-arith-to-llvm,convert-math-to-llvm,convert-func-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' \ -// RUN: | mlir-cpu-runner \ -// RUN: --shared-libs=%mlir_sycl_runtime \ -// RUN: --shared-libs=%mlir_runner_utils \ -// RUN: --entry-point-result=void \ -// RUN: | FileCheck %s - -module @add attributes { - gpu.container_module -} { - memref.global "private" constant @__constant_9xf32_0 : memref<9xf32> = dense<[1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1]> - memref.global "private" constant @__constant_9xf32 : memref<9xf32> = dense<[2.2, 2.2, 2.2, 2.2, 2.2, 2.2, 2.2, 2.2, 2.2]> - func.func @main() { - %0 = memref.get_global @__constant_9xf32 : memref<9xf32> - %1 = memref.get_global @__constant_9xf32_0 : memref<9xf32> - %2 = call @test(%0, %1) : (memref<9xf32>, memref<9xf32>) -> memref<9xf32> - %cast = memref.cast %2 : memref<9xf32> to memref<*xf32> - call @printMemrefF32(%cast) : (memref<*xf32>) -> () - return - } - func.func private @printMemrefF32(memref<*xf32>) - func.func @test(%arg0: memref<9xf32>, %arg1: memref<9xf32>) -> memref<9xf32> { - %c9 = arith.constant 9 : index - %c1 = arith.constant 1 : index - %0 = gpu.wait async - %memref, %asyncToken = gpu.alloc async [%0] host_shared (): memref<9xf32> - gpu.wait [%asyncToken] - memref.copy %arg1, %memref : memref<9xf32> to memref<9xf32> - %1 = gpu.wait async - %memref_0, %asyncToken_1 = gpu.alloc async [%1] host_shared () : memref<9xf32> - gpu.wait [%asyncToken_1] - memref.copy %arg0, %memref_0 : memref<9xf32> to memref<9xf32> - %2 = gpu.wait async - %memref_2, %asyncToken_3 = gpu.alloc async [%2] host_shared () : memref<9xf32> - %3 = gpu.launch_func async [%asyncToken_3] @test_kernel::@test_kernel blocks in (%c9, %c1, %c1) threads in (%c1, %c1, %c1) args(%memref_0 : memref<9xf32>, %memref : memref<9xf32>, %memref_2 : memref<9xf32>) - gpu.wait [%3] - %alloc = memref.alloc() : memref<9xf32> - memref.copy %memref_2, %alloc : memref<9xf32> to memref<9xf32> - %4 = gpu.wait async - %5 = gpu.dealloc async [%4] %memref_2 : memref<9xf32> - %6 = gpu.dealloc async [%5] %memref_0 : memref<9xf32> - %7 = gpu.dealloc async [%6] %memref : memref<9xf32> - gpu.wait [%7] - return %alloc : memref<9xf32> - } - gpu.module @test_kernel attributes { - spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR], [SPV_KHR_expect_assume]>, api=OpenCL, #spirv.resource_limits<>> - }{ - gpu.func @test_kernel(%arg0: memref<9xf32>, %arg1: memref<9xf32>, %arg2: memref<9xf32>) kernel attributes {gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 9, 1, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>} { - //%0 = gpu.block_id x - //%2 = memref.load %arg0[%0] : memref<9xf32> - //%3 = memref.load %arg1[%0] : memref<9xf32> - //%4 = arith.addf %2, %3 : f32 - //memref.store %4, %arg2[%0] : memref<9xf32> - gpu.return - } - } - // CHECK: [3.3, 3.3, 3.3, 3.3, 3.3, 3.3, 3.3, 3.3, 3.3] -} _______________________________________________ lldb-commits mailing list lldb-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/lldb-commits