atmnpatel updated this revision to Diff 405407.
atmnpatel marked 7 inline comments as done.
atmnpatel added a comment.
updates
Repository:
rG LLVM Github Monorepo
CHANGES SINCE LAST ACTION
https://reviews.llvm.org/D113359/new/
https://reviews.llvm.org/D113359
Files:
clang/lib/Basic/TargetInfo.cpp
clang/lib/Basic/Targets/X86.h
clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
clang/lib/CodeGen/CodeGenModule.cpp
clang/lib/Driver/ToolChains/Gnu.cpp
clang/lib/Frontend/CompilerInvocation.cpp
llvm/include/llvm/ADT/Triple.h
llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h
llvm/lib/Support/Triple.cpp
openmp/CMakeLists.txt
openmp/libomptarget/DeviceRTL/CMakeLists.txt
openmp/libomptarget/DeviceRTL/include/ThreadEnvironment.h
openmp/libomptarget/DeviceRTL/src/Debug.cpp
openmp/libomptarget/DeviceRTL/src/Mapping.cpp
openmp/libomptarget/DeviceRTL/src/Misc.cpp
openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
openmp/libomptarget/DeviceRTL/src/Utils.cpp
openmp/libomptarget/plugins/CMakeLists.txt
openmp/libomptarget/plugins/vgpu/CMakeLists.txt
openmp/libomptarget/plugins/vgpu/src/ThreadEnvironment.cpp
openmp/libomptarget/plugins/vgpu/src/ThreadEnvironment.h
openmp/libomptarget/plugins/vgpu/src/ThreadEnvironmentImpl.cpp
openmp/libomptarget/plugins/vgpu/src/ThreadEnvironmentImpl.h
openmp/libomptarget/plugins/vgpu/src/rtl.cpp
openmp/libomptarget/src/rtl.cpp
openmp/libomptarget/test/CMakeLists.txt
Index: openmp/libomptarget/test/CMakeLists.txt
===================================================================
--- openmp/libomptarget/test/CMakeLists.txt
+++ openmp/libomptarget/test/CMakeLists.txt
@@ -18,6 +18,9 @@
string(REGEX MATCHALL "([^\ ]+\ |[^\ ]+$)" SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS}")
foreach(CURRENT_TARGET IN LISTS SYSTEM_TARGETS)
+ IF ("${CURRENT_TARGET}" MATCHES "-vgpu")
+ continue()
+ ENDIF()
string(STRIP "${CURRENT_TARGET}" CURRENT_TARGET)
add_openmp_testsuite(check-libomptarget-${CURRENT_TARGET}
"Running libomptarget tests"
Index: openmp/libomptarget/src/rtl.cpp
===================================================================
--- openmp/libomptarget/src/rtl.cpp
+++ openmp/libomptarget/src/rtl.cpp
@@ -21,17 +21,22 @@
#include <mutex>
#include <string>
-// List of all plugins that can support offloading.
-static const char *RTLNames[] = {
- /* PowerPC target */ "libomptarget.rtl.ppc64.so",
- /* x86_64 target */ "libomptarget.rtl.x86_64.so",
- /* CUDA target */ "libomptarget.rtl.cuda.so",
- /* AArch64 target */ "libomptarget.rtl.aarch64.so",
- /* SX-Aurora VE target */ "libomptarget.rtl.ve.so",
- /* AMDGPU target */ "libomptarget.rtl.amdgpu.so",
- /* Remote target */ "libomptarget.rtl.rpc.so",
+struct PluginInfoTy {
+ std::string Name;
+ bool IsHost;
};
+// List of all plugins that can support offloading.
+static const PluginInfoTy Plugins[] = {
+ /* PowerPC target */ {"libomptarget.rtl.ppc64.so", true},
+ /* x86_64 target */ {"libomptarget.rtl.x86_64.so", true},
+ /* CUDA target */ {"libomptarget.rtl.cuda.so", false},
+ /* AArch64 target */ {"libomptarget.rtl.aarch64.so", true},
+ /* SX-Aurora VE target */ {"libomptarget.rtl.ve.so", false},
+ /* AMDGPU target */ {"libomptarget.rtl.amdgpu.so", false},
+ /* Remote target */ {"libomptarget.rtl.rpc.so", false},
+ /* Virtual GPU target */ {"libomptarget.rtl.vgpu.so", false}};
+
PluginManager *PM;
#if OMPTARGET_PROFILE_ENABLED
@@ -86,21 +91,37 @@
return;
}
+ // TODO: add ability to inspect image and decide automatically
+ bool UseVGPU = false;
+ if (auto *EnvFlag = std::getenv("LIBOMPTARGET_USE_VGPU"))
+ UseVGPU = true;
+
DP("Loading RTLs...\n");
// Attempt to open all the plugins and, if they exist, check if the interface
// is correct and if they are supporting any devices.
- for (auto *Name : RTLNames) {
- DP("Loading library '%s'...\n", Name);
- void *dynlib_handle = dlopen(Name, RTLD_NOW);
+ for (auto &[Name, IsHost] : Plugins) {
+ DP("Loading library '%s'...\n", Name.c_str());
+
+ int Flags = RTLD_NOW;
+
+ if (Name.compare("libomptarget.rtl.vgpu.so") == 0)
+ Flags |= RTLD_GLOBAL;
+
+ if (UseVGPU && IsHost) {
+ DP("Skipping library '%s': VGPU was requested.\n", Name.c_str());
+ continue;
+ }
+
+ void *dynlib_handle = dlopen(Name.c_str(), Flags);
if (!dynlib_handle) {
// Library does not exist or cannot be found.
- DP("Unable to load library '%s': %s!\n", Name, dlerror());
+ DP("Unable to load library '%s': %s!\n", Name.c_str(), dlerror());
continue;
}
- DP("Successfully loaded library '%s'!\n", Name);
+ DP("Successfully loaded library '%s'!\n", Name.c_str());
AllRTLs.emplace_back();
Index: openmp/libomptarget/plugins/vgpu/src/rtl.cpp
===================================================================
--- /dev/null
+++ openmp/libomptarget/plugins/vgpu/src/rtl.cpp
@@ -0,0 +1,615 @@
+//===------RTLs/vgpu/src/rtl.cpp - Target RTLs Implementation ----- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// RTL for virtual (x86) GPU
+//
+//===----------------------------------------------------------------------===//
+
+#include <barrier>
+#include <cassert>
+#include <cmath>
+#include <condition_variable>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <dlfcn.h>
+#include <ffi.h>
+#include <functional>
+#include <gelf.h>
+#include <link.h>
+#include <list>
+#include <memory>
+#include <mutex>
+#include <queue>
+#include <thread>
+#include <vector>
+
+#include "Debug.h"
+#include "ThreadEnvironment.h"
+#include "ThreadEnvironmentImpl.h"
+#include "omptarget.h"
+#include "omptargetplugin.h"
+
+#ifndef TARGET_NAME
+#define TARGET_NAME Generic ELF - 64bit
+#endif
+#define DEBUG_PREFIX "TARGET " GETNAME(TARGET_NAME) " RTL"
+
+#ifndef TARGET_ELF_ID
+#define TARGET_ELF_ID 0
+#endif
+
+#include "elf_common.h"
+
+#define OFFLOADSECTIONNAME "omp_offloading_entries"
+
+#define DEBUG false
+
+struct FFICallTy {
+ ffi_cif CIF;
+ std::vector<ffi_type *> ArgsTypes;
+ std::vector<void *> Args;
+ std::vector<void *> Ptrs;
+ void (*Entry)(void);
+
+ FFICallTy(int32_t ArgNum, void **TgtArgs, ptrdiff_t *TgtOffsets,
+ void *TgtEntryPtr)
+ : ArgsTypes(ArgNum, &ffi_type_pointer), Args(ArgNum), Ptrs(ArgNum) {
+ for (int32_t i = 0; i < ArgNum; ++i) {
+ Ptrs[i] = (void *)((intptr_t)TgtArgs[i] + TgtOffsets[i]);
+ Args[i] = &Ptrs[i];
+ }
+
+ ffi_status status = ffi_prep_cif(&CIF, FFI_DEFAULT_ABI, ArgNum,
+ &ffi_type_void, &ArgsTypes[0]);
+
+ assert(status == FFI_OK && "Unable to prepare target launch!");
+
+ *((void **)&Entry) = TgtEntryPtr;
+ }
+};
+
+/// Array of Dynamic libraries loaded for this target.
+struct DynLibTy {
+ char *FileName;
+ void *Handle;
+};
+
+/// Keep entries table per device.
+struct FuncOrGblEntryTy {
+ __tgt_target_table Table;
+};
+
+thread_local ThreadEnvironmentTy *ThreadEnvironment;
+
+/// Class containing all the device information.
+class RTLDeviceInfoTy {
+ std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries;
+
+public:
+ std::list<DynLibTy> DynLibs;
+
+ // Record entry point associated with device.
+ void createOffloadTable(int32_t device_id, __tgt_offload_entry *begin,
+ __tgt_offload_entry *end) {
+ assert(device_id < (int32_t)FuncGblEntries.size() &&
+ "Unexpected device id!");
+ FuncGblEntries[device_id].emplace_back();
+ FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
+
+ E.Table.EntriesBegin = begin;
+ E.Table.EntriesEnd = end;
+ }
+
+ // Return true if the entry is associated with device.
+ bool findOffloadEntry(int32_t device_id, void *addr) {
+ assert(device_id < (int32_t)FuncGblEntries.size() &&
+ "Unexpected device id!");
+ FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
+
+ for (__tgt_offload_entry *i = E.Table.EntriesBegin, *e = E.Table.EntriesEnd;
+ i < e; ++i) {
+ if (i->addr == addr)
+ return true;
+ }
+
+ return false;
+ }
+
+ // Return the pointer to the target entries table.
+ __tgt_target_table *getOffloadEntriesTable(int32_t device_id) {
+ assert(device_id < (int32_t)FuncGblEntries.size() &&
+ "Unexpected device id!");
+ FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
+
+ return &E.Table;
+ }
+
+ RTLDeviceInfoTy() : FuncGblEntries(1) {}
+
+ ~RTLDeviceInfoTy() {
+ // Close dynamic libraries
+ for (auto &lib : DynLibs) {
+ if (lib.Handle) {
+ dlclose(lib.Handle);
+ remove(lib.FileName);
+ }
+ }
+ }
+};
+
+static RTLDeviceInfoTy DeviceInfo;
+
+std::vector<CTAEnvironmentTy *> CTAEnvironments;
+std::vector<WarpEnvironmentTy *> WarpEnvironments;
+
+struct VGPUTy {
+ struct KernelTy {
+ FFICallTy *Call;
+ int NumTeams;
+
+ KernelTy(FFICallTy *Call, int NumTeams) : Call(Call), NumTeams(NumTeams) {}
+ };
+
+ struct VGPUStreamTy {
+ std::queue<KernelTy> Kernels;
+ std::mutex Mtx;
+
+ void emplace(FFICallTy *Call, int NumTeams) {
+ std::lock_guard Guard(Mtx);
+ Kernels.emplace(Call, NumTeams);
+ }
+
+ KernelTy front() {
+ std::lock_guard Guard(Mtx);
+ return Kernels.front();
+ }
+
+ void pop() {
+ std::lock_guard Guard(Mtx);
+ Kernels.pop();
+ }
+
+ bool empty() {
+ std::lock_guard Guard(Mtx);
+ return Kernels.empty();
+ }
+ };
+
+ struct AsyncInfoQueueTy {
+ std::deque<__tgt_async_info *> Streams;
+ std::mutex Mtx;
+
+ bool empty() {
+ std::lock_guard Guard(Mtx);
+ return Streams.empty();
+ }
+
+ __tgt_async_info *front() {
+ std::lock_guard Guard(Mtx);
+ return Streams.front();
+ }
+
+ void pop() {
+ std::lock_guard Guard(Mtx);
+ Streams.pop_front();
+ }
+
+ void emplace(__tgt_async_info *AsyncInfo) {
+ std::lock_guard Guard(Mtx);
+ Streams.emplace_back(AsyncInfo);
+ }
+ } ExecutionQueue;
+
+ VGPUStreamTy *getStream(__tgt_async_info *AsyncInfo) {
+ assert(AsyncInfo != nullptr && "async_info ptr was null");
+
+ if (!AsyncInfo->Queue)
+ AsyncInfo->Queue = new VGPUStreamTy();
+
+ return reinterpret_cast<VGPUStreamTy *>(AsyncInfo->Queue);
+ }
+
+ std::atomic<bool> Running;
+ std::vector<std::thread> Threads;
+ int WarpsPerCTA = -1;
+ int NumCTAs = -1;
+ int NumThreads = -1;
+
+ std::unique_ptr<std::barrier<std::function<void(void)>>> Barrier;
+ std::condition_variable WorkAvailable;
+ std::mutex WorkDoneMtx;
+ std::condition_variable WorkDone;
+
+ void configureArchitecture() {
+ int ThreadsPerWarp = -1;
+
+ if (const char *Env = std::getenv("VGPU_NUM_THREADS"))
+ NumThreads = std::stoi(Env);
+ if (const char *Env = std::getenv("VGPU_THREADS_PER_WARP"))
+ ThreadsPerWarp = std::stoi(Env);
+ if (const char *Env = std::getenv("VGPU_WARPS_PER_CTA"))
+ WarpsPerCTA = std::stoi(Env);
+
+ if (NumThreads == -1)
+ NumThreads = std::thread::hardware_concurrency();
+ if (ThreadsPerWarp == -1)
+ ThreadsPerWarp = NumThreads;
+ if (WarpsPerCTA == -1)
+ WarpsPerCTA = 1;
+
+ NumCTAs = NumThreads / (ThreadsPerWarp * WarpsPerCTA);
+
+ assert(NumThreads % ThreadsPerWarp == 0 && NumThreads % WarpsPerCTA == 0 &&
+ "Invalid VGPU Config");
+
+ DP("NumThreads: %d, ThreadsPerWarp: %d, WarpsPerCTA: %d\n", NumThreads,
+ ThreadsPerWarp, WarpsPerCTA);
+
+ CTAEnvironmentTy::configure(NumThreads, NumCTAs);
+ WarpEnvironmentTy::configure(ThreadsPerWarp);
+ }
+
+ VGPUTy() : Running(true) {
+ configureArchitecture();
+
+ Barrier = std::make_unique<BarrierTy>(NumThreads, []() {});
+ Threads.reserve(NumThreads);
+
+ auto GlobalThreadIdx = 0;
+ for (auto CTAIdx = 0; CTAIdx < CTAEnvironmentTy::NumCTAs; CTAIdx++) {
+ auto *CTAEnv = new CTAEnvironmentTy();
+ for (auto WarpIdx = 0; WarpIdx < WarpsPerCTA; WarpIdx++) {
+ auto *WarpEnv = new WarpEnvironmentTy();
+ for (auto ThreadIdx = 0; ThreadIdx < WarpEnvironmentTy::ThreadsPerWarp;
+ ThreadIdx++) {
+ Threads.emplace_back([this, GlobalThreadIdx, CTAEnv, WarpEnv]() {
+ ThreadEnvironment = new ThreadEnvironmentTy(WarpEnv, CTAEnv);
+ while (Running) {
+ {
+ std::unique_lock<std::mutex> UniqueLock(ExecutionQueue.Mtx);
+
+ WorkAvailable.wait(UniqueLock, [&]() {
+ if (!Running)
+ return true;
+
+ bool IsEmpty = ExecutionQueue.Streams.empty();
+
+ return !IsEmpty;
+ });
+ }
+
+ if (ExecutionQueue.empty())
+ continue;
+
+ while (!ExecutionQueue.empty()) {
+ auto *Stream = getStream(ExecutionQueue.front());
+ while (!Stream->empty()) {
+ auto [Call, NumTeams] = Stream->front();
+
+ runKernel(CTAEnv, Call, NumTeams);
+
+ if (GlobalThreadIdx == 0) {
+ Stream->pop();
+ delete Call;
+ }
+
+ Barrier->arrive_and_wait();
+ }
+ if (GlobalThreadIdx == 0) {
+ ExecutionQueue.pop();
+ WorkDone.notify_all();
+ }
+ Barrier->arrive_and_wait();
+ }
+ }
+ delete ThreadEnvironment;
+ });
+ GlobalThreadIdx = (GlobalThreadIdx + 1) % NumThreads;
+ }
+ WarpEnvironments.push_back(WarpEnv);
+ }
+ CTAEnvironments.push_back(CTAEnv);
+ }
+ }
+
+ void runKernel(CTAEnvironmentTy *CTAEnv, FFICallTy *Call, int NumTeams) {
+ unsigned TeamIdx = 0;
+ while (TeamIdx < NumTeams) {
+ if (CTAEnv->getId() < NumTeams) {
+ ThreadEnvironment->setBlockEnv(
+ new ThreadBlockEnvironmentTy(TeamIdx + CTAEnv->getId(), NumTeams));
+ ffi_call(&Call->CIF, Call->Entry, NULL, &(Call->Args)[0]);
+ ThreadEnvironment->resetBlockEnv();
+ }
+ Barrier->arrive_and_wait();
+ TeamIdx += NumCTAs;
+ }
+ }
+
+ ~VGPUTy() {
+ awaitAll();
+
+ Running = false;
+ WorkAvailable.notify_all();
+
+ for (auto &Thread : Threads) {
+ if (Thread.joinable())
+ Thread.join();
+ }
+
+ for (auto *CTAEnv : CTAEnvironments)
+ delete CTAEnv;
+
+ for (auto *WarpEnv : WarpEnvironments)
+ delete WarpEnv;
+ }
+
+ void await(__tgt_async_info *AsyncInfo) {
+ std::unique_lock UniqueLock(getStream(AsyncInfo)->Mtx);
+ WorkDone.wait(UniqueLock,
+ [&]() { return getStream(AsyncInfo)->Kernels.empty(); });
+ }
+
+ void awaitAll() {
+ while (!ExecutionQueue.empty()) {
+ await(ExecutionQueue.front());
+ }
+ }
+
+ void scheduleAsync(__tgt_async_info *AsyncInfo, FFICallTy *Call,
+ int NumTeams) {
+ if (NumTeams == 0)
+ NumTeams = NumCTAs;
+ auto *Stream = getStream(AsyncInfo);
+ Stream->emplace(Call, NumTeams);
+ ExecutionQueue.emplace(AsyncInfo);
+ WorkAvailable.notify_all();
+ }
+};
+
+VGPUTy VGPU;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
+// If we don't have a valid ELF ID we can just fail.
+#if TARGET_ELF_ID < 1
+ return 0;
+#else
+ return elf_check_machine(image, TARGET_ELF_ID);
+#endif
+}
+
+int32_t __tgt_rtl_number_of_devices() { return 1; }
+
+int32_t __tgt_rtl_init_device(int32_t device_id) { return OFFLOAD_SUCCESS; }
+
+__tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
+ __tgt_device_image *image) {
+
+ DP("Dev %d: load binary from " DPxMOD " image\n", device_id,
+ DPxPTR(image->ImageStart));
+
+ assert(device_id >= 0 && device_id < 1 && "bad dev id");
+
+ size_t ImageSize = (size_t)image->ImageEnd - (size_t)image->ImageStart;
+ size_t NumEntries = (size_t)(image->EntriesEnd - image->EntriesBegin);
+ DP("Expecting to have %zd entries defined.\n", NumEntries);
+
+ // Is the library version incompatible with the header file?
+ if (elf_version(EV_CURRENT) == EV_NONE) {
+ DP("Incompatible ELF library!\n");
+ return NULL;
+ }
+
+ // Obtain elf handler
+ Elf *e = elf_memory((char *)image->ImageStart, ImageSize);
+ if (!e) {
+ DP("Unable to get ELF handle: %s!\n", elf_errmsg(-1));
+ return NULL;
+ }
+
+ if (elf_kind(e) != ELF_K_ELF) {
+ DP("Invalid Elf kind!\n");
+ elf_end(e);
+ return NULL;
+ }
+
+ // Find the entries section offset
+ Elf_Scn *section = 0;
+ Elf64_Off entries_offset = 0;
+
+ size_t shstrndx;
+
+ if (elf_getshdrstrndx(e, &shstrndx)) {
+ DP("Unable to get ELF strings index!\n");
+ elf_end(e);
+ return NULL;
+ }
+
+ while ((section = elf_nextscn(e, section))) {
+ GElf_Shdr hdr;
+ gelf_getshdr(section, &hdr);
+
+ if (!strcmp(elf_strptr(e, shstrndx, hdr.sh_name), OFFLOADSECTIONNAME)) {
+ entries_offset = hdr.sh_addr;
+ break;
+ }
+ }
+
+ if (!entries_offset) {
+ DP("Entries Section Offset Not Found\n");
+ elf_end(e);
+ return NULL;
+ }
+
+ DP("Offset of entries section is (" DPxMOD ").\n", DPxPTR(entries_offset));
+
+ // load dynamic library and get the entry points. We use the dl library
+ // to do the loading of the library, but we could do it directly to avoid
+ // the dump to the temporary file.
+ //
+ // 1) Create tmp file with the library contents.
+ // 2) Use dlopen to load the file and dlsym to retrieve the symbols.
+ char tmp_name[] = "/tmp/tmpfile_XXXXXX";
+ int tmp_fd = mkstemp(tmp_name);
+
+ if (tmp_fd == -1) {
+ elf_end(e);
+ return NULL;
+ }
+
+ FILE *ftmp = fdopen(tmp_fd, "wb");
+
+ if (!ftmp) {
+ elf_end(e);
+ return NULL;
+ }
+
+ fwrite(image->ImageStart, ImageSize, 1, ftmp);
+ fclose(ftmp);
+
+ DynLibTy Lib = {tmp_name, dlopen(tmp_name, RTLD_NOW | RTLD_GLOBAL)};
+
+ if (!Lib.Handle) {
+ DP("Target library loading error: %s\n", dlerror());
+ elf_end(e);
+ return NULL;
+ }
+
+ DeviceInfo.DynLibs.push_back(Lib);
+
+ struct link_map *libInfo = (struct link_map *)Lib.Handle;
+
+ // The place where the entries info is loaded is the library base address
+ // plus the offset determined from the ELF file.
+ Elf64_Addr entries_addr = libInfo->l_addr + entries_offset;
+
+ DP("Pointer to first entry to be loaded is (" DPxMOD ").\n",
+ DPxPTR(entries_addr));
+
+ // Table of pointers to all the entries in the target.
+ __tgt_offload_entry *entries_table = (__tgt_offload_entry *)entries_addr;
+
+ __tgt_offload_entry *entries_begin = &entries_table[0];
+ __tgt_offload_entry *entries_end = entries_begin + NumEntries;
+
+ if (!entries_begin) {
+ DP("Can't obtain entries begin\n");
+ elf_end(e);
+ return NULL;
+ }
+
+ DP("Entries table range is (" DPxMOD ")->(" DPxMOD ")\n",
+ DPxPTR(entries_begin), DPxPTR(entries_end));
+ DeviceInfo.createOffloadTable(device_id, entries_begin, entries_end);
+
+ elf_end(e);
+
+ return DeviceInfo.getOffloadEntriesTable(device_id);
+}
+
+// Sample implementation of explicit memory allocator. For this plugin all
+// kinds are equivalent to each other.
+void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *hst_ptr,
+ int32_t kind) {
+ void *ptr = NULL;
+
+ switch (kind) {
+ case TARGET_ALLOC_DEVICE:
+ case TARGET_ALLOC_HOST:
+ case TARGET_ALLOC_SHARED:
+ case TARGET_ALLOC_DEFAULT:
+ ptr = malloc(size);
+ break;
+ default:
+ REPORT("Invalid target data allocation kind");
+ }
+
+ return ptr;
+}
+
+int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr,
+ int64_t size) {
+ VGPU.awaitAll();
+ memcpy(tgt_ptr, hst_ptr, size);
+ return OFFLOAD_SUCCESS;
+}
+
+int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr,
+ int64_t size) {
+ VGPU.awaitAll();
+ memcpy(hst_ptr, tgt_ptr, size);
+ return OFFLOAD_SUCCESS;
+}
+
+int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
+ free(tgt_ptr);
+ return OFFLOAD_SUCCESS;
+}
+
+int32_t __tgt_rtl_synchronize(int32_t device_id, __tgt_async_info *async_info) {
+ VGPU.await(async_info);
+ delete (VGPUTy::VGPUStreamTy *)async_info->Queue;
+ async_info->Queue = nullptr;
+ return 0;
+}
+
+int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
+ void **tgt_args,
+ ptrdiff_t *tgt_offsets,
+ int32_t arg_num, int32_t team_num,
+ int32_t thread_limit,
+ uint64_t loop_tripcount) {
+ __tgt_async_info AsyncInfo;
+ int rc = __tgt_rtl_run_target_team_region_async(
+ device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
+ thread_limit, loop_tripcount, &AsyncInfo);
+
+ if (rc != OFFLOAD_SUCCESS)
+ return OFFLOAD_FAIL;
+
+ return __tgt_rtl_synchronize(device_id, &AsyncInfo);
+}
+
+int32_t __tgt_rtl_run_target_team_region_async(
+ int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
+ ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num,
+ int32_t thread_limit, uint64_t loop_tripcount /*not used*/,
+ __tgt_async_info *async_info) {
+ DP("Running entry point at " DPxMOD "...\n", DPxPTR(tgt_entry_ptr));
+
+ auto Call = new FFICallTy(arg_num, tgt_args, tgt_offsets, tgt_entry_ptr);
+
+ VGPU.scheduleAsync(async_info, std::move(Call), team_num);
+ return OFFLOAD_SUCCESS;
+}
+
+int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
+ void **tgt_args, ptrdiff_t *tgt_offsets,
+ int32_t arg_num) {
+ return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args,
+ tgt_offsets, arg_num, 1, 1, 0);
+}
+
+int32_t __tgt_rtl_run_target_region_async(int32_t device_id,
+ void *tgt_entry_ptr, void **tgt_args,
+ ptrdiff_t *tgt_offsets,
+ int32_t arg_num,
+ __tgt_async_info *async_info) {
+ return __tgt_rtl_run_target_team_region_async(device_id, tgt_entry_ptr,
+ tgt_args, tgt_offsets, arg_num,
+ 1, 1, 0, async_info);
+}
+
+#ifdef __cplusplus
+}
+#endif
Index: openmp/libomptarget/plugins/vgpu/src/ThreadEnvironmentImpl.h
===================================================================
--- /dev/null
+++ openmp/libomptarget/plugins/vgpu/src/ThreadEnvironmentImpl.h
@@ -0,0 +1,137 @@
+//===---- ThreadEnvironmentImpl.h - Virtual GPU thread environment - C++ --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OPENMP_LIBOMPTARGET_PLUGINS_VGPU_SRC_THREADENVIRONMENTIMPL_H
+#define OPENMP_LIBOMPTARGET_PLUGINS_VGPU_SRC_THREADENVIRONMENTIMPL_H
+
+#include "ThreadEnvironment.h"
+#include <barrier>
+#include <cstdio>
+#include <functional>
+#include <map>
+#include <thread>
+#include <vector>
+
+using BarrierTy = std::barrier<std::function<void(void)>>;
+
+class WarpEnvironmentTy {
+ static unsigned Idx;
+
+ const unsigned ID;
+
+ std::vector<int32_t> ShuffleBuffer;
+
+ BarrierTy Barrier;
+ BarrierTy ShuffleBarrier;
+ BarrierTy ShuffleDownBarrier;
+
+public:
+ static void configure(unsigned NumThreadsInWarp);
+
+ static unsigned ThreadsPerWarp;
+
+ WarpEnvironmentTy();
+
+ unsigned getWarpId() const;
+ int getNumThreads() const;
+
+ void sync(int Ordering);
+ void writeShuffleBuffer(int32_t Var, unsigned LaneId);
+
+ int32_t getShuffleBuffer(unsigned LaneId);
+
+ void waitShuffleBarrier();
+ void waitShuffleDownBarrier();
+};
+
+class CTAEnvironmentTy {
+ static unsigned Idx;
+
+public:
+ unsigned ID;
+ static unsigned NumThreads;
+ static unsigned NumCTAs;
+
+ BarrierTy Barrier;
+ BarrierTy SyncThreads;
+ BarrierTy NamedBarrier;
+
+ static void configure(unsigned TotalNumThreads, unsigned NumBlocksInCTA);
+
+ CTAEnvironmentTy();
+
+ unsigned getId() const;
+ unsigned getNumThreads() const;
+
+ unsigned getNumBlocks() const;
+
+ void fence(int Ordering);
+ void syncThreads();
+ void namedBarrier();
+};
+
+class ThreadBlockEnvironmentTy {
+ unsigned ID;
+ unsigned NumBlocks;
+
+public:
+ ThreadBlockEnvironmentTy(unsigned ID, unsigned NumBlocks);
+
+ unsigned getId() const;
+ unsigned getNumBlocks() const;
+};
+
+namespace VGPUImpl {
+class ThreadEnvironmentTy {
+ static unsigned Idx;
+ unsigned ThreadIdInWarp;
+ unsigned ThreadIdInBlock;
+ unsigned GlobalThreadIdx;
+
+ WarpEnvironmentTy *WarpEnvironment;
+ ThreadBlockEnvironmentTy *ThreadBlockEnvironment;
+ CTAEnvironmentTy *CTAEnvironment;
+
+public:
+ ThreadEnvironmentTy(WarpEnvironmentTy *WE, CTAEnvironmentTy *CTAE);
+
+ void setBlockEnv(ThreadBlockEnvironmentTy *TBE);
+
+ void resetBlockEnv();
+
+ unsigned getThreadIdInWarp() const;
+ unsigned getThreadIdInBlock() const;
+ unsigned getGlobalThreadId() const;
+
+ unsigned getBlockSize() const;
+
+ unsigned getBlockId() const;
+
+ unsigned getNumberOfBlocks() const;
+ unsigned getKernelSize() const;
+
+ // FIXME: This is wrong
+ LaneMaskTy getActiveMask() const;
+
+ void fenceTeam(int Ordering);
+ void syncWarp(int Ordering);
+
+ int32_t shuffle(uint64_t Mask, int32_t Var, uint64_t SrcLane);
+
+ int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta);
+
+ void namedBarrier(bool Generic);
+
+ void fenceKernel(int32_t MemoryOrder);
+
+ unsigned getWarpSize() const;
+};
+
+} // namespace VGPUImpl
+
+#endif // OPENMP_LIBOMPTARGET_PLUGINS_VGPU_SRC_THREADENVIRONMENTIMPL_H
Index: openmp/libomptarget/plugins/vgpu/src/ThreadEnvironmentImpl.cpp
===================================================================
--- /dev/null
+++ openmp/libomptarget/plugins/vgpu/src/ThreadEnvironmentImpl.cpp
@@ -0,0 +1,171 @@
+//===---- ThreadEnvironmentImpl.h - Virtual GPU thread environment - C++ --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <cstdint>
+
+#include "ThreadEnvironmentImpl.h"
+#include <barrier>
+#include <cstdio>
+#include <functional>
+#include <map>
+#include <thread>
+#include <vector>
+
+void WarpEnvironmentTy::configure(unsigned NumThreads) {
+ ThreadsPerWarp = NumThreads;
+}
+
+WarpEnvironmentTy::WarpEnvironmentTy()
+ : ID(Idx++), ShuffleBuffer(ThreadsPerWarp),
+ Barrier(ThreadsPerWarp, []() {}), ShuffleBarrier(ThreadsPerWarp, []() {}),
+ ShuffleDownBarrier(ThreadsPerWarp, []() {}) {}
+
+unsigned WarpEnvironmentTy::getWarpId() const { return ID; }
+
+int WarpEnvironmentTy::getNumThreads() const { return ThreadsPerWarp; }
+
+void WarpEnvironmentTy::sync(int Ordering) { Barrier.arrive_and_wait(); }
+
+void WarpEnvironmentTy::writeShuffleBuffer(int32_t Var, unsigned LaneId) {
+ ShuffleBuffer[LaneId] = Var;
+}
+
+int32_t WarpEnvironmentTy::getShuffleBuffer(unsigned LaneId) {
+ return ShuffleBuffer[LaneId];
+}
+
+void WarpEnvironmentTy::waitShuffleBarrier() {
+ ShuffleBarrier.arrive_and_wait();
+}
+
+void WarpEnvironmentTy::waitShuffleDownBarrier() {
+ ShuffleBarrier.arrive_and_wait();
+}
+
+unsigned WarpEnvironmentTy::Idx = 0;
+unsigned WarpEnvironmentTy::ThreadsPerWarp = 0;
+
+void CTAEnvironmentTy::configure(unsigned TotalNumThreads, unsigned NumBlocks) {
+ NumThreads = TotalNumThreads / NumBlocks;
+ NumCTAs = NumBlocks;
+}
+
+CTAEnvironmentTy::CTAEnvironmentTy()
+ : ID(Idx++), Barrier(NumThreads, []() {}), SyncThreads(NumThreads, []() {}),
+ NamedBarrier(NumThreads, []() {}) {}
+
+unsigned CTAEnvironmentTy::getId() const { return ID; }
+unsigned CTAEnvironmentTy::getNumThreads() const { return NumThreads; }
+
+unsigned CTAEnvironmentTy::getNumBlocks() const { return NumCTAs; }
+
+void CTAEnvironmentTy::fence(int Ordering) { Barrier.arrive_and_wait(); }
+void CTAEnvironmentTy::syncThreads() { SyncThreads.arrive_and_wait(); }
+void CTAEnvironmentTy::namedBarrier() { NamedBarrier.arrive_and_wait(); }
+
+unsigned CTAEnvironmentTy::Idx = 0;
+unsigned CTAEnvironmentTy::NumThreads = 0;
+unsigned CTAEnvironmentTy::NumCTAs = 0;
+
+ThreadBlockEnvironmentTy::ThreadBlockEnvironmentTy(unsigned ID,
+ unsigned NumBlocks)
+ : ID(ID), NumBlocks(NumBlocks) {}
+
+unsigned ThreadBlockEnvironmentTy::getId() const { return ID; }
+unsigned ThreadBlockEnvironmentTy::getNumBlocks() const { return NumBlocks; }
+
+namespace VGPUImpl {
+ThreadEnvironmentTy::ThreadEnvironmentTy(WarpEnvironmentTy *WE,
+ CTAEnvironmentTy *CTAE)
+ : ThreadIdInWarp(Idx++ % WE->getNumThreads()),
+ ThreadIdInBlock(WE->getWarpId() * WE->getNumThreads() + ThreadIdInWarp),
+ GlobalThreadIdx(CTAE->getId() * CTAE->getNumThreads() + ThreadIdInBlock),
+ WarpEnvironment(WE), CTAEnvironment(CTAE) {}
+
+void ThreadEnvironmentTy::setBlockEnv(ThreadBlockEnvironmentTy *TBE) {
+ ThreadBlockEnvironment = TBE;
+}
+
+void ThreadEnvironmentTy::resetBlockEnv() {
+ delete ThreadBlockEnvironment;
+ ThreadBlockEnvironment = nullptr;
+}
+
+unsigned ThreadEnvironmentTy::getThreadIdInWarp() const {
+ return ThreadIdInWarp;
+}
+unsigned ThreadEnvironmentTy::getThreadIdInBlock() const {
+ return ThreadIdInBlock;
+}
+unsigned ThreadEnvironmentTy::getGlobalThreadId() const {
+ return GlobalThreadIdx;
+}
+
+unsigned ThreadEnvironmentTy::getBlockSize() const {
+ return CTAEnvironment->getNumThreads();
+}
+
+unsigned ThreadEnvironmentTy::getBlockId() const {
+ return ThreadBlockEnvironment->getId();
+}
+
+unsigned ThreadEnvironmentTy::getNumberOfBlocks() const {
+ return ThreadBlockEnvironment->getNumBlocks();
+}
+unsigned ThreadEnvironmentTy::getKernelSize() const {
+ return getBlockSize() * getNumberOfBlocks();
+}
+
+// FIXME: This is wrong
+LaneMaskTy ThreadEnvironmentTy::getActiveMask() const { return ~0U; }
+
+void ThreadEnvironmentTy::fenceTeam(int Ordering) {
+ CTAEnvironment->fence(Ordering);
+}
+void ThreadEnvironmentTy::syncWarp(int Ordering) {
+ WarpEnvironment->sync(Ordering);
+}
+
+int32_t ThreadEnvironmentTy::shuffle(uint64_t Mask, int32_t Var,
+ uint64_t SrcLane) {
+ WarpEnvironment->waitShuffleBarrier();
+ WarpEnvironment->writeShuffleBuffer(Var, ThreadIdInWarp);
+ WarpEnvironment->waitShuffleBarrier();
+ Var = WarpEnvironment->getShuffleBuffer(ThreadIdInWarp);
+ return Var;
+}
+
+int32_t ThreadEnvironmentTy::shuffleDown(uint64_t Mask, int32_t Var,
+ uint32_t Delta) {
+ WarpEnvironment->waitShuffleDownBarrier();
+ WarpEnvironment->writeShuffleBuffer(Var, ThreadIdInWarp);
+ WarpEnvironment->waitShuffleDownBarrier();
+ Var = WarpEnvironment->getShuffleBuffer((ThreadIdInWarp + Delta) %
+ getWarpSize());
+ return Var;
+}
+
+void ThreadEnvironmentTy::namedBarrier(bool Generic) {
+ if (Generic) {
+ CTAEnvironment->namedBarrier();
+ } else {
+ CTAEnvironment->syncThreads();
+ }
+}
+
+void ThreadEnvironmentTy::fenceKernel(int32_t MemoryOrder) {
+ std::atomic_thread_fence(static_cast<std::memory_order>(MemoryOrder));
+}
+
+unsigned ThreadEnvironmentTy::getWarpSize() const {
+ return WarpEnvironment->getNumThreads();
+}
+
+unsigned ThreadEnvironmentTy::Idx = 0;
+
+} // namespace VGPUImpl
Index: openmp/libomptarget/plugins/vgpu/src/ThreadEnvironment.h
===================================================================
--- /dev/null
+++ openmp/libomptarget/plugins/vgpu/src/ThreadEnvironment.h
@@ -0,0 +1,73 @@
+//===---- ThreadEnvironment.h - Virtual GPU thread environment ----- C++ --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OPENMP_LIBOMPTARGET_PLUGINS_VGPU_SRC_THREADENVIRONMENT_H
+#define OPENMP_LIBOMPTARGET_PLUGINS_VGPU_SRC_THREADENVIRONMENT_H
+
+using LaneMaskTy = uint64_t;
+
+// Forward declaration
+class WarpEnvironmentTy;
+class ThreadBlockEnvironmentTy;
+class CTAEnvironmentTy;
+namespace VGPUImpl {
+class ThreadEnvironmentTy;
+void initLock(uint32_t *Lock);
+void destroyLock(uint32_t *Lock);
+void setLock(uint32_t *Lock);
+void unsetLock(uint32_t *Lock);
+bool testLock(uint32_t *Lock);
+uint32_t atomicInc(uint32_t *Address, uint32_t Val, int Ordering);
+} // namespace VGPUImpl
+
+class ThreadEnvironmentTy {
+ VGPUImpl::ThreadEnvironmentTy *Impl;
+
+public:
+ ThreadEnvironmentTy(WarpEnvironmentTy *WE, CTAEnvironmentTy *CTAE);
+
+ ~ThreadEnvironmentTy();
+
+ unsigned getThreadIdInWarp() const;
+
+ unsigned getThreadIdInBlock() const;
+
+ unsigned getGlobalThreadId() const;
+
+ unsigned getBlockSize() const;
+
+ unsigned getKernelSize() const;
+
+ unsigned getBlockId() const;
+
+ unsigned getNumberOfBlocks() const;
+
+ LaneMaskTy getActiveMask() const;
+
+ unsigned getWarpSize() const;
+
+ int32_t shuffle(uint64_t Mask, int32_t Var, uint64_t SrcLane);
+
+ int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta);
+
+ void fenceKernel(int32_t MemoryOrder);
+
+ void fenceTeam(int MemoryOrder);
+
+ void syncWarp(int Mask);
+
+ void namedBarrier(bool Generic);
+
+ void setBlockEnv(ThreadBlockEnvironmentTy *TBE);
+
+ void resetBlockEnv();
+};
+
+ThreadEnvironmentTy *getThreadEnvironment(void);
+
+#endif // OPENMP_LIBOMPTARGET_PLUGINS_VGPU_SRC_THREADENVIRONMENT_H
Index: openmp/libomptarget/plugins/vgpu/src/ThreadEnvironment.cpp
===================================================================
--- /dev/null
+++ openmp/libomptarget/plugins/vgpu/src/ThreadEnvironment.cpp
@@ -0,0 +1,117 @@
+//===---- DeviceEnvironment.cpp - Virtual GPU Device Environment -- C++ ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of VGPU environment classes.
+//
+//===----------------------------------------------------------------------===//
+//
+#include <cstdint>
+
+#include "ThreadEnvironment.h"
+#include "ThreadEnvironmentImpl.h"
+#include <barrier>
+#include <mutex>
+
+std::mutex AtomicIncLock;
+
+uint32_t VGPUImpl::atomicInc(uint32_t *Address, uint32_t Val, int Ordering) {
+ std::lock_guard G(AtomicIncLock);
+ uint32_t V = *Address;
+ if (V >= Val)
+ *Address = 0;
+ else
+ *Address += 1;
+ return V;
+}
+
+void VGPUImpl::initLock(uint32_t *Lock) { Lock = (uint32_t *)new std::mutex; }
+
+void VGPUImpl::destroyLock(uint32_t *Lock) {
+ std::mutex *Mtx = (std::mutex *)Lock;
+ delete Mtx;
+}
+
+void VGPUImpl::setLock(uint32_t *Lock) { ((std::mutex *)Lock)->lock(); }
+
+void VGPUImpl::unsetLock(uint32_t *Lock) { ((std::mutex *)Lock)->unlock(); }
+
+bool VGPUImpl::testLock(uint32_t *Lock) {
+ return ((std::mutex *)Lock)->try_lock();
+}
+
+extern thread_local ThreadEnvironmentTy *ThreadEnvironment;
+
+ThreadEnvironmentTy *getThreadEnvironment() { return ThreadEnvironment; }
+
+ThreadEnvironmentTy::ThreadEnvironmentTy(WarpEnvironmentTy *WE,
+ CTAEnvironmentTy *CTAE)
+ : Impl(new VGPUImpl::ThreadEnvironmentTy(WE, CTAE)) {}
+
+ThreadEnvironmentTy::~ThreadEnvironmentTy() { delete Impl; }
+
+void ThreadEnvironmentTy::fenceTeam(int Ordering) { Impl->fenceTeam(Ordering); }
+
+void ThreadEnvironmentTy::syncWarp(int Ordering) { Impl->syncWarp(Ordering); }
+
+unsigned ThreadEnvironmentTy::getThreadIdInWarp() const {
+ return Impl->getThreadIdInWarp();
+}
+
+unsigned ThreadEnvironmentTy::getThreadIdInBlock() const {
+ return Impl->getThreadIdInBlock();
+}
+
+unsigned ThreadEnvironmentTy::getGlobalThreadId() const {
+ return Impl->getGlobalThreadId();
+}
+
+unsigned ThreadEnvironmentTy::getBlockSize() const {
+ return Impl->getBlockSize();
+}
+
+unsigned ThreadEnvironmentTy::getKernelSize() const {
+ return Impl->getKernelSize();
+}
+
+unsigned ThreadEnvironmentTy::getBlockId() const { return Impl->getBlockId(); }
+
+unsigned ThreadEnvironmentTy::getNumberOfBlocks() const {
+ return Impl->getNumberOfBlocks();
+}
+
+LaneMaskTy ThreadEnvironmentTy::getActiveMask() const {
+ return Impl->getActiveMask();
+}
+
+int32_t ThreadEnvironmentTy::shuffle(uint64_t Mask, int32_t Var,
+ uint64_t SrcLane) {
+ return Impl->shuffle(Mask, Var, SrcLane);
+}
+
+int32_t ThreadEnvironmentTy::shuffleDown(uint64_t Mask, int32_t Var,
+ uint32_t Delta) {
+ return Impl->shuffleDown(Mask, Var, Delta);
+}
+
+void ThreadEnvironmentTy::fenceKernel(int32_t MemoryOrder) {
+ return Impl->fenceKernel(MemoryOrder);
+}
+
+void ThreadEnvironmentTy::namedBarrier(bool Generic) {
+ Impl->namedBarrier(Generic);
+}
+
+void ThreadEnvironmentTy::setBlockEnv(ThreadBlockEnvironmentTy *TBE) {
+ Impl->setBlockEnv(TBE);
+}
+
+void ThreadEnvironmentTy::resetBlockEnv() { Impl->resetBlockEnv(); }
+
+unsigned ThreadEnvironmentTy::getWarpSize() const {
+ return Impl->getWarpSize();
+}
Index: openmp/libomptarget/plugins/vgpu/CMakeLists.txt
===================================================================
--- /dev/null
+++ openmp/libomptarget/plugins/vgpu/CMakeLists.txt
@@ -0,0 +1,74 @@
+###===----------------------------------------------------------------------===##
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+##===----------------------------------------------------------------------===##
+#
+# Build the VGPU plugin for virtual GPU offloading.
+#
+##===----------------------------------------------------------------------===#
+
+if (NOT(LIBOMPTARGET_ENABLE_EXPERIMENTAL_VGPU_PLUGIN))
+ return()
+endif()
+
+macro(build_generic_elf64_vgpu tmachine tmachine_name tmachine_libname tmachine_triple elf_machine_id)
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "${tmachine}$")
+ if(LIBOMPTARGET_DEP_LIBELF_FOUND)
+ if(LIBOMPTARGET_DEP_LIBFFI_FOUND)
+ libomptarget_say("Building ${tmachine_triple}-${tmachine_name} offloading plugin.")
+
+ include_directories(${LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIR})
+ include_directories(${LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIR})
+ include_directories(${LIBOMPTARGET_INCLUDE_DIR})
+
+ # Define macro to be used as prefix of the runtime messages for this target.
+ add_definitions("-DTARGET_NAME=${tmachine_name}")
+
+ # Define macro with the ELF ID for this target.
+ add_definitions("-DTARGET_ELF_ID=${elf_machine_id}")
+
+ add_library("omptarget.rtl.${tmachine_libname}" SHARED
+ ${CMAKE_CURRENT_SOURCE_DIR}/src/rtl.cpp
+ ${CMAKE_CURRENT_SOURCE_DIR}/src/ThreadEnvironment.cpp
+ ${CMAKE_CURRENT_SOURCE_DIR}/src/ThreadEnvironmentImpl.cpp)
+
+ # Install plugin under the lib destination folder.
+ install(TARGETS "omptarget.rtl.${tmachine_libname}"
+ LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}")
+
+ set_target_properties("omptarget.rtl.${tmachine_libname}" PROPERTIES CXX_STANDARD 20)
+
+ target_link_libraries(
+ "omptarget.rtl.${tmachine_libname}"
+ elf_common
+ ${LIBOMPTARGET_DEP_LIBFFI_LIBRARIES}
+ ${LIBOMPTARGET_DEP_LIBELF_LIBRARIES}
+ dl
+ # ${OPENMP_PTHREAD_LIB}
+ "-rdynamic"
+ "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports"
+ )
+
+ list(APPEND LIBOMPTARGET_TESTED_PLUGINS
+ "omptarget.rtl.${tmachine_libname}")
+
+ # Report to the parent scope that we are building a plugin.
+ set(LIBOMPTARGET_SYSTEM_TARGETS
+ "${LIBOMPTARGET_SYSTEM_TARGETS} ${tmachine_triple}" PARENT_SCOPE)
+ set(LIBOMPTARGET_TESTED_PLUGINS
+ "${LIBOMPTARGET_TESTED_PLUGINS}" PARENT_SCOPE)
+ else(LIBOMPTARGET_DEP_LIBFFI_FOUND)
+ libomptarget_say("Not building ${tmachine_name} offloading plugin: libffi dependency not found.")
+ endif(LIBOMPTARGET_DEP_LIBFFI_FOUND)
+ else(LIBOMPTARGET_DEP_LIBELF_FOUND)
+ libomptarget_say("Not building ${tmachine_name} offloading plugin: libelf dependency not found.")
+ endif(LIBOMPTARGET_DEP_LIBELF_FOUND)
+else()
+ libomptarget_say("Not building ${tmachine_name}-vgpu offloading plugin: machine not found in the system.")
+endif()
+endmacro()
+
+build_generic_elf64_vgpu("x86_64" "vgpu" "vgpu" "x86_64-vgpu" "62")
+
Index: openmp/libomptarget/plugins/CMakeLists.txt
===================================================================
--- openmp/libomptarget/plugins/CMakeLists.txt
+++ openmp/libomptarget/plugins/CMakeLists.txt
@@ -75,6 +75,7 @@
add_subdirectory(ppc64)
add_subdirectory(ppc64le)
add_subdirectory(ve)
+add_subdirectory(vgpu)
add_subdirectory(x86_64)
add_subdirectory(remote)
Index: openmp/libomptarget/DeviceRTL/src/Utils.cpp
===================================================================
--- openmp/libomptarget/DeviceRTL/src/Utils.cpp
+++ openmp/libomptarget/DeviceRTL/src/Utils.cpp
@@ -14,6 +14,7 @@
#include "Debug.h"
#include "Interface.h"
#include "Mapping.h"
+#include "ThreadEnvironment.h"
#pragma omp declare target
@@ -32,10 +33,9 @@
namespace impl {
-/// AMDGCN Implementation
+/// AMDGCN/Generic Implementation
///
///{
-#pragma omp begin declare variant match(device = {arch(amdgcn)})
void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) {
static_assert(sizeof(unsigned long) == 8, "");
@@ -47,8 +47,6 @@
return (((uint64_t)HighBits) << 32) | (uint64_t)LowBits;
}
-#pragma omp end declare variant
-
/// NVPTX Implementation
///
///{
@@ -113,6 +111,24 @@
#pragma omp end declare variant
} // namespace impl
+/// Virtual GPU Implementation
+///
+///{
+#pragma omp begin declare variant match(device = {kind(cpu)})
+
+namespace impl {
+
+int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane) {
+ return getThreadEnvironment()->shuffle(Mask, Var, SrcLane);
+}
+
+int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width) {
+ return getThreadEnvironment()->shuffleDown(Mask, Var, Delta);
+}
+
+} // namespace impl
+#pragma omp end declare variant
+
uint64_t utils::pack(uint32_t LowBits, uint32_t HighBits) {
return impl::Pack(LowBits, HighBits);
}
Index: openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
===================================================================
--- openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
+++ openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
@@ -16,6 +16,7 @@
#include "Interface.h"
#include "Mapping.h"
#include "State.h"
+#include "ThreadEnvironment.h"
#include "Types.h"
#include "Utils.h"
@@ -283,6 +284,64 @@
} // namespace impl
+/// Virtual GPU Implementation
+///
+///{
+#pragma omp begin declare variant match(device = {kind(cpu)})
+
+namespace impl {
+
+uint32_t atomicInc(uint32_t *Address, uint32_t Val, int Ordering) {
+ return VGPUImpl::atomicInc(Address, Val, Ordering);
+}
+
+void namedBarrierInit() {}
+
+void namedBarrier() {
+ uint32_t NumThreads = omp_get_num_threads();
+ ASSERT(NumThreads % mapping::getWarpSize() == 0);
+ getThreadEnvironment()->namedBarrier(true);
+}
+
+void fenceTeam(int Ordering) { getThreadEnvironment()->fenceTeam(Ordering); }
+
+void fenceKernel(int Ordering) {
+ getThreadEnvironment()->fenceKernel(Ordering);
+}
+
+// Simply call fenceKernel because there is no need to sync with host
+void fenceSystem(int Ordering) { fenceKernel(Ordering); }
+
+void syncWarp(__kmpc_impl_lanemask_t Mask) {
+ getThreadEnvironment()->syncWarp(Mask);
+}
+
+void syncThreads() { getThreadEnvironment()->namedBarrier(false); }
+
+constexpr uint32_t OMP_SPIN = 1000;
+constexpr uint32_t UNSET = 0;
+constexpr uint32_t SET = 1;
+
+// TODO: This seems to hide a bug in the declare variant handling. If it is
+// called before it is defined
+// here the overload won't happen. Investigate lalter!
+void unsetLock(omp_lock_t *Lock) { VGPUImpl::unsetLock((uint32_t *)Lock); }
+
+int testLock(omp_lock_t *Lock) { return VGPUImpl::testLock((uint32_t *)Lock); }
+
+void initLock(omp_lock_t *Lock) { VGPUImpl::initLock((uint32_t *)Lock); }
+
+void destroyLock(omp_lock_t *Lock) { VGPUImpl::destroyLock((uint32_t *)Lock); }
+
+void setLock(omp_lock_t *Lock) { VGPUImpl::setLock((uint32_t *)Lock); }
+
+void syncThreadsAligned() {}
+
+} // namespace impl
+
+#pragma omp end declare variant
+///}
+
void synchronize::init(bool IsSPMD) {
if (!IsSPMD)
impl::namedBarrierInit();
Index: openmp/libomptarget/DeviceRTL/src/Misc.cpp
===================================================================
--- openmp/libomptarget/DeviceRTL/src/Misc.cpp
+++ openmp/libomptarget/DeviceRTL/src/Misc.cpp
@@ -18,10 +18,9 @@
namespace _OMP {
namespace impl {
-/// AMDGCN Implementation
+/// Generic Implementation - AMDGCN, VGPU
///
///{
-#pragma omp begin declare variant match(device = {arch(amdgcn)})
double getWTick() { return ((double)1E-9); }
@@ -33,8 +32,6 @@
return 0;
}
-#pragma omp end declare variant
-
/// NVPTX Implementation
///
///{
Index: openmp/libomptarget/DeviceRTL/src/Mapping.cpp
===================================================================
--- openmp/libomptarget/DeviceRTL/src/Mapping.cpp
+++ openmp/libomptarget/DeviceRTL/src/Mapping.cpp
@@ -17,10 +17,85 @@
#pragma omp declare target
+#include "ThreadEnvironment.h"
#include "llvm/Frontend/OpenMP/OMPGridValues.h"
using namespace _OMP;
+/// Virtual GPU Implementation
+///
+///{
+#pragma omp begin declare variant match(device = {kind(cpu)})
+
+namespace _OMP {
+namespace impl {
+
+constexpr const llvm::omp::GV &getGridValue() {
+ return llvm::omp::VirtualGpuGridValues;
+}
+
+LaneMaskTy activemask() {
+ uint64_t B = 0;
+ uint32_t N = mapping::getWarpSize();
+ while (N)
+ B |= (1 << (--N));
+ return B;
+}
+
+LaneMaskTy lanemaskLT() {
+ const uint32_t Lane = mapping::getThreadIdInWarp();
+ LaneMaskTy Ballot = mapping::activemask();
+ LaneMaskTy Mask = ((LaneMaskTy)1 << Lane) - (LaneMaskTy)1;
+ return Mask & Ballot;
+}
+
+LaneMaskTy lanemaskGT() {
+ const uint32_t Lane = mapping::getThreadIdInWarp();
+ if (Lane == (mapping::getWarpSize() - 1))
+ return 0;
+ LaneMaskTy Ballot = mapping::activemask();
+ LaneMaskTy Mask = (~((LaneMaskTy)0)) << (Lane + 1);
+ return Mask & Ballot;
+}
+
+uint32_t getThreadIdInWarp() {
+ return mapping::getThreadIdInBlock() & (mapping::getWarpSize() - 1);
+}
+
+uint32_t getThreadIdInBlock() {
+ return getThreadEnvironment()->getThreadIdInBlock();
+}
+
+uint32_t getNumHardwareThreadsInBlock() {
+ return getThreadEnvironment()->getBlockSize();
+}
+
+uint32_t getKernelSize() { return getThreadEnvironment()->getKernelSize(); }
+
+uint32_t getBlockId() { return getThreadEnvironment()->getBlockId(); }
+
+uint32_t getNumberOfBlocks() {
+ return getThreadEnvironment()->getNumberOfBlocks();
+}
+
+uint32_t getNumberOfProcessorElements() { return mapping::getBlockSize(); }
+
+uint32_t getWarpId() {
+ return mapping::getThreadIdInBlock() / mapping::getWarpSize();
+}
+
+uint32_t getWarpSize() { return getThreadEnvironment()->getWarpSize(); }
+
+uint32_t getNumberOfWarpsInBlock() {
+ return (mapping::getBlockSize() + mapping::getWarpSize() - 1) /
+ mapping::getWarpSize();
+}
+
+} // namespace impl
+} // namespace _OMP
+
+#pragma omp end declare variant
+
namespace _OMP {
namespace impl {
Index: openmp/libomptarget/DeviceRTL/src/Debug.cpp
===================================================================
--- openmp/libomptarget/DeviceRTL/src/Debug.cpp
+++ openmp/libomptarget/DeviceRTL/src/Debug.cpp
@@ -49,6 +49,15 @@
} // namespace impl
#pragma omp end declare variant
+#pragma omp begin declare variant match(device = {kind(cpu)})
+int32_t vprintf(const char *, void *);
+namespace impl {
+static int32_t omp_vprintf(const char *Format, void *Arguments, uint32_t) {
+ return vprintf(Format, Arguments);
+}
+} // namespace impl
+#pragma omp end declare variant
+
int32_t __llvm_omp_vprintf(const char *Format, void *Arguments, uint32_t Size) {
return impl::omp_vprintf(Format, Arguments, Size);
}
Index: openmp/libomptarget/DeviceRTL/include/ThreadEnvironment.h
===================================================================
--- /dev/null
+++ openmp/libomptarget/DeviceRTL/include/ThreadEnvironment.h
@@ -0,0 +1,11 @@
+//===--- ThreadEnvironment.h - OpenMP VGPU Dummy Header File ------ C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Dummy header file to avoid preprocessor errors in device runtime.
+//
+//===----------------------------------------------------------------------===//
Index: openmp/libomptarget/DeviceRTL/CMakeLists.txt
===================================================================
--- openmp/libomptarget/DeviceRTL/CMakeLists.txt
+++ openmp/libomptarget/DeviceRTL/CMakeLists.txt
@@ -157,9 +157,8 @@
add_custom_command(OUTPUT ${outfile}
COMMAND ${CLANG_TOOL}
- ${bc_flags}
- -Xclang -target-cpu -Xclang ${target_cpu}
${target_bc_flags}
+ ${bc_flags}
${infile} -o ${outfile}
DEPENDS ${infile}
IMPLICIT_DEPENDS CXX ${infile}
@@ -227,9 +226,11 @@
# Generate a Bitcode library for all the compute capabilities the user requested
foreach(sm ${nvptx_sm_list})
- compileDeviceRTLLibrary(sm_${sm} nvptx -target nvptx64-nvidia-cuda -Xclang -target-feature -Xclang +ptx61 "-D__CUDA_ARCH__=${sm}0")
+ compileDeviceRTLLibrary(sm_${sm} nvptx -Xclang -target-cpu -Xclang sm_${sm} -target nvptx64-nvidia-cuda -Xclang -target-feature -Xclang +ptx61 "-D__CUDA_ARCH__=${sm}0")
endforeach()
foreach(mcpu ${amdgpu_mcpus})
- compileDeviceRTLLibrary(${mcpu} amdgpu -target amdgcn-amd-amdhsa -D__AMDGCN__ -nogpulib)
+ compileDeviceRTLLibrary(${mcpu} amdgpu -Xclang -target-cpu -Xclang ${mcpu} -target amdgcn-amd-amdhsa -D__AMDGCN__ -nogpulib)
endforeach()
+
+compileDeviceRTLLibrary(x86_64 vgpu -target x86_64-vgpu -std=c++20 -I${devicertl_base_directory}/../plugins/vgpu/src)
Index: openmp/CMakeLists.txt
===================================================================
--- openmp/CMakeLists.txt
+++ openmp/CMakeLists.txt
@@ -44,6 +44,8 @@
set(OPENMP_TEST_C_COMPILER ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang.exe)
set(OPENMP_TEST_CXX_COMPILER ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang++.exe)
endif()
+
+ list(APPEND LIBOMPTARGET_LLVM_INCLUDE_DIRS ${LLVM_MAIN_INCLUDE_DIR} ${LLVM_BINARY_DIR}/include)
endif()
# Check and set up common compiler flags.
Index: llvm/lib/Support/Triple.cpp
===================================================================
--- llvm/lib/Support/Triple.cpp
+++ llvm/lib/Support/Triple.cpp
@@ -185,6 +185,8 @@
case PC: return "pc";
case SCEI: return "scei";
case SUSE: return "suse";
+ case OpenMP_VGPU:
+ return "openmp_vgpu";
}
llvm_unreachable("Invalid VendorType!");
@@ -492,22 +494,23 @@
static Triple::VendorType parseVendor(StringRef VendorName) {
return StringSwitch<Triple::VendorType>(VendorName)
- .Case("apple", Triple::Apple)
- .Case("pc", Triple::PC)
- .Case("scei", Triple::SCEI)
- .Case("sie", Triple::SCEI)
- .Case("fsl", Triple::Freescale)
- .Case("ibm", Triple::IBM)
- .Case("img", Triple::ImaginationTechnologies)
- .Case("mti", Triple::MipsTechnologies)
- .Case("nvidia", Triple::NVIDIA)
- .Case("csr", Triple::CSR)
- .Case("myriad", Triple::Myriad)
- .Case("amd", Triple::AMD)
- .Case("mesa", Triple::Mesa)
- .Case("suse", Triple::SUSE)
- .Case("oe", Triple::OpenEmbedded)
- .Default(Triple::UnknownVendor);
+ .Case("apple", Triple::Apple)
+ .Case("pc", Triple::PC)
+ .Case("scei", Triple::SCEI)
+ .Case("sie", Triple::SCEI)
+ .Case("fsl", Triple::Freescale)
+ .Case("ibm", Triple::IBM)
+ .Case("img", Triple::ImaginationTechnologies)
+ .Case("mti", Triple::MipsTechnologies)
+ .Case("nvidia", Triple::NVIDIA)
+ .Case("csr", Triple::CSR)
+ .Case("myriad", Triple::Myriad)
+ .Case("amd", Triple::AMD)
+ .Case("mesa", Triple::Mesa)
+ .Case("suse", Triple::SUSE)
+ .Case("oe", Triple::OpenEmbedded)
+ .Case("openmp_vgpu", Triple::OpenMP_VGPU)
+ .Default(Triple::UnknownVendor);
}
static Triple::OSType parseOS(StringRef OSName) {
Index: llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h
===================================================================
--- llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h
+++ llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h
@@ -114,6 +114,38 @@
128, // GV_Default_WG_Size
};
+/// For Virtual GPUs
+static constexpr GV VirtualGpuGridValues = {
+ 256, // GV_Slot_Size
+ 32, // GV_Warp_Size
+ 1024, // GV_Max_Teams
+ 896, // GV_SimpleBufferSize
+ 1024, // GV_Max_WG_Size
+ 128, // GV_Defaut_WG_Size
+};
+
+static const unsigned OpenMPVGPUAddrSpaceMap[] = {
+ 0, // Default
+ 1, // opencl_global
+ 3, // opencl_local
+ 4, // opencl_constant
+ 0, // opencl_private
+ 0, // opencl_generic
+ 1, // opencl_global_device
+ 1, // opencl_global_host
+ 1, // cuda_device
+ 4, // cuda_constant
+ 3, // cuda_shared
+ 1, // sycl_global
+ 0, // sycl_global_device
+ 0, // sycl_global_host
+ 3, // sycl_local
+ 0, // sycl_private
+ 270, // ptr32_sptr
+ 271, // ptr32_uptr
+ 272 // ptr64
+};
+
} // namespace omp
} // namespace llvm
Index: llvm/include/llvm/ADT/Triple.h
===================================================================
--- llvm/include/llvm/ADT/Triple.h
+++ llvm/include/llvm/ADT/Triple.h
@@ -165,7 +165,8 @@
Mesa,
SUSE,
OpenEmbedded,
- LastVendorType = OpenEmbedded
+ OpenMP_VGPU,
+ LastVendorType = OpenMP_VGPU
};
enum OSType {
UnknownOS,
@@ -691,6 +692,9 @@
return getArch() == Triple::nvptx || getArch() == Triple::nvptx64;
}
+ /// Tests whether the target is OpenMP VGPU.
+ bool isOpenMPVGPU() const { return getVendor() == llvm::Triple::OpenMP_VGPU; }
+
/// Tests whether the target is AMDGCN
bool isAMDGCN() const { return getArch() == Triple::amdgcn; }
Index: clang/lib/Frontend/CompilerInvocation.cpp
===================================================================
--- clang/lib/Frontend/CompilerInvocation.cpp
+++ clang/lib/Frontend/CompilerInvocation.cpp
@@ -3985,7 +3985,8 @@
}
// Set CUDA mode for OpenMP target NVPTX/AMDGCN if specified in options
- Opts.OpenMPCUDAMode = Opts.OpenMPIsDevice && (T.isNVPTX() || T.isAMDGCN()) &&
+ Opts.OpenMPCUDAMode = Opts.OpenMPIsDevice &&
+ (T.isNVPTX() || T.isAMDGCN() || T.isOpenMPVGPU()) &&
Args.hasArg(options::OPT_fopenmp_cuda_mode);
// Set CUDA mode for OpenMP target NVPTX/AMDGCN if specified in options
Index: clang/lib/Driver/ToolChains/Gnu.cpp
===================================================================
--- clang/lib/Driver/ToolChains/Gnu.cpp
+++ clang/lib/Driver/ToolChains/Gnu.cpp
@@ -3069,4 +3069,13 @@
if (!DriverArgs.hasFlag(options::OPT_fuse_init_array,
options::OPT_fno_use_init_array, true))
CC1Args.push_back("-fno-use-init-array");
+
+ if (DriverArgs.hasArg(options::OPT_S))
+ return;
+
+ if (getTriple().getVendor() == llvm::Triple::OpenMP_VGPU) {
+ std::string BitcodeSuffix = getTripleString() + "-openmp_vgpu";
+ clang::driver::tools::addOpenMPDeviceRTL(getDriver(), DriverArgs, CC1Args,
+ BitcodeSuffix, getTriple());
+ }
}
Index: clang/lib/CodeGen/CodeGenModule.cpp
===================================================================
--- clang/lib/CodeGen/CodeGenModule.cpp
+++ clang/lib/CodeGen/CodeGenModule.cpp
@@ -249,7 +249,9 @@
OpenMPRuntime.reset(new CGOpenMPRuntimeGPU(*this));
break;
default:
- if (LangOpts.OpenMPSimd)
+ if (getTriple().getVendor() == llvm::Triple::OpenMP_VGPU)
+ OpenMPRuntime.reset(new CGOpenMPRuntimeGPU(*this));
+ else if (LangOpts.OpenMPSimd)
OpenMPRuntime.reset(new CGOpenMPSIMDRuntime(*this));
else
OpenMPRuntime.reset(new CGOpenMPRuntime(*this));
Index: clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
===================================================================
--- clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -1119,10 +1119,11 @@
CGM.addCompilerUsedGlobal(GVMode);
}
-void CGOpenMPRuntimeGPU::createOffloadEntry(llvm::Constant *ID,
- llvm::Constant *Addr,
- uint64_t Size, int32_t,
- llvm::GlobalValue::LinkageTypes) {
+void CGOpenMPRuntimeGPU::createOffloadEntry(
+ llvm::Constant *ID, llvm::Constant *Addr, uint64_t Size, int32_t Flags,
+ llvm::GlobalValue::LinkageTypes Linkage) {
+ if (CGM.getTarget().getTriple().getVendor() == llvm::Triple::OpenMP_VGPU)
+ return CGOpenMPRuntime::createOffloadEntry(ID, Addr, Size, Flags, Linkage);
// TODO: Add support for global variables on the device after declare target
// support.
llvm::Function *Fn = dyn_cast<llvm::Function>(Addr);
Index: clang/lib/Basic/Targets/X86.h
===================================================================
--- clang/lib/Basic/Targets/X86.h
+++ clang/lib/Basic/Targets/X86.h
@@ -17,6 +17,7 @@
#include "clang/Basic/TargetInfo.h"
#include "clang/Basic/TargetOptions.h"
#include "llvm/ADT/Triple.h"
+#include "llvm/Frontend/OpenMP/OMPGridValues.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/X86TargetParser.h"
@@ -388,6 +389,10 @@
uint64_t getPointerAlignV(unsigned AddrSpace) const override {
return getPointerWidthV(AddrSpace);
}
+
+ const llvm::omp::GV &getGridValue() const override {
+ return llvm::omp::VirtualGpuGridValues;
+ }
};
// X86-32 generic target
Index: clang/lib/Basic/TargetInfo.cpp
===================================================================
--- clang/lib/Basic/TargetInfo.cpp
+++ clang/lib/Basic/TargetInfo.cpp
@@ -151,6 +151,9 @@
MaxOpenCLWorkGroupSize = 1024;
ProgramAddrSpace = 0;
+
+ if (Triple.getVendor() == llvm::Triple::OpenMP_VGPU)
+ AddrSpaceMap = &llvm::omp::OpenMPVGPUAddrSpaceMap;
}
// Out of line virtual dtor for TargetInfo.
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits