atmnpatel updated this revision to Diff 386426.
atmnpatel added a comment.
small nit fix
Repository:
rG LLVM Github Monorepo
CHANGES SINCE LAST ACTION
https://reviews.llvm.org/D113359/new/
https://reviews.llvm.org/D113359
Files:
clang/lib/Basic/Targets/X86.h
clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
clang/lib/CodeGen/CodeGenModule.cpp
clang/lib/Driver/ToolChains/Gnu.cpp
clang/lib/Frontend/CompilerInvocation.cpp
llvm/include/llvm/ADT/Triple.h
llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h
llvm/lib/Support/Triple.cpp
openmp/CMakeLists.txt
openmp/libomptarget/DeviceRTL/CMakeLists.txt
openmp/libomptarget/DeviceRTL/src/Debug.cpp
openmp/libomptarget/DeviceRTL/src/Kernel.cpp
openmp/libomptarget/DeviceRTL/src/Mapping.cpp
openmp/libomptarget/DeviceRTL/src/Misc.cpp
openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
openmp/libomptarget/DeviceRTL/src/Utils.cpp
openmp/libomptarget/plugins/CMakeLists.txt
openmp/libomptarget/plugins/vgpu/CMakeLists.txt
openmp/libomptarget/plugins/vgpu/src/ThreadEnvironment.cpp
openmp/libomptarget/plugins/vgpu/src/ThreadEnvironment.h
openmp/libomptarget/plugins/vgpu/src/ThreadEnvironmentImpl.h
openmp/libomptarget/plugins/vgpu/src/rtl.cpp
openmp/libomptarget/src/rtl.cpp
Index: openmp/libomptarget/src/rtl.cpp
===================================================================
--- openmp/libomptarget/src/rtl.cpp
+++ openmp/libomptarget/src/rtl.cpp
@@ -30,6 +30,7 @@
/* SX-Aurora VE target */ "libomptarget.rtl.ve.so",
/* AMDGPU target */ "libomptarget.rtl.amdgpu.so",
/* Remote target */ "libomptarget.rtl.rpc.so",
+ /* Virtual GPU target */ "libomptarget.rtl.vgpu.so",
};
PluginManager *PM;
@@ -79,7 +80,13 @@
// is correct and if they are supporting any devices.
for (auto *Name : RTLNames) {
DP("Loading library '%s'...\n", Name);
- void *dynlib_handle = dlopen(Name, RTLD_NOW);
+
+ int Flags = RTLD_NOW;
+
+ if (strcmp(Name, "libomptarget.rtl.vgpu.so") == 0)
+ Flags |= RTLD_GLOBAL;
+
+ void *dynlib_handle = dlopen(Name, Flags);
if (!dynlib_handle) {
// Library does not exist or cannot be found.
Index: openmp/libomptarget/plugins/vgpu/src/rtl.cpp
===================================================================
--- /dev/null
+++ openmp/libomptarget/plugins/vgpu/src/rtl.cpp
@@ -0,0 +1,623 @@
+//===------RTLs/vgpu/src/rtl.cpp - Target RTLs Implementation ----- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// RTL for virtual (x86) GPU
+//
+//===----------------------------------------------------------------------===//
+
+#include <barrier>
+#include <cassert>
+#include <cmath>
+#include <condition_variable>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <dlfcn.h>
+#include <ffi.h>
+#include <functional>
+#include <gelf.h>
+#include <link.h>
+#include <list>
+#include <memory>
+#include <mutex>
+#include <queue>
+#include <thread>
+#include <vector>
+
+#include "Debug.h"
+#include "ThreadEnvironment.h"
+#include "ThreadEnvironmentImpl.h"
+#include "omptarget.h"
+#include "omptargetplugin.h"
+
+#ifndef TARGET_NAME
+#define TARGET_NAME Generic ELF - 64bit
+#endif
+#define DEBUG_PREFIX "TARGET " GETNAME(TARGET_NAME) " RTL"
+
+#ifndef TARGET_ELF_ID
+#define TARGET_ELF_ID 0
+#endif
+
+#include "elf_common.h"
+
+#define NUMBER_OF_DEVICES 1
+#define OFFLOADSECTIONNAME "omp_offloading_entries"
+
+#define DEBUG false
+
+/// Array of Dynamic libraries loaded for this target.
+struct DynLibTy {
+ char *FileName;
+ void *Handle;
+};
+
+/// Keep entries table per device.
+struct FuncOrGblEntryTy {
+ __tgt_target_table Table;
+};
+
+thread_local ThreadEnvironmentTy *ThreadEnvironment;
+
+/// Class containing all the device information.
+class RTLDeviceInfoTy {
+ std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries;
+
+public:
+ std::list<DynLibTy> DynLibs;
+
+ // Record entry point associated with device.
+ void createOffloadTable(int32_t device_id, __tgt_offload_entry *begin,
+ __tgt_offload_entry *end) {
+ assert(device_id < (int32_t)FuncGblEntries.size() &&
+ "Unexpected device id!");
+ FuncGblEntries[device_id].emplace_back();
+ FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
+
+ E.Table.EntriesBegin = begin;
+ E.Table.EntriesEnd = end;
+ }
+
+ // Return true if the entry is associated with device.
+ bool findOffloadEntry(int32_t device_id, void *addr) {
+ assert(device_id < (int32_t)FuncGblEntries.size() &&
+ "Unexpected device id!");
+ FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
+
+ for (__tgt_offload_entry *i = E.Table.EntriesBegin, *e = E.Table.EntriesEnd;
+ i < e; ++i) {
+ if (i->addr == addr)
+ return true;
+ }
+
+ return false;
+ }
+
+ // Return the pointer to the target entries table.
+ __tgt_target_table *getOffloadEntriesTable(int32_t device_id) {
+ assert(device_id < (int32_t)FuncGblEntries.size() &&
+ "Unexpected device id!");
+ FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
+
+ return &E.Table;
+ }
+
+ RTLDeviceInfoTy(int32_t num_devices) { FuncGblEntries.resize(num_devices); }
+
+ ~RTLDeviceInfoTy() {
+ // Close dynamic libraries
+ for (auto &lib : DynLibs) {
+ if (lib.Handle) {
+ dlclose(lib.Handle);
+ remove(lib.FileName);
+ }
+ }
+ }
+};
+
+static RTLDeviceInfoTy DeviceInfo(NUMBER_OF_DEVICES);
+
+std::vector<CTAEnvironmentTy *> CTAEnvironments;
+std::vector<WarpEnvironmentTy *> WarpEnvironments;
+
+struct VGPUTy {
+ struct KernelTy {
+ ffi_cif *Cif;
+ std::function<void(void)> Kernel;
+ int NumTeams;
+
+ KernelTy(ffi_cif *Cif, std::function<void(void)> Kernel, int NumTeams)
+ : Cif(Cif), Kernel(Kernel), NumTeams(NumTeams) {}
+ };
+
+ struct VGPUStreamTy {
+ std::queue<KernelTy> Kernels;
+ std::mutex Mtx;
+
+ void emplace(ffi_cif *Cif, std::function<void(void)> F, int NumTeams) {
+ std::lock_guard Guard(Mtx);
+ Kernels.emplace(Cif, F, NumTeams);
+ }
+
+ KernelTy front() {
+ std::lock_guard Guard(Mtx);
+ return Kernels.front();
+ }
+
+ void pop() {
+ std::lock_guard Guard(Mtx);
+ Kernels.pop();
+ }
+
+ bool empty() {
+ std::lock_guard Guard(Mtx);
+ return Kernels.empty();
+ }
+ };
+
+ struct AsyncInfoQueueTy {
+ std::deque<__tgt_async_info *> Streams;
+ std::mutex Mtx;
+
+ bool empty() {
+ std::lock_guard Guard(Mtx);
+ return Streams.empty();
+ }
+
+ __tgt_async_info *front() {
+ std::lock_guard Guard(Mtx);
+ return Streams.front();
+ }
+
+ void pop() {
+ std::lock_guard Guard(Mtx);
+ Streams.pop_front();
+ }
+
+ void emplace(__tgt_async_info *AsyncInfo) {
+ std::lock_guard Guard(Mtx);
+ Streams.emplace_back(AsyncInfo);
+ }
+ } ExecutionQueue;
+
+ VGPUStreamTy *getStream(__tgt_async_info *AsyncInfo) {
+ assert(AsyncInfo != nullptr && "async_info ptr was null");
+
+ if (!AsyncInfo->Queue)
+ AsyncInfo->Queue = new VGPUStreamTy();
+
+ return reinterpret_cast<VGPUStreamTy *>(AsyncInfo->Queue);
+ }
+
+ std::atomic<bool> Running;
+ std::vector<std::thread> Threads;
+ int WarpsPerCTA;
+ int NumCTAs;
+
+ std::unique_ptr<std::barrier<std::function<void(void)>>> Barrier;
+ std::condition_variable WorkAvailable;
+ std::mutex WorkDoneMtx;
+ std::condition_variable WorkDone;
+
+ VGPUTy(int NumThreads = -1, int ThreadsPerWarp = -1, int WarpsPerCTA = -1)
+ : Running(true) {
+ if (const char *Env = std::getenv("VGPU_NUM_THREADS"))
+ NumThreads = std::stoi(Env);
+ if (const char *Env = std::getenv("VGPU_THREADS_PER_WARP"))
+ ThreadsPerWarp = std::stoi(Env);
+ if (const char *Env = std::getenv("VGPU_WARPS_PER_CTA"))
+ WarpsPerCTA = std::stoi(Env);
+
+ if (NumThreads == -1)
+ NumThreads = std::thread::hardware_concurrency();
+ if (ThreadsPerWarp == -1)
+ ThreadsPerWarp = NumThreads;
+ if (WarpsPerCTA == -1)
+ WarpsPerCTA = 1;
+
+ NumCTAs = NumThreads / (ThreadsPerWarp * WarpsPerCTA);
+
+ // printf("NumThreads: %d, ThreadsPerWarp: %d, WarpsPerCTA: %d\n",
+ // NumThreads,
+ // ThreadsPerWarp, WarpsPerCTA);
+
+ assert(NumThreads % ThreadsPerWarp == 0 && NumThreads % WarpsPerCTA == 0 &&
+ "Invalid VGPU Config");
+
+ Barrier = std::make_unique<std::barrier<std::function<void(void)>>>(
+ NumThreads, []() {});
+
+ Threads.reserve(NumThreads);
+
+ auto GlobalThreadIdx = 0;
+ for (auto CTAIdx = 0; CTAIdx < NumCTAs; CTAIdx++) {
+ auto *CTAEnv =
+ new CTAEnvironmentTy(CTAIdx, NumThreads / NumCTAs, NumCTAs);
+ for (auto WarpIdx = 0; WarpIdx < WarpsPerCTA; WarpIdx++) {
+ auto *WarpEnv = new WarpEnvironmentTy(WarpIdx, ThreadsPerWarp);
+ for (auto ThreadIdx = 0; ThreadIdx < ThreadsPerWarp; ThreadIdx++) {
+ Threads.emplace_back(
+ [this, ThreadIdx, GlobalThreadIdx, CTAEnv, WarpEnv]() {
+ ThreadEnvironment =
+ new ThreadEnvironmentTy(ThreadIdx, WarpEnv, CTAEnv);
+ std::function<void(void)> Kernel;
+ while (Running) {
+ {
+ std::unique_lock<std::mutex> UniqueLock(ExecutionQueue.Mtx);
+
+ WorkAvailable.wait(UniqueLock, [&]() {
+ if (!Running) {
+ return true;
+ }
+ bool IsEmpty = ExecutionQueue.Streams.empty();
+
+ return !IsEmpty;
+ });
+ }
+
+ if (ExecutionQueue.empty()) {
+ continue;
+ }
+
+ while (!ExecutionQueue.empty()) {
+ auto *Stream = getStream(ExecutionQueue.front());
+ while (!Stream->empty()) {
+ auto KernelInfo = Stream->front();
+ Kernel = KernelInfo.Kernel;
+
+ const unsigned NumTeams = KernelInfo.NumTeams;
+ unsigned TeamIdx = 0;
+ while (TeamIdx < KernelInfo.NumTeams) {
+ if (CTAEnv->getId() < KernelInfo.NumTeams) {
+ ThreadEnvironment->setBlockEnv(
+ new ThreadBlockEnvironmentTy(
+ TeamIdx + CTAEnv->getId(), NumTeams));
+ Kernel();
+ ThreadEnvironment->resetBlockEnv();
+ }
+ Barrier->arrive_and_wait();
+ TeamIdx += NumCTAs;
+ }
+
+ if (GlobalThreadIdx == 0) {
+ delete KernelInfo.Cif;
+ Stream->pop();
+ }
+
+ Barrier->arrive_and_wait();
+ }
+ if (GlobalThreadIdx == 0) {
+ ExecutionQueue.pop();
+ WorkDone.notify_all();
+ }
+ Barrier->arrive_and_wait();
+ }
+ }
+ delete ThreadEnvironment;
+ });
+ GlobalThreadIdx = (GlobalThreadIdx + 1) % NumThreads;
+ }
+ WarpEnvironments.push_back(WarpEnv);
+ }
+ CTAEnvironments.push_back(CTAEnv);
+ }
+ }
+
+ ~VGPUTy() {
+ awaitAll();
+
+ Running = false;
+ WorkAvailable.notify_all();
+
+ for (auto &Thread : Threads) {
+ if (Thread.joinable()) {
+ Thread.join();
+ }
+ }
+
+ for (auto *CTAEnv : CTAEnvironments)
+ delete CTAEnv;
+
+ for (auto *WarpEnv : WarpEnvironments)
+ delete WarpEnv;
+ }
+
+ void await(__tgt_async_info *AsyncInfo) {
+ std::unique_lock UniqueLock(getStream(AsyncInfo)->Mtx);
+ WorkDone.wait(UniqueLock,
+ [&]() { return getStream(AsyncInfo)->Kernels.empty(); });
+ }
+
+ void awaitAll() {
+ while (!ExecutionQueue.empty()) {
+ await(ExecutionQueue.front());
+ }
+ }
+
+ void scheduleAsync(__tgt_async_info *AsyncInfo, ffi_cif *Cif,
+ std::function<void(void)> F, int NumTeams) {
+ if (NumTeams == 0)
+ NumTeams = NumCTAs;
+ auto *Stream = getStream(AsyncInfo);
+ Stream->emplace(Cif, F, NumTeams);
+ ExecutionQueue.emplace(AsyncInfo);
+ WorkAvailable.notify_all();
+ }
+};
+
+VGPUTy VGPU;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
+// If we don't have a valid ELF ID we can just fail.
+#if TARGET_ELF_ID < 1
+ return 0;
+#else
+ return elf_check_machine(image, TARGET_ELF_ID);
+#endif
+}
+
+int32_t __tgt_rtl_number_of_devices() { return NUMBER_OF_DEVICES; }
+
+int32_t __tgt_rtl_init_device(int32_t device_id) { return OFFLOAD_SUCCESS; }
+
+__tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
+ __tgt_device_image *image) {
+
+ DP("Dev %d: load binary from " DPxMOD " image\n", device_id,
+ DPxPTR(image->ImageStart));
+
+ assert(device_id >= 0 && device_id < NUMBER_OF_DEVICES && "bad dev id");
+
+ size_t ImageSize = (size_t)image->ImageEnd - (size_t)image->ImageStart;
+ size_t NumEntries = (size_t)(image->EntriesEnd - image->EntriesBegin);
+ DP("Expecting to have %zd entries defined.\n", NumEntries);
+
+ // Is the library version incompatible with the header file?
+ if (elf_version(EV_CURRENT) == EV_NONE) {
+ DP("Incompatible ELF library!\n");
+ return NULL;
+ }
+
+ // Obtain elf handler
+ Elf *e = elf_memory((char *)image->ImageStart, ImageSize);
+ if (!e) {
+ DP("Unable to get ELF handle: %s!\n", elf_errmsg(-1));
+ return NULL;
+ }
+
+ if (elf_kind(e) != ELF_K_ELF) {
+ DP("Invalid Elf kind!\n");
+ elf_end(e);
+ return NULL;
+ }
+
+ // Find the entries section offset
+ Elf_Scn *section = 0;
+ Elf64_Off entries_offset = 0;
+
+ size_t shstrndx;
+
+ if (elf_getshdrstrndx(e, &shstrndx)) {
+ DP("Unable to get ELF strings index!\n");
+ elf_end(e);
+ return NULL;
+ }
+
+ while ((section = elf_nextscn(e, section))) {
+ GElf_Shdr hdr;
+ gelf_getshdr(section, &hdr);
+
+ if (!strcmp(elf_strptr(e, shstrndx, hdr.sh_name), OFFLOADSECTIONNAME)) {
+ entries_offset = hdr.sh_addr;
+ break;
+ }
+ }
+
+ if (!entries_offset) {
+ DP("Entries Section Offset Not Found\n");
+ elf_end(e);
+ return NULL;
+ }
+
+ DP("Offset of entries section is (" DPxMOD ").\n", DPxPTR(entries_offset));
+
+ // load dynamic library and get the entry points. We use the dl library
+ // to do the loading of the library, but we could do it directly to avoid
+ // the dump to the temporary file.
+ //
+ // 1) Create tmp file with the library contents.
+ // 2) Use dlopen to load the file and dlsym to retrieve the symbols.
+ char tmp_name[] = "/tmp/tmpfile_XXXXXX";
+ int tmp_fd = mkstemp(tmp_name);
+
+ if (tmp_fd == -1) {
+ elf_end(e);
+ return NULL;
+ }
+
+ FILE *ftmp = fdopen(tmp_fd, "wb");
+
+ if (!ftmp) {
+ elf_end(e);
+ return NULL;
+ }
+
+ fwrite(image->ImageStart, ImageSize, 1, ftmp);
+ fclose(ftmp);
+
+ DynLibTy Lib = {tmp_name, dlopen(tmp_name, RTLD_NOW | RTLD_GLOBAL)};
+
+ if (!Lib.Handle) {
+ DP("Target library loading error: %s\n", dlerror());
+ elf_end(e);
+ return NULL;
+ }
+
+ DeviceInfo.DynLibs.push_back(Lib);
+
+ struct link_map *libInfo = (struct link_map *)Lib.Handle;
+
+ // The place where the entries info is loaded is the library base address
+ // plus the offset determined from the ELF file.
+ Elf64_Addr entries_addr = libInfo->l_addr + entries_offset;
+
+ DP("Pointer to first entry to be loaded is (" DPxMOD ").\n",
+ DPxPTR(entries_addr));
+
+ // Table of pointers to all the entries in the target.
+ __tgt_offload_entry *entries_table = (__tgt_offload_entry *)entries_addr;
+
+ __tgt_offload_entry *entries_begin = &entries_table[0];
+ __tgt_offload_entry *entries_end = entries_begin + NumEntries;
+
+ if (!entries_begin) {
+ DP("Can't obtain entries begin\n");
+ elf_end(e);
+ return NULL;
+ }
+
+ DP("Entries table range is (" DPxMOD ")->(" DPxMOD ")\n",
+ DPxPTR(entries_begin), DPxPTR(entries_end));
+ DeviceInfo.createOffloadTable(device_id, entries_begin, entries_end);
+
+ elf_end(e);
+
+ return DeviceInfo.getOffloadEntriesTable(device_id);
+}
+
+// Sample implementation of explicit memory allocator. For this plugin all
+// kinds are equivalent to each other.
+void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *hst_ptr,
+ int32_t kind) {
+ void *ptr = NULL;
+
+ switch (kind) {
+ case TARGET_ALLOC_DEVICE:
+ case TARGET_ALLOC_HOST:
+ case TARGET_ALLOC_SHARED:
+ case TARGET_ALLOC_DEFAULT:
+ ptr = malloc(size);
+ break;
+ default:
+ REPORT("Invalid target data allocation kind");
+ }
+
+ return ptr;
+}
+
+int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr,
+ int64_t size) {
+ VGPU.awaitAll();
+ memcpy(tgt_ptr, hst_ptr, size);
+ return OFFLOAD_SUCCESS;
+}
+
+int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr,
+ int64_t size) {
+ VGPU.awaitAll();
+ memcpy(hst_ptr, tgt_ptr, size);
+ return OFFLOAD_SUCCESS;
+}
+
+int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
+ free(tgt_ptr);
+ return OFFLOAD_SUCCESS;
+}
+
+int32_t __tgt_rtl_synchronize(int32_t device_id, __tgt_async_info *async_info) {
+ VGPU.await(async_info);
+ delete (VGPUTy::VGPUStreamTy *)async_info->Queue;
+ async_info->Queue = nullptr;
+ return 0;
+}
+
+int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
+ void **tgt_args,
+ ptrdiff_t *tgt_offsets,
+ int32_t arg_num, int32_t team_num,
+ int32_t thread_limit,
+ uint64_t loop_tripcount) {
+ __tgt_async_info AsyncInfo;
+ int rc = __tgt_rtl_run_target_team_region_async(
+ device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
+ thread_limit, loop_tripcount, &AsyncInfo);
+
+ if (rc != OFFLOAD_SUCCESS)
+ return OFFLOAD_FAIL;
+
+ return __tgt_rtl_synchronize(device_id, &AsyncInfo);
+}
+
+int32_t __tgt_rtl_run_target_team_region_async(
+ int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
+ ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num,
+ int32_t thread_limit, uint64_t loop_tripcount /*not used*/,
+ __tgt_async_info *async_info) {
+ ffi_cif *cif = new ffi_cif();
+
+ // All args are references.
+ std::shared_ptr<std::vector<ffi_type *>> args_types =
+ std::make_shared<std::vector<ffi_type *>>(arg_num, &ffi_type_pointer);
+ std::shared_ptr<std::vector<void *>> args =
+ std::make_shared<std::vector<void *>>(arg_num);
+ std::shared_ptr<std::vector<void *>> ptrs =
+ std::make_shared<std::vector<void *>>(arg_num);
+
+ for (int32_t i = 0; i < arg_num; ++i) {
+ (*ptrs)[i] = (void *)((intptr_t)tgt_args[i] + tgt_offsets[i]);
+ (*args)[i] = &(*ptrs)[i];
+ }
+
+ ffi_status status = ffi_prep_cif(cif, FFI_DEFAULT_ABI, arg_num,
+ &ffi_type_void, &(*args_types)[0]);
+
+ assert(status == FFI_OK && "Unable to prepare target launch!");
+
+ if (status != FFI_OK)
+ return OFFLOAD_FAIL;
+
+ DP("Running entry point at " DPxMOD "...\n", DPxPTR(tgt_entry_ptr));
+
+ void (*entry)(void);
+ *((void **)&entry) = tgt_entry_ptr;
+
+ VGPU.scheduleAsync(
+ async_info, cif,
+ [&]() {
+ ffi_call(cif, entry, NULL, &(*args)[0]);
+ &(args_types);
+ },
+ team_num);
+ VGPU.await(async_info);
+ return OFFLOAD_SUCCESS;
+}
+
+int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
+ void **tgt_args, ptrdiff_t *tgt_offsets,
+ int32_t arg_num) {
+ return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args,
+ tgt_offsets, arg_num, 1, 1, 0);
+}
+
+int32_t __tgt_rtl_run_target_region_async(int32_t device_id,
+ void *tgt_entry_ptr, void **tgt_args,
+ ptrdiff_t *tgt_offsets,
+ int32_t arg_num,
+ __tgt_async_info *async_info) {
+ return __tgt_rtl_run_target_team_region_async(device_id, tgt_entry_ptr,
+ tgt_args, tgt_offsets, arg_num,
+ 1, 1, 0, async_info);
+}
+
+#ifdef __cplusplus
+}
+#endif
Index: openmp/libomptarget/plugins/vgpu/src/ThreadEnvironmentImpl.h
===================================================================
--- /dev/null
+++ openmp/libomptarget/plugins/vgpu/src/ThreadEnvironmentImpl.h
@@ -0,0 +1,168 @@
+//===---- ThreadEnvironmentImpl.h - Virtual GPU thread environment - C++ --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OPENMP_LIBOMPTARGET_PLUGINS_VGPU_SRC_THREADENVIRONMENTIMPL_H
+#define OPENMP_LIBOMPTARGET_PLUGINS_VGPU_SRC_THREADENVIRONMENTIMPL_H
+
+#include "ThreadEnvironment.h"
+#include <barrier>
+#include <cstdio>
+#include <functional>
+#include <map>
+#include <thread>
+#include <vector>
+
+class WarpEnvironmentTy {
+ const unsigned ID;
+ const unsigned NumThreads;
+
+ std::vector<int32_t> ShuffleBuffer;
+
+ std::barrier<std::function<void(void)>> Barrier;
+ std::barrier<std::function<void(void)>> ShuffleBarrier;
+ std::barrier<std::function<void(void)>> ShuffleDownBarrier;
+
+public:
+ WarpEnvironmentTy(unsigned ID, unsigned NumThreads)
+ : ID(ID), NumThreads(NumThreads), ShuffleBuffer(NumThreads),
+ Barrier(NumThreads, []() {}), ShuffleBarrier(NumThreads, []() {}),
+ ShuffleDownBarrier(NumThreads, []() {}) {}
+
+ unsigned getWarpId() const { return ID; }
+ int getNumThreads() const { return NumThreads; }
+
+ void sync() { Barrier.arrive_and_wait(); }
+ void writeShuffleBuffer(int32_t Var, unsigned LaneId) {
+ ShuffleBuffer[LaneId] = Var;
+ }
+
+ int32_t getShuffleBuffer(unsigned LaneId) { return ShuffleBuffer[LaneId]; }
+
+ void waitShuffleBarrier() { ShuffleBarrier.arrive_and_wait(); }
+
+ void waitShuffleDownBarrier() { ShuffleBarrier.arrive_and_wait(); }
+};
+
+class CTAEnvironmentTy {
+public:
+ unsigned ID;
+ unsigned NumThreads;
+ unsigned NumBlocks;
+
+ std::barrier<std::function<void(void)>> Barrier;
+ std::barrier<std::function<void(void)>> SyncThreads;
+ std::barrier<std::function<void(void)>> NamedBarrier;
+
+ CTAEnvironmentTy(unsigned ID, unsigned NumThreads, unsigned NumBlocks)
+ : ID(ID), NumThreads(NumThreads), NumBlocks(NumBlocks),
+ Barrier(NumThreads, []() {}), SyncThreads(NumThreads, []() {}),
+ NamedBarrier(NumThreads, []() {}) {}
+
+ unsigned getId() const { return ID; }
+ unsigned getNumThreads() const { return NumThreads; }
+
+ unsigned getNumBlocks() const { return NumBlocks; }
+
+ void fence() { Barrier.arrive_and_wait(); }
+ void syncThreads() { SyncThreads.arrive_and_wait(); }
+ void namedBarrier() { NamedBarrier.arrive_and_wait(); }
+};
+
+class ThreadBlockEnvironmentTy {
+ unsigned ID;
+ unsigned NumBlocks;
+
+public:
+ ThreadBlockEnvironmentTy(unsigned ID, unsigned NumBlocks)
+ : ID(ID), NumBlocks(NumBlocks) {}
+
+ unsigned getId() const { return ID; }
+ unsigned getNumBlocks() const { return NumBlocks; }
+};
+
+namespace VGPUImpl {
+class ThreadEnvironmentTy {
+ unsigned ThreadIdInWarp;
+ unsigned ThreadIdInBlock;
+ unsigned GlobalThreadIdx;
+
+ WarpEnvironmentTy *WarpEnvironment;
+ ThreadBlockEnvironmentTy *ThreadBlockEnvironment;
+ CTAEnvironmentTy *CTAEnvironment;
+
+public:
+ ThreadEnvironmentTy(unsigned ThreadId, WarpEnvironmentTy *WE,
+ CTAEnvironmentTy *CTAE)
+ : ThreadIdInWarp(ThreadId),
+ ThreadIdInBlock(WE->getWarpId() * WE->getNumThreads() + ThreadId),
+ GlobalThreadIdx(CTAE->getId() * CTAE->getNumThreads() +
+ ThreadIdInBlock),
+ WarpEnvironment(WE), CTAEnvironment(CTAE) {}
+
+ void setBlockEnv(ThreadBlockEnvironmentTy *TBE) {
+ ThreadBlockEnvironment = TBE;
+ }
+
+ void resetBlockEnv() {
+ delete ThreadBlockEnvironment;
+ ThreadBlockEnvironment = nullptr;
+ }
+
+ unsigned getThreadIdInWarp() const { return ThreadIdInWarp; }
+ unsigned getThreadIdInBlock() const { return ThreadIdInBlock; }
+ unsigned getGlobalThreadId() const { return GlobalThreadIdx; }
+
+ unsigned getBlockSize() const { return CTAEnvironment->getNumThreads(); }
+
+ unsigned getBlockId() const { return ThreadBlockEnvironment->getId(); }
+
+ unsigned getNumberOfBlocks() const {
+ return ThreadBlockEnvironment->getNumBlocks();
+ }
+ unsigned getKernelSize() const {}
+
+ // FIXME: This is wrong
+ LaneMaskTy getActiveMask() const { return ~0U; }
+
+ void fenceTeam() { CTAEnvironment->fence(); }
+ void syncWarp() { WarpEnvironment->sync(); }
+
+ int32_t shuffle(int32_t Var, uint64_t SrcLane) {
+ WarpEnvironment->waitShuffleBarrier();
+ WarpEnvironment->writeShuffleBuffer(Var, ThreadIdInWarp);
+ WarpEnvironment->waitShuffleBarrier();
+ Var = WarpEnvironment->getShuffleBuffer(ThreadIdInWarp);
+ return Var;
+ }
+
+ int32_t shuffleDown(int32_t Var, uint32_t Delta) {
+ WarpEnvironment->waitShuffleDownBarrier();
+ WarpEnvironment->writeShuffleBuffer(Var, ThreadIdInWarp);
+ WarpEnvironment->waitShuffleDownBarrier();
+ Var = WarpEnvironment->getShuffleBuffer((ThreadIdInWarp + Delta) %
+ getWarpSize());
+ return Var;
+ }
+
+ void namedBarrier(bool Generic) {
+ if (Generic) {
+ CTAEnvironment->namedBarrier();
+ } else {
+ CTAEnvironment->syncThreads();
+ }
+ }
+
+ void fenceKernel(int32_t MemoryOrder) {
+ std::atomic_thread_fence(static_cast<std::memory_order>(MemoryOrder));
+ }
+
+ unsigned getWarpSize() const { return WarpEnvironment->getNumThreads(); }
+};
+} // namespace VGPUImpl
+
+#endif // OPENMP_LIBOMPTARGET_PLUGINS_VGPU_SRC_THREADENVIRONMENTIMPL_H
Index: openmp/libomptarget/plugins/vgpu/src/ThreadEnvironment.h
===================================================================
--- /dev/null
+++ openmp/libomptarget/plugins/vgpu/src/ThreadEnvironment.h
@@ -0,0 +1,72 @@
+//===---- ThreadEnvironment.h - Virtual GPU thread environment ----- C++ --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OPENMP_LIBOMPTARGET_PLUGINS_VGPU_SRC_THREADENVIRONMENT_H
+#define OPENMP_LIBOMPTARGET_PLUGINS_VGPU_SRC_THREADENVIRONMENT_H
+
+using LaneMaskTy = uint64_t;
+
+// Forward declaration
+class WarpEnvironmentTy;
+class ThreadBlockEnvironmentTy;
+class CTAEnvironmentTy;
+namespace VGPUImpl {
+class ThreadEnvironmentTy;
+void setLock(uint32_t *Lock, uint32_t Unset, uint32_t Set, uint32_t OmpSpin,
+ uint32_t BlockId,
+ uint32_t(atomicCAS)(uint32_t *, uint32_t, uint32_t, int));
+uint32_t atomicInc(uint32_t *Address, uint32_t Val, int Ordering);
+} // namespace VGPUImpl
+
+class ThreadEnvironmentTy {
+ VGPUImpl::ThreadEnvironmentTy *Impl;
+
+public:
+ ThreadEnvironmentTy(unsigned Id, WarpEnvironmentTy *WE,
+ CTAEnvironmentTy *CTAE);
+
+ ~ThreadEnvironmentTy();
+
+ unsigned getThreadIdInWarp() const;
+
+ unsigned getThreadIdInBlock() const;
+
+ unsigned getGlobalThreadId() const;
+
+ unsigned getBlockSize() const;
+
+ unsigned getKernelSize() const;
+
+ unsigned getBlockId() const;
+
+ unsigned getNumberOfBlocks() const;
+
+ LaneMaskTy getActiveMask() const;
+
+ unsigned getWarpSize() const;
+
+ int32_t shuffle(int32_t Var, uint64_t SrcLane);
+
+ int32_t shuffleDown(int32_t Var, uint32_t Delta);
+
+ void fenceKernel(int32_t MemoryOrder);
+
+ void fenceTeam();
+
+ void syncWarp();
+
+ void namedBarrier(bool Generic);
+
+ void setBlockEnv(ThreadBlockEnvironmentTy *TBE);
+
+ void resetBlockEnv();
+};
+
+ThreadEnvironmentTy *getThreadEnvironment(void);
+
+#endif // OPENMP_LIBOMPTARGET_PLUGINS_VGPU_SRC_THREADENVIRONMENT_H
Index: openmp/libomptarget/plugins/vgpu/src/ThreadEnvironment.cpp
===================================================================
--- /dev/null
+++ openmp/libomptarget/plugins/vgpu/src/ThreadEnvironment.cpp
@@ -0,0 +1,120 @@
+//===---- DeviceEnvironment.cpp - Virtual GPU Device Environment -- C++ ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of VGPU environment classes.
+//
+//===----------------------------------------------------------------------===//
+
+// clang-format off
+#include <cstdint>
+#include "ThreadEnvironment.h"
+#include "ThreadEnvironmentImpl.h"
+#include <barrier>
+#include <mutex>
+// clang-format on
+
+std::mutex AtomicIncLock;
+
+uint32_t VGPUImpl::atomicInc(uint32_t *Address, uint32_t Val, int Ordering) {
+ std::lock_guard G(AtomicIncLock);
+ uint32_t V = *Address;
+ if (V >= Val)
+ *Address = 0;
+ else
+ *Address += 1;
+ return V;
+}
+
+void VGPUImpl::setLock(uint32_t *Lock, uint32_t Unset, uint32_t Set,
+ uint32_t OmpSpin, uint32_t BlockId,
+ uint32_t(atomicCAS)(uint32_t *, uint32_t, uint32_t,
+ int)) {
+ // TODO: not sure spinning is a good idea here..
+ while (atomicCAS((uint32_t *)Lock, Unset, Set, __ATOMIC_SEQ_CST) != Unset) {
+ std::clock_t start = std::clock();
+ std::clock_t now;
+ for (;;) {
+ now = std::clock();
+ std::clock_t cycles =
+ now > start ? now - start : now + (0xffffffff - start);
+ if (cycles >= 1000 * BlockId) {
+ break;
+ }
+ }
+ } // wait for 0 to be the read value
+}
+
+extern thread_local ThreadEnvironmentTy *ThreadEnvironment;
+
+ThreadEnvironmentTy *getThreadEnvironment() { return ThreadEnvironment; }
+
+ThreadEnvironmentTy::ThreadEnvironmentTy(unsigned Id, WarpEnvironmentTy *WE,
+ CTAEnvironmentTy *CTAE)
+ : Impl(new VGPUImpl::ThreadEnvironmentTy(Id, WE, CTAE)) {}
+
+ThreadEnvironmentTy::~ThreadEnvironmentTy() { delete Impl; }
+
+void ThreadEnvironmentTy::fenceTeam() { Impl->fenceTeam(); }
+
+void ThreadEnvironmentTy::syncWarp() { Impl->syncWarp(); }
+
+unsigned ThreadEnvironmentTy::getThreadIdInWarp() const {
+ return Impl->getThreadIdInWarp();
+}
+
+unsigned ThreadEnvironmentTy::getThreadIdInBlock() const {
+ return Impl->getThreadIdInBlock();
+}
+
+unsigned ThreadEnvironmentTy::getGlobalThreadId() const {
+ return Impl->getGlobalThreadId();
+}
+
+unsigned ThreadEnvironmentTy::getBlockSize() const {
+ return Impl->getBlockSize();
+}
+
+unsigned ThreadEnvironmentTy::getKernelSize() const {
+ return Impl->getKernelSize();
+}
+
+unsigned ThreadEnvironmentTy::getBlockId() const { return Impl->getBlockId(); }
+
+unsigned ThreadEnvironmentTy::getNumberOfBlocks() const {
+ return Impl->getNumberOfBlocks();
+}
+
+LaneMaskTy ThreadEnvironmentTy::getActiveMask() const {
+ return Impl->getActiveMask();
+}
+
+int32_t ThreadEnvironmentTy::shuffle(int32_t Var, uint64_t SrcLane) {
+ return Impl->shuffle(Var, SrcLane);
+}
+
+int32_t ThreadEnvironmentTy::shuffleDown(int32_t Var, uint32_t Delta) {
+ return Impl->shuffleDown(Var, Delta);
+}
+
+void ThreadEnvironmentTy::fenceKernel(int32_t MemoryOrder) {
+ return Impl->fenceKernel(MemoryOrder);
+}
+
+void ThreadEnvironmentTy::namedBarrier(bool Generic) {
+ Impl->namedBarrier(Generic);
+}
+
+void ThreadEnvironmentTy::setBlockEnv(ThreadBlockEnvironmentTy *TBE) {
+ Impl->setBlockEnv(TBE);
+}
+
+void ThreadEnvironmentTy::resetBlockEnv() { Impl->resetBlockEnv(); }
+
+unsigned ThreadEnvironmentTy::getWarpSize() const {
+ return Impl->getWarpSize();
+}
Index: openmp/libomptarget/plugins/vgpu/CMakeLists.txt
===================================================================
--- /dev/null
+++ openmp/libomptarget/plugins/vgpu/CMakeLists.txt
@@ -0,0 +1,58 @@
+set(tmachine_name "vgpu")
+set(tmachine_libname "vgpu")
+set(tmachine_triple "x86_64-vgpu")
+set(elf_machine_id "62")
+
+if(LIBOMPTARGET_DEP_LIBELF_FOUND)
+ if(LIBOMPTARGET_DEP_LIBFFI_FOUND)
+
+ libomptarget_say("Building ${tmachine_name} offloading plugin.")
+
+ include_directories(${LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIR})
+ include_directories(${LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIR})
+ include_directories(${LIBOMPTARGET_INCLUDE_DIR})
+
+ # Define macro to be used as prefix of the runtime messages for this target.
+ add_definitions("-DTARGET_NAME=${tmachine_name}")
+
+ # Define macro with the ELF ID for this target.
+ add_definitions("-DTARGET_ELF_ID=${elf_machine_id}")
+
+ add_library("omptarget.rtl.${tmachine_libname}" SHARED
+ ${CMAKE_CURRENT_SOURCE_DIR}/src/rtl.cpp
+ ${CMAKE_CURRENT_SOURCE_DIR}/src/ThreadEnvironment.cpp)
+
+ # Install plugin under the lib destination folder.
+ install(TARGETS "omptarget.rtl.${tmachine_libname}"
+ LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}")
+
+ set_target_properties("omptarget.rtl.${tmachine_libname}" PROPERTIES CXX_STANDARD 20)
+ target_compile_options("omptarget.rtl.${tmachine_libname}" PRIVATE "-stdlib=libc++")
+
+ target_link_libraries(
+ "omptarget.rtl.${tmachine_libname}"
+ elf_common
+ ${LIBOMPTARGET_DEP_LIBFFI_LIBRARIES}
+ ${LIBOMPTARGET_DEP_LIBELF_LIBRARIES}
+ dl
+ ${OPENMP_PTHREAD_LIB}
+ "-rdynamic"
+ c++
+ #"-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports"
+ )
+
+ list(APPEND LIBOMPTARGET_TESTED_PLUGINS
+ "omptarget.rtl.${tmachine_libname}")
+
+ # Report to the parent scope that we are building a plugin.
+ set(LIBOMPTARGET_SYSTEM_TARGETS
+ "${LIBOMPTARGET_SYSTEM_TARGETS} ${tmachine_triple}" PARENT_SCOPE)
+ set(LIBOMPTARGET_TESTED_PLUGINS
+ "${LIBOMPTARGET_TESTED_PLUGINS}" PARENT_SCOPE)
+
+ else(LIBOMPTARGET_DEP_LIBFFI_FOUND)
+ libomptarget_say("Not building ${tmachine_name} offloading plugin: libffi dependency not found.")
+ endif(LIBOMPTARGET_DEP_LIBFFI_FOUND)
+else(LIBOMPTARGET_DEP_LIBELF_FOUND)
+ libomptarget_say("Not building ${tmachine_name} offloading plugin: libelf dependency not found.")
+endif(LIBOMPTARGET_DEP_LIBELF_FOUND)
Index: openmp/libomptarget/plugins/CMakeLists.txt
===================================================================
--- openmp/libomptarget/plugins/CMakeLists.txt
+++ openmp/libomptarget/plugins/CMakeLists.txt
@@ -75,6 +75,7 @@
add_subdirectory(ppc64)
add_subdirectory(ppc64le)
add_subdirectory(ve)
+add_subdirectory(vgpu)
add_subdirectory(x86_64)
add_subdirectory(remote)
Index: openmp/libomptarget/DeviceRTL/src/Utils.cpp
===================================================================
--- openmp/libomptarget/DeviceRTL/src/Utils.cpp
+++ openmp/libomptarget/DeviceRTL/src/Utils.cpp
@@ -49,6 +49,24 @@
#pragma omp end declare variant
+/// Virtual GPU Implementation
+///
+///{
+#pragma omp begin declare variant match( \
+ device = {kind(cpu)}, implementation = {extension(match_any)})
+
+void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) {
+ *LowBits = (uint32_t)(Val & static_cast<uint64_t>(0x00000000FFFFFFFF));
+ *HighBits =
+ (uint32_t)((Val & static_cast<uint64_t>(0xFFFFFFFF00000000)) >> 32);
+}
+
+uint64_t Pack(uint32_t LowBits, uint32_t HighBits) {
+ return (((uint64_t)HighBits) << 32) | (uint64_t)LowBits;
+}
+
+#pragma omp end declare variant
+
/// NVPTX Implementation
///
///{
@@ -113,6 +131,26 @@
#pragma omp end declare variant
} // namespace impl
+/// Virtual GPU Implementation
+///
+///{
+#pragma omp begin declare variant match( \
+ device = {kind(cpu)}, implementation = {extension(match_any)})
+
+#include "ThreadEnvironment.h"
+namespace impl {
+
+int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane) {
+ return getThreadEnvironment()->shuffle(Var, SrcLane);
+}
+
+int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width) {
+ return getThreadEnvironment()->shuffleDown(Var, Delta);
+}
+
+} // namespace impl
+#pragma omp end declare variant
+
uint64_t utils::pack(uint32_t LowBits, uint32_t HighBits) {
return impl::Pack(LowBits, HighBits);
}
Index: openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
===================================================================
--- openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
+++ openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
@@ -283,6 +283,73 @@
} // namespace impl
+/// Virtual GPU Implementation
+///
+///{
+#pragma omp begin declare variant match( \
+ device = {kind(cpu)}, implementation = {extension(match_any)})
+
+#include "ThreadEnvironment.h"
+namespace impl {
+
+uint32_t atomicInc(uint32_t *Address, uint32_t Val, int Ordering) {
+ return VGPUImpl::atomicInc(Address, Val, Ordering);
+}
+
+void namedBarrierInit() {}
+
+void namedBarrier() {
+ uint32_t NumThreads = omp_get_num_threads();
+ ASSERT(NumThreads % mapping::getWarpSize() == 0);
+ getThreadEnvironment()->namedBarrier(true);
+}
+
+void fenceTeam(int) { getThreadEnvironment()->fenceTeam(); }
+
+void fenceKernel(int memory_order) {
+ getThreadEnvironment()->fenceKernel(memory_order);
+}
+
+// Simply call fenceKernel because there is no need to sync with host
+void fenceSystem(int) { fenceKernel(0); }
+
+void syncWarp(__kmpc_impl_lanemask_t Mask) {
+ getThreadEnvironment()->syncWarp();
+}
+
+void syncThreads() { getThreadEnvironment()->namedBarrier(false); }
+
+constexpr uint32_t OMP_SPIN = 1000;
+constexpr uint32_t UNSET = 0;
+constexpr uint32_t SET = 1;
+
+// TODO: This seems to hide a bug in the declare variant handling. If it is
+// called before it is defined
+// here the overload won't happen. Investigate lalter!
+void unsetLock(omp_lock_t *Lock) {
+ (void)atomicExchange((uint32_t *)Lock, UNSET, __ATOMIC_SEQ_CST);
+}
+
+int testLock(omp_lock_t *Lock) {
+ return atomicAdd((uint32_t *)Lock, 0u, __ATOMIC_SEQ_CST);
+}
+
+void initLock(omp_lock_t *Lock) { unsetLock(Lock); }
+
+void destroyLock(omp_lock_t *Lock) { unsetLock(Lock); }
+
+void setLock(omp_lock_t *Lock) {
+ VGPUImpl::setLock((uint32_t *)Lock, UNSET, SET, OMP_SPIN,
+ mapping::getBlockId(), atomicCAS);
+}
+
+void syncThreadsAligned() {}
+
+} // namespace impl
+
+#pragma omp end declare variant
+///}
+
void synchronize::init(bool IsSPMD) {
if (!IsSPMD)
impl::namedBarrierInit();
Index: openmp/libomptarget/DeviceRTL/src/Misc.cpp
===================================================================
--- openmp/libomptarget/DeviceRTL/src/Misc.cpp
+++ openmp/libomptarget/DeviceRTL/src/Misc.cpp
@@ -18,10 +18,9 @@
namespace _OMP {
namespace impl {
-/// AMDGCN Implementation
+/// Generic Implementation - AMDGCN, VGPU
///
///{
-#pragma omp begin declare variant match(device = {arch(amdgcn)})
double getWTick() { return ((double)1E-9); }
@@ -33,8 +32,6 @@
return 0;
}
-#pragma omp end declare variant
-
/// NVPTX Implementation
///
///{
Index: openmp/libomptarget/DeviceRTL/src/Mapping.cpp
===================================================================
--- openmp/libomptarget/DeviceRTL/src/Mapping.cpp
+++ openmp/libomptarget/DeviceRTL/src/Mapping.cpp
@@ -21,6 +21,83 @@
using namespace _OMP;
+/// Virtual GPU Implementation
+///
+///{
+#pragma omp begin declare variant match( \
+ device = {kind(cpu)}, implementation = {extension(match_any)})
+
+#include "ThreadEnvironment.h"
+
+namespace _OMP {
+namespace impl {
+
+constexpr const llvm::omp::GV &getGridValue() {
+ return llvm::omp::VirtualGpuGridValues;
+}
+
+LaneMaskTy activemask() {
+ uint64_t B = 0;
+ uint32_t N = mapping::getWarpSize();
+ while (N)
+ B |= (1 << (--N));
+ return B;
+}
+
+LaneMaskTy lanemaskLT() {
+ const uint32_t Lane = mapping::getThreadIdInWarp();
+ LaneMaskTy Ballot = mapping::activemask();
+ LaneMaskTy Mask = ((LaneMaskTy)1 << Lane) - (LaneMaskTy)1;
+ return Mask & Ballot;
+}
+
+LaneMaskTy lanemaskGT() {
+ const uint32_t Lane = mapping::getThreadIdInWarp();
+ if (Lane == (mapping::getWarpSize() - 1))
+ return 0;
+ LaneMaskTy Ballot = mapping::activemask();
+ LaneMaskTy Mask = (~((LaneMaskTy)0)) << (Lane + 1);
+ return Mask & Ballot;
+}
+
+uint32_t getThreadIdInWarp() {
+ return mapping::getThreadIdInBlock() & (mapping::getWarpSize() - 1);
+}
+
+uint32_t getThreadIdInBlock() {
+ return getThreadEnvironment()->getThreadIdInBlock();
+}
+
+uint32_t getNumHardwareThreadsInBlock() {
+ return getThreadEnvironment()->getBlockSize();
+}
+
+uint32_t getKernelSize() { return getThreadEnvironment()->getKernelSize(); }
+
+uint32_t getBlockId() { return getThreadEnvironment()->getBlockId(); }
+
+uint32_t getNumberOfBlocks() {
+ return getThreadEnvironment()->getNumberOfBlocks();
+}
+
+uint32_t getNumberOfProcessorElements() { return mapping::getBlockSize(); }
+
+uint32_t getWarpId() {
+ return mapping::getThreadIdInBlock() / mapping::getWarpSize();
+}
+
+uint32_t getWarpSize() { return getThreadEnvironment()->getWarpSize(); }
+
+uint32_t getNumberOfWarpsInBlock() {
+ return (mapping::getBlockSize() + mapping::getWarpSize() - 1) /
+ mapping::getWarpSize();
+}
+
+} // namespace impl
+} // namespace _OMP
+
+#pragma omp end declare variant
+
namespace _OMP {
namespace impl {
Index: openmp/libomptarget/DeviceRTL/src/Kernel.cpp
===================================================================
--- openmp/libomptarget/DeviceRTL/src/Kernel.cpp
+++ openmp/libomptarget/DeviceRTL/src/Kernel.cpp
@@ -110,6 +110,22 @@
state::ParallelRegionFn = nullptr;
}
+#pragma omp begin declare variant match( \
+ device = {kind(cpu)}, implementation = {extension(match_any)})
+void __kmpc_target_deinit(IdentTy *Ident, int8_t Mode, bool) {
+ FunctionTracingRAII();
+ const bool IsSPMD = Mode & OMP_TGT_EXEC_MODE_SPMD;
+ state::assumeInitialState(IsSPMD);
+ if (IsSPMD)
+ return;
+
+ // Signal the workers to exit the state machine and exit the kernel.
+ state::ParallelRegionFn = nullptr;
+
+ synchronize::threads();
+}
+#pragma omp end declare variant
+
int8_t __kmpc_is_spmd_exec_mode() {
FunctionTracingRAII();
return mapping::isSPMDMode();
Index: openmp/libomptarget/DeviceRTL/src/Debug.cpp
===================================================================
--- openmp/libomptarget/DeviceRTL/src/Debug.cpp
+++ openmp/libomptarget/DeviceRTL/src/Debug.cpp
@@ -49,6 +49,16 @@
} // namespace impl
#pragma omp end declare variant
+#pragma omp begin declare variant match( \
+ device = {kind(cpu)}, implementation = {extension(match_any)})
+int32_t vprintf(const char *, void *);
+namespace impl {
+static int32_t omp_vprintf(const char *Format, void *Arguments, uint32_t) {
+ return vprintf(Format, Arguments);
+}
+} // namespace impl
+#pragma omp end declare variant
+
int32_t __llvm_omp_vprintf(const char *Format, void *Arguments, uint32_t Size) {
return impl::omp_vprintf(Format, Arguments, Size);
}
Index: openmp/libomptarget/DeviceRTL/CMakeLists.txt
===================================================================
--- openmp/libomptarget/DeviceRTL/CMakeLists.txt
+++ openmp/libomptarget/DeviceRTL/CMakeLists.txt
@@ -132,6 +132,7 @@
-fopenmp -fopenmp-cuda-mode -Xclang -fopenmp-is-device
-I${include_directory}
-I${devicertl_base_directory}/../include
+ -I${devicertl_base_directory}/../plugins/vgpu/src
${LIBOMPTARGET_LLVM_INCLUDE_DIRS_DEVICERTL}
)
@@ -153,7 +154,6 @@
add_custom_command(OUTPUT ${outfile}
COMMAND ${CLANG_TOOL}
${bc_flags}
- -Xclang -target-cpu -Xclang ${target_cpu}
${target_bc_flags}
${infile} -o ${outfile}
DEPENDS ${infile}
@@ -222,9 +222,11 @@
# Generate a Bitcode library for all the compute capabilities the user requested
foreach(sm ${nvptx_sm_list})
- compileDeviceRTLLibrary(sm_${sm} nvptx -target nvptx64 -Xclang -target-feature -Xclang +ptx61 "-D__CUDA_ARCH__=${sm}0")
+ compileDeviceRTLLibrary(sm_${sm} nvptx -Xclang -target-cpu -Xclang sm_${sm} -target nvptx64 -Xclang -target-feature -Xclang +ptx61 "-D__CUDA_ARCH__=${sm}0")
endforeach()
foreach(mcpu ${amdgpu_mcpus})
- compileDeviceRTLLibrary(${mcpu} amdgpu -target amdgcn-amd-amdhsa -D__AMDGCN__ -fvisibility=default -nogpulib)
+ compileDeviceRTLLibrary(${mcpu} amdgpu -Xclang -target-cpu -Xclang ${mcpu} -target amdgcn-amd-amdhsa -D__AMDGCN__ -fvisibility=default -nogpulib)
endforeach()
+
+compileDeviceRTLLibrary(vgpu x86_64-vgpu -target x86_64-vgpu -std=c++20 -stdlib=libc++)
Index: openmp/CMakeLists.txt
===================================================================
--- openmp/CMakeLists.txt
+++ openmp/CMakeLists.txt
@@ -39,6 +39,8 @@
set(OPENMP_TEST_C_COMPILER ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang.exe)
set(OPENMP_TEST_CXX_COMPILER ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang++.exe)
endif()
+
+ list(APPEND LIBOMPTARGET_LLVM_INCLUDE_DIRS ${LLVM_MAIN_INCLUDE_DIR} ${LLVM_BINARY_DIR}/include)
endif()
# Check and set up common compiler flags.
Index: llvm/lib/Support/Triple.cpp
===================================================================
--- llvm/lib/Support/Triple.cpp
+++ llvm/lib/Support/Triple.cpp
@@ -185,6 +185,8 @@
case PC: return "pc";
case SCEI: return "scei";
case SUSE: return "suse";
+ case OpenMP_VGPU:
+ return "vgpu";
}
llvm_unreachable("Invalid VendorType!");
@@ -492,22 +494,23 @@
static Triple::VendorType parseVendor(StringRef VendorName) {
return StringSwitch<Triple::VendorType>(VendorName)
- .Case("apple", Triple::Apple)
- .Case("pc", Triple::PC)
- .Case("scei", Triple::SCEI)
- .Case("sie", Triple::SCEI)
- .Case("fsl", Triple::Freescale)
- .Case("ibm", Triple::IBM)
- .Case("img", Triple::ImaginationTechnologies)
- .Case("mti", Triple::MipsTechnologies)
- .Case("nvidia", Triple::NVIDIA)
- .Case("csr", Triple::CSR)
- .Case("myriad", Triple::Myriad)
- .Case("amd", Triple::AMD)
- .Case("mesa", Triple::Mesa)
- .Case("suse", Triple::SUSE)
- .Case("oe", Triple::OpenEmbedded)
- .Default(Triple::UnknownVendor);
+ .Case("apple", Triple::Apple)
+ .Case("pc", Triple::PC)
+ .Case("scei", Triple::SCEI)
+ .Case("sie", Triple::SCEI)
+ .Case("fsl", Triple::Freescale)
+ .Case("ibm", Triple::IBM)
+ .Case("img", Triple::ImaginationTechnologies)
+ .Case("mti", Triple::MipsTechnologies)
+ .Case("nvidia", Triple::NVIDIA)
+ .Case("csr", Triple::CSR)
+ .Case("myriad", Triple::Myriad)
+ .Case("amd", Triple::AMD)
+ .Case("mesa", Triple::Mesa)
+ .Case("suse", Triple::SUSE)
+ .Case("oe", Triple::OpenEmbedded)
+ .Case("vgpu", Triple::OpenMP_VGPU)
+ .Default(Triple::UnknownVendor);
}
static Triple::OSType parseOS(StringRef OSName) {
Index: llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h
===================================================================
--- llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h
+++ llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h
@@ -114,6 +114,16 @@
128, // GV_Default_WG_Size
};
+/// For Virtual GPUs
+static constexpr GV VirtualGpuGridValues = {
+ 256, // GV_Slot_Size
+ 32, // GV_Warp_Size
+ 1024, // GV_Max_Teams
+ 896, // GV_SimpleBufferSize
+ 1024, // GV_Max_WG_Size
+ 128, // GV_Defaut_WG_Size
+};
+
} // namespace omp
} // namespace llvm
Index: llvm/include/llvm/ADT/Triple.h
===================================================================
--- llvm/include/llvm/ADT/Triple.h
+++ llvm/include/llvm/ADT/Triple.h
@@ -164,7 +164,8 @@
Mesa,
SUSE,
OpenEmbedded,
- LastVendorType = OpenEmbedded
+ OpenMP_VGPU,
+ LastVendorType = OpenMP_VGPU
};
enum OSType {
UnknownOS,
Index: clang/lib/Frontend/CompilerInvocation.cpp
===================================================================
--- clang/lib/Frontend/CompilerInvocation.cpp
+++ clang/lib/Frontend/CompilerInvocation.cpp
@@ -3983,7 +3983,9 @@
}
// Set CUDA mode for OpenMP target NVPTX/AMDGCN if specified in options
- Opts.OpenMPCUDAMode = Opts.OpenMPIsDevice && (T.isNVPTX() || T.isAMDGCN()) &&
+ Opts.OpenMPCUDAMode = Opts.OpenMPIsDevice &&
+ (T.isNVPTX() || T.isAMDGCN() ||
+ T.getVendor() == llvm::Triple::OpenMP_VGPU) &&
Args.hasArg(options::OPT_fopenmp_cuda_mode);
// Set CUDA mode for OpenMP target NVPTX/AMDGCN if specified in options
Index: clang/lib/Driver/ToolChains/Gnu.cpp
===================================================================
--- clang/lib/Driver/ToolChains/Gnu.cpp
+++ clang/lib/Driver/ToolChains/Gnu.cpp
@@ -3074,4 +3074,13 @@
if (!DriverArgs.hasFlag(options::OPT_fuse_init_array,
options::OPT_fno_use_init_array, true))
CC1Args.push_back("-fno-use-init-array");
+
+ if (DriverArgs.hasArg(options::OPT_S))
+ return;
+
+ if (getTriple().getVendor() == llvm::Triple::OpenMP_VGPU) {
+ std::string BitcodeSuffix = "x86_64-vgpu";
+ clang::driver::tools::addOpenMPDeviceRTL(getDriver(), DriverArgs, CC1Args,
+ BitcodeSuffix, getTriple());
+ }
}
Index: clang/lib/CodeGen/CodeGenModule.cpp
===================================================================
--- clang/lib/CodeGen/CodeGenModule.cpp
+++ clang/lib/CodeGen/CodeGenModule.cpp
@@ -249,7 +249,9 @@
OpenMPRuntime.reset(new CGOpenMPRuntimeGPU(*this));
break;
default:
- if (LangOpts.OpenMPSimd)
+ if (getTriple().getVendor() == llvm::Triple::OpenMP_VGPU) {
+ OpenMPRuntime.reset(new CGOpenMPRuntimeGPU(*this));
+ } else if (LangOpts.OpenMPSimd)
OpenMPRuntime.reset(new CGOpenMPSIMDRuntime(*this));
else
OpenMPRuntime.reset(new CGOpenMPRuntime(*this));
Index: clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
===================================================================
--- clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -1119,10 +1119,11 @@
CGM.addCompilerUsedGlobal(GVMode);
}
-void CGOpenMPRuntimeGPU::createOffloadEntry(llvm::Constant *ID,
- llvm::Constant *Addr,
- uint64_t Size, int32_t,
- llvm::GlobalValue::LinkageTypes) {
+void CGOpenMPRuntimeGPU::createOffloadEntry(
+ llvm::Constant *ID, llvm::Constant *Addr, uint64_t Size, int32_t Flags,
+ llvm::GlobalValue::LinkageTypes Linkage) {
+ if (CGM.getTarget().getTriple().getVendor() == llvm::Triple::OpenMP_VGPU)
+ return CGOpenMPRuntime::createOffloadEntry(ID, Addr, Size, Flags, Linkage);
// TODO: Add support for global variables on the device after declare target
// support.
if (!isa<llvm::Function>(Addr))
Index: clang/lib/Basic/Targets/X86.h
===================================================================
--- clang/lib/Basic/Targets/X86.h
+++ clang/lib/Basic/Targets/X86.h
@@ -17,6 +17,7 @@
#include "clang/Basic/TargetInfo.h"
#include "clang/Basic/TargetOptions.h"
#include "llvm/ADT/Triple.h"
+#include "llvm/Frontend/OpenMP/OMPGridValues.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/X86TargetParser.h"
@@ -45,6 +46,28 @@
272 // ptr64
};
+static const unsigned X86VGPUAddrSpaceMap[] = {
+ 0, // Default
+ 1, // opencl_global
+ 3, // opencl_local
+ 4, // opencl_constant
+ 0, // opencl_private
+ 0, // opencl_generic
+ 1, // opencl_global_device
+ 1, // opencl_global_host
+ 1, // cuda_device
+ 4, // cuda_constant
+ 3, // cuda_shared
+ 1, // sycl_global
+ 0, // sycl_global_device
+ 0, // sycl_global_host
+ 3, // sycl_local
+ 0, // sycl_private
+ 270, // ptr32_sptr
+ 271, // ptr32_uptr
+ 272 // ptr64
+};
+
// X86 target abstract base class; x86-32 and x86-64 are very close, so
// most of the implementation can be shared.
class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo {
@@ -162,6 +185,9 @@
getTriple().isOSWindows() && getTriple().isOSBinFormatCOFF();
if (IsWinCOFF)
MaxVectorAlign = MaxTLSAlign = 8192u * getCharWidth();
+
+ if (Triple.getVendor() == llvm::Triple::OpenMP_VGPU)
+ AddrSpaceMap = &X86VGPUAddrSpaceMap;
}
const char *getLongDoubleMangling() const override {
@@ -388,6 +414,10 @@
uint64_t getPointerAlignV(unsigned AddrSpace) const override {
return getPointerWidthV(AddrSpace);
}
+
+ const llvm::omp::GV &getGridValue() const override {
+ return llvm::omp::VirtualGpuGridValues;
+ }
};
// X86-32 generic target
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits