================ @@ -1038,6 +1048,109 @@ struct CUDADeviceTy : public GenericDeviceTy { using CUDAStreamManagerTy = GenericDeviceResourceManagerTy<CUDAStreamRef>; using CUDAEventManagerTy = GenericDeviceResourceManagerTy<CUDAEventRef>; + Error callGlobalCtorDtorCommon(GenericPluginTy &Plugin, DeviceImageTy &Image, + bool IsCtor) { + // Perform a quick check for the named kernel in the image. The kernel + // should be created by the 'nvptx-lower-ctor-dtor' pass. + GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler(); + GlobalTy Global(IsCtor ? "nvptx$device$init" : "nvptx$device$fini", + sizeof(void *)); + if (auto Err = Handler.getGlobalMetadataFromImage(*this, Image, Global)) { + consumeError(std::move(Err)); + return Plugin::success(); + } + + // The Nvidia backend cannot handle creating the ctor / dtor array + // automatically so we must create it ourselves. The backend will emit + // several globals that contain function pointers we can call. These are + // prefixed with a known name due to Nvidia's lack of section support. + const ELF64LEObjectFile *ELFObj = + Handler.getOrCreateELFObjectFile(*this, Image); + if (!ELFObj) + return Plugin::error("Unable to create ELF object for image %p", + Image.getStart()); + + // Search for all symbols that contain a constructor or destructor. + SmallVector<std::pair<StringRef, uint16_t>> Funcs; + for (ELFSymbolRef Sym : ELFObj->symbols()) { + auto NameOrErr = Sym.getName(); + if (!NameOrErr) + return NameOrErr.takeError(); + + if (!NameOrErr->starts_with(IsCtor ? "__init_array_object_" + : "__fini_array_object_")) + continue; + + uint16_t priority; + if (NameOrErr->rsplit('_').second.getAsInteger(10, priority)) + return Plugin::error("Invalid priority for constructor or destructor"); + + Funcs.emplace_back(*NameOrErr, priority); + } + + // Sort the created array to be in priority order. + llvm::sort(Funcs, [=](auto x, auto y) { return x.second < y.second; }); + + // Allocate a buffer to store all of the known constructor / destructor + // functions in so we can iterate them on the device. + void *Buffer = + allocate(Funcs.size() * sizeof(void *), nullptr, TARGET_ALLOC_SHARED); + if (!Buffer) + return Plugin::error("Failed to allocate memory for global buffer"); + + auto *GlobalPtrStart = reinterpret_cast<uintptr_t *>(Buffer); + auto *GlobalPtrStop = reinterpret_cast<uintptr_t *>(Buffer) + Funcs.size(); + + std::size_t Idx = 0; + for (auto [Name, Priority] : Funcs) { + GlobalTy FunctionAddr(Name.str(), sizeof(void *), &GlobalPtrStart[Idx++]); + if (auto Err = Handler.readGlobalFromDevice(*this, Image, FunctionAddr)) + return std::move(Err); + } + + // Copy the created buffer to the appropriate symbols so the kernel can + // iterate through them. + GlobalTy StartGlobal(IsCtor ? "__init_array_start" : "__fini_array_start", + sizeof(void *), &GlobalPtrStart); + if (auto Err = Handler.writeGlobalToDevice(*this, Image, StartGlobal)) + return std::move(Err); + + GlobalTy StopGlobal(IsCtor ? "__init_array_end" : "__fini_array_end", + sizeof(void *), &GlobalPtrStop); + if (auto Err = Handler.writeGlobalToDevice(*this, Image, StopGlobal)) + return std::move(Err); + + // Launch the kernel to execute the functions in the buffer. + GenericKernelTy *CUDAKernel = Plugin.allocate<CUDAKernelTy>(); + if (!CUDAKernel) + return Plugin::error("Failed to allocate memory for CUDA kernel"); + + new (CUDAKernel) + CUDAKernelTy(IsCtor ? "nvptx$device$init" : "nvptx$device$fini"); ---------------- jdoerfert wrote:
> IsCtor ? "nvptx$device$init" : "nvptx$device$fini" Do this once, other such ternaries as well. https://github.com/llvm/llvm-project/pull/71739 _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits