From 79170cbc979b8a51f7400a39a808a8799ccc4ec2 Mon Sep 17 00:00:00 2001
From: Roman Arzumanyan <rarzumanyan@nvidia.com>
Date: Wed, 7 Feb 2018 14:41:48 +0300
Subject: [PATCH] CUDA linkage changed to dynamic loading for filters:
 vf_scale, vf_thumbnail

---
 compat/cuda/dynlink_cuda.h      | 252 +++++++++++++++++++------
 compat/cuda/dynlink_loader.h    | 404 +++++++++++++++++++++++++++++++++++++++-
 libavfilter/vf_scale_cuda.c     |  49 ++---
 libavfilter/vf_thumbnail_cuda.c |  62 +++---
 4 files changed, 638 insertions(+), 129 deletions(-)

diff --git a/compat/cuda/dynlink_cuda.h b/compat/cuda/dynlink_cuda.h
index 3a13611ce6..da20d06558 100644
--- a/compat/cuda/dynlink_cuda.h
+++ b/compat/cuda/dynlink_cuda.h
@@ -29,70 +29,212 @@
 #define AV_COMPAT_DYNLINK_CUDA_H
 
 #include <stddef.h>
+#include <cuda.h>
 
-#define CUDA_VERSION 7050
-
-#if defined(_WIN32) || defined(__CYGWIN__)
-#define CUDAAPI __stdcall
-#else
-#define CUDAAPI
-#endif
-
-#define CU_CTX_SCHED_BLOCKING_SYNC 4
-
-typedef int CUdevice;
-typedef void* CUarray;
-typedef void* CUcontext;
-typedef void* CUstream;
-#if defined(__x86_64) || defined(AMD64) || defined(_M_AMD64)
-typedef unsigned long long CUdeviceptr;
-#else
-typedef unsigned int CUdeviceptr;
-#endif
-
-typedef enum cudaError_enum {
-    CUDA_SUCCESS = 0
-} CUresult;
-
-typedef enum CUmemorytype_enum {
-    CU_MEMORYTYPE_HOST = 1,
-    CU_MEMORYTYPE_DEVICE = 2
-} CUmemorytype;
-
-typedef struct CUDA_MEMCPY2D_st {
-    size_t srcXInBytes;
-    size_t srcY;
-    CUmemorytype srcMemoryType;
-    const void *srcHost;
-    CUdeviceptr srcDevice;
-    CUarray srcArray;
-    size_t srcPitch;
-
-    size_t dstXInBytes;
-    size_t dstY;
-    CUmemorytype dstMemoryType;
-    void *dstHost;
-    CUdeviceptr dstDevice;
-    CUarray dstArray;
-    size_t dstPitch;
-
-    size_t WidthInBytes;
-    size_t Height;
-} CUDA_MEMCPY2D;
-
+typedef CUresult CUDAAPI tcuGetErrorString(CUresult error, const char **pStr);
+typedef CUresult CUDAAPI tcuGetErrorName(CUresult error, const char **pStr);
 typedef CUresult CUDAAPI tcuInit(unsigned int Flags);
-typedef CUresult CUDAAPI tcuDeviceGetCount(int *count);
+typedef CUresult CUDAAPI tcuDriverGetVersion(int *driverVersion);
 typedef CUresult CUDAAPI tcuDeviceGet(CUdevice *device, int ordinal);
+typedef CUresult CUDAAPI tcuDeviceGetCount(int *count);
 typedef CUresult CUDAAPI tcuDeviceGetName(char *name, int len, CUdevice dev);
+typedef CUresult CUDAAPI tcuDeviceTotalMem(size_t *bytes, CUdevice dev);
+typedef CUresult CUDAAPI tcuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev);
+typedef CUresult CUDAAPI tcuDeviceGetProperties(CUdevprop *prop, CUdevice dev);
 typedef CUresult CUDAAPI tcuDeviceComputeCapability(int *major, int *minor, CUdevice dev);
+typedef CUresult CUDAAPI tcuDevicePrimaryCtxRetain(CUcontext *pctx, CUdevice dev);
+typedef CUresult CUDAAPI tcuDevicePrimaryCtxRelease(CUdevice dev);
+typedef CUresult CUDAAPI tcuDevicePrimaryCtxSetFlags(CUdevice dev, unsigned int flags);
+typedef CUresult CUDAAPI tcuDevicePrimaryCtxGetState(CUdevice dev, unsigned int *flags, int *active);
+typedef CUresult CUDAAPI tcuDevicePrimaryCtxReset(CUdevice dev);
 typedef CUresult CUDAAPI tcuCtxCreate_v2(CUcontext *pctx, unsigned int flags, CUdevice dev);
+typedef CUresult CUDAAPI tcuCtxDestroy_v2(CUcontext ctx);
 typedef CUresult CUDAAPI tcuCtxPushCurrent_v2(CUcontext *pctx);
 typedef CUresult CUDAAPI tcuCtxPopCurrent_v2(CUcontext *pctx);
-typedef CUresult CUDAAPI tcuCtxDestroy_v2(CUcontext ctx);
+typedef CUresult CUDAAPI tcuCtxSetCurrent(CUcontext ctx);
+typedef CUresult CUDAAPI tcuCtxGetCurrent(CUcontext *pctx);
+typedef CUresult CUDAAPI tcuCtxGetDevice(CUdevice *device);
+typedef CUresult CUDAAPI tcuCtxGetFlags(unsigned int *flags);
+typedef CUresult CUDAAPI tcuCtxSynchronize(void);
+typedef CUresult CUDAAPI tcuCtxSetLimit(CUlimit limit, size_t value);
+typedef CUresult CUDAAPI tcuCtxGetLimit(size_t *pvalue, CUlimit limit);
+typedef CUresult CUDAAPI tcuCtxGetCacheConfig(CUfunc_cache *pconfig);
+typedef CUresult CUDAAPI tcuCtxSetCacheConfig(CUfunc_cache config);
+typedef CUresult CUDAAPI tcuCtxGetSharedMemConfig(CUsharedconfig *pConfig);
+typedef CUresult CUDAAPI tcuCtxSetSharedMemConfig(CUsharedconfig config);
+typedef CUresult CUDAAPI tcuCtxGetApiVersion(CUcontext ctx, unsigned int *version);
+typedef CUresult CUDAAPI tcuCtxGetStreamPriorityRange(int *leastPriority, int *greatestPriority);
+typedef CUresult CUDAAPI tcuCtxAttach(CUcontext *pctx, unsigned int flags);
+typedef CUresult CUDAAPI tcuCtxDetach(CUcontext ctx);
+typedef CUresult CUDAAPI tcuModuleLoad(CUmodule *module, const char *fname);
+typedef CUresult CUDAAPI tcuModuleLoadData(CUmodule *module, const void *image);
+typedef CUresult CUDAAPI tcuModuleLoadDataEx(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues);
+typedef CUresult CUDAAPI tcuModuleLoadFatBinary(CUmodule *module, const void *fatCubin);
+typedef CUresult CUDAAPI tcuModuleUnload(CUmodule hmod);
+typedef CUresult CUDAAPI tcuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name);
+typedef CUresult CUDAAPI tcuModuleGetGlobal(CUdeviceptr *dptr, size_t *bytes, CUmodule hmod, const char *name);
+typedef CUresult CUDAAPI tcuModuleGetTexRef(CUtexref *pTexRef, CUmodule hmod, const char *name);
+typedef CUresult CUDAAPI tcuModuleGetSurfRef(CUsurfref *pSurfRef, CUmodule hmod, const char *name);
+typedef CUresult CUDAAPI tcuLinkCreate(unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut);
+typedef CUresult CUDAAPI tcuLinkAddData(CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name, unsigned int numOptions, CUjit_option *options, void **optionValues);
+typedef CUresult CUDAAPI tcuLinkAddFile(CUlinkState state, CUjitInputType type, const char *path, unsigned int numOptions, CUjit_option *options, void **optionValues);
+typedef CUresult CUDAAPI tcuLinkComplete(CUlinkState state, void **cubinOut, size_t *sizeOut);
+typedef CUresult CUDAAPI tcuLinkDestroy(CUlinkState state);
+typedef CUresult CUDAAPI tcuMemGetInfo(size_t *free, size_t *total);
 typedef CUresult CUDAAPI tcuMemAlloc_v2(CUdeviceptr *dptr, size_t bytesize);
+typedef CUresult CUDAAPI tcuMemAllocPitch(CUdeviceptr *dptr, size_t *pPitch, size_t WidthInBytes, size_t Height, unsigned int ElementSizeBytes);
 typedef CUresult CUDAAPI tcuMemFree_v2(CUdeviceptr dptr);
-typedef CUresult CUDAAPI tcuMemcpy2D_v2(const CUDA_MEMCPY2D *pcopy);
-typedef CUresult CUDAAPI tcuGetErrorName(CUresult error, const char** pstr);
-typedef CUresult CUDAAPI tcuGetErrorString(CUresult error, const char** pstr);
+typedef CUresult CUDAAPI tcuMemGetAddressRange(CUdeviceptr *pbase, size_t *psize, CUdeviceptr dptr);
+typedef CUresult CUDAAPI tcuMemAllocHost(void **pp, size_t bytesize);
+typedef CUresult CUDAAPI tcuMemFreeHost(void *p);
+typedef CUresult CUDAAPI tcuMemHostAlloc(void **pp, size_t bytesize, unsigned int Flags);
+typedef CUresult CUDAAPI tcuMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p, unsigned int Flags);
+typedef CUresult CUDAAPI tcuMemHostGetFlags(unsigned int *pFlags, void *p);
+typedef CUresult CUDAAPI tcuMemAllocManaged(CUdeviceptr *dptr, size_t bytesize, unsigned int flags);
+typedef CUresult CUDAAPI tcuDeviceGetByPCIBusId(CUdevice *dev, const char *pciBusId);
+typedef CUresult CUDAAPI tcuDeviceGetPCIBusId(char *pciBusId, int len, CUdevice dev);
+typedef CUresult CUDAAPI tcuIpcGetEventHandle(CUipcEventHandle *pHandle, CUevent event);
+typedef CUresult CUDAAPI tcuIpcOpenEventHandle(CUevent *phEvent, CUipcEventHandle handle);
+typedef CUresult CUDAAPI tcuIpcGetMemHandle(CUipcMemHandle *pHandle, CUdeviceptr dptr);
+typedef CUresult CUDAAPI tcuIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle, unsigned int Flags);
+typedef CUresult CUDAAPI tcuIpcCloseMemHandle(CUdeviceptr dptr);
+typedef CUresult CUDAAPI tcuMemHostRegister(void *p, size_t bytesize, unsigned int Flags);
+typedef CUresult CUDAAPI tcuMemHostUnregister(void *p);
+typedef CUresult CUDAAPI tcuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount);
+typedef CUresult CUDAAPI tcuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount);
+typedef CUresult CUDAAPI tcuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount);
+typedef CUresult CUDAAPI tcuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount);
+typedef CUresult CUDAAPI tcuMemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount);
+typedef CUresult CUDAAPI tcuMemcpyDtoA(CUarray dstArray, size_t dstOffset, CUdeviceptr srcDevice, size_t ByteCount);
+typedef CUresult CUDAAPI tcuMemcpyAtoD(CUdeviceptr dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+typedef CUresult CUDAAPI tcuMemcpyHtoA(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount);
+typedef CUresult CUDAAPI tcuMemcpyAtoH(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+typedef CUresult CUDAAPI tcuMemcpyAtoA(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+typedef CUresult CUDAAPI tcuMemcpy2D_v2(const CUDA_MEMCPY2D *pCopy);
+typedef CUresult CUDAAPI tcuMemcpy2DUnaligned(const CUDA_MEMCPY2D *pCopy);
+typedef CUresult CUDAAPI tcuMemcpy3D(const CUDA_MEMCPY3D *pCopy);
+typedef CUresult CUDAAPI tcuMemcpy3DPeer(const CUDA_MEMCPY3D_PEER *pCopy);
+typedef CUresult CUDAAPI tcuMemcpyAsync(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount, CUstream hStream);
+typedef CUresult CUDAAPI tcuMemcpyPeerAsync(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream);
+typedef CUresult CUDAAPI tcuMemcpyHtoDAsync(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream);
+typedef CUresult CUDAAPI tcuMemcpyDtoHAsync(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
+typedef CUresult CUDAAPI tcuMemcpyDtoDAsync(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
+typedef CUresult CUDAAPI tcuMemcpyHtoAAsync(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount, CUstream hStream);
+typedef CUresult CUDAAPI tcuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount, CUstream hStream);
+typedef CUresult CUDAAPI tcuMemcpy2DAsync(const CUDA_MEMCPY2D *pCopy, CUstream hStream);
+typedef CUresult CUDAAPI tcuMemcpy3DAsync(const CUDA_MEMCPY3D *pCopy, CUstream hStream);
+typedef CUresult CUDAAPI tcuMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER *pCopy, CUstream hStream);
+typedef CUresult CUDAAPI tcuMemsetD8(CUdeviceptr dstDevice, unsigned char uc, size_t N);
+typedef CUresult CUDAAPI tcuMemsetD16(CUdeviceptr dstDevice, unsigned short us, size_t N);
+typedef CUresult CUDAAPI tcuMemsetD32(CUdeviceptr dstDevice, unsigned int ui, size_t N);
+typedef CUresult CUDAAPI tcuMemsetD2D8(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height);
+typedef CUresult CUDAAPI tcuMemsetD2D16(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height);
+typedef CUresult CUDAAPI tcuMemsetD2D32(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height);
+typedef CUresult CUDAAPI tcuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc, size_t N, CUstream hStream);
+typedef CUresult CUDAAPI tcuMemsetD16Async(CUdeviceptr dstDevice, unsigned short us, size_t N, CUstream hStream);
+typedef CUresult CUDAAPI tcuMemsetD32Async(CUdeviceptr dstDevice, unsigned int ui, size_t N, CUstream hStream);
+typedef CUresult CUDAAPI tcuMemsetD2D8Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, CUstream hStream);    typedef CUresult CUDAAPI tcuMemsetD2D16Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, CUstream hStream);
+typedef CUresult CUDAAPI tcuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, CUstream hStream);
+typedef CUresult CUDAAPI tcuArrayCreate(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR *pAllocateArray);
+typedef CUresult CUDAAPI tcuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR *pArrayDescriptor, CUarray hArray);
+typedef CUresult CUDAAPI tcuArrayDestroy(CUarray hArray);
+typedef CUresult CUDAAPI tcuArray3DCreate(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pAllocateArray);
+typedef CUresult CUDAAPI tcuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR *pArrayDescriptor, CUarray hArray);
+typedef CUresult CUDAAPI tcuMipmappedArrayCreate(CUmipmappedArray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pMipmappedArrayDesc, unsigned int numMipmapLevels);
+typedef CUresult CUDAAPI tcuMipmappedArrayGetLevel(CUarray *pLevelArray, CUmipmappedArray hMipmappedArray, unsigned int level);
+typedef CUresult CUDAAPI tcuMipmappedArrayDestroy(CUmipmappedArray hMipmappedArray);
+typedef CUresult CUDAAPI tcuPointerGetAttribute(void *data, CUpointer_attribute attribute, CUdeviceptr ptr);
+typedef CUresult CUDAAPI tcuMemPrefetchAsync(CUdeviceptr devPtr, size_t count, CUdevice dstDevice, CUstream hStream);
+typedef CUresult CUDAAPI tcuMemAdvise(CUdeviceptr devPtr, size_t count, CUmem_advise advice, CUdevice device);
+typedef CUresult CUDAAPI tcuMemRangeGetAttribute(void *data, size_t dataSize, CUmem_range_attribute attribute, CUdeviceptr devPtr, size_t count);
+typedef CUresult CUDAAPI tcuMemRangeGetAttributes(void **data, size_t *dataSizes, CUmem_range_attribute *attributes, size_t numAttributes, CUdeviceptr devPtr, size_t count);
+typedef CUresult CUDAAPI tcuPointerSetAttribute(const void *value, CUpointer_attribute attribute, CUdeviceptr ptr);
+typedef CUresult CUDAAPI tcuPointerGetAttributes(unsigned int numAttributes, CUpointer_attribute *attributes, void **data, CUdeviceptr ptr);
+typedef CUresult CUDAAPI tcuStreamCreate(CUstream *phStream, unsigned int Flags);
+typedef CUresult CUDAAPI tcuStreamCreateWithPriority(CUstream *phStream, unsigned int flags, int priority);
+typedef CUresult CUDAAPI tcuStreamGetPriority(CUstream hStream, int *priority);
+typedef CUresult CUDAAPI tcuStreamGetFlags(CUstream hStream, unsigned int *flags);
+typedef CUresult CUDAAPI tcuStreamWaitEvent(CUstream hStream, CUevent hEvent, unsigned int Flags);
+typedef CUresult CUDAAPI tcuStreamAddCallback(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags);
+typedef CUresult CUDAAPI tcuStreamAttachMemAsync(CUstream hStream, CUdeviceptr dptr, size_t length, unsigned int flags);
+typedef CUresult CUDAAPI tcuStreamQuery(CUstream hStream);
+typedef CUresult CUDAAPI tcuStreamSynchronize(CUstream hStream);
+typedef CUresult CUDAAPI tcuStreamDestroy(CUstream hStream);
+typedef CUresult CUDAAPI tcuEventCreate(CUevent *phEvent, unsigned int Flags);
+typedef CUresult CUDAAPI tcuEventRecord(CUevent hEvent, CUstream hStream);
+typedef CUresult CUDAAPI tcuEventQuery(CUevent hEvent);
+typedef CUresult CUDAAPI tcuEventSynchronize(CUevent hEvent);
+typedef CUresult CUDAAPI tcuEventDestroy(CUevent hEvent);
+typedef CUresult CUDAAPI tcuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUevent hEnd);
+typedef CUresult CUDAAPI tcuStreamWaitValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags);
+typedef CUresult CUDAAPI tcuStreamWriteValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags);
+typedef CUresult CUDAAPI tcuStreamBatchMemOp(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags);
+typedef CUresult CUDAAPI tcuFuncGetAttribute(int *pi, CUfunction_attribute attrib, CUfunction hfunc);
+typedef CUresult CUDAAPI tcuFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config);
+typedef CUresult CUDAAPI tcuFuncSetSharedMemConfig(CUfunction hfunc, CUsharedconfig config);
+typedef CUresult CUDAAPI tcuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra);
+typedef CUresult CUDAAPI tcuFuncSetBlockShape(CUfunction hfunc, int x, int y, int z);
+typedef CUresult CUDAAPI tcuFuncSetSharedSize(CUfunction hfunc, unsigned int bytes);
+typedef CUresult CUDAAPI tcuParamSetSize(CUfunction hfunc, unsigned int numbytes);
+typedef CUresult CUDAAPI tcuParamSeti(CUfunction hfunc, int offset, unsigned int value);
+typedef CUresult CUDAAPI tcuParamSetf(CUfunction hfunc, int offset, float value);
+typedef CUresult CUDAAPI tcuParamSetv(CUfunction hfunc, int offset, void *ptr, unsigned int numbytes);
+typedef CUresult CUDAAPI tcuLaunch(CUfunction f);
+typedef CUresult CUDAAPI tcuLaunchGrid(CUfunction f, int grid_width, int grid_height);
+typedef CUresult CUDAAPI tcuLaunchGridAsync(CUfunction f, int grid_width, int grid_height, CUstream hStream);
+typedef CUresult CUDAAPI tcuParamSetTexRef(CUfunction hfunc, int texunit, CUtexref hTexRef);
+typedef CUresult CUDAAPI tcuOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize);
+typedef CUresult CUDAAPI tcuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize, unsigned int flags);
+typedef CUresult CUDAAPI tcuOccupancyMaxPotentialBlockSize(int *minGridSize, int *blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit);
+typedef CUresult CUDAAPI tcuOccupancyMaxPotentialBlockSizeWithFlags(int *minGridSize, int *blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit, unsigned int flags);
+typedef CUresult CUDAAPI tcuTexRefSetArray(CUtexref hTexRef, CUarray hArray, unsigned int Flags);
+typedef CUresult CUDAAPI tcuTexRefSetMipmappedArray(CUtexref hTexRef, CUmipmappedArray hMipmappedArray, unsigned int Flags);
+typedef CUresult CUDAAPI tcuTexRefSetAddress(size_t *ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, size_t bytes);
+typedef CUresult CUDAAPI tcuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, size_t Pitch);
+typedef CUresult CUDAAPI tcuTexRefSetFormat(CUtexref hTexRef, CUarray_format fmt, int NumPackedComponents);
+typedef CUresult CUDAAPI tcuTexRefSetAddressMode(CUtexref hTexRef, int dim, CUaddress_mode am);
+typedef CUresult CUDAAPI tcuTexRefSetFilterMode(CUtexref hTexRef, CUfilter_mode fm);
+typedef CUresult CUDAAPI tcuTexRefSetMipmapFilterMode(CUtexref hTexRef, CUfilter_mode fm);
+typedef CUresult CUDAAPI tcuTexRefSetMipmapLevelBias(CUtexref hTexRef, float bias);
+typedef CUresult CUDAAPI tcuTexRefSetMipmapLevelClamp(CUtexref hTexRef, float minMipmapLevelClamp, float maxMipmapLevelClamp);
+typedef CUresult CUDAAPI tcuTexRefSetMaxAnisotropy(CUtexref hTexRef, unsigned int maxAniso);
+typedef CUresult CUDAAPI tcuTexRefSetBorderColor(CUtexref hTexRef, float *pBorderColor);
+typedef CUresult CUDAAPI tcuTexRefSetFlags(CUtexref hTexRef, unsigned int Flags);
+typedef CUresult CUDAAPI tcuTexRefGetAddress(CUdeviceptr *pdptr, CUtexref hTexRef);
+typedef CUresult CUDAAPI tcuTexRefGetArray(CUarray *phArray, CUtexref hTexRef);
+typedef CUresult CUDAAPI tcuTexRefGetMipmappedArray(CUmipmappedArray *phMipmappedArray, CUtexref hTexRef);
+typedef CUresult CUDAAPI tcuTexRefGetAddressMode(CUaddress_mode *pam, CUtexref hTexRef, int dim);
+typedef CUresult CUDAAPI tcuTexRefGetFilterMode(CUfilter_mode *pfm, CUtexref hTexRef);
+typedef CUresult CUDAAPI tcuTexRefGetFormat(CUarray_format *pFormat, int *pNumChannels, CUtexref hTexRef);
+typedef CUresult CUDAAPI tcuTexRefGetMipmapFilterMode(CUfilter_mode *pfm, CUtexref hTexRef);
+typedef CUresult CUDAAPI tcuTexRefGetMipmapLevelBias(float *pbias, CUtexref hTexRef);
+typedef CUresult CUDAAPI tcuTexRefGetMipmapLevelClamp(float *pminMipmapLevelClamp, float *pmaxMipmapLevelClamp, CUtexref hTexRef);
+typedef CUresult CUDAAPI tcuTexRefGetMaxAnisotropy(int *pmaxAniso, CUtexref hTexRef);
+typedef CUresult CUDAAPI tcuTexRefGetBorderColor(float *pBorderColor, CUtexref hTexRef);
+typedef CUresult CUDAAPI tcuTexRefGetFlags(unsigned int *pFlags, CUtexref hTexRef);
+typedef CUresult CUDAAPI tcuTexRefCreate(CUtexref *pTexRef);
+typedef CUresult CUDAAPI tcuTexRefDestroy(CUtexref hTexRef);
+typedef CUresult CUDAAPI tcuSurfRefSetArray(CUsurfref hSurfRef, CUarray hArray, unsigned int Flags);
+typedef CUresult CUDAAPI tcuSurfRefGetArray(CUarray *phArray, CUsurfref hSurfRef);
+typedef CUresult CUDAAPI tcuTexObjectCreate(CUtexObject *pTexObject, const CUDA_RESOURCE_DESC *pResDesc, const CUDA_TEXTURE_DESC *pTexDesc, const CUDA_RESOURCE_VIEW_DESC *pResViewDesc);
+typedef CUresult CUDAAPI tcuTexObjectDestroy(CUtexObject texObject);
+typedef CUresult CUDAAPI tcuTexObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUtexObject texObject);
+typedef CUresult CUDAAPI tcuTexObjectGetTextureDesc(CUDA_TEXTURE_DESC *pTexDesc, CUtexObject texObject);
+typedef CUresult CUDAAPI tcuTexObjectGetResourceViewDesc(CUDA_RESOURCE_VIEW_DESC *pResViewDesc, CUtexObject texObject);
+typedef CUresult CUDAAPI tcuSurfObjectCreate(CUsurfObject *pSurfObject, const CUDA_RESOURCE_DESC *pResDesc);
+typedef CUresult CUDAAPI tcuSurfObjectDestroy(CUsurfObject surfObject);
+typedef CUresult CUDAAPI tcuSurfObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUsurfObject surfObject);
+typedef CUresult CUDAAPI tcuDeviceCanAccessPeer(int *canAccessPeer, CUdevice dev, CUdevice peerDev);
+typedef CUresult CUDAAPI tcuDeviceGetP2PAttribute(int* value, CUdevice_P2PAttribute attrib, CUdevice srcDevice, CUdevice dstDevice);
+typedef CUresult CUDAAPI tcuCtxEnablePeerAccess(CUcontext peerContext, unsigned int Flags);
+typedef CUresult CUDAAPI tcuCtxDisablePeerAccess(CUcontext peerContext);
+typedef CUresult CUDAAPI tcuGraphicsUnregisterResource(CUgraphicsResource resource);
+typedef CUresult CUDAAPI tcuGraphicsSubResourceGetMappedArray(CUarray *pArray, CUgraphicsResource resource, unsigned int arrayIndex, unsigned int mipLevel);
+typedef CUresult CUDAAPI tcuGraphicsResourceGetMappedMipmappedArray(CUmipmappedArray *pMipmappedArray, CUgraphicsResource resource);
+typedef CUresult CUDAAPI tcuGraphicsResourceGetMappedPointer(CUdeviceptr *pDevPtr, size_t *pSize, CUgraphicsResource resource);
+typedef CUresult CUDAAPI tcuGraphicsResourceSetMapFlags(CUgraphicsResource resource, unsigned int flags);
+typedef CUresult CUDAAPI tcuGraphicsMapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
+typedef CUresult CUDAAPI tcuGraphicsUnmapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
+typedef CUresult CUDAAPI tcuGetExportTable(const void **ppExportTable, const CUuuid *pExportTableId);
 
 #endif
diff --git a/compat/cuda/dynlink_loader.h b/compat/cuda/dynlink_loader.h
index fa43782c9a..89efd624fe 100644
--- a/compat/cuda/dynlink_loader.h
+++ b/compat/cuda/dynlink_loader.h
@@ -112,20 +112,214 @@ error:                              \
 
 #ifdef AV_COMPAT_DYNLINK_CUDA_H
 typedef struct CudaFunctions {
+
+
+    tcuGetErrorString *cuGetErrorString;
+    tcuGetErrorName *cuGetErrorName;
     tcuInit *cuInit;
-    tcuDeviceGetCount *cuDeviceGetCount;
+    tcuDriverGetVersion *cuDriverGetVersion;
     tcuDeviceGet *cuDeviceGet;
+    tcuDeviceGetCount *cuDeviceGetCount;
     tcuDeviceGetName *cuDeviceGetName;
+    tcuDeviceTotalMem *cuDeviceTotalMem;
+    tcuDeviceGetAttribute *cuDeviceGetAttribute;
+    tcuDeviceGetProperties *cuDeviceGetProperties;
     tcuDeviceComputeCapability *cuDeviceComputeCapability;
+    tcuDevicePrimaryCtxRetain *cuDevicePrimaryCtxRetain;
+    tcuDevicePrimaryCtxRelease *cuDevicePrimaryCtxRelease;
+    tcuDevicePrimaryCtxSetFlags *cuDevicePrimaryCtxSetFlags;
+    tcuDevicePrimaryCtxGetState *cuDevicePrimaryCtxGetState;
+    tcuDevicePrimaryCtxReset *cuDevicePrimaryCtxReset;
     tcuCtxCreate_v2 *cuCtxCreate;
+    tcuCtxDestroy_v2 *cuCtxDestroy;
     tcuCtxPushCurrent_v2 *cuCtxPushCurrent;
     tcuCtxPopCurrent_v2 *cuCtxPopCurrent;
-    tcuCtxDestroy_v2 *cuCtxDestroy;
+    tcuCtxSetCurrent *cuCtxSetCurrent;
+    tcuCtxGetCurrent *cuCtxGetCurrent;
+    tcuCtxGetDevice *cuCtxGetDevice;
+    tcuCtxGetFlags *cuCtxGetFlags;
+    tcuCtxSynchronize *cuCtxSynchronize;
+    tcuCtxSetLimit *cuCtxSetLimit;
+    tcuCtxGetLimit *cuCtxGetLimit;
+    tcuCtxGetCacheConfig *cuCtxGetCacheConfig;
+    tcuCtxSetCacheConfig *cuCtxSetCacheConfig;
+    tcuCtxGetSharedMemConfig *cuCtxGetSharedMemConfig;
+    tcuCtxSetSharedMemConfig *cuCtxSetSharedMemConfig;
+    tcuCtxGetApiVersion *cuCtxGetApiVersion;
+    tcuCtxGetStreamPriorityRange *cuCtxGetStreamPriorityRange;
+    tcuCtxAttach *cuCtxAttach;
+    tcuCtxDetach *cuCtxDetach;
+    tcuModuleLoad *cuModuleLoad;
+    tcuModuleLoadData *cuModuleLoadData;
+    tcuModuleLoadDataEx *cuModuleLoadDataEx;
+    tcuModuleLoadFatBinary *cuModuleLoadFatBinary;
+    tcuModuleUnload *cuModuleUnload;
+    tcuModuleGetFunction *cuModuleGetFunction;
+    tcuModuleGetGlobal *cuModuleGetGlobal;
+    tcuModuleGetTexRef *cuModuleGetTexRef;
+    tcuModuleGetSurfRef *cuModuleGetSurfRef;
+    tcuLinkCreate *cuLinkCreate;
+    tcuLinkAddData *cuLinkAddData;
+    tcuLinkAddFile *cuLinkAddFile;
+    tcuLinkComplete *cuLinkComplete;
+    tcuLinkDestroy *cuLinkDestroy;
+    tcuMemGetInfo *cuMemGetInfo;
     tcuMemAlloc_v2 *cuMemAlloc;
+    tcuMemAllocPitch *cuMemAllocPitch;
     tcuMemFree_v2 *cuMemFree;
+    tcuMemGetAddressRange *cuMemGetAddressRange;
+    tcuMemAllocHost *cuMemAllocHost;
+    tcuMemFreeHost *cuMemFreeHost;
+    tcuMemHostAlloc *cuMemHostAlloc;
+    tcuMemHostGetDevicePointer *cuMemHostGetDevicePointer;
+    tcuMemHostGetFlags *cuMemHostGetFlags;
+    tcuMemAllocManaged *cuMemAllocManaged;
+    tcuDeviceGetByPCIBusId *cuDeviceGetByPCIBusId;
+    tcuDeviceGetPCIBusId *cuDeviceGetPCIBusId;
+    tcuIpcGetEventHandle *cuIpcGetEventHandle;
+    tcuIpcOpenEventHandle *cuIpcOpenEventHandle;
+    tcuIpcGetMemHandle *cuIpcGetMemHandle;
+    tcuIpcOpenMemHandle *cuIpcOpenMemHandle;
+    tcuIpcCloseMemHandle *cuIpcCloseMemHandle;
+    tcuMemHostRegister *cuMemHostRegister;
+    tcuMemHostUnregister *cuMemHostUnregister;
+    tcuMemcpy *cuMemcpy;
+    tcuMemcpyPeer *cuMemcpyPeer;
+    tcuMemcpyHtoD *cuMemcpyHtoD;
+    tcuMemcpyDtoH *cuMemcpyDtoH;
+    tcuMemcpyDtoD *cuMemcpyDtoD;
+    tcuMemcpyDtoA *cuMemcpyDtoA;
+    tcuMemcpyAtoD *cuMemcpyAtoD;
+    tcuMemcpyHtoA *cuMemcpyHtoA;
+    tcuMemcpyAtoH *cuMemcpyAtoH;
+    tcuMemcpyAtoA *cuMemcpyAtoA;
     tcuMemcpy2D_v2 *cuMemcpy2D;
-    tcuGetErrorName *cuGetErrorName;
-    tcuGetErrorString *cuGetErrorString;
+    tcuMemcpy2DUnaligned *cuMemcpy2DUnaligned;
+    tcuMemcpy3D *cuMemcpy3D;
+    tcuMemcpy3DPeer *cuMemcpy3DPeer;
+    tcuMemcpyAsync *cuMemcpyAsync;
+    tcuMemcpyPeerAsync *cuMemcpyPeerAsync;
+    tcuMemcpyHtoDAsync *cuMemcpyHtoDAsync;
+    tcuMemcpyDtoHAsync *cuMemcpyDtoHAsync;
+    tcuMemcpyDtoDAsync *cuMemcpyDtoDAsync;
+    tcuMemcpyHtoAAsync *cuMemcpyHtoAAsync;
+    tcuMemcpyAtoHAsync *cuMemcpyAtoHAsync;
+    tcuMemcpy2DAsync *cuMemcpy2DAsync;
+    tcuMemcpy3DAsync *cuMemcpy3DAsync;
+    tcuMemcpy3DPeerAsync *cuMemcpy3DPeerAsync;
+    tcuMemsetD8 *cuMemsetD8;
+    tcuMemsetD16 *cuMemsetD16;
+    tcuMemsetD32 *cuMemsetD32;
+    tcuMemsetD2D8 *cuMemsetD2D8;
+    tcuMemsetD2D16 *cuMemsetD2D16;
+    tcuMemsetD2D32 *cuMemsetD2D32;
+    tcuMemsetD8Async *cuMemsetD8Async;
+    tcuMemsetD16Async *cuMemsetD16Async;
+    tcuMemsetD32Async *cuMemsetD32Async;
+    tcuMemsetD2D8Async *cuMemsetD2D8Async;
+    tcuMemsetD2D16Async *cuMemsetD2D16Async;
+    tcuMemsetD2D32Async *cuMemsetD2D32Async;
+    tcuArrayCreate *cuArrayCreate;
+    tcuArrayGetDescriptor *cuArrayGetDescriptor;
+    tcuArrayDestroy *cuArrayDestroy;
+    tcuArray3DCreate *cuArray3DCreate;
+    tcuArray3DGetDescriptor *cuArray3DGetDescriptor;
+    tcuMipmappedArrayCreate *cuMipmappedArrayCreate;
+    tcuMipmappedArrayGetLevel *cuMipmappedArrayGetLevel;
+    tcuMipmappedArrayDestroy *cuMipmappedArrayDestroy;
+    tcuPointerGetAttribute *cuPointerGetAttribute;
+    tcuMemPrefetchAsync *cuMemPrefetchAsync;
+    tcuMemAdvise *cuMemAdvise;
+    tcuMemRangeGetAttribute *cuMemRangeGetAttribute;
+    tcuMemRangeGetAttributes *cuMemRangeGetAttributes;
+    tcuPointerSetAttribute *cuPointerSetAttribute;
+    tcuPointerGetAttributes *cuPointerGetAttributes;
+    tcuStreamCreate *cuStreamCreate;
+    tcuStreamCreateWithPriority *cuStreamCreateWithPriority;
+    tcuStreamGetPriority *cuStreamGetPriority;
+    tcuStreamGetFlags *cuStreamGetFlags;
+    tcuStreamWaitEvent *cuStreamWaitEvent;
+    tcuStreamAddCallback *cuStreamAddCallback;
+    tcuStreamAttachMemAsync *cuStreamAttachMemAsync;
+    tcuStreamQuery *cuStreamQuery;
+    tcuStreamSynchronize *cuStreamSynchronize;
+    tcuStreamDestroy *cuStreamDestroy;
+    tcuEventCreate *cuEventCreate;
+    tcuEventRecord *cuEventRecord;
+    tcuEventQuery *cuEventQuery;
+    tcuEventSynchronize *cuEventSynchronize;
+    tcuEventDestroy *cuEventDestroy;
+    tcuEventElapsedTime *cuEventElapsedTime;
+    tcuStreamWaitValue32 *cuStreamWaitValue32;
+    tcuStreamWriteValue32 *cuStreamWriteValue32;
+    tcuStreamBatchMemOp *cuStreamBatchMemOp;
+    tcuFuncGetAttribute *cuFuncGetAttribute;
+    tcuFuncSetCacheConfig *cuFuncSetCacheConfig;
+    tcuFuncSetSharedMemConfig *cuFuncSetSharedMemConfig;
+    tcuLaunchKernel *cuLaunchKernel;
+    tcuFuncSetBlockShape *cuFuncSetBlockShape;
+    tcuFuncSetSharedSize *cuFuncSetSharedSize;
+    tcuParamSetSize *cuParamSetSize;
+    tcuParamSeti *cuParamSeti;
+    tcuParamSetf *cuParamSetf;
+    tcuParamSetv *cuParamSetv;
+    tcuLaunch *cuLaunch;
+    tcuLaunchGrid *cuLaunchGrid;
+    tcuLaunchGridAsync *cuLaunchGridAsync;
+    tcuParamSetTexRef *cuParamSetTexRef;
+    tcuOccupancyMaxActiveBlocksPerMultiprocessor *cuOccupancyMaxActiveBlocksPerMultiprocessor;
+    tcuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags *cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags;
+    tcuOccupancyMaxPotentialBlockSize *cuOccupancyMaxPotentialBlockSize;
+    tcuOccupancyMaxPotentialBlockSizeWithFlags *cuOccupancyMaxPotentialBlockSizeWithFlags;
+    tcuTexRefSetArray *cuTexRefSetArray;
+    tcuTexRefSetMipmappedArray *cuTexRefSetMipmappedArray;
+    tcuTexRefSetAddress *cuTexRefSetAddress;
+    tcuTexRefSetAddress2D *cuTexRefSetAddress2D;
+    tcuTexRefSetFormat *cuTexRefSetFormat;
+    tcuTexRefSetAddressMode *cuTexRefSetAddressMode;
+    tcuTexRefSetFilterMode *cuTexRefSetFilterMode;
+    tcuTexRefSetMipmapFilterMode *cuTexRefSetMipmapFilterMode;
+    tcuTexRefSetMipmapLevelBias *cuTexRefSetMipmapLevelBias;
+    tcuTexRefSetMipmapLevelClamp *cuTexRefSetMipmapLevelClamp;
+    tcuTexRefSetMaxAnisotropy *cuTexRefSetMaxAnisotropy;
+    tcuTexRefSetBorderColor *cuTexRefSetBorderColor;
+    tcuTexRefSetFlags *cuTexRefSetFlags;
+    tcuTexRefGetAddress *cuTexRefGetAddress;
+    tcuTexRefGetArray *cuTexRefGetArray;
+    tcuTexRefGetMipmappedArray *cuTexRefGetMipmappedArray;
+    tcuTexRefGetAddressMode *cuTexRefGetAddressMode;
+    tcuTexRefGetFilterMode *cuTexRefGetFilterMode;
+    tcuTexRefGetFormat *cuTexRefGetFormat;
+    tcuTexRefGetMipmapFilterMode *cuTexRefGetMipmapFilterMode;
+    tcuTexRefGetMipmapLevelBias *cuTexRefGetMipmapLevelBias;
+    tcuTexRefGetMipmapLevelClamp *cuTexRefGetMipmapLevelClamp;
+    tcuTexRefGetMaxAnisotropy *cuTexRefGetMaxAnisotropy;
+    tcuTexRefGetBorderColor *cuTexRefGetBorderColor;
+    tcuTexRefGetFlags *cuTexRefGetFlags;
+    tcuTexRefCreate *cuTexRefCreate;
+    tcuTexRefDestroy *cuTexRefDestroy;
+    tcuSurfRefSetArray *cuSurfRefSetArray;
+    tcuSurfRefGetArray *cuSurfRefGetArray;
+    tcuTexObjectCreate *cuTexObjectCreate;
+    tcuTexObjectDestroy *cuTexObjectDestroy;
+    tcuTexObjectGetResourceDesc *cuTexObjectGetResourceDesc;
+    tcuTexObjectGetTextureDesc *cuTexObjectGetTextureDesc;
+    tcuTexObjectGetResourceViewDesc *cuTexObjectGetResourceViewDesc;
+    tcuSurfObjectCreate *cuSurfObjectCreate;
+    tcuSurfObjectDestroy *cuSurfObjectDestroy;
+    tcuSurfObjectGetResourceDesc *cuSurfObjectGetResourceDesc;
+    tcuDeviceCanAccessPeer *cuDeviceCanAccessPeer;
+    tcuDeviceGetP2PAttribute *cuDeviceGetP2PAttribute;
+    tcuCtxEnablePeerAccess *cuCtxEnablePeerAccess;
+    tcuCtxDisablePeerAccess *cuCtxDisablePeerAccess;
+    tcuGraphicsUnregisterResource *cuGraphicsUnregisterResource;
+    tcuGraphicsSubResourceGetMappedArray *cuGraphicsSubResourceGetMappedArray;
+    tcuGraphicsResourceGetMappedMipmappedArray *cuGraphicsResourceGetMappedMipmappedArray;
+    tcuGraphicsResourceGetMappedPointer *cuGraphicsResourceGetMappedPointer;
+    tcuGraphicsResourceSetMapFlags *cuGraphicsResourceSetMapFlags;
+    tcuGraphicsMapResources *cuGraphicsMapResources;
+    tcuGraphicsUnmapResources *cuGraphicsUnmapResources;
+    tcuGetExportTable *cuGetExportTable;
 
     LIB_HANDLE lib;
 } CudaFunctions;
@@ -191,21 +385,213 @@ static inline int cuda_load_functions(CudaFunctions **functions, void *logctx)
 {
     GENERIC_LOAD_FUNC_PREAMBLE(CudaFunctions, cuda, CUDA_LIBNAME);
 
+    LOAD_SYMBOL(cuGetErrorString, tcuGetErrorString, "cuGetErrorString");
+    LOAD_SYMBOL(cuGetErrorName, tcuGetErrorName, "cuGetErrorName");
     LOAD_SYMBOL(cuInit, tcuInit, "cuInit");
-    LOAD_SYMBOL(cuDeviceGetCount, tcuDeviceGetCount, "cuDeviceGetCount");
+    LOAD_SYMBOL(cuDriverGetVersion, tcuDriverGetVersion, "cuDriverGetVersion");
     LOAD_SYMBOL(cuDeviceGet, tcuDeviceGet, "cuDeviceGet");
+    LOAD_SYMBOL(cuDeviceGetCount, tcuDeviceGetCount, "cuDeviceGetCount");
     LOAD_SYMBOL(cuDeviceGetName, tcuDeviceGetName, "cuDeviceGetName");
+    LOAD_SYMBOL(cuDeviceTotalMem, tcuDeviceTotalMem, "cuDeviceTotalMem");
+    LOAD_SYMBOL(cuDeviceGetAttribute, tcuDeviceGetAttribute, "cuDeviceGetAttribute");
+    LOAD_SYMBOL(cuDeviceGetProperties, tcuDeviceGetProperties, "cuDeviceGetProperties");
     LOAD_SYMBOL(cuDeviceComputeCapability, tcuDeviceComputeCapability, "cuDeviceComputeCapability");
+    LOAD_SYMBOL(cuDevicePrimaryCtxRetain, tcuDevicePrimaryCtxRetain, "cuDevicePrimaryCtxRetain");
+    LOAD_SYMBOL(cuDevicePrimaryCtxRelease, tcuDevicePrimaryCtxRelease, "cuDevicePrimaryCtxRelease");
+    LOAD_SYMBOL(cuDevicePrimaryCtxSetFlags, tcuDevicePrimaryCtxSetFlags, "cuDevicePrimaryCtxSetFlags");
+    LOAD_SYMBOL(cuDevicePrimaryCtxGetState, tcuDevicePrimaryCtxGetState, "cuDevicePrimaryCtxGetState");
+    LOAD_SYMBOL(cuDevicePrimaryCtxReset, tcuDevicePrimaryCtxReset, "cuDevicePrimaryCtxReset");
     LOAD_SYMBOL(cuCtxCreate, tcuCtxCreate_v2, "cuCtxCreate_v2");
+    LOAD_SYMBOL(cuCtxDestroy, tcuCtxDestroy_v2, "cuCtxDestroy_v2");
     LOAD_SYMBOL(cuCtxPushCurrent, tcuCtxPushCurrent_v2, "cuCtxPushCurrent_v2");
     LOAD_SYMBOL(cuCtxPopCurrent, tcuCtxPopCurrent_v2, "cuCtxPopCurrent_v2");
-    LOAD_SYMBOL(cuCtxDestroy, tcuCtxDestroy_v2, "cuCtxDestroy_v2");
+    LOAD_SYMBOL(cuCtxSetCurrent, tcuCtxSetCurrent, "cuCtxSetCurrent");
+    LOAD_SYMBOL(cuCtxGetCurrent, tcuCtxGetCurrent, "cuCtxGetCurrent");
+    LOAD_SYMBOL(cuCtxGetDevice, tcuCtxGetDevice, "cuCtxGetDevice");
+    LOAD_SYMBOL(cuCtxGetFlags, tcuCtxGetFlags, "cuCtxGetFlags");
+    LOAD_SYMBOL(cuCtxSynchronize, tcuCtxSynchronize, "cuCtxSynchronize");
+    LOAD_SYMBOL(cuCtxSetLimit, tcuCtxSetLimit, "cuCtxSetLimit");
+    LOAD_SYMBOL(cuCtxGetLimit, tcuCtxGetLimit, "cuCtxGetLimit");
+    LOAD_SYMBOL(cuCtxGetCacheConfig, tcuCtxGetCacheConfig, "cuCtxGetCacheConfig");
+    LOAD_SYMBOL(cuCtxSetCacheConfig, tcuCtxSetCacheConfig, "cuCtxSetCacheConfig");
+    LOAD_SYMBOL(cuCtxGetSharedMemConfig, tcuCtxGetSharedMemConfig, "cuCtxGetSharedMemConfig");
+    LOAD_SYMBOL(cuCtxSetSharedMemConfig, tcuCtxSetSharedMemConfig, "cuCtxSetSharedMemConfig");
+    LOAD_SYMBOL(cuCtxGetApiVersion, tcuCtxGetApiVersion, "cuCtxGetApiVersion");
+    LOAD_SYMBOL(cuCtxGetStreamPriorityRange, tcuCtxGetStreamPriorityRange, "cuCtxGetStreamPriorityRange");
+    LOAD_SYMBOL(cuCtxAttach, tcuCtxAttach, "cuCtxAttach");
+    LOAD_SYMBOL(cuCtxDetach, tcuCtxDetach, "cuCtxDetach");
+    LOAD_SYMBOL(cuModuleLoad, tcuModuleLoad, "cuModuleLoad");
+    LOAD_SYMBOL(cuModuleLoadData, tcuModuleLoadData, "cuModuleLoadData");
+    LOAD_SYMBOL(cuModuleLoadDataEx, tcuModuleLoadDataEx, "cuModuleLoadDataEx");
+    LOAD_SYMBOL(cuModuleLoadFatBinary, tcuModuleLoadFatBinary, "cuModuleLoadFatBinary");
+    LOAD_SYMBOL(cuModuleUnload, tcuModuleUnload, "cuModuleUnload");
+    LOAD_SYMBOL(cuModuleGetFunction, tcuModuleGetFunction, "cuModuleGetFunction");
+    LOAD_SYMBOL(cuModuleGetGlobal, tcuModuleGetGlobal, "cuModuleGetGlobal");
+    LOAD_SYMBOL(cuModuleGetTexRef, tcuModuleGetTexRef, "cuModuleGetTexRef");
+    LOAD_SYMBOL(cuModuleGetSurfRef, tcuModuleGetSurfRef, "cuModuleGetSurfRef");
+    LOAD_SYMBOL(cuLinkCreate, tcuLinkCreate, "cuLinkCreate");
+    LOAD_SYMBOL(cuLinkAddData, tcuLinkAddData, "cuLinkAddData");
+    LOAD_SYMBOL(cuLinkAddFile, tcuLinkAddFile, "cuLinkAddFile");
+    LOAD_SYMBOL(cuLinkComplete, tcuLinkComplete, "cuLinkComplete");
+    LOAD_SYMBOL(cuLinkDestroy, tcuLinkDestroy, "cuLinkDestroy");
+    LOAD_SYMBOL(cuMemGetInfo, tcuMemGetInfo, "cuMemGetInfo");
     LOAD_SYMBOL(cuMemAlloc, tcuMemAlloc_v2, "cuMemAlloc_v2");
+    LOAD_SYMBOL(cuMemAllocPitch, tcuMemAllocPitch, "cuMemAllocPitch");
     LOAD_SYMBOL(cuMemFree, tcuMemFree_v2, "cuMemFree_v2");
+    LOAD_SYMBOL(cuMemGetAddressRange, tcuMemGetAddressRange, "cuMemGetAddressRange");
+    LOAD_SYMBOL(cuMemAllocHost, tcuMemAllocHost, "cuMemAllocHost");
+    LOAD_SYMBOL(cuMemFreeHost, tcuMemFreeHost, "cuMemFreeHost");
+    LOAD_SYMBOL(cuMemHostAlloc, tcuMemHostAlloc, "cuMemHostAlloc");
+    LOAD_SYMBOL(cuMemHostGetDevicePointer, tcuMemHostGetDevicePointer, "cuMemHostGetDevicePointer");
+    LOAD_SYMBOL(cuMemHostGetFlags, tcuMemHostGetFlags, "cuMemHostGetFlags");
+    LOAD_SYMBOL(cuMemAllocManaged, tcuMemAllocManaged, "cuMemAllocManaged");
+    LOAD_SYMBOL(cuDeviceGetByPCIBusId, tcuDeviceGetByPCIBusId, "cuDeviceGetByPCIBusId");
+    LOAD_SYMBOL(cuDeviceGetPCIBusId, tcuDeviceGetPCIBusId, "cuDeviceGetPCIBusId");
+    LOAD_SYMBOL(cuIpcGetEventHandle, tcuIpcGetEventHandle, "cuIpcGetEventHandle");
+    LOAD_SYMBOL(cuIpcOpenEventHandle, tcuIpcOpenEventHandle, "cuIpcOpenEventHandle");
+    LOAD_SYMBOL(cuIpcGetMemHandle, tcuIpcGetMemHandle, "cuIpcGetMemHandle");
+    LOAD_SYMBOL(cuIpcOpenMemHandle, tcuIpcOpenMemHandle, "cuIpcOpenMemHandle");
+    LOAD_SYMBOL(cuIpcCloseMemHandle, tcuIpcCloseMemHandle, "cuIpcCloseMemHandle");
+    LOAD_SYMBOL(cuMemHostRegister, tcuMemHostRegister, "cuMemHostRegister");
+    LOAD_SYMBOL(cuMemHostUnregister, tcuMemHostUnregister, "cuMemHostUnregister");
+    LOAD_SYMBOL(cuMemcpy, tcuMemcpy, "cuMemcpy");
+    LOAD_SYMBOL(cuMemcpyPeer, tcuMemcpyPeer, "cuMemcpyPeer");
+    LOAD_SYMBOL(cuMemcpyHtoD, tcuMemcpyHtoD, "cuMemcpyHtoD");
+    LOAD_SYMBOL(cuMemcpyDtoH, tcuMemcpyDtoH, "cuMemcpyDtoH");
+    LOAD_SYMBOL(cuMemcpyDtoD, tcuMemcpyDtoD, "cuMemcpyDtoD");
+    LOAD_SYMBOL(cuMemcpyDtoA, tcuMemcpyDtoA, "cuMemcpyDtoA");
+    LOAD_SYMBOL(cuMemcpyAtoD, tcuMemcpyAtoD, "cuMemcpyAtoD");
+    LOAD_SYMBOL(cuMemcpyHtoA, tcuMemcpyHtoA, "cuMemcpyHtoA");
+    LOAD_SYMBOL(cuMemcpyAtoH, tcuMemcpyAtoH, "cuMemcpyAtoH");
+    LOAD_SYMBOL(cuMemcpyAtoA, tcuMemcpyAtoA, "cuMemcpyAtoA");
     LOAD_SYMBOL(cuMemcpy2D, tcuMemcpy2D_v2, "cuMemcpy2D_v2");
-    LOAD_SYMBOL(cuGetErrorName, tcuGetErrorName, "cuGetErrorName");
-    LOAD_SYMBOL(cuGetErrorString, tcuGetErrorString, "cuGetErrorString");
-
+    LOAD_SYMBOL(cuMemcpy2DUnaligned, tcuMemcpy2DUnaligned, "cuMemcpy2DUnaligned");
+    LOAD_SYMBOL(cuMemcpy3D, tcuMemcpy3D, "cuMemcpy3D");
+    LOAD_SYMBOL(cuMemcpy3DPeer, tcuMemcpy3DPeer, "cuMemcpy3DPeer");
+    LOAD_SYMBOL(cuMemcpyAsync, tcuMemcpyAsync, "cuMemcpyAsync");
+    LOAD_SYMBOL(cuMemcpyPeerAsync, tcuMemcpyPeerAsync, "cuMemcpyPeerAsync");
+    LOAD_SYMBOL(cuMemcpyHtoDAsync, tcuMemcpyHtoDAsync, "cuMemcpyHtoDAsync");
+    LOAD_SYMBOL(cuMemcpyDtoHAsync, tcuMemcpyDtoHAsync, "cuMemcpyDtoHAsync");
+    LOAD_SYMBOL(cuMemcpyDtoDAsync, tcuMemcpyDtoDAsync, "cuMemcpyDtoDAsync");
+    LOAD_SYMBOL(cuMemcpyHtoAAsync, tcuMemcpyHtoAAsync, "cuMemcpyHtoAAsync");
+    LOAD_SYMBOL(cuMemcpyAtoHAsync, tcuMemcpyAtoHAsync, "cuMemcpyAtoHAsync");
+    LOAD_SYMBOL(cuMemcpy2DAsync, tcuMemcpy2DAsync, "cuMemcpy2DAsync");
+    LOAD_SYMBOL(cuMemcpy3DAsync, tcuMemcpy3DAsync, "cuMemcpy3DAsync");
+    LOAD_SYMBOL(cuMemcpy3DPeerAsync, tcuMemcpy3DPeerAsync, "cuMemcpy3DPeerAsync");
+    LOAD_SYMBOL(cuMemsetD8, tcuMemsetD8, "cuMemsetD8");
+    LOAD_SYMBOL(cuMemsetD16, tcuMemsetD16, "cuMemsetD16");
+    LOAD_SYMBOL(cuMemsetD32, tcuMemsetD32, "cuMemsetD32");
+    LOAD_SYMBOL(cuMemsetD2D8, tcuMemsetD2D8, "cuMemsetD2D8");
+    LOAD_SYMBOL(cuMemsetD2D16, tcuMemsetD2D16, "cuMemsetD2D16");
+    LOAD_SYMBOL(cuMemsetD2D32, tcuMemsetD2D32, "cuMemsetD2D32");
+    LOAD_SYMBOL(cuMemsetD8Async, tcuMemsetD8Async, "cuMemsetD8Async");
+    LOAD_SYMBOL(cuMemsetD16Async, tcuMemsetD16Async, "cuMemsetD16Async");
+    LOAD_SYMBOL(cuMemsetD32Async, tcuMemsetD32Async, "cuMemsetD32Async");
+    LOAD_SYMBOL(cuMemsetD2D8Async, tcuMemsetD2D8Async, "cuMemsetD2D8Async");
+    LOAD_SYMBOL(cuMemsetD2D16Async, tcuMemsetD2D16Async, "cuMemsetD2D16Async");
+    LOAD_SYMBOL(cuMemsetD2D32Async, tcuMemsetD2D32Async, "cuMemsetD2D32Async");
+    LOAD_SYMBOL(cuArrayCreate, tcuArrayCreate, "cuArrayCreate");
+    LOAD_SYMBOL(cuArrayGetDescriptor, tcuArrayGetDescriptor, "cuArrayGetDescriptor");
+    LOAD_SYMBOL(cuArrayDestroy, tcuArrayDestroy, "cuArrayDestroy");
+    LOAD_SYMBOL(cuArray3DCreate, tcuArray3DCreate, "cuArray3DCreate");
+    LOAD_SYMBOL(cuArray3DGetDescriptor, tcuArray3DGetDescriptor, "cuArray3DGetDescriptor");
+    LOAD_SYMBOL(cuMipmappedArrayCreate, tcuMipmappedArrayCreate, "cuMipmappedArrayCreate");
+    LOAD_SYMBOL(cuMipmappedArrayGetLevel, tcuMipmappedArrayGetLevel, "cuMipmappedArrayGetLevel");
+    LOAD_SYMBOL(cuMipmappedArrayDestroy, tcuMipmappedArrayDestroy, "cuMipmappedArrayDestroy");
+    LOAD_SYMBOL(cuPointerGetAttribute, tcuPointerGetAttribute, "cuPointerGetAttribute");
+    LOAD_SYMBOL(cuMemPrefetchAsync, tcuMemPrefetchAsync, "cuMemPrefetchAsync");
+    LOAD_SYMBOL(cuMemAdvise, tcuMemAdvise, "cuMemAdvise");
+    LOAD_SYMBOL(cuMemRangeGetAttribute, tcuMemRangeGetAttribute, "cuMemRangeGetAttribute");
+    LOAD_SYMBOL(cuMemRangeGetAttributes, tcuMemRangeGetAttributes, "cuMemRangeGetAttributes");
+    LOAD_SYMBOL(cuPointerSetAttribute, tcuPointerSetAttribute, "cuPointerSetAttribute");
+    LOAD_SYMBOL(cuPointerGetAttributes, tcuPointerGetAttributes, "cuPointerGetAttributes");
+    LOAD_SYMBOL(cuStreamCreate, tcuStreamCreate, "cuStreamCreate");
+    LOAD_SYMBOL(cuStreamCreateWithPriority, tcuStreamCreateWithPriority, "cuStreamCreateWithPriority");
+    LOAD_SYMBOL(cuStreamGetPriority, tcuStreamGetPriority, "cuStreamGetPriority");
+    LOAD_SYMBOL(cuStreamGetFlags, tcuStreamGetFlags, "cuStreamGetFlags");
+    LOAD_SYMBOL(cuStreamWaitEvent, tcuStreamWaitEvent, "cuStreamWaitEvent");
+    LOAD_SYMBOL(cuStreamAddCallback, tcuStreamAddCallback, "cuStreamAddCallback");
+    LOAD_SYMBOL(cuStreamAttachMemAsync, tcuStreamAttachMemAsync, "cuStreamAttachMemAsync");
+    LOAD_SYMBOL(cuStreamQuery, tcuStreamQuery, "cuStreamQuery");
+    LOAD_SYMBOL(cuStreamSynchronize, tcuStreamSynchronize, "cuStreamSynchronize");
+    LOAD_SYMBOL(cuStreamDestroy, tcuStreamDestroy, "cuStreamDestroy");
+    LOAD_SYMBOL(cuEventCreate, tcuEventCreate, "cuEventCreate");
+    LOAD_SYMBOL(cuEventRecord, tcuEventRecord, "cuEventRecord");
+    LOAD_SYMBOL(cuEventQuery, tcuEventQuery, "cuEventQuery");
+    LOAD_SYMBOL(cuEventSynchronize, tcuEventSynchronize, "cuEventSynchronize");
+    LOAD_SYMBOL(cuEventDestroy, tcuEventDestroy, "cuEventDestroy");
+    LOAD_SYMBOL(cuEventElapsedTime, tcuEventElapsedTime, "cuEventElapsedTime");
+    LOAD_SYMBOL(cuStreamWaitValue32, tcuStreamWaitValue32, "cuStreamWaitValue32");
+    LOAD_SYMBOL(cuStreamWriteValue32, tcuStreamWriteValue32, "cuStreamWriteValue32");
+    LOAD_SYMBOL(cuStreamBatchMemOp, tcuStreamBatchMemOp, "cuStreamBatchMemOp");
+    LOAD_SYMBOL(cuFuncGetAttribute, tcuFuncGetAttribute, "cuFuncGetAttribute");
+    LOAD_SYMBOL(cuFuncSetCacheConfig, tcuFuncSetCacheConfig, "cuFuncSetCacheConfig");
+    LOAD_SYMBOL(cuFuncSetSharedMemConfig, tcuFuncSetSharedMemConfig, "cuFuncSetSharedMemConfig");
+    LOAD_SYMBOL(cuLaunchKernel, tcuLaunchKernel, "cuLaunchKernel");
+    LOAD_SYMBOL(cuFuncSetBlockShape, tcuFuncSetBlockShape, "cuFuncSetBlockShape");
+    LOAD_SYMBOL(cuFuncSetSharedSize, tcuFuncSetSharedSize, "cuFuncSetSharedSize");
+    LOAD_SYMBOL(cuParamSetSize, tcuParamSetSize, "cuParamSetSize");
+    LOAD_SYMBOL(cuParamSeti, tcuParamSeti, "cuParamSeti");
+    LOAD_SYMBOL(cuParamSetf, tcuParamSetf, "cuParamSetf");
+    LOAD_SYMBOL(cuParamSetv, tcuParamSetv, "cuParamSetv");
+    LOAD_SYMBOL(cuLaunch, tcuLaunch, "cuLaunch");
+    LOAD_SYMBOL(cuLaunchGrid, tcuLaunchGrid, "cuLaunchGrid");
+    LOAD_SYMBOL(cuLaunchGridAsync, tcuLaunchGridAsync, "cuLaunchGridAsync");
+    LOAD_SYMBOL(cuParamSetTexRef, tcuParamSetTexRef, "cuParamSetTexRef");
+    LOAD_SYMBOL(cuOccupancyMaxActiveBlocksPerMultiprocessor, tcuOccupancyMaxActiveBlocksPerMultiprocessor, "cuOccupancyMaxActiveBlocksPerMultiprocessor");
+    LOAD_SYMBOL(cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags, tcuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags, "cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags");
+    LOAD_SYMBOL(cuOccupancyMaxPotentialBlockSize, tcuOccupancyMaxPotentialBlockSize, "cuOccupancyMaxPotentialBlockSize");
+    LOAD_SYMBOL(cuOccupancyMaxPotentialBlockSizeWithFlags, tcuOccupancyMaxPotentialBlockSizeWithFlags, "cuOccupancyMaxPotentialBlockSizeWithFlags");
+    LOAD_SYMBOL(cuTexRefSetArray, tcuTexRefSetArray, "cuTexRefSetArray");
+    LOAD_SYMBOL(cuTexRefSetMipmappedArray, tcuTexRefSetMipmappedArray, "cuTexRefSetMipmappedArray");
+    LOAD_SYMBOL(cuTexRefSetAddress, tcuTexRefSetAddress, "cuTexRefSetAddress");
+    LOAD_SYMBOL(cuTexRefSetAddress2D, tcuTexRefSetAddress2D, "cuTexRefSetAddress2D");
+    LOAD_SYMBOL(cuTexRefSetFormat, tcuTexRefSetFormat, "cuTexRefSetFormat");
+    LOAD_SYMBOL(cuTexRefSetAddressMode, tcuTexRefSetAddressMode, "cuTexRefSetAddressMode");
+    LOAD_SYMBOL(cuTexRefSetFilterMode, tcuTexRefSetFilterMode, "cuTexRefSetFilterMode");
+    LOAD_SYMBOL(cuTexRefSetMipmapFilterMode, tcuTexRefSetMipmapFilterMode, "cuTexRefSetMipmapFilterMode");
+    LOAD_SYMBOL(cuTexRefSetMipmapLevelBias, tcuTexRefSetMipmapLevelBias, "cuTexRefSetMipmapLevelBias");
+    LOAD_SYMBOL(cuTexRefSetMipmapLevelClamp, tcuTexRefSetMipmapLevelClamp, "cuTexRefSetMipmapLevelClamp");
+    LOAD_SYMBOL(cuTexRefSetMaxAnisotropy, tcuTexRefSetMaxAnisotropy, "cuTexRefSetMaxAnisotropy");
+    LOAD_SYMBOL(cuTexRefSetBorderColor, tcuTexRefSetBorderColor, "cuTexRefSetBorderColor");
+    LOAD_SYMBOL(cuTexRefSetFlags, tcuTexRefSetFlags, "cuTexRefSetFlags");
+    LOAD_SYMBOL(cuTexRefGetAddress, tcuTexRefGetAddress, "cuTexRefGetAddress");
+    LOAD_SYMBOL(cuTexRefGetArray, tcuTexRefGetArray, "cuTexRefGetArray");
+    LOAD_SYMBOL(cuTexRefGetMipmappedArray, tcuTexRefGetMipmappedArray, "cuTexRefGetMipmappedArray");
+    LOAD_SYMBOL(cuTexRefGetAddressMode, tcuTexRefGetAddressMode, "cuTexRefGetAddressMode");
+    LOAD_SYMBOL(cuTexRefGetFilterMode, tcuTexRefGetFilterMode, "cuTexRefGetFilterMode");
+    LOAD_SYMBOL(cuTexRefGetFormat, tcuTexRefGetFormat, "cuTexRefGetFormat");
+    LOAD_SYMBOL(cuTexRefGetMipmapFilterMode, tcuTexRefGetMipmapFilterMode, "cuTexRefGetMipmapFilterMode");
+    LOAD_SYMBOL(cuTexRefGetMipmapLevelBias, tcuTexRefGetMipmapLevelBias, "cuTexRefGetMipmapLevelBias");
+    LOAD_SYMBOL(cuTexRefGetMipmapLevelClamp, tcuTexRefGetMipmapLevelClamp, "cuTexRefGetMipmapLevelClamp");
+    LOAD_SYMBOL(cuTexRefGetMaxAnisotropy, tcuTexRefGetMaxAnisotropy, "cuTexRefGetMaxAnisotropy");
+    LOAD_SYMBOL(cuTexRefGetBorderColor, tcuTexRefGetBorderColor, "cuTexRefGetBorderColor");
+    LOAD_SYMBOL(cuTexRefGetFlags, tcuTexRefGetFlags, "cuTexRefGetFlags");
+    LOAD_SYMBOL(cuTexRefCreate, tcuTexRefCreate, "cuTexRefCreate");
+    LOAD_SYMBOL(cuTexRefDestroy, tcuTexRefDestroy, "cuTexRefDestroy");
+    LOAD_SYMBOL(cuSurfRefSetArray, tcuSurfRefSetArray, "cuSurfRefSetArray");
+    LOAD_SYMBOL(cuSurfRefGetArray, tcuSurfRefGetArray, "cuSurfRefGetArray");
+    LOAD_SYMBOL(cuTexObjectCreate, tcuTexObjectCreate, "cuTexObjectCreate");
+    LOAD_SYMBOL(cuTexObjectDestroy, tcuTexObjectDestroy, "cuTexObjectDestroy");
+    LOAD_SYMBOL(cuTexObjectGetResourceDesc, tcuTexObjectGetResourceDesc, "cuTexObjectGetResourceDesc");
+    LOAD_SYMBOL(cuTexObjectGetTextureDesc, tcuTexObjectGetTextureDesc, "cuTexObjectGetTextureDesc");
+    LOAD_SYMBOL(cuTexObjectGetResourceViewDesc, tcuTexObjectGetResourceViewDesc, "cuTexObjectGetResourceViewDesc");
+    LOAD_SYMBOL(cuSurfObjectCreate, tcuSurfObjectCreate, "cuSurfObjectCreate");
+    LOAD_SYMBOL(cuSurfObjectDestroy, tcuSurfObjectDestroy, "cuSurfObjectDestroy");
+    LOAD_SYMBOL(cuSurfObjectGetResourceDesc, tcuSurfObjectGetResourceDesc, "cuSurfObjectGetResourceDesc");
+    LOAD_SYMBOL(cuDeviceCanAccessPeer, tcuDeviceCanAccessPeer, "cuDeviceCanAccessPeer");
+    LOAD_SYMBOL(cuDeviceGetP2PAttribute, tcuDeviceGetP2PAttribute, "cuDeviceGetP2PAttribute");
+    LOAD_SYMBOL(cuCtxEnablePeerAccess, tcuCtxEnablePeerAccess, "cuCtxEnablePeerAccess");
+    LOAD_SYMBOL(cuCtxDisablePeerAccess, tcuCtxDisablePeerAccess, "cuCtxDisablePeerAccess");
+    LOAD_SYMBOL(cuGraphicsUnregisterResource, tcuGraphicsUnregisterResource, "cuGraphicsUnregisterResource");
+    LOAD_SYMBOL(cuGraphicsSubResourceGetMappedArray, tcuGraphicsSubResourceGetMappedArray, "cuGraphicsSubResourceGetMappedArray");
+    LOAD_SYMBOL(cuGraphicsResourceGetMappedMipmappedArray, tcuGraphicsResourceGetMappedMipmappedArray, "cuGraphicsResourceGetMappedMipmappedArray");
+    LOAD_SYMBOL(cuGraphicsResourceGetMappedPointer, tcuGraphicsResourceGetMappedPointer, "cuGraphicsResourceGetMappedPointer");
+    LOAD_SYMBOL(cuGraphicsResourceSetMapFlags, tcuGraphicsResourceSetMapFlags, "cuGraphicsResourceSetMapFlags");
+    LOAD_SYMBOL(cuGraphicsMapResources, tcuGraphicsMapResources, "cuGraphicsMapResources");
+    LOAD_SYMBOL(cuGraphicsUnmapResources, tcuGraphicsUnmapResources, "cuGraphicsUnmapResources");
+    LOAD_SYMBOL(cuGetExportTable, tcuGetExportTable, "cuGetExportTable");
+	
     GENERIC_LOAD_FUNC_FINALE(cuda);
 }
 #endif
diff --git a/libavfilter/vf_scale_cuda.c b/libavfilter/vf_scale_cuda.c
index 23ac27a7dc..95d896c560 100644
--- a/libavfilter/vf_scale_cuda.c
+++ b/libavfilter/vf_scale_cuda.c
@@ -20,10 +20,11 @@
 * DEALINGS IN THE SOFTWARE.
 */
 
-#include <cuda.h>
 #include <stdio.h>
 #include <string.h>
 
+#include "compat/cuda/dynlink_loader.h"
+
 #include "libavutil/avstring.h"
 #include "libavutil/common.h"
 #include "libavutil/hwcontext.h"
@@ -95,6 +96,8 @@ typedef struct CUDAScaleContext {
     CUdeviceptr srcBuffer;
     CUdeviceptr dstBuffer;
     int         tex_alignment;
+
+    CudaFunctions *cuda_dl;
 } CUDAScaleContext;
 
 static av_cold int cudascale_init(AVFilterContext *ctx)
@@ -261,49 +264,23 @@ static av_cold int cudascale_config_props(AVFilterLink *outlink)
 
     extern char vf_scale_cuda_ptx[];
 
-    err = cuCtxPushCurrent(cuda_ctx);
+    s->cuda_dl = device_hwctx->internal->cuda_dl;
+
+    err = s->cuda_dl->cuCtxPushCurrent(cuda_ctx);
     if (err != CUDA_SUCCESS) {
         av_log(ctx, AV_LOG_ERROR, "Error pushing cuda context\n");
         ret = AVERROR_UNKNOWN;
         goto fail;
     }
 
-    err = cuModuleLoadData(&s->cu_module, vf_scale_cuda_ptx);
+    err = s->cuda_dl->cuModuleLoadData(&s->cu_module, vf_scale_cuda_ptx);
     if (err != CUDA_SUCCESS) {
         av_log(ctx, AV_LOG_ERROR, "Error loading module data\n");
         ret = AVERROR_UNKNOWN;
         goto fail;
     }
 
-    cuModuleGetFunction(&s->cu_func_uchar, s->cu_module, "Subsample_Bilinear_uchar");
-    cuModuleGetFunction(&s->cu_func_uchar2, s->cu_module, "Subsample_Bilinear_uchar2");
-    cuModuleGetFunction(&s->cu_func_uchar4, s->cu_module, "Subsample_Bilinear_uchar4");
-    cuModuleGetFunction(&s->cu_func_ushort, s->cu_module, "Subsample_Bilinear_ushort");
-    cuModuleGetFunction(&s->cu_func_ushort2, s->cu_module, "Subsample_Bilinear_ushort2");
-    cuModuleGetFunction(&s->cu_func_ushort4, s->cu_module, "Subsample_Bilinear_ushort4");
-
-    cuModuleGetTexRef(&s->cu_tex_uchar, s->cu_module, "uchar_tex");
-    cuModuleGetTexRef(&s->cu_tex_uchar2, s->cu_module, "uchar2_tex");
-    cuModuleGetTexRef(&s->cu_tex_uchar4, s->cu_module, "uchar4_tex");
-    cuModuleGetTexRef(&s->cu_tex_ushort, s->cu_module, "ushort_tex");
-    cuModuleGetTexRef(&s->cu_tex_ushort2, s->cu_module, "ushort2_tex");
-    cuModuleGetTexRef(&s->cu_tex_ushort4, s->cu_module, "ushort4_tex");
-
-    cuTexRefSetFlags(s->cu_tex_uchar, CU_TRSF_READ_AS_INTEGER);
-    cuTexRefSetFlags(s->cu_tex_uchar2, CU_TRSF_READ_AS_INTEGER);
-    cuTexRefSetFlags(s->cu_tex_uchar4, CU_TRSF_READ_AS_INTEGER);
-    cuTexRefSetFlags(s->cu_tex_ushort, CU_TRSF_READ_AS_INTEGER);
-    cuTexRefSetFlags(s->cu_tex_ushort2, CU_TRSF_READ_AS_INTEGER);
-    cuTexRefSetFlags(s->cu_tex_ushort4, CU_TRSF_READ_AS_INTEGER);
-
-    cuTexRefSetFilterMode(s->cu_tex_uchar, CU_TR_FILTER_MODE_LINEAR);
-    cuTexRefSetFilterMode(s->cu_tex_uchar2, CU_TR_FILTER_MODE_LINEAR);
-    cuTexRefSetFilterMode(s->cu_tex_uchar4, CU_TR_FILTER_MODE_LINEAR);
-    cuTexRefSetFilterMode(s->cu_tex_ushort, CU_TR_FILTER_MODE_LINEAR);
-    cuTexRefSetFilterMode(s->cu_tex_ushort2, CU_TR_FILTER_MODE_LINEAR);
-    cuTexRefSetFilterMode(s->cu_tex_ushort4, CU_TR_FILTER_MODE_LINEAR);
-
-    cuCtxPopCurrent(&dummy);
+    s->cuda_dl->cuCtxPopCurrent(&dummy);
 
     if ((ret = ff_scale_eval_dimensions(s,
                                         s->w_expr, s->h_expr,
@@ -358,8 +335,8 @@ static int call_resize_kernel(CUDAScaleContext *s, CUfunction func, CUtexref tex
         desc.Format = CU_AD_FORMAT_UNSIGNED_INT16;
     }
 
-    cuTexRefSetAddress2D_v3(tex, &desc, src_devptr, src_pitch * pixel_size);
-    cuLaunchKernel(func, DIV_UP(dst_width, BLOCKX), DIV_UP(dst_height, BLOCKY), 1, BLOCKX, BLOCKY, 1, 0, 0, args_uchar, NULL);
+    s->cuda_dl->cuTexRefSetAddress2D_v3(tex, &desc, src_devptr, src_pitch * pixel_size);
+    s->cuda_dl->cuLaunchKernel(func, DIV_UP(dst_width, BLOCKX), DIV_UP(dst_height, BLOCKY), 1, BLOCKX, BLOCKY, 1, 0, 0, args_uchar, NULL);
 
     return 0;
 }
@@ -480,7 +457,7 @@ static int cudascale_filter_frame(AVFilterLink *link, AVFrame *in)
         goto fail;
     }
 
-    err = cuCtxPushCurrent(device_hwctx->cuda_ctx);
+    err = s->cuda_dl->cuCtxPushCurrent(device_hwctx->cuda_ctx);
     if (err != CUDA_SUCCESS) {
         ret = AVERROR_UNKNOWN;
         goto fail;
@@ -488,7 +465,7 @@ static int cudascale_filter_frame(AVFilterLink *link, AVFrame *in)
 
     ret = cudascale_scale(ctx, out, in);
 
-    cuCtxPopCurrent(&dummy);
+    s->cuda_dl->cuCtxPopCurrent(&dummy);
     if (ret < 0)
         goto fail;
 
diff --git a/libavfilter/vf_thumbnail_cuda.c b/libavfilter/vf_thumbnail_cuda.c
index 09377ca7f4..1b5fc79f97 100644
--- a/libavfilter/vf_thumbnail_cuda.c
+++ b/libavfilter/vf_thumbnail_cuda.c
@@ -20,7 +20,7 @@
 * DEALINGS IN THE SOFTWARE.
 */
 
-#include <cuda.h>
+#include "compat/cuda/dynlink_loader.h"
 
 #include "libavutil/hwcontext.h"
 #include "libavutil/hwcontext_cuda_internal.h"
@@ -70,6 +70,8 @@ typedef struct ThumbnailCudaContext {
     CUtexref    cu_tex_ushort2;
 
     CUdeviceptr data;
+
+    CudaFunctions *cuda_dl;
 } ThumbnailCudaContext;
 
 #define OFFSET(x) offsetof(ThumbnailCudaContext, x)
@@ -171,8 +173,8 @@ static int thumbnail_kernel(ThumbnailCudaContext *s, CUfunction func, CUtexref t
         desc.Format = CU_AD_FORMAT_UNSIGNED_INT16;
     }
 
-    cuTexRefSetAddress2D_v3(tex, &desc, src_devptr, src_pitch);
-    cuLaunchKernel(func, DIV_UP(src_width, BLOCKX), DIV_UP(src_height, BLOCKY), 1, BLOCKX, BLOCKY, 1, 0, 0, args, NULL);
+    s->cuda_dl->cuTexRefSetAddress2D_v3(tex, &desc, src_devptr, src_pitch);
+    s->cuda_dl->cuLaunchKernel(func, DIV_UP(src_width, BLOCKX), DIV_UP(src_height, BLOCKY), 1, BLOCKX, BLOCKY, 1, 0, 0, args, NULL);
 
     return 0;
 }
@@ -243,11 +245,11 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *frame)
     // keep a reference of each frame
     s->frames[s->n].buf = frame;
 
-    err = cuCtxPushCurrent(device_hwctx->cuda_ctx);
+    err = s->cuda_dl->cuCtxPushCurrent(device_hwctx->cuda_ctx);
     if (err != CUDA_SUCCESS)
         return AVERROR_UNKNOWN;
 
-    cuMemsetD8(s->data, 0, HIST_SIZE * sizeof(int));
+    s->cuda_dl->cuMemsetD8(s->data, 0, HIST_SIZE * sizeof(int));
 
     thumbnail(ctx, (int*)s->data, frame);
 
@@ -260,7 +262,7 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *frame)
     cpy.WidthInBytes = HIST_SIZE * sizeof(int);
     cpy.Height = 1;
 
-    err = cuMemcpy2D(&cpy);
+    err = s->cuda_dl->cuMemcpy2D(&cpy);
     if (err != CUDA_SUCCESS) {
         av_log(ctx, AV_LOG_ERROR, "Error transferring the data from the CUDA frame\n");
         return AVERROR_UNKNOWN;
@@ -274,7 +276,7 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *frame)
             hist[i] = 4 * hist[i];
     }
 
-    cuCtxPopCurrent(&dummy);
+    s->cuda_dl->cuCtxPopCurrent(&dummy);
     if (ret < 0)
         return ret;
 
@@ -292,12 +294,12 @@ static av_cold void uninit(AVFilterContext *ctx)
     ThumbnailCudaContext *s = ctx->priv;
 
     if (s->data) {
-        cuMemFree(s->data);
+        s->cuda_dl->cuMemFree(s->data);
         s->data = 0;
     }
 
     if (s->cu_module) {
-        cuModuleUnload(s->cu_module);
+        s->cuda_dl->cuModuleUnload(s->cu_module);
         s->cu_module = NULL;
     }
 
@@ -344,45 +346,47 @@ static int config_props(AVFilterLink *inlink)
 
     extern char vf_thumbnail_cuda_ptx[];
 
-    err = cuCtxPushCurrent(cuda_ctx);
+    s->cuda_dl = device_hwctx->internal->cuda_dl;;
+
+    err = s->cuda_dl->cuCtxPushCurrent(cuda_ctx);
     if (err != CUDA_SUCCESS) {
         av_log(ctx, AV_LOG_ERROR, "Error pushing cuda context\n");
         return AVERROR_UNKNOWN;
     }
 
-    err = cuModuleLoadData(&s->cu_module, vf_thumbnail_cuda_ptx);
+    err = s->cuda_dl->cuModuleLoadData(&s->cu_module, vf_thumbnail_cuda_ptx);
     if (err != CUDA_SUCCESS) {
         av_log(ctx, AV_LOG_ERROR, "Error loading module data\n");
         return AVERROR_UNKNOWN;
     }
 
-    cuModuleGetFunction(&s->cu_func_uchar, s->cu_module, "Thumbnail_uchar");
-    cuModuleGetFunction(&s->cu_func_uchar2, s->cu_module, "Thumbnail_uchar2");
-    cuModuleGetFunction(&s->cu_func_ushort, s->cu_module, "Thumbnail_ushort");
-    cuModuleGetFunction(&s->cu_func_ushort2, s->cu_module, "Thumbnail_ushort2");
+    s->cuda_dl->cuModuleGetFunction(&s->cu_func_uchar, s->cu_module, "Thumbnail_uchar");
+    s->cuda_dl->cuModuleGetFunction(&s->cu_func_uchar2, s->cu_module, "Thumbnail_uchar2");
+    s->cuda_dl->cuModuleGetFunction(&s->cu_func_ushort, s->cu_module, "Thumbnail_ushort");
+    s->cuda_dl->cuModuleGetFunction(&s->cu_func_ushort2, s->cu_module, "Thumbnail_ushort2");
 
-    cuModuleGetTexRef(&s->cu_tex_uchar, s->cu_module, "uchar_tex");
-    cuModuleGetTexRef(&s->cu_tex_uchar2, s->cu_module, "uchar2_tex");
-    cuModuleGetTexRef(&s->cu_tex_ushort, s->cu_module, "ushort_tex");
-    cuModuleGetTexRef(&s->cu_tex_ushort2, s->cu_module, "ushort2_tex");
+    s->cuda_dl->cuModuleGetTexRef(&s->cu_tex_uchar, s->cu_module, "uchar_tex");
+    s->cuda_dl->cuModuleGetTexRef(&s->cu_tex_uchar2, s->cu_module, "uchar2_tex");
+    s->cuda_dl->cuModuleGetTexRef(&s->cu_tex_ushort, s->cu_module, "ushort_tex");
+    s->cuda_dl->cuModuleGetTexRef(&s->cu_tex_ushort2, s->cu_module, "ushort2_tex");
 
-    cuTexRefSetFlags(s->cu_tex_uchar, CU_TRSF_READ_AS_INTEGER);
-    cuTexRefSetFlags(s->cu_tex_uchar2, CU_TRSF_READ_AS_INTEGER);
-    cuTexRefSetFlags(s->cu_tex_ushort, CU_TRSF_READ_AS_INTEGER);
-    cuTexRefSetFlags(s->cu_tex_ushort2, CU_TRSF_READ_AS_INTEGER);
+    s->cuda_dl->cuTexRefSetFlags(s->cu_tex_uchar, CU_TRSF_READ_AS_INTEGER);
+    s->cuda_dl->cuTexRefSetFlags(s->cu_tex_uchar2, CU_TRSF_READ_AS_INTEGER);
+    s->cuda_dl->cuTexRefSetFlags(s->cu_tex_ushort, CU_TRSF_READ_AS_INTEGER);
+    s->cuda_dl->cuTexRefSetFlags(s->cu_tex_ushort2, CU_TRSF_READ_AS_INTEGER);
 
-    cuTexRefSetFilterMode(s->cu_tex_uchar, CU_TR_FILTER_MODE_LINEAR);
-    cuTexRefSetFilterMode(s->cu_tex_uchar2, CU_TR_FILTER_MODE_LINEAR);
-    cuTexRefSetFilterMode(s->cu_tex_ushort, CU_TR_FILTER_MODE_LINEAR);
-    cuTexRefSetFilterMode(s->cu_tex_ushort2, CU_TR_FILTER_MODE_LINEAR);
+    s->cuda_dl->cuTexRefSetFilterMode(s->cu_tex_uchar, CU_TR_FILTER_MODE_LINEAR);
+    s->cuda_dl->cuTexRefSetFilterMode(s->cu_tex_uchar2, CU_TR_FILTER_MODE_LINEAR);
+    s->cuda_dl->cuTexRefSetFilterMode(s->cu_tex_ushort, CU_TR_FILTER_MODE_LINEAR);
+    s->cuda_dl->cuTexRefSetFilterMode(s->cu_tex_ushort2, CU_TR_FILTER_MODE_LINEAR);
 
-    err = cuMemAlloc(&s->data, HIST_SIZE * sizeof(int));
+    err = s->cuda_dl->cuMemAlloc(&s->data, HIST_SIZE * sizeof(int));
     if (err != CUDA_SUCCESS) {
         av_log(ctx, AV_LOG_ERROR, "Error allocating cuda memory\n");
         return AVERROR_UNKNOWN;
     }
 
-    cuCtxPopCurrent(&dummy);
+    s->cuda_dl->cuCtxPopCurrent(&dummy);
 
     s->hw_frames_ctx = ctx->inputs[0]->hw_frames_ctx;
 
-- 
2.15.1.windows.2

