This is the libgomp plugin side of omp_clock_wtime support on NVPTX. Query
GPU frequency and copy the value into the device image.
At the moment CUDA driver sets GPU to a fixed frequency when a CUDA context is
created (the default is to use the highest non-boost frequency, but it can be
altered with the nvidia-smi utility), so as long as dynamic boost is not
implemented, and thermal throttling does not happen, what was queried should
correspond to the actual frequency of %clock64 updates. However, on GTX Titan
we observed that the driver returns GPU frequency that is midway between
actual frequency and boost frequency -- we consider that a driver bug. Thus,
the implementation comes with a caveat that device-side measurements are less
reliable (than host-side).
* plugin/plugin-nvptx.c (struct ptx_device): New field (clock_khz).
(nvptx_open_device): Set it.
(nvptx_set_clocktick): New. Use it...
(GOMP_OFFLOAD_load_image): ...here.
---
libgomp/ChangeLog.gomp-nvptx | 7 +++++++
libgomp/plugin/plugin-nvptx.c | 28 +++++++++++++++++++++++++++-
2 files changed, 34 insertions(+), 1 deletion(-)
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index e687586..87e0494 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -287,8 +287,9 @@ struct ptx_device
bool overlap;
bool map;
bool concur;
- int mode;
bool mkern;
+ int mode;
+ int clock_khz;
struct ptx_image_data *images; /* Images loaded on device. */
pthread_mutex_t image_lock; /* Lock for above list. */
@@ -641,6 +642,12 @@ nvptx_open_device (int n)
ptx_dev->mkern = pi;
+ r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
+
+ ptx_dev->clock_khz = pi;
+
r = cuDeviceGetAttribute (&async_engines,
CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
if (r != CUDA_SUCCESS)
@@ -1505,6 +1512,23 @@ GOMP_OFFLOAD_version (void)
return GOMP_VERSION;
}
+/* Initialize __nvptx_clocktick, if present in MODULE. */
+
+static void
+nvptx_set_clocktick (CUmodule module, struct ptx_device *dev)
+{
+ CUdeviceptr dptr;
+ CUresult r = cuModuleGetGlobal (&dptr, NULL, module, "__nvptx_clocktick");
+ if (r == CUDA_ERROR_NOT_FOUND)
+ return;
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
+ double __nvptx_clocktick = 1e-3 / dev->clock_khz;
+ r = cuMemcpyHtoD (dptr, &__nvptx_clocktick, sizeof (__nvptx_clocktick));
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
+}
+
/* Load the (partial) program described by TARGET_DATA to device
number ORD. Allocate and return TARGET_TABLE. */
@@ -1590,6 +1614,8 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, const
void *target_data,
targ_tbl->end = targ_tbl->start + bytes;
}
+ nvptx_set_clocktick (module, dev);
+
return fn_entries + var_entries;
}