Am 27.01.2018 um 02:09 schrieb Felix Kuehling:
This fence is used by KFD to keep memory resident while user mode
queues are enabled. Trying to evict memory will trigger the
enable_signaling callback, which starts a KFD eviction, which
involves preempting user mode queues before signaling the fence.
There is one such fence per process.

Signed-off-by: Felix Kuehling <[email protected]>
---
  drivers/gpu/drm/amd/amdgpu/Makefile              |   1 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h       |  15 ++
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c | 196 +++++++++++++++++++++++
  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h         |   5 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c         |  18 ++-
  drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c          |  18 +++
  drivers/gpu/drm/amd/include/kgd_kfd_interface.h  |   6 +
  7 files changed, 256 insertions(+), 3 deletions(-)
  create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c

diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile 
b/drivers/gpu/drm/amd/amdgpu/Makefile
index d6e5b72..43dc3f9 100644
--- a/drivers/gpu/drm/amd/amdgpu/Makefile
+++ b/drivers/gpu/drm/amd/amdgpu/Makefile
@@ -130,6 +130,7 @@ amdgpu-y += \
  # add amdkfd interfaces
  amdgpu-y += \
         amdgpu_amdkfd.o \
+        amdgpu_amdkfd_fence.o \
         amdgpu_amdkfd_gfx_v8.o
# add cgs
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 2a519f9..8d92f5c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -29,6 +29,8 @@
  #include <linux/mmu_context.h>
  #include <kgd_kfd_interface.h>
+extern const struct kgd2kfd_calls *kgd2kfd;
+
  struct amdgpu_device;
struct kgd_mem {
@@ -37,6 +39,19 @@ struct kgd_mem {
        void *cpu_ptr;
  };
+/* KFD Memory Eviction */
+struct amdgpu_amdkfd_fence {
+       struct dma_fence base;
+       void *mm;
+       spinlock_t lock;
+       char timeline_name[TASK_COMM_LEN];
+};
+
+struct amdgpu_amdkfd_fence *amdgpu_amdkfd_fence_create(u64 context,
+                                                      void *mm);
+bool amd_kfd_fence_check_mm(struct dma_fence *f, void *mm);
+struct amdgpu_amdkfd_fence *to_amdgpu_amdkfd_fence(struct dma_fence *f);
+
  int amdgpu_amdkfd_init(void);
  void amdgpu_amdkfd_fini(void);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c
new file mode 100644
index 0000000..252e44e
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c
@@ -0,0 +1,196 @@
+/*
+ * Copyright 2016-2018 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <linux/dma-fence.h>
+#include <linux/spinlock.h>
+#include <linux/atomic.h>
+#include <linux/stacktrace.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include "amdgpu_amdkfd.h"
+
+const struct dma_fence_ops amd_kfd_fence_ops;
+static atomic_t fence_seq = ATOMIC_INIT(0);
+
+static int amd_kfd_fence_signal(struct dma_fence *f);
+
+/* Eviction Fence
+ * Fence helper functions to deal with KFD memory eviction.
+ * Big Idea - Since KFD submissions are done by user queues, a BO cannot be
+ *  evicted unless all the user queues for that process are evicted.
+ *
+ * All the BOs in a process share an eviction fence. When process X wants
+ * to map VRAM memory but TTM can't find enough space, TTM will attempt to
+ * evict BOs from its LRU list. TTM checks if the BO is valuable to evict
+ * by calling ttm_bo_driver->eviction_valuable().
+ *
+ * ttm_bo_driver->eviction_valuable() - will return false if the BO belongs
+ *  to process X. Otherwise, it will return true to indicate BO can be
+ *  evicted by TTM.
+ *
+ * If ttm_bo_driver->eviction_valuable returns true, then TTM will continue
+ * the evcition process for that BO by calling ttm_bo_evict --> amdgpu_bo_move
+ * --> amdgpu_copy_buffer(). This sets up job in GPU scheduler.
+ *
+ * GPU Scheduler (amd_sched_main) - sets up a cb (fence_add_callback) to
+ *  nofity when the BO is free to move. fence_add_callback --> enable_signaling
+ *  --> amdgpu_amdkfd_fence.enable_signaling
+ *
+ * amdgpu_amdkfd_fence.enable_signaling - Start a work item that will quiesce
+ * user queues and signal fence. The work item will also start another delayed
+ * work item to restore BOs
+ */
+
+struct amdgpu_amdkfd_fence *amdgpu_amdkfd_fence_create(u64 context,
+                                                      void *mm)
+{
+       struct amdgpu_amdkfd_fence *fence = NULL;
+
+       fence = kzalloc(sizeof(*fence), GFP_KERNEL);
+       if (fence == NULL)
+               return NULL;
+
+       /* mm_struct mm is used as void pointer to identify the parent
+        * KFD process. Don't dereference it. Fence and any threads using
+        * mm is guranteed to be released before process termination.
+        */
+       fence->mm = mm;

That won't work. Fences can live much longer than any process who created them.

I've already found a fence in a BO still living hours after the process was killed and the pid long recycled.

I suggest to make fence->mm a real mm_struct pointer with reference counting and then set it to NULL and drop the reference in enable_signaling.

+       get_task_comm(fence->timeline_name, current);
+       spin_lock_init(&fence->lock);
+
+       dma_fence_init(&fence->base, &amd_kfd_fence_ops, &fence->lock,
+                  context, atomic_inc_return(&fence_seq));
+
+       return fence;
+}
+
+struct amdgpu_amdkfd_fence *to_amdgpu_amdkfd_fence(struct dma_fence *f)
+{
+       struct amdgpu_amdkfd_fence *fence;
+
+       if (!f)
+               return NULL;
+
+       fence = container_of(f, struct amdgpu_amdkfd_fence, base);
+       if (fence && f->ops == &amd_kfd_fence_ops)
+               return fence;
+
+       return NULL;
+}
+
+static const char *amd_kfd_fence_get_driver_name(struct dma_fence *f)
+{
+       return "amdgpu_amdkfd_fence";
+}
+
+static const char *amd_kfd_fence_get_timeline_name(struct dma_fence *f)
+{
+       struct amdgpu_amdkfd_fence *fence = to_amdgpu_amdkfd_fence(f);
+
+       return fence->timeline_name;
+}
+
+/**
+ * amd_kfd_fence_enable_signaling - This gets called when TTM wants to evict
+ *  a KFD BO and schedules a job to move the BO.
+ *  If fence is already signaled return true.
+ *  If fence is not signaled schedule a evict KFD process work item.
+ */
+static bool amd_kfd_fence_enable_signaling(struct dma_fence *f)
+{
+       struct amdgpu_amdkfd_fence *fence = to_amdgpu_amdkfd_fence(f);
+
+       if (!fence)
+               return false;
+
+       if (dma_fence_is_signaled(f))
+               return true;
+
+       if (!kgd2kfd->schedule_evict_and_restore_process(
+                               (struct mm_struct *)fence->mm, f))
+               return true;
+
+       return false;
+}
+
+static int amd_kfd_fence_signal(struct dma_fence *f)
+{
+       unsigned long flags;
+       int ret;
+
+       spin_lock_irqsave(f->lock, flags);
+       /* Set enabled bit so cb will called */
+       set_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &f->flags);

Mhm, why is that necessary?

+       ret = dma_fence_signal_locked(f);
+       spin_unlock_irqrestore(f->lock, flags);
+
+       return ret;
+}
+
+/**
+ * amd_kfd_fence_release - callback that fence can be freed
+ *
+ * @fence: fence
+ *
+ * This function is called when the reference count becomes zero.
+ * It just RCU schedules freeing up the fence.
+ */
+static void amd_kfd_fence_release(struct dma_fence *f)
+{
+       struct amdgpu_amdkfd_fence *fence = to_amdgpu_amdkfd_fence(f);
+       /* Unconditionally signal the fence. The process is getting
+        * terminated.
+        */
+       if (WARN_ON(!fence))
+               return; /* Not an amdgpu_amdkfd_fence */
+
+       amd_kfd_fence_signal(f);
+       kfree_rcu(f, rcu);
+}
+
+/**
+ * amd_kfd_fence_check_mm - Check if @mm is same as that of the fence @f
+ *  if same return TRUE else return FALSE.
+ *
+ * @f: [IN] fence
+ * @mm: [IN] mm that needs to be verified
+ */
+bool amd_kfd_fence_check_mm(struct dma_fence *f, void *mm)
+{
+       struct amdgpu_amdkfd_fence *fence = to_amdgpu_amdkfd_fence(f);
+
+       if (!fence)
+               return false;
+       else if (fence->mm == mm)
+               return true;
+
+       return false;
+}
+
+const struct dma_fence_ops amd_kfd_fence_ops = {
+       .get_driver_name = amd_kfd_fence_get_driver_name,
+       .get_timeline_name = amd_kfd_fence_get_timeline_name,
+       .enable_signaling = amd_kfd_fence_enable_signaling,
+       .signaled = NULL,
+       .wait = dma_fence_default_wait,
+       .release = amd_kfd_fence_release,
+};
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index 65d5a4e..ca00dd2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -36,8 +36,9 @@
  #define AMDGPU_MAX_UVD_ENC_RINGS      2
/* some special values for the owner field */
-#define AMDGPU_FENCE_OWNER_UNDEFINED   ((void*)0ul)
-#define AMDGPU_FENCE_OWNER_VM          ((void*)1ul)
+#define AMDGPU_FENCE_OWNER_UNDEFINED   ((void *)0ul)
+#define AMDGPU_FENCE_OWNER_VM          ((void *)1ul)
+#define AMDGPU_FENCE_OWNER_KFD         ((void *)2ul)
#define AMDGPU_FENCE_FLAG_64BIT (1 << 0)
  #define AMDGPU_FENCE_FLAG_INT           (1 << 1)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
index df65c66..0cb31d9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
@@ -31,6 +31,7 @@
  #include <drm/drmP.h>
  #include "amdgpu.h"
  #include "amdgpu_trace.h"
+#include "amdgpu_amdkfd.h"
struct amdgpu_sync_entry {
        struct hlist_node       node;
@@ -86,10 +87,18 @@ static bool amdgpu_sync_same_dev(struct amdgpu_device *adev,
  static void *amdgpu_sync_get_owner(struct dma_fence *f)
  {
        struct drm_sched_fence *s_fence = to_drm_sched_fence(f);
+       struct amdgpu_amdkfd_fence *kfd_fence;
+
+       if (!f)
+               return AMDGPU_FENCE_OWNER_UNDEFINED;

When you add the extra NULL check here then please move the to_drm_sched_fence() after it as well.

Christian.

if (s_fence)
                return s_fence->owner;
+ kfd_fence = to_amdgpu_amdkfd_fence(f);
+       if (kfd_fence)
+               return AMDGPU_FENCE_OWNER_KFD;
+
        return AMDGPU_FENCE_OWNER_UNDEFINED;
  }
@@ -204,11 +213,18 @@ int amdgpu_sync_resv(struct amdgpu_device *adev,
        for (i = 0; i < flist->shared_count; ++i) {
                f = rcu_dereference_protected(flist->shared[i],
                                              reservation_object_held(resv));
+               /* We only want to trigger KFD eviction fences on
+                * evict or move jobs. Skip KFD fences otherwise.
+                */
+               fence_owner = amdgpu_sync_get_owner(f);
+               if (fence_owner == AMDGPU_FENCE_OWNER_KFD &&
+                   owner != AMDGPU_FENCE_OWNER_UNDEFINED)
+                       continue;
+
                if (amdgpu_sync_same_dev(adev, f)) {
                        /* VM updates are only interesting
                         * for other VM updates and moves.
                         */
-                       fence_owner = amdgpu_sync_get_owner(f);
                        if ((owner != AMDGPU_FENCE_OWNER_UNDEFINED) &&
                            (fence_owner != AMDGPU_FENCE_OWNER_UNDEFINED) &&
                            ((owner == AMDGPU_FENCE_OWNER_VM) !=
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index e4bb435..c3f33d3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -46,6 +46,7 @@
  #include "amdgpu.h"
  #include "amdgpu_object.h"
  #include "amdgpu_trace.h"
+#include "amdgpu_amdkfd.h"
  #include "bif/bif_4_1_d.h"
#define DRM_FILE_PAGE_OFFSET (0x100000000ULL >> PAGE_SHIFT)
@@ -1170,6 +1171,23 @@ static bool amdgpu_ttm_bo_eviction_valuable(struct 
ttm_buffer_object *bo,
  {
        unsigned long num_pages = bo->mem.num_pages;
        struct drm_mm_node *node = bo->mem.mm_node;
+       struct reservation_object_list *flist;
+       struct dma_fence *f;
+       int i;
+
+       /* If bo is a KFD BO, check if the bo belongs to the current process.
+        * If true, then return false as any KFD process needs all its BOs to
+        * be resident to run successfully
+        */
+       flist = reservation_object_get_list(bo->resv);
+       if (flist) {
+               for (i = 0; i < flist->shared_count; ++i) {
+                       f = rcu_dereference_protected(flist->shared[i],
+                               reservation_object_held(bo->resv));
+                       if (amd_kfd_fence_check_mm(f, current->mm))
+                               return false;
+               }
+       }
switch (bo->mem.mem_type) {
        case TTM_PL_TT:
diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h 
b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
index 94eab54..9e35249 100644
--- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
+++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
@@ -30,6 +30,7 @@
#include <linux/types.h>
  #include <linux/bitmap.h>
+#include <linux/dma-fence.h>
struct pci_dev; @@ -286,6 +287,9 @@ struct kfd2kgd_calls {
   *
   * @resume: Notifies amdkfd about a resume action done to a kgd device
   *
+ * @schedule_evict_and_restore_process: Schedules work queue that will prepare
+ * for safe eviction of KFD BOs that belong to the specified process.
+ *
   * This structure contains function callback pointers so the kgd driver
   * will notify to the amdkfd about certain status changes.
   *
@@ -300,6 +304,8 @@ struct kgd2kfd_calls {
        void (*interrupt)(struct kfd_dev *kfd, const void *ih_ring_entry);
        void (*suspend)(struct kfd_dev *kfd);
        int (*resume)(struct kfd_dev *kfd);
+       int (*schedule_evict_and_restore_process)(struct mm_struct *mm,
+                       struct dma_fence *fence);
  };
int kgd2kfd_init(unsigned interface_version,

_______________________________________________
amd-gfx mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Reply via email to