This patch changes the location of taking a mutex lock and releasing it
during the hard-reset process of the ASIC.

The only place we need to protect is when we dereference pointers that may
go away in case the user process aborts/closes the FD.

That way, we allow the user process to actually close its FD in case we
tell him that an error occurred.

Signed-off-by: Oded Gabbay <[email protected]>
---
 drivers/misc/habanalabs/device.c | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/drivers/misc/habanalabs/device.c b/drivers/misc/habanalabs/device.c
index 5400e65ba5fa..471506b54217 100644
--- a/drivers/misc/habanalabs/device.c
+++ b/drivers/misc/habanalabs/device.c
@@ -574,20 +574,21 @@ static void device_kill_open_processes(struct hl_device 
*hdev)
        else
                pending_total = HL_PENDING_RESET_PER_SEC;
 
-       pending_cnt = pending_total;
-
        /* Flush all processes that are inside hl_open */
        mutex_lock(&hdev->fpriv_list_lock);
+       mutex_unlock(&hdev->fpriv_list_lock);
 
-       while ((!list_empty(&hdev->fpriv_list)) && (pending_cnt)) {
-
-               pending_cnt--;
-
-               dev_info(hdev->dev,
-                       "Can't HARD reset, waiting for user to close FD\n");
+       /* Giving time for user to close FD, and for processes that are inside
+        * hl_device_open to finish
+        */
+       if (!list_empty(&hdev->fpriv_list))
                ssleep(1);
-       }
 
+       mutex_lock(&hdev->fpriv_list_lock);
+
+       /* This section must be protected because we are dereferencing
+        * pointers that are freed if the process exits
+        */
        if (!list_empty(&hdev->fpriv_list)) {
                task = get_pid_task(hdev->compute_ctx->hpriv->taskpid,
                                        PIDTYPE_PID);
@@ -600,6 +601,8 @@ static void device_kill_open_processes(struct hl_device 
*hdev)
                }
        }
 
+       mutex_unlock(&hdev->fpriv_list_lock);
+
        /* We killed the open users, but because the driver cleans up after the
         * user contexts are closed (e.g. mmu mappings), we need to wait again
         * to make sure the cleaning phase is finished before continuing with
@@ -609,6 +612,8 @@ static void device_kill_open_processes(struct hl_device 
*hdev)
        pending_cnt = pending_total;
 
        while ((!list_empty(&hdev->fpriv_list)) && (pending_cnt)) {
+               dev_info(hdev->dev,
+                       "Waiting for all unmap operations to finish before hard 
reset\n");
 
                pending_cnt--;
 
@@ -618,9 +623,6 @@ static void device_kill_open_processes(struct hl_device 
*hdev)
        if (!list_empty(&hdev->fpriv_list))
                dev_crit(hdev->dev,
                        "Going to hard reset with open user contexts\n");
-
-       mutex_unlock(&hdev->fpriv_list_lock);
-
 }
 
 static void device_hard_reset_pending(struct work_struct *work)
-- 
2.17.1

Reply via email to