On Thu, 25 Sept 2025 at 14:03, Christian König <
[email protected]> wrote:

> There has been multiple complains that 10 seconds are usually to long.
>
> The original requirement for longer timeout came from compute tests on
> AMDVLK, since that is no longer a topic reduce the timeout back to 2
> seconds for all queues.
>
> While at it also remove any special handling for compute queues under
> SRIOV or pass through.
>
> Signed-off-by: Christian König <[email protected]>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 85 ++++++++++------------
>  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    | 21 ++----
>  2 files changed, 48 insertions(+), 58 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index a77000c2e0bb..ceb3c616292c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -4278,58 +4278,53 @@ static int
> amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
>         long timeout;
>         int ret = 0;
>
> -       /*
> -        * By default timeout for jobs is 10 sec
> -        */
> -       adev->compute_timeout = adev->gfx_timeout =
> msecs_to_jiffies(10000);
> -       adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
> +       /* By default timeout for all queues is 2 sec */
> +       adev->gfx_timeout = adev->compute_timeout = adev->sdma_timeout =
> +               adev->video_timeout = msecs_to_jiffies(2000);
>
> -       if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
> -               while ((timeout_setting = strsep(&input, ",")) &&
> -                               strnlen(timeout_setting,
> AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
> -                       ret = kstrtol(timeout_setting, 0, &timeout);
> -                       if (ret)
> -                               return ret;
> +       if (!strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH))
> +               return 0;
>
> -                       if (timeout == 0) {
> -                               index++;
> -                               continue;
> -                       } else if (timeout < 0) {
> -                               timeout = MAX_SCHEDULE_TIMEOUT;
> -                               dev_warn(adev->dev, "lockup timeout
> disabled");
> -                               add_taint(TAINT_SOFTLOCKUP,
> LOCKDEP_STILL_OK);
> -                       } else {
> -                               timeout = msecs_to_jiffies(timeout);
> -                       }
> +       while ((timeout_setting = strsep(&input, ",")) &&
> +              strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
> +               ret = kstrtol(timeout_setting, 0, &timeout);
> +               if (ret)
> +                       return ret;
>
> -                       switch (index++) {
> -                       case 0:
> -                               adev->gfx_timeout = timeout;
> -                               break;
> -                       case 1:
> -                               adev->compute_timeout = timeout;
> -                               break;
> -                       case 2:
> -                               adev->sdma_timeout = timeout;
> -                               break;
> -                       case 3:
> -                               adev->video_timeout = timeout;
> -                               break;
> -                       default:
> -                               break;
> -                       }
> +               if (timeout == 0) {
> +                       index++;
> +                       continue;
> +               } else if (timeout < 0) {
> +                       timeout = MAX_SCHEDULE_TIMEOUT;
> +                       dev_warn(adev->dev, "lockup timeout disabled");
> +                       add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
> +               } else {
> +                       timeout = msecs_to_jiffies(timeout);
>                 }
> -               /*
> -                * There is only one value specified and
> -                * it should apply to all non-compute jobs.
> -                */
> -               if (index == 1) {
> -                       adev->sdma_timeout = adev->video_timeout =
> adev->gfx_timeout;
> -                       if (amdgpu_sriov_vf(adev) ||
> amdgpu_passthrough(adev))
> -                               adev->compute_timeout = adev->gfx_timeout;
> +
> +               switch (index++) {
> +               case 0:
> +                       adev->gfx_timeout = timeout;
> +                       break;
> +               case 1:
> +                       adev->compute_timeout = timeout;
> +                       break;
> +               case 2:
> +                       adev->sdma_timeout = timeout;
> +                       break;
> +               case 3:
> +                       adev->video_timeout = timeout;
> +                       break;
> +               default:
> +                       break;
>                 }
>         }
>
> +       /* When only one value specified apply it to all queues. */
> +       if (index == 1)
> +               adev->gfx_timeout = adev->compute_timeout =
> adev->sdma_timeout =
> +                       adev->video_timeout = timeout;
> +
>         return ret;
>  }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index ece251cbe8c3..fe45dd1d979e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -353,22 +353,17 @@ module_param_named(svm_default_granularity,
> amdgpu_svm_default_granularity, uint
>   * DOC: lockup_timeout (string)
>   * Set GPU scheduler timeout value in ms.
>   *
> - * The format can be [Non-Compute] or [GFX,Compute,SDMA,Video]. That is
> there can be one or
> - * multiple values specified. 0 and negative values are invalidated. They
> will be adjusted
> - * to the default timeout.
> + * The format can be [single value] for setting all timeouts at once or
> + * [GFX,Compute,SDMA,Video] to set individual timeouts.
> + * Negative values mean infinity.
>   *
> - * - With one value specified, the setting will apply to all non-compute
> jobs.
> - * - With multiple values specified, the first one will be for GFX.
> - *   The second one is for Compute. The third and fourth ones are
> - *   for SDMA and Video.
> - *
> - * By default(with no lockup_timeout settings), the timeout for all jobs
> is 10000.
> + * By default(with no lockup_timeout settings), the timeout for all
> queues is 2000.
>   */
>  MODULE_PARM_DESC(lockup_timeout,
> -                "GPU lockup timeout in ms (default: 10000 for all jobs. "
> -                "0: keep default value. negative: infinity timeout),
> format: for bare metal [Non-Compute] or [GFX,Compute,SDMA,Video]; "
> -                "for passthrough or sriov [all jobs] or
> [GFX,Compute,SDMA,Video].");
> -module_param_string(lockup_timeout, amdgpu_lockup_timeout,
> sizeof(amdgpu_lockup_timeout), 0444);
> +                "GPU lockup timeout in ms (default: 2000 for all queues. "
> +                "0: keep default value. negative: infinity timeout),
> format: [single value for all] or [GFX,Compute,SDMA,Video].");
> +module_param_string(lockup_timeout, amdgpu_lockup_timeout,
> +                   sizeof(amdgpu_lockup_timeout), 0444);
>
>  /**
>   * DOC: dpm (int)
> --
> 2.43.0
>
> Hi

This patch is causing issues with running:

  ~/GravityMark_1.89_linux $ DRI_PRIME=1 ./run_fullscreen_vk_rt.sh

M:      0 us: ../data.zip: 313 files
M:  15.19 ms: Temporal antialiasing
M:  15.21 ms: Fullscreen mode
M:  15.22 ms: Render Statistics
M:  20.77 ms: Build Date: Jun 20 2025
M:  20.80 ms: Build Info: version=20250429; linux; x64; release; vk=1;
gl=45; gles=32; cu=1; fusion
M:  20.81 ms: Build Version: 1.89
M:  48.06 ms: Name: ASUSTeK COMPUTER INC. G513QY ROG Strix G513QY_G513QY
M:  48.09 ms: System: 'Gentoo Linux'
M:  48.10 ms: Kernel: Linux 6.19.0-rc7-drm+ x86_64
M:  48.11 ms: Memory: 62.19 GB
M:  48.13 ms: Uptime: 19.00 s
M:  48.15 ms: CPU: AMD Ryzen 9 5900HX with Radeon Graphics
M:  48.17 ms: GPU 0: [AMD/ATI] Navi 22 [Radeon RX 6700/6700 XT/6750 XT /
6800M/6850M XT] (rev c3)
M:  48.18 ms: Device: VEN_1002&DEV_73DF&SUBSYS_16C21043
M:  48.19 ms: Memory: 11.98 GB
M:  48.21 ms: GPU 1: [AMD/ATI] Cezanne [Radeon Vega Series / Radeon Vega
Mobile Series] (rev c4)
M:  48.22 ms: Device: VEN_1002&DEV_1638&SUBSYS_16C21043
M:  48.23 ms: Memory: 512.00 MB
M:  48.53 ms: Desktop: 2560x1440 1.0
M:  48.55 ms: Screen 0: 2560x1440 0 0 eDP-1
M:  48.57 ms: Set fullscreen mode on 0 screen
M:  51.48 ms: Creating 2560x1440 Vulkan Window
M: 147.88 ms: Render Size: 2560x1440
M: 149.22 ms: Using Fetch Mode
M: 233.88 ms: Device: AMD Radeon RX 6800M (RADV NAVI22)
M: 233.95 ms: Vendor: AMD
M: 233.96 ms: Version: 26.0.99
M: 233.97 ms: DeviceID: 0x73df
M: 234.35 ms: Group Memory: 64.00 KB
M: 234.36 ms: Video Memory: 11.98 GB
M: 234.37 ms: Max Uniform Size: 4.00 GB
M: 234.38 ms: Max Storage Size: 4.00 GB
M: 234.38 ms: Creating SceneManager
M: 416.06 ms: Creating RenderManager
M: 547.17 ms: Ray Tracing Mode
M: 547.20 ms: Creating Scene
M:   1.481 s: Creating 200,000 Asteroids
M:   1.600 s: Updating Scene
M:   1.751 s: GravityMark 1.89 Vulkan RT is Ready in 1.7 s
M:   1.751 s: Starting 2560x1440 Vulkan RT Benchmark
M:   1.751 s: Count: 1
M:   1.752 s: Resizing 2560x1440 frame
M:   1.753 s: Build buffer 44.74 MB
radv/amdgpu: The CS has been cancelled because the context is lost. This
context is guilty of a hard recovery.
E:   4.151 s: VK::error(): device lost
E:   4.152 s: VKContext::Frame::submit(): can't submit command buffer
E:   4.152 s: VKContext::submit(): can't submit frame
E:   4.152 s: VKWindow::present(): can't submit context
E:   4.152 s: GravityMark::render(): can't present window
E:   9.347 s: VK::error(): device lost
E:   9.347 s: VKContext::Frame::wait(): can't wait for fence
E:   9.347 s: VKContext::finish(): can't wait frame
E:   9.347 s: VK::error(): device lost
E:   9.347 s: VKContext::Frame::wait(): can't wait for fence
E:   9.347 s: VKContext::finish(): can't wait frame
E:   9.347 s: VK::error(): device lost
E:   9.347 s: VKContext::Frame::wait(): can't wait for fence
E:   9.347 s: VKContext::finish(): can't wait frame
E:   9.347 s: VKWindow::finish(): can't finish context
M:   9.347 s: Clearing Scene
E:   9.575 s: VK::error(): device lost
E:   9.575 s: VKContext::Frame::wait(): can't wait for fence
E:   9.575 s: VKContext::finish(): can't wait frame
M:   9.575 s: Restore fullscreen mode on 0 screen
E:   9.583 s: VK::error(): device lost
E:   9.583 s: VKContext::Frame::wait(): can't wait for fence
E:   9.583 s: VKContext::finish(): can't wait frame
E:   9.583 s: VK::error(): device lost
E:   9.583 s: VKContext::Frame::wait(): can't wait for fence
E:   9.583 s: VKContext::finish(): can't wait frame


It's only the full screen and RT that seem to have issues

Dmesg:

Feb 15 21:16:13 axion.fireburn.co.uk kernel: [drm] PCIE GART of 512M
enabled (table at 0x00000082FEB00000).
Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
PSP is resuming...
Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
reserve 0xa00000 from 0x82fd000000 for PSP TMR
Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
RAS: optional ras ta ucode is not available
Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
SECUREDISPLAY: optional securedisplay ta ucode is not available
Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
SMU is resuming...
Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
smu driver if version = 0x0000000e, smu fw if version = 0x00000012, smu fw
program = 0, version = 0x00413f00 (65.63.0)
Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
SMU driver if version not matched
Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
Setting new power limit is not supported!
Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
SMU is resumed successfully!
Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
kiq ring mec 2 pipe 1 q 0
Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
[drm] DMUB hardware initialized: version=0x02020021
Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: [drm]
Cannot find any crtc or sizes
Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
ring gfx_0.0.0 uses VM inv eng 0 on hub 0
Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
ring gfx_0.1.0 uses VM inv eng 1 on hub 0
Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
ring comp_1.0.0 uses VM inv eng 4 on hub 0
Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
ring comp_1.1.0 uses VM inv eng 5 on hub 0
Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
ring comp_1.2.0 uses VM inv eng 6 on hub 0
Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
ring comp_1.3.0 uses VM inv eng 7 on hub 0
Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
ring comp_1.0.1 uses VM inv eng 8 on hub 0
Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
ring comp_1.1.1 uses VM inv eng 9 on hub 0
Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
ring comp_1.2.1 uses VM inv eng 10 on hub 0
Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
ring comp_1.3.1 uses VM inv eng 11 on hub 0
Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
ring kiq_0.2.1.0 uses VM inv eng 12 on hub 0
Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
ring sdma0 uses VM inv eng 13 on hub 0
Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
ring sdma1 uses VM inv eng 14 on hub 0
Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
ring vcn_dec_0 uses VM inv eng 0 on hub 8
Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
ring vcn_enc_0.0 uses VM inv eng 1 on hub 8
Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
ring vcn_enc_0.1 uses VM inv eng 4 on hub 8
Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
ring jpeg_dec uses VM inv eng 5 on hub 8
Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: [drm]
Cannot find any crtc or sizes
Feb 15 21:16:17 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
Dumping IP State
Feb 15 21:16:17 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
Dumping IP State Completed
Feb 15 21:16:17 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
[drm] AMDGPU device coredump file has been created
Feb 15 21:16:17 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
[drm] Check your /sys/class/drm/card0/device/devcoredump/data
Feb 15 21:16:17 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
ring gfx_0.0.0 timeout, signaled seq=99, emitted seq=100
Feb 15 21:16:17 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
 Process GravityMark.x64 pid 1794 thread GravityMark.x64 pid 1794
Feb 15 21:16:17 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
Starting gfx_0.0.0 ring reset
Feb 15 21:16:17 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
Ring gfx_0.0.0 reset succeeded
Feb 15 21:16:17 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: [drm]
device wedged, but recovered through reset

I got things working with this patch:

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index c7f44422939f..5a3f02a26192 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4206,7 +4206,7 @@ static int
amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)

       /* By default timeout for all queues is 2 sec */
       adev->gfx_timeout = adev->compute_timeout = adev->sdma_timeout =
-               adev->video_timeout = msecs_to_jiffies(2000);
+               adev->video_timeout = msecs_to_jiffies(5000);

       if (!strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH))
               return 0;

Reply via email to