On Thu, 25 Sept 2025 at 14:03, Christian König < [email protected]> wrote:
> There has been multiple complains that 10 seconds are usually to long. > > The original requirement for longer timeout came from compute tests on > AMDVLK, since that is no longer a topic reduce the timeout back to 2 > seconds for all queues. > > While at it also remove any special handling for compute queues under > SRIOV or pass through. > > Signed-off-by: Christian König <[email protected]> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 85 ++++++++++------------ > drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 21 ++---- > 2 files changed, 48 insertions(+), 58 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index a77000c2e0bb..ceb3c616292c 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -4278,58 +4278,53 @@ static int > amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) > long timeout; > int ret = 0; > > - /* > - * By default timeout for jobs is 10 sec > - */ > - adev->compute_timeout = adev->gfx_timeout = > msecs_to_jiffies(10000); > - adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; > + /* By default timeout for all queues is 2 sec */ > + adev->gfx_timeout = adev->compute_timeout = adev->sdma_timeout = > + adev->video_timeout = msecs_to_jiffies(2000); > > - if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { > - while ((timeout_setting = strsep(&input, ",")) && > - strnlen(timeout_setting, > AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { > - ret = kstrtol(timeout_setting, 0, &timeout); > - if (ret) > - return ret; > + if (!strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) > + return 0; > > - if (timeout == 0) { > - index++; > - continue; > - } else if (timeout < 0) { > - timeout = MAX_SCHEDULE_TIMEOUT; > - dev_warn(adev->dev, "lockup timeout > disabled"); > - add_taint(TAINT_SOFTLOCKUP, > LOCKDEP_STILL_OK); > - } else { > - timeout = msecs_to_jiffies(timeout); > - } > + while ((timeout_setting = strsep(&input, ",")) && > + strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { > + ret = kstrtol(timeout_setting, 0, &timeout); > + if (ret) > + return ret; > > - switch (index++) { > - case 0: > - adev->gfx_timeout = timeout; > - break; > - case 1: > - adev->compute_timeout = timeout; > - break; > - case 2: > - adev->sdma_timeout = timeout; > - break; > - case 3: > - adev->video_timeout = timeout; > - break; > - default: > - break; > - } > + if (timeout == 0) { > + index++; > + continue; > + } else if (timeout < 0) { > + timeout = MAX_SCHEDULE_TIMEOUT; > + dev_warn(adev->dev, "lockup timeout disabled"); > + add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); > + } else { > + timeout = msecs_to_jiffies(timeout); > } > - /* > - * There is only one value specified and > - * it should apply to all non-compute jobs. > - */ > - if (index == 1) { > - adev->sdma_timeout = adev->video_timeout = > adev->gfx_timeout; > - if (amdgpu_sriov_vf(adev) || > amdgpu_passthrough(adev)) > - adev->compute_timeout = adev->gfx_timeout; > + > + switch (index++) { > + case 0: > + adev->gfx_timeout = timeout; > + break; > + case 1: > + adev->compute_timeout = timeout; > + break; > + case 2: > + adev->sdma_timeout = timeout; > + break; > + case 3: > + adev->video_timeout = timeout; > + break; > + default: > + break; > } > } > > + /* When only one value specified apply it to all queues. */ > + if (index == 1) > + adev->gfx_timeout = adev->compute_timeout = > adev->sdma_timeout = > + adev->video_timeout = timeout; > + > return ret; > } > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > index ece251cbe8c3..fe45dd1d979e 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > @@ -353,22 +353,17 @@ module_param_named(svm_default_granularity, > amdgpu_svm_default_granularity, uint > * DOC: lockup_timeout (string) > * Set GPU scheduler timeout value in ms. > * > - * The format can be [Non-Compute] or [GFX,Compute,SDMA,Video]. That is > there can be one or > - * multiple values specified. 0 and negative values are invalidated. They > will be adjusted > - * to the default timeout. > + * The format can be [single value] for setting all timeouts at once or > + * [GFX,Compute,SDMA,Video] to set individual timeouts. > + * Negative values mean infinity. > * > - * - With one value specified, the setting will apply to all non-compute > jobs. > - * - With multiple values specified, the first one will be for GFX. > - * The second one is for Compute. The third and fourth ones are > - * for SDMA and Video. > - * > - * By default(with no lockup_timeout settings), the timeout for all jobs > is 10000. > + * By default(with no lockup_timeout settings), the timeout for all > queues is 2000. > */ > MODULE_PARM_DESC(lockup_timeout, > - "GPU lockup timeout in ms (default: 10000 for all jobs. " > - "0: keep default value. negative: infinity timeout), > format: for bare metal [Non-Compute] or [GFX,Compute,SDMA,Video]; " > - "for passthrough or sriov [all jobs] or > [GFX,Compute,SDMA,Video]."); > -module_param_string(lockup_timeout, amdgpu_lockup_timeout, > sizeof(amdgpu_lockup_timeout), 0444); > + "GPU lockup timeout in ms (default: 2000 for all queues. " > + "0: keep default value. negative: infinity timeout), > format: [single value for all] or [GFX,Compute,SDMA,Video]."); > +module_param_string(lockup_timeout, amdgpu_lockup_timeout, > + sizeof(amdgpu_lockup_timeout), 0444); > > /** > * DOC: dpm (int) > -- > 2.43.0 > > Hi This patch is causing issues with running: ~/GravityMark_1.89_linux $ DRI_PRIME=1 ./run_fullscreen_vk_rt.sh M: 0 us: ../data.zip: 313 files M: 15.19 ms: Temporal antialiasing M: 15.21 ms: Fullscreen mode M: 15.22 ms: Render Statistics M: 20.77 ms: Build Date: Jun 20 2025 M: 20.80 ms: Build Info: version=20250429; linux; x64; release; vk=1; gl=45; gles=32; cu=1; fusion M: 20.81 ms: Build Version: 1.89 M: 48.06 ms: Name: ASUSTeK COMPUTER INC. G513QY ROG Strix G513QY_G513QY M: 48.09 ms: System: 'Gentoo Linux' M: 48.10 ms: Kernel: Linux 6.19.0-rc7-drm+ x86_64 M: 48.11 ms: Memory: 62.19 GB M: 48.13 ms: Uptime: 19.00 s M: 48.15 ms: CPU: AMD Ryzen 9 5900HX with Radeon Graphics M: 48.17 ms: GPU 0: [AMD/ATI] Navi 22 [Radeon RX 6700/6700 XT/6750 XT / 6800M/6850M XT] (rev c3) M: 48.18 ms: Device: VEN_1002&DEV_73DF&SUBSYS_16C21043 M: 48.19 ms: Memory: 11.98 GB M: 48.21 ms: GPU 1: [AMD/ATI] Cezanne [Radeon Vega Series / Radeon Vega Mobile Series] (rev c4) M: 48.22 ms: Device: VEN_1002&DEV_1638&SUBSYS_16C21043 M: 48.23 ms: Memory: 512.00 MB M: 48.53 ms: Desktop: 2560x1440 1.0 M: 48.55 ms: Screen 0: 2560x1440 0 0 eDP-1 M: 48.57 ms: Set fullscreen mode on 0 screen M: 51.48 ms: Creating 2560x1440 Vulkan Window M: 147.88 ms: Render Size: 2560x1440 M: 149.22 ms: Using Fetch Mode M: 233.88 ms: Device: AMD Radeon RX 6800M (RADV NAVI22) M: 233.95 ms: Vendor: AMD M: 233.96 ms: Version: 26.0.99 M: 233.97 ms: DeviceID: 0x73df M: 234.35 ms: Group Memory: 64.00 KB M: 234.36 ms: Video Memory: 11.98 GB M: 234.37 ms: Max Uniform Size: 4.00 GB M: 234.38 ms: Max Storage Size: 4.00 GB M: 234.38 ms: Creating SceneManager M: 416.06 ms: Creating RenderManager M: 547.17 ms: Ray Tracing Mode M: 547.20 ms: Creating Scene M: 1.481 s: Creating 200,000 Asteroids M: 1.600 s: Updating Scene M: 1.751 s: GravityMark 1.89 Vulkan RT is Ready in 1.7 s M: 1.751 s: Starting 2560x1440 Vulkan RT Benchmark M: 1.751 s: Count: 1 M: 1.752 s: Resizing 2560x1440 frame M: 1.753 s: Build buffer 44.74 MB radv/amdgpu: The CS has been cancelled because the context is lost. This context is guilty of a hard recovery. E: 4.151 s: VK::error(): device lost E: 4.152 s: VKContext::Frame::submit(): can't submit command buffer E: 4.152 s: VKContext::submit(): can't submit frame E: 4.152 s: VKWindow::present(): can't submit context E: 4.152 s: GravityMark::render(): can't present window E: 9.347 s: VK::error(): device lost E: 9.347 s: VKContext::Frame::wait(): can't wait for fence E: 9.347 s: VKContext::finish(): can't wait frame E: 9.347 s: VK::error(): device lost E: 9.347 s: VKContext::Frame::wait(): can't wait for fence E: 9.347 s: VKContext::finish(): can't wait frame E: 9.347 s: VK::error(): device lost E: 9.347 s: VKContext::Frame::wait(): can't wait for fence E: 9.347 s: VKContext::finish(): can't wait frame E: 9.347 s: VKWindow::finish(): can't finish context M: 9.347 s: Clearing Scene E: 9.575 s: VK::error(): device lost E: 9.575 s: VKContext::Frame::wait(): can't wait for fence E: 9.575 s: VKContext::finish(): can't wait frame M: 9.575 s: Restore fullscreen mode on 0 screen E: 9.583 s: VK::error(): device lost E: 9.583 s: VKContext::Frame::wait(): can't wait for fence E: 9.583 s: VKContext::finish(): can't wait frame E: 9.583 s: VK::error(): device lost E: 9.583 s: VKContext::Frame::wait(): can't wait for fence E: 9.583 s: VKContext::finish(): can't wait frame It's only the full screen and RT that seem to have issues Dmesg: Feb 15 21:16:13 axion.fireburn.co.uk kernel: [drm] PCIE GART of 512M enabled (table at 0x00000082FEB00000). Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: PSP is resuming... Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: reserve 0xa00000 from 0x82fd000000 for PSP TMR Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: RAS: optional ras ta ucode is not available Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: SECUREDISPLAY: optional securedisplay ta ucode is not available Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: SMU is resuming... Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: smu driver if version = 0x0000000e, smu fw if version = 0x00000012, smu fw program = 0, version = 0x00413f00 (65.63.0) Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: SMU driver if version not matched Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: Setting new power limit is not supported! Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: SMU is resumed successfully! Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: kiq ring mec 2 pipe 1 q 0 Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: [drm] DMUB hardware initialized: version=0x02020021 Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: [drm] Cannot find any crtc or sizes Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: ring gfx_0.0.0 uses VM inv eng 0 on hub 0 Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: ring gfx_0.1.0 uses VM inv eng 1 on hub 0 Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: ring comp_1.0.0 uses VM inv eng 4 on hub 0 Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: ring comp_1.1.0 uses VM inv eng 5 on hub 0 Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: ring comp_1.2.0 uses VM inv eng 6 on hub 0 Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: ring comp_1.3.0 uses VM inv eng 7 on hub 0 Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: ring comp_1.0.1 uses VM inv eng 8 on hub 0 Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: ring comp_1.1.1 uses VM inv eng 9 on hub 0 Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: ring comp_1.2.1 uses VM inv eng 10 on hub 0 Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: ring comp_1.3.1 uses VM inv eng 11 on hub 0 Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: ring kiq_0.2.1.0 uses VM inv eng 12 on hub 0 Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: ring sdma0 uses VM inv eng 13 on hub 0 Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: ring sdma1 uses VM inv eng 14 on hub 0 Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: ring vcn_dec_0 uses VM inv eng 0 on hub 8 Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: ring vcn_enc_0.0 uses VM inv eng 1 on hub 8 Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: ring vcn_enc_0.1 uses VM inv eng 4 on hub 8 Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: ring jpeg_dec uses VM inv eng 5 on hub 8 Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: [drm] Cannot find any crtc or sizes Feb 15 21:16:17 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: Dumping IP State Feb 15 21:16:17 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: Dumping IP State Completed Feb 15 21:16:17 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: [drm] AMDGPU device coredump file has been created Feb 15 21:16:17 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: [drm] Check your /sys/class/drm/card0/device/devcoredump/data Feb 15 21:16:17 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: ring gfx_0.0.0 timeout, signaled seq=99, emitted seq=100 Feb 15 21:16:17 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: Process GravityMark.x64 pid 1794 thread GravityMark.x64 pid 1794 Feb 15 21:16:17 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: Starting gfx_0.0.0 ring reset Feb 15 21:16:17 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: Ring gfx_0.0.0 reset succeeded Feb 15 21:16:17 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: [drm] device wedged, but recovered through reset I got things working with this patch: diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index c7f44422939f..5a3f02a26192 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4206,7 +4206,7 @@ static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) /* By default timeout for all queues is 2 sec */ adev->gfx_timeout = adev->compute_timeout = adev->sdma_timeout = - adev->video_timeout = msecs_to_jiffies(2000); + adev->video_timeout = msecs_to_jiffies(5000); if (!strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) return 0;
