> 2021年9月6日 17:04,Christian König <[email protected]> 写道:
>
>
>
> Am 06.09.21 um 03:12 schrieb xinhui pan:
>> A long time ago, someone reports system got hung during memory test.
>> In recent days, I am trying to look for or understand the potential
>> deadlock in ttm/amdgpu code.
>>
>> This patchset aims to fix the deadlock during ttm populate.
>>
>> TTM has a parameter called pages_limit, when allocated GTT memory
>> reaches this limit, swapout would be triggered. As ttm_bo_swapout does
>> not return the correct retval, populate might get hung.
>>
>> UVD ib test uses GTT which might be insufficient. So a gpu recovery
>> would hung if populate hung.
>
> Ah, now I understand what you are trying to do.
>
> Problem is that won't work either. Allocating VRAM can easily land you inside
> the same deadlock.
>
> We need to avoid the allocation altogether for this for work correctly.
looks like we need reserve some pages at sw init.
>
>>
>> I have made one drm test which alloc two GTT BOs, submit gfx copy
>> commands and free these BOs without waiting fence. What's more, these
>> gfx copy commands will cause gfx ring hang. So gpu recovery would be
>> triggered.
>
> Mhm, that should never be possible. It is perfectly valid for an application
> to terminate without waitting for the GFX submission to be completed.
gfx ring hangs because of the command is illegal.
the packet is COMMAND [30:21] | BYTE_COUNT [20:0]
I use 0xFF << 20 to hang the ring on purpose.
>
> Going to push patch #1 to drm-misc-fixes or drm-misc-next-fixes in a moment.
>
> Thanks,
> Christian.
>
>>
>> Now here is one possible deadlock case.
>> gpu_recovery
>> -> stop drm scheduler
>> -> asic reset
>> -> ib test
>> -> tt populate (uvd ib test)
>> -> ttm_bo_swapout (BO A) // this always fails as the fence of
>> BO A would not be signaled by schedluer or HW. Hit deadlock.
>>
>> I paste the drm test patch below.
>> #modprobe ttm pages_limit=65536
>> #amdgpu_test -s 1 -t 4
>> ---
>> tests/amdgpu/basic_tests.c | 32 ++++++++++++++------------------
>> 1 file changed, 14 insertions(+), 18 deletions(-)
>>
>> diff --git a/tests/amdgpu/basic_tests.c b/tests/amdgpu/basic_tests.c
>> index dbf02fee..f85ed340 100644
>> --- a/tests/amdgpu/basic_tests.c
>> +++ b/tests/amdgpu/basic_tests.c
>> @@ -65,13 +65,16 @@ static void amdgpu_direct_gma_test(void);
>> static void amdgpu_command_submission_write_linear_helper(unsigned ip_type);
>> static void amdgpu_command_submission_const_fill_helper(unsigned ip_type);
>> static void amdgpu_command_submission_copy_linear_helper(unsigned ip_type);
>> -static void amdgpu_test_exec_cs_helper(amdgpu_context_handle context_handle,
>> +static void _amdgpu_test_exec_cs_helper(amdgpu_context_handle
>> context_handle,
>> unsigned ip_type,
>> int instance, int pm4_dw, uint32_t
>> *pm4_src,
>> int res_cnt, amdgpu_bo_handle *resources,
>> struct amdgpu_cs_ib_info *ib_info,
>> - struct amdgpu_cs_request *ibs_request);
>> + struct amdgpu_cs_request *ibs_request,
>> int sync, int repeat);
>> +#define amdgpu_test_exec_cs_helper(...) \
>> + _amdgpu_test_exec_cs_helper(__VA_ARGS__, 1, 1)
>> +
>> CU_TestInfo basic_tests[] = {
>> { "Query Info Test", amdgpu_query_info_test },
>> { "Userptr Test", amdgpu_userptr_test },
>> @@ -1341,12 +1344,12 @@ static void amdgpu_command_submission_compute(void)
>> * pm4_src, resources, ib_info, and ibs_request
>> * submit command stream described in ibs_request and wait for this IB
>> accomplished
>> */
>> -static void amdgpu_test_exec_cs_helper(amdgpu_context_handle context_handle,
>> +static void _amdgpu_test_exec_cs_helper(amdgpu_context_handle
>> context_handle,
>> unsigned ip_type,
>> int instance, int pm4_dw, uint32_t
>> *pm4_src,
>> int res_cnt, amdgpu_bo_handle *resources,
>> struct amdgpu_cs_ib_info *ib_info,
>> - struct amdgpu_cs_request *ibs_request)
>> + struct amdgpu_cs_request *ibs_request,
>> int sync, int repeat)
>> {
>> int r;
>> uint32_t expired;
>> @@ -1395,12 +1398,15 @@ static void
>> amdgpu_test_exec_cs_helper(amdgpu_context_handle context_handle,
>> CU_ASSERT_NOT_EQUAL(ibs_request, NULL);
>> /* submit CS */
>> - r = amdgpu_cs_submit(context_handle, 0, ibs_request, 1);
>> + while (repeat--)
>> + r = amdgpu_cs_submit(context_handle, 0, ibs_request, 1);
>> CU_ASSERT_EQUAL(r, 0);
>> r = amdgpu_bo_list_destroy(ibs_request->resources);
>> CU_ASSERT_EQUAL(r, 0);
>> + if (!sync)
>> + return;
>> fence_status.ip_type = ip_type;
>> fence_status.ip_instance = 0;
>> fence_status.ring = ibs_request->ring;
>> @@ -1667,7 +1673,7 @@ static void
>> amdgpu_command_submission_sdma_const_fill(void)
>> static void amdgpu_command_submission_copy_linear_helper(unsigned ip_type)
>> {
>> - const int sdma_write_length = 1024;
>> + const int sdma_write_length = (255) << 20;
>> const int pm4_dw = 256;
>> amdgpu_context_handle context_handle;
>> amdgpu_bo_handle bo1, bo2;
>> @@ -1715,8 +1721,6 @@ static void
>> amdgpu_command_submission_copy_linear_helper(unsigned ip_type)
>> &bo1_va_handle);
>> CU_ASSERT_EQUAL(r, 0);
>> - /* set bo1 */
>> - memset((void*)bo1_cpu, 0xaa, sdma_write_length);
>> /* allocate UC bo2 for sDMA use */
>> r = amdgpu_bo_alloc_and_map(device_handle,
>> @@ -1727,8 +1731,6 @@ static void
>> amdgpu_command_submission_copy_linear_helper(unsigned ip_type)
>> &bo2_va_handle);
>> CU_ASSERT_EQUAL(r, 0);
>> - /* clear bo2 */
>> - memset((void*)bo2_cpu, 0, sdma_write_length);
>> resources[0] = bo1;
>> resources[1] = bo2;
>> @@ -1785,17 +1787,11 @@ static void
>> amdgpu_command_submission_copy_linear_helper(unsigned ip_type)
>> }
>> }
>> - amdgpu_test_exec_cs_helper(context_handle,
>> + _amdgpu_test_exec_cs_helper(context_handle,
>> ip_type, ring_id,
>> i, pm4,
>> 2, resources,
>> - ib_info,
>> ibs_request);
>> -
>> - /* verify if SDMA test result meets with
>> expected */
>> - i = 0;
>> - while(i < sdma_write_length) {
>> - CU_ASSERT_EQUAL(bo2_cpu[i++], 0xaa);
>> - }
>> + ib_info,
>> ibs_request, 0, 100);
>> r = amdgpu_bo_unmap_and_free(bo1,
>> bo1_va_handle, bo1_mc,
>> sdma_write_length);
>> CU_ASSERT_EQUAL(r, 0);
>