drm/amdgpu: Fix amdgpu_device_reset_sriov retry logic
The retry loop for SRIOV reset have refcount and memory leak issue. Depending on which function call fails it can potentially call amdgpu_amdkfd_pre/post_reset different number of times and causes kfd_locked count to be wrong. This will block all future attempts at opening /dev/kfd. The retry loop also leakes resources by calling amdgpu_virt_init_data_exchange multiple times without calling the corresponding fini function. Align with the bare-metal reset path which doesn't have these issues. This means taking the amdgpu_amdkfd_pre/post_reset functions out of the reset loop and calling amdgpu_device_pre_asic_reset each retry which properly free the resources from previous try by calling amdgpu_virt_fini_data_exchange. Signed-off-by: Yunxiang Li <Yunxiang.Li@amd.com> Reviewed-by: Emily Deng <Emily.Deng@amd.com> Reviewed-by: Zhigang Luo <zhigang.luo@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
committed by
Alex Deucher
parent
a5b843269a
commit
6e4aa08fa9
@@ -5065,10 +5065,6 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
|
||||
{
|
||||
int r;
|
||||
struct amdgpu_hive_info *hive = NULL;
|
||||
int retry_limit = 0;
|
||||
|
||||
retry:
|
||||
amdgpu_amdkfd_pre_reset(adev);
|
||||
|
||||
if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) {
|
||||
clear_bit(AMDGPU_HOST_FLR, &reset_context->flags);
|
||||
@@ -5088,7 +5084,7 @@ retry:
|
||||
/* Resume IP prior to SMC */
|
||||
r = amdgpu_device_ip_reinit_early_sriov(adev);
|
||||
if (r)
|
||||
goto error;
|
||||
return r;
|
||||
|
||||
amdgpu_virt_init_data_exchange(adev);
|
||||
|
||||
@@ -5099,38 +5095,35 @@ retry:
|
||||
/* now we are okay to resume SMC/CP/SDMA */
|
||||
r = amdgpu_device_ip_reinit_late_sriov(adev);
|
||||
if (r)
|
||||
goto error;
|
||||
return r;
|
||||
|
||||
hive = amdgpu_get_xgmi_hive(adev);
|
||||
/* Update PSP FW topology after reset */
|
||||
if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
|
||||
r = amdgpu_xgmi_update_topology(hive, adev);
|
||||
|
||||
if (hive)
|
||||
amdgpu_put_xgmi_hive(hive);
|
||||
if (r)
|
||||
return r;
|
||||
|
||||
if (!r) {
|
||||
r = amdgpu_ib_ring_tests(adev);
|
||||
r = amdgpu_ib_ring_tests(adev);
|
||||
if (r)
|
||||
return r;
|
||||
|
||||
amdgpu_amdkfd_post_reset(adev);
|
||||
}
|
||||
|
||||
error:
|
||||
if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
|
||||
if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
|
||||
amdgpu_inc_vram_lost(adev);
|
||||
r = amdgpu_device_recover_vram(adev);
|
||||
}
|
||||
if (r)
|
||||
return r;
|
||||
|
||||
/* need to be called during full access so we can't do it later like
|
||||
* bare-metal does.
|
||||
*/
|
||||
amdgpu_amdkfd_post_reset(adev);
|
||||
amdgpu_virt_release_full_gpu(adev, true);
|
||||
|
||||
if (AMDGPU_RETRY_SRIOV_RESET(r)) {
|
||||
if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) {
|
||||
retry_limit++;
|
||||
goto retry;
|
||||
} else
|
||||
DRM_ERROR("GPU reset retry is beyond the retry limit\n");
|
||||
}
|
||||
|
||||
return r;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -5689,6 +5682,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
|
||||
int i, r = 0;
|
||||
bool need_emergency_restart = false;
|
||||
bool audio_suspended = false;
|
||||
int retry_limit = AMDGPU_MAX_RETRY_LIMIT;
|
||||
|
||||
/*
|
||||
* Special case: RAS triggered and full reset isn't supported
|
||||
@@ -5770,8 +5764,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
|
||||
|
||||
cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
|
||||
|
||||
if (!amdgpu_sriov_vf(tmp_adev))
|
||||
amdgpu_amdkfd_pre_reset(tmp_adev);
|
||||
amdgpu_amdkfd_pre_reset(tmp_adev);
|
||||
|
||||
/*
|
||||
* Mark these ASICs to be reseted as untracked first
|
||||
@@ -5830,6 +5823,10 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */
|
||||
/* Host driver will handle XGMI hive reset for SRIOV */
|
||||
if (amdgpu_sriov_vf(adev)) {
|
||||
r = amdgpu_device_reset_sriov(adev, reset_context);
|
||||
if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) {
|
||||
amdgpu_virt_release_full_gpu(adev, true);
|
||||
goto retry;
|
||||
}
|
||||
if (r)
|
||||
adev->asic_reset_res = r;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user