Commit 1a6fc071 authored by Tao Zhou's avatar Tao Zhou Committed by Alex Deucher
Browse files

drm/amdgpu: move the call of ras recovery_init and bad page reserve to proper place



ras recovery_init should be called after ttm init,
bad page reserve should be put in front of gpu reset since i2c
may be unstable during gpu reset.
add cleanup for recovery_init and recovery_fini

v2: add more comment and print.
    remove cancel_work_sync in recovery_init.

Signed-off-by: default avatarTao Zhou <tao.zhou1@amd.com>
Reviewed-by: default avatarGuchun Chen <guchun.chen@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 87d2b92f
Loading
Loading
Loading
Loading
+0 −5
Original line number Diff line number Diff line
@@ -3630,11 +3630,6 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
						break;
				}
			}

			list_for_each_entry(tmp_adev, device_list_handle,
					gmc.xgmi.head) {
				amdgpu_ras_reserve_bad_pages(tmp_adev);
			}
		}
	}

+26 −13
Original line number Diff line number Diff line
@@ -1493,16 +1493,17 @@ out:
	return 0;
}

static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
{
	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
	struct ras_err_handler_data **data = &con->eh_data;
	int ret;

	*data = kmalloc(sizeof(**data),
			GFP_KERNEL|__GFP_ZERO);
	if (!*data)
		return -ENOMEM;
	*data = kmalloc(sizeof(**data), GFP_KERNEL | __GFP_ZERO);
	if (!*data) {
		ret = -ENOMEM;
		goto out;
	}

	mutex_init(&con->recovery_lock);
	INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery);
@@ -1511,18 +1512,30 @@ static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)

	ret = amdgpu_ras_eeprom_init(&adev->psp.ras.ras->eeprom_control);
	if (ret)
		return ret;
		goto free;

	if (adev->psp.ras.ras->eeprom_control.num_recs) {
		ret = amdgpu_ras_load_bad_pages(adev);
		if (ret)
			return ret;
			goto free;
		ret = amdgpu_ras_reserve_bad_pages(adev);
		if (ret)
			return ret;
			goto release;
	}

	return 0;

release:
	amdgpu_ras_release_bad_pages(adev);
free:
	con->eh_data = NULL;
	kfree((*data)->bps);
	kfree((*data)->bps_bo);
	kfree(*data);
out:
	DRM_WARN("Failed to initialize ras recovery!\n");

	return ret;
}

static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
@@ -1530,12 +1543,17 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
	struct ras_err_handler_data *data = con->eh_data;

	/* recovery_init failed to init it, fini is useless */
	if (!data)
		return 0;

	cancel_work_sync(&con->recovery_work);
	amdgpu_ras_release_bad_pages(adev);

	mutex_lock(&con->recovery_lock);
	con->eh_data = NULL;
	kfree(data->bps);
	kfree(data->bps_bo);
	kfree(data);
	mutex_unlock(&con->recovery_lock);

@@ -1627,9 +1645,6 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
			return r;
	}

	if (amdgpu_ras_recovery_init(adev))
		goto recovery_out;

	amdgpu_ras_mask &= AMDGPU_RAS_BLOCK_MASK;

	if (amdgpu_ras_fs_init(adev))
@@ -1644,8 +1659,6 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
			con->hw_supported, con->supported);
	return 0;
fs_out:
	amdgpu_ras_recovery_fini(adev);
recovery_out:
	amdgpu_ras_set_context(adev, NULL);
	kfree(con);

+5 −0
Original line number Diff line number Diff line
@@ -480,6 +480,7 @@ static inline int amdgpu_ras_is_supported(struct amdgpu_device *adev,
	return ras && (ras->supported & (1 << block));
}

int amdgpu_ras_recovery_init(struct amdgpu_device *adev);
int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev,
		unsigned int block);

@@ -500,6 +501,10 @@ static inline int amdgpu_ras_reset_gpu(struct amdgpu_device *adev,
{
	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);

	/* save bad page to eeprom before gpu reset,
	 * i2c may be unstable in gpu reset
	 */
	amdgpu_ras_reserve_bad_pages(adev);
	if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0)
		schedule_work(&ras->recovery_work);
	return 0;
+12 −0
Original line number Diff line number Diff line
@@ -54,6 +54,7 @@
#include "amdgpu_trace.h"
#include "amdgpu_amdkfd.h"
#include "amdgpu_sdma.h"
#include "amdgpu_ras.h"
#include "bif/bif_4_1_d.h"

static int amdgpu_map_buffer(struct ttm_buffer_object *bo,
@@ -1777,6 +1778,17 @@ int amdgpu_ttm_init(struct amdgpu_device *adev)
						adev->gmc.visible_vram_size);
#endif

	/*
	 * retired pages will be loaded from eeprom and reserved here,
	 * it should be called after ttm init since new bo may be created,
	 * recovery_init may fail, but it can free all resources allocated by
	 * itself and its failure should not stop amdgpu init process.
	 *
	 * Note: theoretically, this should be called before all vram allocations
	 * to protect retired page from abusing
	 */
	amdgpu_ras_recovery_init(adev);

	/*
	 *The reserved vram for firmware must be pinned to the specified
	 *place on the VRAM, so reserve it early.