Commit 61380faa authored by John Clements's avatar John Clements Committed by Alex Deucher
Browse files

drm/amdgpu: disable ras query and iject during gpu reset



added flag to ras context to indicate if ras query functionality is ready

Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarJohn Clements <john.clements@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 66399248
Loading
Loading
Loading
Loading
+3 −0
Original line number Diff line number Diff line
@@ -4168,6 +4168,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
	need_full_reset = job_signaled = false;
	INIT_LIST_HEAD(&device_list);

	amdgpu_ras_set_error_query_ready(adev, false);

	dev_info(adev->dev, "GPU %s begin!\n",
		(in_ras_intr && !use_baco) ? "jobs stop":"reset");

@@ -4224,6 +4226,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
	/* block all schedulers and reset given job's ring */
	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
		if (tmp_adev != adev) {
			amdgpu_ras_set_error_query_ready(tmp_adev, false);
			amdgpu_device_lock_adev(tmp_adev, false);
			if (!amdgpu_sriov_vf(tmp_adev))
			                amdgpu_amdkfd_pre_reset(tmp_adev);
+21 −3
Original line number Diff line number Diff line
@@ -80,6 +80,20 @@ atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0);
static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
				uint64_t addr);

void amdgpu_ras_set_error_query_ready(struct amdgpu_device *adev, bool ready)
{
	if (adev)
		amdgpu_ras_get_context(adev)->error_query_ready = ready;
}

bool amdgpu_ras_get_error_query_ready(struct amdgpu_device *adev)
{
	if (adev)
		return amdgpu_ras_get_context(adev)->error_query_ready;

	return false;
}

static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf,
					size_t size, loff_t *pos)
{
@@ -281,7 +295,7 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *
	struct ras_debug_if data;
	int ret = 0;

	if (amdgpu_ras_intr_triggered()) {
	if (!amdgpu_ras_get_error_query_ready(adev)) {
		DRM_WARN("RAS WARN: error injection currently inaccessible\n");
		return size;
	}
@@ -399,7 +413,7 @@ static ssize_t amdgpu_ras_sysfs_read(struct device *dev,
		.head = obj->head,
	};

	if (amdgpu_ras_intr_triggered())
	if (!amdgpu_ras_get_error_query_ready(obj->adev))
		return snprintf(buf, PAGE_SIZE,
				"Query currently inaccessible\n");

@@ -1886,8 +1900,10 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev,
	}

	/* in resume phase, no need to create ras fs node */
	if (adev->in_suspend || adev->in_gpu_reset)
	if (adev->in_suspend || adev->in_gpu_reset) {
		amdgpu_ras_set_error_query_ready(adev, true);
		return 0;
	}

	if (ih_info->cb) {
		r = amdgpu_ras_interrupt_add_handler(adev, ih_info);
@@ -1899,6 +1915,8 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev,
	if (r)
		goto sysfs;

	amdgpu_ras_set_error_query_ready(adev, true);

	return 0;
cleanup:
	amdgpu_ras_sysfs_remove(adev, ras_block);
+4 −0
Original line number Diff line number Diff line
@@ -334,6 +334,8 @@ struct amdgpu_ras {
	uint32_t flags;
	bool reboot;
	struct amdgpu_ras_eeprom_control eeprom_control;

	bool error_query_ready;
};

struct ras_fs_data {
@@ -629,4 +631,6 @@ static inline void amdgpu_ras_intr_cleared(void)

void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev);

void amdgpu_ras_set_error_query_ready(struct amdgpu_device *adev, bool ready);

#endif