Commit 43c4d576 authored by John Clements's avatar John Clements Committed by Alex Deucher
Browse files

drm/amdgpu: protect RAS sysfs during GPU reset



MMHub EDC becomes dirty after BACO reset

EDC registers should be cleared early on in reset phase

Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarJohn Clements <john.clements@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent cb7adfd6
Loading
Loading
Loading
Loading
+11 −1
Original line number Diff line number Diff line
@@ -2742,6 +2742,9 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)

		if (adev->asic_reset_res)
			goto fail;

		if (adev->mmhub.funcs && adev->mmhub.funcs->reset_ras_error_count)
			adev->mmhub.funcs->reset_ras_error_count(adev);
	} else {

		task_barrier_full(&hive->tb);
@@ -3910,8 +3913,15 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
		}
	}

	if (!r && amdgpu_ras_intr_triggered())
	if (!r && amdgpu_ras_intr_triggered()) {
		list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
			if (tmp_adev->mmhub.funcs &&
			    tmp_adev->mmhub.funcs->reset_ras_error_count)
				tmp_adev->mmhub.funcs->reset_ras_error_count(tmp_adev);
		}

		amdgpu_ras_intr_cleared();
	}

	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
		if (need_full_reset) {
+9 −0
Original line number Diff line number Diff line
@@ -281,6 +281,11 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *
	struct ras_debug_if data;
	int ret = 0;

	if (amdgpu_ras_intr_triggered()) {
		DRM_WARN("RAS WARN: error injection currently inaccessible\n");
		return size;
	}

	ret = amdgpu_ras_debugfs_ctrl_parse_data(f, buf, size, pos, &data);
	if (ret)
		return -EINVAL;
@@ -394,6 +399,10 @@ static ssize_t amdgpu_ras_sysfs_read(struct device *dev,
		.head = obj->head,
	};

	if (amdgpu_ras_intr_triggered())
		return snprintf(buf, PAGE_SIZE,
				"Query currently inaccessible\n");

	if (amdgpu_ras_error_query(obj->adev, &info))
		return -EINVAL;