Commit 83b0582c authored by Dennis Li's avatar Dennis Li Committed by Alex Deucher
Browse files

drm/amdgpu: support gfx ras error injection and err_cnt query



check gfx error count in both ras querry function and
ras interrupt handler.

gfx ras is still disabled by default due to known stability
issue found in gpu reset.

Signed-off-by: default avatarDennis Li <Dennis.Li@amd.com>
Reviewed-by: default avatarTao Zhou <tao.zhou1@amd.com>
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 2c960ea0
Loading
Loading
Loading
Loading
+16 −3
Original line number Diff line number Diff line
@@ -602,6 +602,10 @@ int amdgpu_ras_error_query(struct amdgpu_device *adev,
		if (adev->umc.funcs->query_ras_error_count)
			adev->umc.funcs->query_ras_error_count(adev, &err_data);
		break;
	case AMDGPU_RAS_BLOCK__GFX:
		if (adev->gfx.funcs->query_ras_error_count)
			adev->gfx.funcs->query_ras_error_count(adev, &err_data);
		break;
	default:
		break;
	}
@@ -639,13 +643,22 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
	if (!obj)
		return -EINVAL;

	if (block_info.block_id != TA_RAS_BLOCK__UMC) {
	switch (info->head.block) {
	case AMDGPU_RAS_BLOCK__GFX:
		if (adev->gfx.funcs->ras_error_inject)
			ret = adev->gfx.funcs->ras_error_inject(adev, info);
		else
			ret = -EINVAL;
		break;
	case AMDGPU_RAS_BLOCK__UMC:
		ret = psp_ras_trigger_error(&adev->psp, &block_info);
		break;
	default:
		DRM_INFO("%s error injection is not supported yet\n",
			 ras_block_str(info->head.block));
		return -EINVAL;
		ret = -EINVAL;
	}

	ret = psp_ras_trigger_error(&adev->psp, &block_info);
	if (ret)
		DRM_ERROR("RAS ERROR: inject %s error failed ret %d\n",
				ras_block_str(info->head.block),
+2 −0
Original line number Diff line number Diff line
@@ -5611,6 +5611,8 @@ static int gfx_v9_0_process_ras_data_cb(struct amdgpu_device *adev,
{
	/* TODO ue will trigger an interrupt. */
	kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
	if (adev->gfx.funcs->query_ras_error_count)
		adev->gfx.funcs->query_ras_error_count(adev, err_data);
	amdgpu_ras_reset_gpu(adev, 0);
	return AMDGPU_RAS_UE;
}