Commit 279375c3 authored by Hawking Zhang's avatar Hawking Zhang Committed by Alex Deucher
Browse files

drm/amdgpu: add reset_ras_error_count function for GFX



GFX ras error counters are dirty ones after cold reboot
Read operation is needed to reset them to 0

Signed-off-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Reviewed-by: default avatarAlex Deucher <alexander.deucher@amd.com>
Reviewed-by: default avatarTao Zhou <tao.zhou1@amd.com>
Reviewed-by: default avatarGuchun Chen <guchun.chen@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent fe5211f1
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -206,6 +206,7 @@ struct amdgpu_gfx_funcs {
				 u32 queue, u32 vmid);
	int (*ras_error_inject)(struct amdgpu_device *adev, void *inject_if);
	int (*query_ras_error_count) (struct amdgpu_device *adev, void *ras_error_status);
	void (*reset_ras_error_count) (struct amdgpu_device *adev);
};

struct sq_work {
+10 −16
Original line number Diff line number Diff line
@@ -738,9 +738,9 @@ static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring);
static u64 gfx_v9_0_ring_get_rptr_compute(struct amdgpu_ring *ring);
static int gfx_v9_0_query_ras_error_count(struct amdgpu_device *adev,
					  void *ras_error_status);
static void gfx_v9_0_clear_ras_edc_counter(struct amdgpu_device *adev);
static int gfx_v9_0_ras_error_inject(struct amdgpu_device *adev,
				     void *inject_if);
static void gfx_v9_0_reset_ras_error_count(struct amdgpu_device *adev);

static void gfx_v9_0_kiq_set_resources(struct amdgpu_ring *kiq_ring,
				uint64_t queue_mask)
@@ -1997,7 +1997,8 @@ static const struct amdgpu_gfx_funcs gfx_v9_0_gfx_funcs = {
	.read_wave_vgprs = &gfx_v9_0_read_wave_vgprs,
	.select_me_pipe_q = &gfx_v9_0_select_me_pipe_q,
	.ras_error_inject = &gfx_v9_0_ras_error_inject,
	.query_ras_error_count = &gfx_v9_0_query_ras_error_count
	.query_ras_error_count = &gfx_v9_0_query_ras_error_count,
	.reset_ras_error_count = &gfx_v9_0_reset_ras_error_count,
};

static const struct amdgpu_gfx_funcs gfx_v9_4_gfx_funcs = {
@@ -2008,7 +2009,8 @@ static const struct amdgpu_gfx_funcs gfx_v9_4_gfx_funcs = {
	.read_wave_vgprs = &gfx_v9_0_read_wave_vgprs,
	.select_me_pipe_q = &gfx_v9_0_select_me_pipe_q,
	.ras_error_inject = &gfx_v9_4_ras_error_inject,
	.query_ras_error_count = &gfx_v9_4_query_ras_error_count
	.query_ras_error_count = &gfx_v9_4_query_ras_error_count,
	.reset_ras_error_count = &gfx_v9_4_reset_ras_error_count,
};

static int gfx_v9_0_gpu_early_init(struct amdgpu_device *adev)
@@ -4395,18 +4397,6 @@ static int gfx_v9_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
		goto fail;
	}

	switch (adev->asic_type)
	{
	case CHIP_VEGA20:
		gfx_v9_0_clear_ras_edc_counter(adev);
		break;
	case CHIP_ARCTURUS:
		gfx_v9_4_clear_ras_edc_counter(adev);
		break;
	default:
		break;
	}

fail:
	amdgpu_ib_free(adev, &ib, NULL);
	dma_fence_put(f);
@@ -4454,6 +4444,10 @@ static int gfx_v9_0_ecc_late_init(void *handle)
	if (r)
		return r;

	if (adev->gfx.funcs &&
	    adev->gfx.funcs->reset_ras_error_count)
		adev->gfx.funcs->reset_ras_error_count(adev);

	r = amdgpu_gfx_ras_late_init(adev);
	if (r)
		return r;
@@ -6388,7 +6382,7 @@ static int gfx_v9_0_ras_error_count(const struct soc15_reg_entry *reg,
	return 0;
}

static void gfx_v9_0_clear_ras_edc_counter(struct amdgpu_device *adev)
static void gfx_v9_0_reset_ras_error_count(struct amdgpu_device *adev)
{
	int i, j, k;

+1 −1
Original line number Diff line number Diff line
@@ -893,7 +893,7 @@ int gfx_v9_4_query_ras_error_count(struct amdgpu_device *adev,
	return 0;
}

void gfx_v9_4_clear_ras_edc_counter(struct amdgpu_device *adev)
void gfx_v9_4_reset_ras_error_count(struct amdgpu_device *adev)
{
	int i, j, k;

+2 −0
Original line number Diff line number Diff line
@@ -32,4 +32,6 @@ int gfx_v9_4_query_ras_error_count(struct amdgpu_device *adev,
int gfx_v9_4_ras_error_inject(struct amdgpu_device *adev,
				     void *inject_if);

void gfx_v9_4_reset_ras_error_count(struct amdgpu_device *adev);

#endif /* __GFX_V9_4_H__ */