Commit 89041940 authored by Gavin Wan's avatar Gavin Wan Committed by Alex Deucher
Browse files

drm/amdgpu: Support passing amdgpu critical error to host via GPU Mailbox.



This feature works for SRIOV enviroment. For non-SRIOV enviroment, the
trans_error function does nothing.

The error information includes error_code (16bit), error_flags(16bit)
and error_data(64bit). Since there are not many errors, we keep the
errors in an array and transfer all errors to Host before amdgpu
initialization function (amdgpu_device_init) exit.

Signed-off-by: default avatarGavin Wan <Gavin.Wan@amd.com>
Reviewed-by: default avatarAlex Deucher <alexander.deucher@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 8e1b90cc
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -25,7 +25,7 @@ amdgpu-y += amdgpu_device.o amdgpu_kms.o \
	amdgpu_prime.o amdgpu_vm.o amdgpu_ib.o amdgpu_pll.o \
	amdgpu_ucode.o amdgpu_bo_list.o amdgpu_ctx.o amdgpu_sync.o \
	amdgpu_gtt_mgr.o amdgpu_vram_mgr.o amdgpu_virt.o amdgpu_atomfirmware.o \
	amdgpu_queue_mgr.o
	amdgpu_queue_mgr.o amdgpu_vf_error.o

# add asic specific block
amdgpu-$(CONFIG_DRM_AMDGPU_CIK)+= cik.o cik_ih.o kv_smc.o kv_dpm.o \
+18 −3
Original line number Diff line number Diff line
@@ -53,6 +53,7 @@
#include "bif/bif_4_1_d.h"
#include <linux/pci.h>
#include <linux/firmware.h>
#include "amdgpu_vf_error.h"

MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
@@ -2134,6 +2135,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
	r = amdgpu_atombios_init(adev);
	if (r) {
		dev_err(adev->dev, "amdgpu_atombios_init failed\n");
		amdgpu_vf_error_put(AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
		goto failed;
	}

@@ -2144,6 +2146,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
	if (amdgpu_vpost_needed(adev)) {
		if (!adev->bios) {
			dev_err(adev->dev, "no vBIOS found\n");
			amdgpu_vf_error_put(AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
			r = -EINVAL;
			goto failed;
		}
@@ -2151,6 +2154,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
		r = amdgpu_atom_asic_init(adev->mode_info.atom_context);
		if (r) {
			dev_err(adev->dev, "gpu post error!\n");
			amdgpu_vf_error_put(AMDGIM_ERROR_VF_GPU_POST_ERROR, 0, 0);
			goto failed;
		}
	} else {
@@ -2162,7 +2166,8 @@ int amdgpu_device_init(struct amdgpu_device *adev,
		r = amdgpu_atombios_get_clock_info(adev);
		if (r) {
			dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
			return r;
			amdgpu_vf_error_put(AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
			goto failed;
		}
		/* init i2c buses */
		amdgpu_atombios_i2c_init(adev);
@@ -2172,6 +2177,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
	r = amdgpu_fence_driver_init(adev);
	if (r) {
		dev_err(adev->dev, "amdgpu_fence_driver_init failed\n");
		amdgpu_vf_error_put(AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
		goto failed;
	}

@@ -2181,6 +2187,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
	r = amdgpu_init(adev);
	if (r) {
		dev_err(adev->dev, "amdgpu_init failed\n");
		amdgpu_vf_error_put(AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
		amdgpu_fini(adev);
		goto failed;
	}
@@ -2200,6 +2207,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
	r = amdgpu_ib_pool_init(adev);
	if (r) {
		dev_err(adev->dev, "IB initialization failed (%d).\n", r);
		amdgpu_vf_error_put(AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
		goto failed;
	}

@@ -2244,12 +2252,14 @@ int amdgpu_device_init(struct amdgpu_device *adev,
	r = amdgpu_late_init(adev);
	if (r) {
		dev_err(adev->dev, "amdgpu_late_init failed\n");
		amdgpu_vf_error_put(AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
		goto failed;
	}

	return 0;

failed:
	amdgpu_vf_error_trans_all(adev);
	if (runtime)
		vga_switcheroo_fini_domain_pm_ops(adev->dev);
	return r;
@@ -2937,6 +2947,7 @@ out:
		}
	} else {
		dev_err(adev->dev, "asic resume failed (%d).\n", r);
		amdgpu_vf_error_put(AMDGIM_ERROR_VF_ASIC_RESUME_FAIL, 0, r);
		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
			if (adev->rings[i] && adev->rings[i]->sched.thread) {
				kthread_unpark(adev->rings[i]->sched.thread);
@@ -2947,12 +2958,16 @@ out:
	drm_helper_resume_force_mode(adev->ddev);

	ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev, resched);
	if (r)
	if (r) {
		/* bad news, how to tell it to userspace ? */
		dev_info(adev->dev, "GPU reset failed\n");
	else
		amdgpu_vf_error_put(AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
	}
	else {
		dev_info(adev->dev, "GPU reset successed!\n");
	}

	amdgpu_vf_error_trans_all(adev);
	return r;
}

+85 −0
Original line number Diff line number Diff line
/*
 * Copyright 2017 Advanced Micro Devices, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 *
 */

#include "amdgpu.h"
#include "amdgpu_vf_error.h"
#include "mxgpu_ai.h"

#define AMDGPU_VF_ERROR_ENTRY_SIZE    16 

/* struct error_entry - amdgpu VF error information. */
struct amdgpu_vf_error_buffer {
	int read_count;
	int write_count;
	uint16_t code[AMDGPU_VF_ERROR_ENTRY_SIZE];
	uint16_t flags[AMDGPU_VF_ERROR_ENTRY_SIZE];
	uint64_t data[AMDGPU_VF_ERROR_ENTRY_SIZE];
};

struct amdgpu_vf_error_buffer admgpu_vf_errors;


void amdgpu_vf_error_put(uint16_t sub_error_code, uint16_t error_flags, uint64_t error_data)
{
	int index;
	uint16_t error_code = AMDGIM_ERROR_CODE(AMDGIM_ERROR_CATEGORY_VF, sub_error_code);

	index = admgpu_vf_errors.write_count % AMDGPU_VF_ERROR_ENTRY_SIZE;
	admgpu_vf_errors.code [index] = error_code;
	admgpu_vf_errors.flags [index] = error_flags;
	admgpu_vf_errors.data [index] = error_data;
	admgpu_vf_errors.write_count ++;
}


void amdgpu_vf_error_trans_all(struct amdgpu_device *adev)
{
	/* u32 pf2vf_flags = 0; */
	u32 data1, data2, data3;
	int index;

	if ((NULL == adev) || (!amdgpu_sriov_vf(adev)) || (!adev->virt.ops) || (!adev->virt.ops->trans_msg)) {
		return;
	}
/*
 	TODO: Enable these code when pv2vf_info is merged
	AMDGPU_FW_VRAM_PF2VF_READ (adev, feature_flags, &pf2vf_flags);
	if (!(pf2vf_flags & AMDGIM_FEATURE_ERROR_LOG_COLLECT)) {
		return;
	}
*/
	/* The errors are overlay of array, correct read_count as full. */
	if (admgpu_vf_errors.write_count - admgpu_vf_errors.read_count > AMDGPU_VF_ERROR_ENTRY_SIZE) {
		admgpu_vf_errors.read_count = admgpu_vf_errors.write_count - AMDGPU_VF_ERROR_ENTRY_SIZE;
	}

	while (admgpu_vf_errors.read_count < admgpu_vf_errors.write_count) {
		index =admgpu_vf_errors.read_count % AMDGPU_VF_ERROR_ENTRY_SIZE;
		data1 = AMDGIM_ERROR_CODE_FLAGS_TO_MAILBOX (admgpu_vf_errors.code[index], admgpu_vf_errors.flags[index]);
		data2 = admgpu_vf_errors.data[index] & 0xFFFFFFFF;
		data3 = (admgpu_vf_errors.data[index] >> 32) & 0xFFFFFFFF;

		adev->virt.ops->trans_msg(adev, IDH_LOG_VF_ERROR, data1, data2, data3);
		admgpu_vf_errors.read_count ++;
	}
}
+62 −0
Original line number Diff line number Diff line
/*
 * Copyright 2017 Advanced Micro Devices, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 *
 */

#ifndef __VF_ERROR_H__
#define __VF_ERROR_H__

#define AMDGIM_ERROR_CODE_FLAGS_TO_MAILBOX(c,f)    (((c & 0xFFFF) << 16) | (f & 0xFFFF))
#define AMDGIM_ERROR_CODE(t,c)       (((t&0xF)<<12)|(c&0xFFF))

/* Please keep enum same as AMD GIM driver */
enum AMDGIM_ERROR_VF {
	AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL = 0,
	AMDGIM_ERROR_VF_NO_VBIOS,
	AMDGIM_ERROR_VF_GPU_POST_ERROR,
	AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL,
	AMDGIM_ERROR_VF_FENCE_INIT_FAIL,

	AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL,
	AMDGIM_ERROR_VF_IB_INIT_FAIL,
	AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL,
	AMDGIM_ERROR_VF_ASIC_RESUME_FAIL,
	AMDGIM_ERROR_VF_GPU_RESET_FAIL,

	AMDGIM_ERROR_VF_TEST,
	AMDGIM_ERROR_VF_MAX
};

enum AMDGIM_ERROR_CATEGORY {
	AMDGIM_ERROR_CATEGORY_NON_USED = 0,
	AMDGIM_ERROR_CATEGORY_GIM,
	AMDGIM_ERROR_CATEGORY_PF,
	AMDGIM_ERROR_CATEGORY_VF,
	AMDGIM_ERROR_CATEGORY_VBIOS,
	AMDGIM_ERROR_CATEGORY_MONITOR,

	AMDGIM_ERROR_CATEGORY_MAX
};

void amdgpu_vf_error_put(uint16_t sub_error_code, uint16_t error_flags, uint64_t error_data);
void amdgpu_vf_error_trans_all (struct amdgpu_device *adev);

#endif /* __VF_ERROR_H__ */
+1 −0
Original line number Diff line number Diff line
@@ -43,6 +43,7 @@ struct amdgpu_virt_ops {
	int (*req_full_gpu)(struct amdgpu_device *adev, bool init);
	int (*rel_full_gpu)(struct amdgpu_device *adev, bool init);
	int (*reset_gpu)(struct amdgpu_device *adev);
	void (*trans_msg)(struct amdgpu_device *adev, u32 req, u32 data1, u32 data2, u32 data3);
};

/* GPU virtualization */
Loading