Commit e3480271 authored by Chen Yucong's avatar Chen Yucong Committed by Tony Luck
Browse files

x86, mce, severity: Extend the the mce_severity mechanism to handle UCNA/DEFERRED error



Until now, the mce_severity mechanism can only identify the severity
of UCNA error as MCE_KEEP_SEVERITY. Meanwhile, it is not able to filter
out DEFERRED error for AMD platform.

This patch extends the mce_severity mechanism for handling
UCNA/DEFERRED error. In order to do this, the patch introduces a new
severity level - MCE_UCNA/DEFERRED_SEVERITY.

In addition, mce_severity is specific to machine check exception,
and it will check MCIP/EIPV/RIPV bits. In order to use mce_severity
mechanism in non-exception context, the patch also introduces a new
argument (is_excp) for mce_severity. `is_excp' is used to explicitly
specify the calling context of mce_severity.

Reviewed-by: default avatarAravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Signed-off-by: default avatarChen Yucong <slaoub@gmail.com>
Signed-off-by: default avatarTony Luck <tony.luck@intel.com>
parent 8dcf32ea
Loading
Loading
Loading
Loading
+4 −0
Original line number Original line Diff line number Diff line
@@ -34,6 +34,10 @@
#define MCI_STATUS_S	 (1ULL<<56)  /* Signaled machine check */
#define MCI_STATUS_S	 (1ULL<<56)  /* Signaled machine check */
#define MCI_STATUS_AR	 (1ULL<<55)  /* Action required */
#define MCI_STATUS_AR	 (1ULL<<55)  /* Action required */


/* AMD-specific bits */
#define MCI_STATUS_DEFERRED	(1ULL<<44)  /* declare an uncorrected error */
#define MCI_STATUS_POISON	(1ULL<<43)  /* access poisonous data */

/*
/*
 * Note that the full MCACOD field of IA32_MCi_STATUS MSR is
 * Note that the full MCACOD field of IA32_MCi_STATUS MSR is
 * bits 15:0.  But bit 12 is the 'F' bit, defined for corrected
 * bits 15:0.  But bit 12 is the 'F' bit, defined for corrected
+3 −1
Original line number Original line Diff line number Diff line
@@ -3,6 +3,8 @@


enum severity_level {
enum severity_level {
	MCE_NO_SEVERITY,
	MCE_NO_SEVERITY,
	MCE_DEFERRED_SEVERITY,
	MCE_UCNA_SEVERITY = MCE_DEFERRED_SEVERITY,
	MCE_KEEP_SEVERITY,
	MCE_KEEP_SEVERITY,
	MCE_SOME_SEVERITY,
	MCE_SOME_SEVERITY,
	MCE_AO_SEVERITY,
	MCE_AO_SEVERITY,
@@ -21,7 +23,7 @@ struct mce_bank {
	char			attrname[ATTR_LEN];	/* attribute name */
	char			attrname[ATTR_LEN];	/* attribute name */
};
};


int mce_severity(struct mce *a, int tolerant, char **msg);
int mce_severity(struct mce *a, int tolerant, char **msg, bool is_excp);
struct dentry *mce_get_debugfs_dir(void);
struct dentry *mce_get_debugfs_dir(void);


extern struct mce_bank *mce_banks;
extern struct mce_bank *mce_banks;
+17 −6
Original line number Original line Diff line number Diff line
@@ -31,6 +31,7 @@


enum context { IN_KERNEL = 1, IN_USER = 2 };
enum context { IN_KERNEL = 1, IN_USER = 2 };
enum ser { SER_REQUIRED = 1, NO_SER = 2 };
enum ser { SER_REQUIRED = 1, NO_SER = 2 };
enum exception { EXCP_CONTEXT = 1, NO_EXCP = 2 };


static struct severity {
static struct severity {
	u64 mask;
	u64 mask;
@@ -40,6 +41,7 @@ static struct severity {
	unsigned char mcgres;
	unsigned char mcgres;
	unsigned char ser;
	unsigned char ser;
	unsigned char context;
	unsigned char context;
	unsigned char excp;
	unsigned char covered;
	unsigned char covered;
	char *msg;
	char *msg;
} severities[] = {
} severities[] = {
@@ -48,6 +50,8 @@ static struct severity {
#define  USER		.context = IN_USER
#define  USER		.context = IN_USER
#define  SER		.ser = SER_REQUIRED
#define  SER		.ser = SER_REQUIRED
#define  NOSER		.ser = NO_SER
#define  NOSER		.ser = NO_SER
#define  EXCP		.excp = EXCP_CONTEXT
#define  NOEXCP		.excp = NO_EXCP
#define  BITCLR(x)	.mask = x, .result = 0
#define  BITCLR(x)	.mask = x, .result = 0
#define  BITSET(x)	.mask = x, .result = x
#define  BITSET(x)	.mask = x, .result = x
#define  MCGMASK(x, y)	.mcgmask = x, .mcgres = y
#define  MCGMASK(x, y)	.mcgmask = x, .mcgres = y
@@ -62,7 +66,7 @@ static struct severity {
		),
		),
	MCESEV(
	MCESEV(
		NO, "Not enabled",
		NO, "Not enabled",
		BITCLR(MCI_STATUS_EN)
		EXCP, BITCLR(MCI_STATUS_EN)
		),
		),
	MCESEV(
	MCESEV(
		PANIC, "Processor context corrupt",
		PANIC, "Processor context corrupt",
@@ -71,16 +75,20 @@ static struct severity {
	/* When MCIP is not set something is very confused */
	/* When MCIP is not set something is very confused */
	MCESEV(
	MCESEV(
		PANIC, "MCIP not set in MCA handler",
		PANIC, "MCIP not set in MCA handler",
		MCGMASK(MCG_STATUS_MCIP, 0)
		EXCP, MCGMASK(MCG_STATUS_MCIP, 0)
		),
		),
	/* Neither return not error IP -- no chance to recover -> PANIC */
	/* Neither return not error IP -- no chance to recover -> PANIC */
	MCESEV(
	MCESEV(
		PANIC, "Neither restart nor error IP",
		PANIC, "Neither restart nor error IP",
		MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0)
		EXCP, MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0)
		),
		),
	MCESEV(
	MCESEV(
		PANIC, "In kernel and no restart IP",
		PANIC, "In kernel and no restart IP",
		KERNEL, MCGMASK(MCG_STATUS_RIPV, 0)
		EXCP, KERNEL, MCGMASK(MCG_STATUS_RIPV, 0)
		),
	MCESEV(
		DEFERRED, "Deferred error",
		NOSER, MASK(MCI_STATUS_UC|MCI_STATUS_DEFERRED|MCI_STATUS_POISON, MCI_STATUS_DEFERRED)
		),
		),
	MCESEV(
	MCESEV(
		KEEP, "Corrected error",
		KEEP, "Corrected error",
@@ -89,7 +97,7 @@ static struct severity {


	/* ignore OVER for UCNA */
	/* ignore OVER for UCNA */
	MCESEV(
	MCESEV(
		KEEP, "Uncorrected no action required",
		UCNA, "Uncorrected no action required",
		SER, MASK(MCI_UC_SAR, MCI_STATUS_UC)
		SER, MASK(MCI_UC_SAR, MCI_STATUS_UC)
		),
		),
	MCESEV(
	MCESEV(
@@ -178,8 +186,9 @@ static int error_context(struct mce *m)
	return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
	return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
}
}


int mce_severity(struct mce *m, int tolerant, char **msg)
int mce_severity(struct mce *m, int tolerant, char **msg, bool is_excp)
{
{
	enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP);
	enum context ctx = error_context(m);
	enum context ctx = error_context(m);
	struct severity *s;
	struct severity *s;


@@ -194,6 +203,8 @@ int mce_severity(struct mce *m, int tolerant, char **msg)
			continue;
			continue;
		if (s->context && ctx != s->context)
		if (s->context && ctx != s->context)
			continue;
			continue;
		if (s->excp && excp != s->excp)
			continue;
		if (msg)
		if (msg)
			*msg = s->msg;
			*msg = s->msg;
		s->covered = 1;
		s->covered = 1;
+8 −6
Original line number Original line Diff line number Diff line
@@ -668,7 +668,8 @@ static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
			if (quirk_no_way_out)
			if (quirk_no_way_out)
				quirk_no_way_out(i, m, regs);
				quirk_no_way_out(i, m, regs);
		}
		}
		if (mce_severity(m, mca_cfg.tolerant, msg) >= MCE_PANIC_SEVERITY)
		if (mce_severity(m, mca_cfg.tolerant, msg, true) >=
		    MCE_PANIC_SEVERITY)
			ret = 1;
			ret = 1;
	}
	}
	return ret;
	return ret;
@@ -754,7 +755,7 @@ static void mce_reign(void)
	for_each_possible_cpu(cpu) {
	for_each_possible_cpu(cpu) {
		int severity = mce_severity(&per_cpu(mces_seen, cpu),
		int severity = mce_severity(&per_cpu(mces_seen, cpu),
					    mca_cfg.tolerant,
					    mca_cfg.tolerant,
					    &nmsg);
					    &nmsg, true);
		if (severity > global_worst) {
		if (severity > global_worst) {
			msg = nmsg;
			msg = nmsg;
			global_worst = severity;
			global_worst = severity;
@@ -1095,13 +1096,14 @@ void do_machine_check(struct pt_regs *regs, long error_code)
		 */
		 */
		add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
		add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);


		severity = mce_severity(&m, cfg->tolerant, NULL);
		severity = mce_severity(&m, cfg->tolerant, NULL, true);


		/*
		/*
		 * When machine check was for corrected handler don't touch,
		 * When machine check was for corrected/deferred handler don't
		 * unless we're panicing.
		 * touch, unless we're panicing.
		 */
		 */
		if (severity == MCE_KEEP_SEVERITY && !no_way_out)
		if ((severity == MCE_KEEP_SEVERITY ||
		     severity == MCE_UCNA_SEVERITY) && !no_way_out)
			continue;
			continue;
		__set_bit(i, toclear);
		__set_bit(i, toclear);
		if (severity == MCE_NO_SEVERITY) {
		if (severity == MCE_NO_SEVERITY) {
+0 −3
Original line number Original line Diff line number Diff line
@@ -32,9 +32,6 @@
#define R4(x)				(((x) >> 4) & 0xf)
#define R4(x)				(((x) >> 4) & 0xf)
#define R4_MSG(x)			((R4(x) < 9) ?  rrrr_msgs[R4(x)] : "Wrong R4!")
#define R4_MSG(x)			((R4(x) < 9) ?  rrrr_msgs[R4(x)] : "Wrong R4!")


#define MCI_STATUS_DEFERRED		BIT_64(44)
#define MCI_STATUS_POISON		BIT_64(43)

extern const char * const pp_msgs[];
extern const char * const pp_msgs[];


enum tt_ids {
enum tt_ids {