Commit 28fcb77b authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge branch 'ras-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull RAS updates from Borislav Petkov:

 - Fully reworked thermal throttling notifications, there should be no
   more spamming of dmesg (Srinivas Pandruvada and Benjamin Berg)

 - More enablement for the Intel-compatible CPUs Zhaoxin (Tony W
   Wang-oc)

 - PPIN support for Icelake (Tony Luck)

* 'ras-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/mce/therm_throt: Optimize notifications of thermal throttle
  x86/mce: Add Xeon Icelake to list of CPUs that support PPIN
  x86/mce: Lower throttling MCE messages' priority to warning
  x86/mce: Add Zhaoxin LMCE support
  x86/mce: Add Zhaoxin CMCI support
  x86/mce: Add Zhaoxin MCE support
  x86/mce/amd: Make disable_err_thresholding() static
parents 63c2291f f6656208
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -583,7 +583,7 @@ bool amd_filter_mce(struct mce *m)
 * - Prevent possible spurious interrupts from the IF bank on Family 0x17
 *   Models 0x10-0x2F due to Erratum #1114.
 */
void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank)
static void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank)
{
	int i, num_msrs;
	u64 hwcr;
+78 −15
Original line number Diff line number Diff line
@@ -488,8 +488,9 @@ int mce_usable_address(struct mce *m)
	if (!(m->status & MCI_STATUS_ADDRV))
		return 0;

	/* Checks after this one are Intel-specific: */
	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
	/* Checks after this one are Intel/Zhaoxin-specific: */
	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL &&
	    boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN)
		return 1;

	if (!(m->status & MCI_STATUS_MISCV))
@@ -507,10 +508,13 @@ EXPORT_SYMBOL_GPL(mce_usable_address);

bool mce_is_memory_error(struct mce *m)
{
	if (m->cpuvendor == X86_VENDOR_AMD ||
	    m->cpuvendor == X86_VENDOR_HYGON) {
	switch (m->cpuvendor) {
	case X86_VENDOR_AMD:
	case X86_VENDOR_HYGON:
		return amd_mce_is_memory_error(m);
	} else if (m->cpuvendor == X86_VENDOR_INTEL) {

	case X86_VENDOR_INTEL:
	case X86_VENDOR_ZHAOXIN:
		/*
		 * Intel SDM Volume 3B - 15.9.2 Compound Error Codes
		 *
@@ -527,10 +531,11 @@ bool mce_is_memory_error(struct mce *m)
		return (m->status & 0xef80) == BIT(7) ||
		       (m->status & 0xef00) == BIT(8) ||
		       (m->status & 0xeffc) == 0xc;
	}

	default:
		return false;
	}
}
EXPORT_SYMBOL_GPL(mce_is_memory_error);

bool mce_is_correctable(struct mce *m)
@@ -1127,6 +1132,12 @@ static bool __mc_check_crashing_cpu(int cpu)
		u64 mcgstatus;

		mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);

		if (boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN) {
			if (mcgstatus & MCG_STATUS_LMCES)
				return false;
		}

		if (mcgstatus & MCG_STATUS_RIPV) {
			mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
			return true;
@@ -1277,9 +1288,10 @@ void do_machine_check(struct pt_regs *regs, long error_code)

	/*
	 * Check if this MCE is signaled to only this logical processor,
	 * on Intel only.
	 * on Intel, Zhaoxin only.
	 */
	if (m.cpuvendor == X86_VENDOR_INTEL)
	if (m.cpuvendor == X86_VENDOR_INTEL ||
	    m.cpuvendor == X86_VENDOR_ZHAOXIN)
		lmce = m.mcgstatus & MCG_STATUS_LMCES;

	/*
@@ -1697,6 +1709,18 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
		if (c->x86 == 6 && c->x86_model == 45)
			quirk_no_way_out = quirk_sandybridge_ifu;
	}

	if (c->x86_vendor == X86_VENDOR_ZHAOXIN) {
		/*
		 * All newer Zhaoxin CPUs support MCE broadcasting. Enable
		 * synchronization with a one second timeout.
		 */
		if (c->x86 > 6 || (c->x86_model == 0x19 || c->x86_model == 0x1f)) {
			if (cfg->monarch_timeout < 0)
				cfg->monarch_timeout = USEC_PER_SEC;
		}
	}

	if (cfg->monarch_timeout < 0)
		cfg->monarch_timeout = 0;
	if (cfg->bootlog != 0)
@@ -1760,6 +1784,35 @@ static void mce_centaur_feature_init(struct cpuinfo_x86 *c)
	}
}

static void mce_zhaoxin_feature_init(struct cpuinfo_x86 *c)
{
	struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);

	/*
	 * These CPUs have MCA bank 8 which reports only one error type called
	 * SVAD (System View Address Decoder). The reporting of that error is
	 * controlled by IA32_MC8.CTL.0.
	 *
	 * If enabled, prefetching on these CPUs will cause SVAD MCE when
	 * virtual machines start and result in a system  panic. Always disable
	 * bank 8 SVAD error by default.
	 */
	if ((c->x86 == 7 && c->x86_model == 0x1b) ||
	    (c->x86_model == 0x19 || c->x86_model == 0x1f)) {
		if (this_cpu_read(mce_num_banks) > 8)
			mce_banks[8].ctl = 0;
	}

	intel_init_cmci();
	intel_init_lmce();
	mce_adjust_timer = cmci_intel_adjust_timer;
}

static void mce_zhaoxin_feature_clear(struct cpuinfo_x86 *c)
{
	intel_clear_lmce();
}

static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
{
	switch (c->x86_vendor) {
@@ -1781,6 +1834,10 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
		mce_centaur_feature_init(c);
		break;

	case X86_VENDOR_ZHAOXIN:
		mce_zhaoxin_feature_init(c);
		break;

	default:
		break;
	}
@@ -1792,6 +1849,11 @@ static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c)
	case X86_VENDOR_INTEL:
		mce_intel_feature_clear(c);
		break;

	case X86_VENDOR_ZHAOXIN:
		mce_zhaoxin_feature_clear(c);
		break;

	default:
		break;
	}
@@ -2014,15 +2076,16 @@ static void mce_disable_error_reporting(void)
static void vendor_disable_error_reporting(void)
{
	/*
	 * Don't clear on Intel or AMD or Hygon CPUs. Some of these MSRs
	 * are socket-wide.
	 * Disabling them for just a single offlined CPU is bad, since it will
	 * inhibit reporting for all shared resources on the socket like the
	 * last level cache (LLC), the integrated memory controller (iMC), etc.
	 * Don't clear on Intel or AMD or Hygon or Zhaoxin CPUs. Some of these
	 * MSRs are socket-wide. Disabling them for just a single offlined CPU
	 * is bad, since it will inhibit reporting for all shared resources on
	 * the socket like the last level cache (LLC), the integrated memory
	 * controller (iMC), etc.
	 */
	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ||
	    boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ||
	    boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
	    boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
	    boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN)
		return;

	mce_disable_error_reporting();
+7 −4
Original line number Diff line number Diff line
@@ -85,8 +85,10 @@ static int cmci_supported(int *banks)
	 * initialization is vendor keyed and this
	 * makes sure none of the backdoors are entered otherwise.
	 */
	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL &&
	    boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN)
		return 0;

	if (!boot_cpu_has(X86_FEATURE_APIC) || lapic_get_maxlvt() < 6)
		return 0;
	rdmsrl(MSR_IA32_MCG_CAP, cap);
@@ -423,7 +425,7 @@ void cmci_disable_bank(int bank)
	raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
}

static void intel_init_cmci(void)
void intel_init_cmci(void)
{
	int banks;

@@ -442,7 +444,7 @@ static void intel_init_cmci(void)
	cmci_recheck();
}

static void intel_init_lmce(void)
void intel_init_lmce(void)
{
	u64 val;

@@ -455,7 +457,7 @@ static void intel_init_lmce(void)
		wrmsrl(MSR_IA32_MCG_EXT_CTL, val | MCG_EXT_CTL_LMCE_EN);
}

static void intel_clear_lmce(void)
void intel_clear_lmce(void)
{
	u64 val;

@@ -482,6 +484,7 @@ static void intel_ppin_init(struct cpuinfo_x86 *c)
	case INTEL_FAM6_BROADWELL_D:
	case INTEL_FAM6_BROADWELL_X:
	case INTEL_FAM6_SKYLAKE_X:
	case INTEL_FAM6_ICELAKE_X:
	case INTEL_FAM6_XEON_PHI_KNL:
	case INTEL_FAM6_XEON_PHI_KNM:

+6 −0
Original line number Diff line number Diff line
@@ -45,11 +45,17 @@ unsigned long cmci_intel_adjust_timer(unsigned long interval);
bool mce_intel_cmci_poll(void);
void mce_intel_hcpu_update(unsigned long cpu);
void cmci_disable_bank(int bank);
void intel_init_cmci(void);
void intel_init_lmce(void);
void intel_clear_lmce(void);
#else
# define cmci_intel_adjust_timer mce_adjust_timer_default
static inline bool mce_intel_cmci_poll(void) { return false; }
static inline void mce_intel_hcpu_update(unsigned long cpu) { }
static inline void cmci_disable_bank(int bank) { }
static inline void intel_init_cmci(void) { }
static inline void intel_init_lmce(void) { }
static inline void intel_clear_lmce(void) { }
#endif

void mce_timer_kick(unsigned long interval);
+227 −24
Original line number Diff line number Diff line
@@ -40,15 +40,58 @@
#define THERMAL_THROTTLING_EVENT	0
#define POWER_LIMIT_EVENT		1

/*
 * Current thermal event state:
/**
 * struct _thermal_state - Represent the current thermal event state
 * @next_check:			Stores the next timestamp, when it is allowed
 *				to log the next warning message.
 * @last_interrupt_time:	Stores the timestamp for the last threshold
 *				high event.
 * @therm_work:			Delayed workqueue structure
 * @count:			Stores the current running count for thermal
 *				or power threshold interrupts.
 * @last_count:			Stores the previous running count for thermal
 *				or power threshold interrupts.
 * @max_time_ms:		This shows the maximum amount of time CPU was
 *				in throttled state for a single thermal
 *				threshold high to low state.
 * @total_time_ms:		This is a cumulative time during which CPU was
 *				in the throttled state.
 * @rate_control_active:	Set when a throttling message is logged.
 *				This is used for the purpose of rate-control.
 * @new_event:			Stores the last high/low status of the
 *				THERM_STATUS_PROCHOT or
 *				THERM_STATUS_POWER_LIMIT.
 * @level:			Stores whether this _thermal_state instance is
 *				for a CORE level or for PACKAGE level.
 * @sample_index:		Index for storing the next sample in the buffer
 *				temp_samples[].
 * @sample_count:		Total number of samples collected in the buffer
 *				temp_samples[].
 * @average:			The last moving average of temperature samples
 * @baseline_temp:		Temperature at which thermal threshold high
 *				interrupt was generated.
 * @temp_samples:		Storage for temperature samples to calculate
 *				moving average.
 *
 * This structure is used to represent data related to thermal state for a CPU.
 * There is a separate storage for core and package level for each CPU.
 */
struct _thermal_state {
	bool			new_event;
	int			event;
	u64			next_check;
	u64			last_interrupt_time;
	struct delayed_work	therm_work;
	unsigned long		count;
	unsigned long		last_count;
	unsigned long		max_time_ms;
	unsigned long		total_time_ms;
	bool			rate_control_active;
	bool			new_event;
	u8			level;
	u8			sample_index;
	u8			sample_count;
	u8			average;
	u8			baseline_temp;
	u8			temp_samples[3];
};

struct thermal_state {
@@ -121,8 +164,22 @@ define_therm_throt_device_one_ro(package_throttle_count);
define_therm_throt_device_show_func(package_power_limit, count);
define_therm_throt_device_one_ro(package_power_limit_count);

define_therm_throt_device_show_func(core_throttle, max_time_ms);
define_therm_throt_device_one_ro(core_throttle_max_time_ms);

define_therm_throt_device_show_func(package_throttle, max_time_ms);
define_therm_throt_device_one_ro(package_throttle_max_time_ms);

define_therm_throt_device_show_func(core_throttle, total_time_ms);
define_therm_throt_device_one_ro(core_throttle_total_time_ms);

define_therm_throt_device_show_func(package_throttle, total_time_ms);
define_therm_throt_device_one_ro(package_throttle_total_time_ms);

static struct attribute *thermal_throttle_attrs[] = {
	&dev_attr_core_throttle_count.attr,
	&dev_attr_core_throttle_max_time_ms.attr,
	&dev_attr_core_throttle_total_time_ms.attr,
	NULL
};

@@ -135,6 +192,105 @@ static const struct attribute_group thermal_attr_group = {
#define CORE_LEVEL	0
#define PACKAGE_LEVEL	1

#define THERM_THROT_POLL_INTERVAL	HZ
#define THERM_STATUS_PROCHOT_LOG	BIT(1)

static void clear_therm_status_log(int level)
{
	int msr;
	u64 msr_val;

	if (level == CORE_LEVEL)
		msr = MSR_IA32_THERM_STATUS;
	else
		msr = MSR_IA32_PACKAGE_THERM_STATUS;

	rdmsrl(msr, msr_val);
	wrmsrl(msr, msr_val & ~THERM_STATUS_PROCHOT_LOG);
}

static void get_therm_status(int level, bool *proc_hot, u8 *temp)
{
	int msr;
	u64 msr_val;

	if (level == CORE_LEVEL)
		msr = MSR_IA32_THERM_STATUS;
	else
		msr = MSR_IA32_PACKAGE_THERM_STATUS;

	rdmsrl(msr, msr_val);
	if (msr_val & THERM_STATUS_PROCHOT_LOG)
		*proc_hot = true;
	else
		*proc_hot = false;

	*temp = (msr_val >> 16) & 0x7F;
}

static void throttle_active_work(struct work_struct *work)
{
	struct _thermal_state *state = container_of(to_delayed_work(work),
						struct _thermal_state, therm_work);
	unsigned int i, avg, this_cpu = smp_processor_id();
	u64 now = get_jiffies_64();
	bool hot;
	u8 temp;

	get_therm_status(state->level, &hot, &temp);
	/* temperature value is offset from the max so lesser means hotter */
	if (!hot && temp > state->baseline_temp) {
		if (state->rate_control_active)
			pr_info("CPU%d: %s temperature/speed normal (total events = %lu)\n",
				this_cpu,
				state->level == CORE_LEVEL ? "Core" : "Package",
				state->count);

		state->rate_control_active = false;
		return;
	}

	if (time_before64(now, state->next_check) &&
			  state->rate_control_active)
		goto re_arm;

	state->next_check = now + CHECK_INTERVAL;

	if (state->count != state->last_count) {
		/* There was one new thermal interrupt */
		state->last_count = state->count;
		state->average = 0;
		state->sample_count = 0;
		state->sample_index = 0;
	}

	state->temp_samples[state->sample_index] = temp;
	state->sample_count++;
	state->sample_index = (state->sample_index + 1) % ARRAY_SIZE(state->temp_samples);
	if (state->sample_count < ARRAY_SIZE(state->temp_samples))
		goto re_arm;

	avg = 0;
	for (i = 0; i < ARRAY_SIZE(state->temp_samples); ++i)
		avg += state->temp_samples[i];

	avg /= ARRAY_SIZE(state->temp_samples);

	if (state->average > avg) {
		pr_warn("CPU%d: %s temperature is above threshold, cpu clock is throttled (total events = %lu)\n",
			this_cpu,
			state->level == CORE_LEVEL ? "Core" : "Package",
			state->count);
		state->rate_control_active = true;
	}

	state->average = avg;

re_arm:
	clear_therm_status_log(state->level);
	schedule_delayed_work_on(this_cpu, &state->therm_work, THERM_THROT_POLL_INTERVAL);
}

/***
 * therm_throt_process - Process thermal throttling event from interrupt
 * @curr: Whether the condition is current or not (boolean), since the
@@ -178,27 +334,33 @@ static void therm_throt_process(bool new_event, int event, int level)
	if (new_event)
		state->count++;

	if (time_before64(now, state->next_check) &&
			state->count != state->last_count)
	if (event != THERMAL_THROTTLING_EVENT)
		return;

	state->next_check = now + CHECK_INTERVAL;
	state->last_count = state->count;
	if (new_event && !state->last_interrupt_time) {
		bool hot;
		u8 temp;

	/* if we just entered the thermal event */
	if (new_event) {
		if (event == THERMAL_THROTTLING_EVENT)
			pr_crit("CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n",
				this_cpu,
				level == CORE_LEVEL ? "Core" : "Package",
				state->count);
		return;
	}
	if (old_event) {
		if (event == THERMAL_THROTTLING_EVENT)
			pr_info("CPU%d: %s temperature/speed normal\n", this_cpu,
				level == CORE_LEVEL ? "Core" : "Package");
		get_therm_status(state->level, &hot, &temp);
		/*
		 * Ignore short temperature spike as the system is not close
		 * to PROCHOT. 10C offset is large enough to ignore. It is
		 * already dropped from the high threshold temperature.
		 */
		if (temp > 10)
			return;

		state->baseline_temp = temp;
		state->last_interrupt_time = now;
		schedule_delayed_work_on(this_cpu, &state->therm_work, THERM_THROT_POLL_INTERVAL);
	} else if (old_event && state->last_interrupt_time) {
		unsigned long throttle_time;

		throttle_time = jiffies_delta_to_msecs(now - state->last_interrupt_time);
		if (throttle_time > state->max_time_ms)
			state->max_time_ms = throttle_time;
		state->total_time_ms += throttle_time;
		state->last_interrupt_time = 0;
	}
}

@@ -244,20 +406,47 @@ static int thermal_throttle_add_dev(struct device *dev, unsigned int cpu)
	if (err)
		return err;

	if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable)
	if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) {
		err = sysfs_add_file_to_group(&dev->kobj,
					      &dev_attr_core_power_limit_count.attr,
					      thermal_attr_group.name);
		if (err)
			goto del_group;
	}

	if (cpu_has(c, X86_FEATURE_PTS)) {
		err = sysfs_add_file_to_group(&dev->kobj,
					      &dev_attr_package_throttle_count.attr,
					      thermal_attr_group.name);
		if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable)
		if (err)
			goto del_group;

		err = sysfs_add_file_to_group(&dev->kobj,
					      &dev_attr_package_throttle_max_time_ms.attr,
					      thermal_attr_group.name);
		if (err)
			goto del_group;

		err = sysfs_add_file_to_group(&dev->kobj,
					      &dev_attr_package_throttle_total_time_ms.attr,
					      thermal_attr_group.name);
		if (err)
			goto del_group;

		if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) {
			err = sysfs_add_file_to_group(&dev->kobj,
					&dev_attr_package_power_limit_count.attr,
					thermal_attr_group.name);
			if (err)
				goto del_group;
		}
	}

	return 0;

del_group:
	sysfs_remove_group(&dev->kobj, &thermal_attr_group);

	return err;
}

@@ -269,15 +458,29 @@ static void thermal_throttle_remove_dev(struct device *dev)
/* Get notified when a cpu comes on/off. Be hotplug friendly. */
static int thermal_throttle_online(unsigned int cpu)
{
	struct thermal_state *state = &per_cpu(thermal_state, cpu);
	struct device *dev = get_cpu_device(cpu);

	state->package_throttle.level = PACKAGE_LEVEL;
	state->core_throttle.level = CORE_LEVEL;

	INIT_DELAYED_WORK(&state->package_throttle.therm_work, throttle_active_work);
	INIT_DELAYED_WORK(&state->core_throttle.therm_work, throttle_active_work);

	return thermal_throttle_add_dev(dev, cpu);
}

static int thermal_throttle_offline(unsigned int cpu)
{
	struct thermal_state *state = &per_cpu(thermal_state, cpu);
	struct device *dev = get_cpu_device(cpu);

	cancel_delayed_work(&state->package_throttle.therm_work);
	cancel_delayed_work(&state->core_throttle.therm_work);

	state->package_throttle.rate_control_active = false;
	state->core_throttle.rate_control_active = false;

	thermal_throttle_remove_dev(dev);
	return 0;
}