Commit 954bd994 authored by Oliver O'Halloran's avatar Oliver O'Halloran Committed by Michael Ellerman
Browse files

powerpc/eeh: Add eeh_force_recover to debugfs



This patch adds a debugfs interface to force scheduling a recovery event.
This can be used to recover a specific PE or schedule a "special" recovery
even that checks for errors at the PHB level.
To force a recovery of a normal PE, use:

 echo '<#pe>:<#phb>' > /sys/kernel/debug/powerpc/eeh_force_recover

To force a scan for broken PHBs:

 echo 'hwcheck' > /sys/kernel/debug/powerpc/eeh_force_recover

Signed-off-by: default avatarOliver O'Halloran <oohall@gmail.com>
Signed-off-by: default avatarMichael Ellerman <mpe@ellerman.id.au>
parent 6b493f60
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -33,6 +33,7 @@ struct eeh_event {

int eeh_event_init(void);
int eeh_send_failure_event(struct eeh_pe *pe);
int __eeh_send_failure_event(struct eeh_pe *pe);
void eeh_remove_event(struct eeh_pe *pe, bool force);
void eeh_handle_normal_event(struct eeh_pe *pe);
void eeh_handle_special_event(void);
+59 −0
Original line number Diff line number Diff line
@@ -1838,6 +1838,62 @@ static int eeh_enable_dbgfs_get(void *data, u64 *val)

DEFINE_DEBUGFS_ATTRIBUTE(eeh_enable_dbgfs_ops, eeh_enable_dbgfs_get,
			 eeh_enable_dbgfs_set, "0x%llx\n");

static ssize_t eeh_force_recover_write(struct file *filp,
				const char __user *user_buf,
				size_t count, loff_t *ppos)
{
	struct pci_controller *hose;
	uint32_t phbid, pe_no;
	struct eeh_pe *pe;
	char buf[20];
	int ret;

	ret = simple_write_to_buffer(buf, sizeof(buf), ppos, user_buf, count);
	if (!ret)
		return -EFAULT;

	/*
	 * When PE is NULL the event is a "special" event. Rather than
	 * recovering a specific PE it forces the EEH core to scan for failed
	 * PHBs and recovers each. This needs to be done before any device
	 * recoveries can occur.
	 */
	if (!strncmp(buf, "hwcheck", 7)) {
		__eeh_send_failure_event(NULL);
		return count;
	}

	ret = sscanf(buf, "%x:%x", &phbid, &pe_no);
	if (ret != 2)
		return -EINVAL;

	hose = pci_find_controller_for_domain(phbid);
	if (!hose)
		return -ENODEV;

	/* Retrieve PE */
	pe = eeh_pe_get(hose, pe_no, 0);
	if (!pe)
		return -ENODEV;

	/*
	 * We don't do any state checking here since the detection
	 * process is async to the recovery process. The recovery
	 * thread *should* not break even if we schedule a recovery
	 * from an odd state (e.g. PE removed, or recovery of a
	 * non-isolated PE)
	 */
	__eeh_send_failure_event(pe);

	return ret < 0 ? ret : count;
}

static const struct file_operations eeh_force_recover_fops = {
	.open	= simple_open,
	.llseek	= no_llseek,
	.write	= eeh_force_recover_write,
};
#endif

static int __init eeh_init_proc(void)
@@ -1853,6 +1909,9 @@ static int __init eeh_init_proc(void)
		debugfs_create_bool("eeh_disable_recovery", 0600,
				powerpc_debugfs_root,
				&eeh_debugfs_no_recover);
		debugfs_create_file_unsafe("eeh_force_recover", 0600,
				powerpc_debugfs_root, NULL,
				&eeh_force_recover_fops);
		eeh_cache_debugfs_init();
#endif
	}
+15 −10
Original line number Diff line number Diff line
@@ -121,20 +121,11 @@ int eeh_event_init(void)
 * the actual event will be delivered in a normal context
 * (from a workqueue).
 */
int eeh_send_failure_event(struct eeh_pe *pe)
int __eeh_send_failure_event(struct eeh_pe *pe)
{
	unsigned long flags;
	struct eeh_event *event;

	/*
	 * If we've manually supressed recovery events via debugfs
	 * then just drop it on the floor.
	 */
	if (eeh_debugfs_no_recover) {
		pr_err("EEH: Event dropped due to no_recover setting\n");
		return 0;
	}

	event = kzalloc(sizeof(*event), GFP_ATOMIC);
	if (!event) {
		pr_err("EEH: out of memory, event not handled\n");
@@ -153,6 +144,20 @@ int eeh_send_failure_event(struct eeh_pe *pe)
	return 0;
}

int eeh_send_failure_event(struct eeh_pe *pe)
{
	/*
	 * If we've manually supressed recovery events via debugfs
	 * then just drop it on the floor.
	 */
	if (eeh_debugfs_no_recover) {
		pr_err("EEH: Event dropped due to no_recover setting\n");
		return 0;
	}

	return __eeh_send_failure_event(pe);
}

/**
 * eeh_remove_event - Remove EEH event from the queue
 * @pe: Event binding to the PE