Commit ac325acd authored by Linas Vepstas's avatar Linas Vepstas Committed by Paul Mackerras
Browse files

[PATCH] powerpc/pseries: clear PCI failure counter if no new failures



The current PCI error recovery system keeps track of the number of PCI card
resets, and refuses to bring a card back up if this number is too large.
The goal of doing this was to avoid an infinite loop of resets if a card is
obviously dead.  However, if the failures are rare, but the machine has a
high uptime, this mechanism might still be triggered; this is too harsh.

This patch will avoids this problem by decrementing the fail count after an
hour.  Thus, as long as a pci card BSOD's less than 6 times an hour, it
will continue to be reset indefinitely.  If it's failure rate is greater
than that, it will be taken off-line permanently.

This patch is larger than it might otherwise be because it changes
indentation by removing a pointless while-loop.  The while loop is not
needed, as the handler is invoked once fo each event (by schedule_work());
the loop is leftover cruft from an earlier implementation.

Signed-off-by: default avatarLinas Vepstas <linas@austin.ibm.com>
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
Signed-off-by: default avatarPaul Mackerras <paulus@samba.org>
parent 4bd174fe
Loading
Loading
Loading
Loading
+7 −6
Original line number Diff line number Diff line
@@ -23,9 +23,8 @@
 *
 */
#include <linux/delay.h>
#include <linux/irq.h>
#include <linux/interrupt.h>
#include <linux/notifier.h>
#include <linux/irq.h>
#include <linux/pci.h>
#include <asm/eeh.h>
#include <asm/eeh_event.h>
@@ -250,7 +249,7 @@ static int eeh_reset_device (struct pci_dn *pe_dn, struct pci_bus *bus)
 */
#define MAX_WAIT_FOR_RECOVERY 15

void handle_eeh_events (struct eeh_event *event)
struct pci_dn * handle_eeh_events (struct eeh_event *event)
{
	struct device_node *frozen_dn;
	struct pci_dn *frozen_pdn;
@@ -265,7 +264,7 @@ void handle_eeh_events (struct eeh_event *event)
	if (!frozen_dn) {
		printk(KERN_ERR "EEH: Error: Cannot find partition endpoint for %s\n",
		        pci_name(event->dev));
		return;
		return NULL;
	}

	/* There are two different styles for coming up with the PE.
@@ -280,7 +279,7 @@ void handle_eeh_events (struct eeh_event *event)
	if (!frozen_bus) {
		printk(KERN_ERR "EEH: Cannot find PCI bus for %s\n",
		        frozen_dn->full_name);
		return;
		return NULL;
	}

#if 0
@@ -355,7 +354,7 @@ void handle_eeh_events (struct eeh_event *event)
	/* Tell all device drivers that they can resume operations */
	pci_walk_bus(frozen_bus, eeh_report_resume, NULL);

	return;
	return frozen_pdn;
	
excess_failures:
	/*
@@ -384,6 +383,8 @@ perm_error:

	/* Shut down the device drivers for good. */
	pcibios_remove_pci_devices(frozen_bus);

	return NULL;
}

/* ---------- end of file ---------- */
+28 −22
Original line number Diff line number Diff line
@@ -18,6 +18,7 @@
 * Copyright (c) 2005 Linas Vepstas <linas@linas.org>
 */

#include <linux/delay.h>
#include <linux/list.h>
#include <linux/mutex.h>
#include <linux/pci.h>
@@ -56,10 +57,9 @@ static int eeh_event_handler(void * dummy)
{
	unsigned long flags;
	struct eeh_event	*event;
	struct pci_dn *pdn;

	daemonize ("eehd");

	while (1) {
	set_current_state(TASK_INTERRUPTIBLE);

	spin_lock_irqsave(&eeh_eventlist_lock, flags);
@@ -73,7 +73,7 @@ static int eeh_event_handler(void * dummy)
	spin_unlock_irqrestore(&eeh_eventlist_lock, flags);

	if (event == NULL)
			break;
		return 0;

	/* Serialize processing of EEH events */
	mutex_lock(&eeh_event_mutex);
@@ -82,12 +82,18 @@ static int eeh_event_handler(void * dummy)
	printk(KERN_INFO "EEH: Detected PCI bus error on device %s\n",
	       pci_name(event->dev));

		handle_eeh_events(event);
	pdn = handle_eeh_events(event);

	eeh_clear_slot(event->dn, EEH_MODE_RECOVERING);
	pci_dev_put(event->dev);
	kfree(event);
	mutex_unlock(&eeh_event_mutex);

	/* If there are no new errors after an hour, clear the counter. */
	if (pdn && pdn->eeh_freeze_count>0) {
		msleep_interruptible (3600*1000);
		if (pdn->eeh_freeze_count>0)
			pdn->eeh_freeze_count--;
	}

	return 0;
+5 −5
Original line number Diff line number Diff line
@@ -18,8 +18,8 @@
 * Copyright (c) 2005 Linas Vepstas <linas@linas.org>
 */

#ifndef ASM_PPC64_EEH_EVENT_H
#define ASM_PPC64_EEH_EVENT_H
#ifndef ASM_POWERPC_EEH_EVENT_H
#define ASM_POWERPC_EEH_EVENT_H
#ifdef __KERNEL__

/** EEH event -- structure holding pci controller data that describes
@@ -39,7 +39,7 @@ struct eeh_event {
 * @dev pci device
 *
 * This routine builds a PCI error event which will be delivered
 * to all listeners on the peh_notifier_chain.
 * to all listeners on the eeh_notifier_chain.
 *
 * This routine can be called within an interrupt context;
 * the actual event will be delivered in a normal context
@@ -51,7 +51,7 @@ int eeh_send_failure_event (struct device_node *dn,
                            int time_unavail);

/* Main recovery function */
void handle_eeh_events (struct eeh_event *);
struct pci_dn * handle_eeh_events (struct eeh_event *);

#endif /* __KERNEL__ */
#endif /* ASM_PPC64_EEH_EVENT_H */
#endif /* ASM_POWERPC_EEH_EVENT_H */