Commit 4fb6fde7 authored by Aaron Miller's avatar Aaron Miller Committed by Borislav Petkov
Browse files

EDAC: Expose per-DIMM error counts in sysfs



The old csrowX sysfs directories have per-csrow error counters, but the
new dimmX directories do not currently expose error counts.

EDAC already keeps these counts, add them to sysfs so per-DIMM counts
are still available when CONFIG_EDAC_LEGACY_SYSFS=n.

Signed-off-by: default avatarAaron Miller <aaronmiller@fb.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
Link: http://lkml.kernel.org/r/20161103220153.3997328-1-aaronmiller@fb.com


Signed-off-by: default avatarBorislav Petkov <bp@suse.de>
parent 2287c636
Loading
Loading
Loading
Loading
+17 −0
Original line number Diff line number Diff line
@@ -138,3 +138,20 @@ Contact: Mauro Carvalho Chehab <m.chehab@samsung.com>
Description:	This attribute file will display what type of memory is
		currently on this csrow. Normally, either buffered or
		unbuffered memory (for example, Unbuffered-DDR3).

What:		/sys/devices/system/edac/mc/mc*/(dimm|rank)*/dimm_ce_count
Date:		October 2016
Contact:	linux-edac@vger.kernel.org
Description:	This attribute file displays the total count of correctable
		errors that have occurred on this DIMM. This count is very important
		to examine. CEs provide early indications that a DIMM is beginning
		to fail. This count field should be monitored for non-zero values
		and report such information to the system administrator.

What:		/sys/devices/system/edac/mc/mc*/(dimm|rank)*/dimm_ue_count
Date:		October 2016
Contact:	linux-edac@vger.kernel.org
Description:	This attribute file displays the total count of uncorrectable
		errors that have occurred on this DIMM. If panic_on_ue is set, this
		counter will not have a chance to increment, since EDAC will panic the
		system
+20 −0
Original line number Diff line number Diff line
@@ -438,11 +438,13 @@ A typical EDAC system has the following structure under
	│   │   ├── ce_count
	│   │   ├── ce_noinfo_count
	│   │   ├── dimm0
	│   │   │   ├── dimm_ce_count
	│   │   │   ├── dimm_dev_type
	│   │   │   ├── dimm_edac_mode
	│   │   │   ├── dimm_label
	│   │   │   ├── dimm_location
	│   │   │   ├── dimm_mem_type
	│   │   │   ├── dimm_ue_count
	│   │   │   ├── size
	│   │   │   └── uevent
	│   │   ├── max_location
@@ -457,11 +459,13 @@ A typical EDAC system has the following structure under
	│   │   ├── ce_count
	│   │   ├── ce_noinfo_count
	│   │   ├── dimm0
	│   │   │   ├── dimm_ce_count
	│   │   │   ├── dimm_dev_type
	│   │   │   ├── dimm_edac_mode
	│   │   │   ├── dimm_label
	│   │   │   ├── dimm_location
	│   │   │   ├── dimm_mem_type
	│   │   │   ├── dimm_ue_count
	│   │   │   ├── size
	│   │   │   └── uevent
	│   │   ├── max_location
@@ -483,6 +487,22 @@ this ``X`` memory module:
	This attribute file displays, in count of megabytes, the memory
	that this csrow contains.

- ``dimm_ue_count`` - Uncorrectable Errors count attribute file

	This attribute file displays the total count of uncorrectable
	errors that have occurred on this DIMM. If panic_on_ue is set
	this counter will not have a chance to increment, since EDAC
	will panic the system.

- ``dimm_ce_count`` - Correctable Errors count attribute file

	This attribute file displays the total count of correctable
	errors that have occurred on this DIMM. This count is very
	important to examine. CEs provide early indications that a
	DIMM is beginning to fail. This count field should be
	monitored for non-zero values and report such information
	to the system administrator.

- ``dimm_dev_type``  - Device type attribute file

	This attribute file will display what type of DRAM device is
+38 −0
Original line number Diff line number Diff line
@@ -569,6 +569,40 @@ static ssize_t dimmdev_edac_mode_show(struct device *dev,
	return sprintf(data, "%s\n", edac_caps[dimm->edac_mode]);
}

static ssize_t dimmdev_ce_count_show(struct device *dev,
				      struct device_attribute *mattr,
				      char *data)
{
	struct dimm_info *dimm = to_dimm(dev);
	u32 count;
	int off;

	off = EDAC_DIMM_OFF(dimm->mci->layers,
			    dimm->mci->n_layers,
			    dimm->location[0],
			    dimm->location[1],
			    dimm->location[2]);
	count = dimm->mci->ce_per_layer[dimm->mci->n_layers-1][off];
	return sprintf(data, "%u\n", count);
}

static ssize_t dimmdev_ue_count_show(struct device *dev,
				      struct device_attribute *mattr,
				      char *data)
{
	struct dimm_info *dimm = to_dimm(dev);
	u32 count;
	int off;

	off = EDAC_DIMM_OFF(dimm->mci->layers,
			    dimm->mci->n_layers,
			    dimm->location[0],
			    dimm->location[1],
			    dimm->location[2]);
	count = dimm->mci->ue_per_layer[dimm->mci->n_layers-1][off];
	return sprintf(data, "%u\n", count);
}

/* dimm/rank attribute files */
static DEVICE_ATTR(dimm_label, S_IRUGO | S_IWUSR,
		   dimmdev_label_show, dimmdev_label_store);
@@ -577,6 +611,8 @@ static DEVICE_ATTR(size, S_IRUGO, dimmdev_size_show, NULL);
static DEVICE_ATTR(dimm_mem_type, S_IRUGO, dimmdev_mem_type_show, NULL);
static DEVICE_ATTR(dimm_dev_type, S_IRUGO, dimmdev_dev_type_show, NULL);
static DEVICE_ATTR(dimm_edac_mode, S_IRUGO, dimmdev_edac_mode_show, NULL);
static DEVICE_ATTR(dimm_ce_count, S_IRUGO, dimmdev_ce_count_show, NULL);
static DEVICE_ATTR(dimm_ue_count, S_IRUGO, dimmdev_ue_count_show, NULL);

/* attributes of the dimm<id>/rank<id> object */
static struct attribute *dimm_attrs[] = {
@@ -586,6 +622,8 @@ static struct attribute *dimm_attrs[] = {
	&dev_attr_dimm_mem_type.attr,
	&dev_attr_dimm_dev_type.attr,
	&dev_attr_dimm_edac_mode.attr,
	&dev_attr_dimm_ce_count.attr,
	&dev_attr_dimm_ue_count.attr,
	NULL,
};