Commit 345fb0a9 authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull EDAC updates from Borislav Petkov:

 - Make amd64_edac still load on a machine with unpopulated nodes +
   cleanups (Yazen Ghannam)

 - Expose per-DIMM error counts in sysfs (Aaron Miller)

 - Add T2080 l2-cache support to mpc85xx (Chris Packham)

 - Random other small improvements/cleanups/fixlets

* tag 'edac_for_4.11' of git://git.kernel.org/pub/scm/linux/kernel/git/bp/bp:
  EDAC, mce_amd: Print IPID and Syndrome on a separate line
  EDAC, amd64: Bump driver version
  MAINTAINERS, EDAC: Update email for Thor Thayer
  EDAC, fsl_ddr: Make locally used symbols static
  EDAC, mpc85xx: Add T2080 l2-cache support
  EDAC, amd64: Add x86cpuid sanity check during init
  EDAC, amd64: Don't treat ECC disabled as failure
  EDAC: Add routine to check if MC devices list is empty
  EDAC, amd64: Remove unused printing macros
  EDAC, amd64: Rework messages in ecc_enabled()
  EDAC, amd64: Move global code out of instance functions
  EDAC, amd64: Free unused memory when init_one_instance() fails
  EDAC, mce_amd: Give more context to deferred error message
  EDAC, i7300: Test for the second channel properly
  EDAC, sb_edac: Get rid of ->show_interleave_mode()
  EDAC: Expose per-DIMM error counts in sysfs
  EDAC, amd64: Save and return err code from probe_one_instance()
  EDAC, i82975x: Add ioremap_nocache() error handling
  EDAC: Fix typos in enum mem_type comments
  EDAC: Make dev_attr_sdram_scrub_rate static
parents 507b5007 75bf2f64
Loading
Loading
Loading
Loading
+17 −0
Original line number Diff line number Diff line
@@ -138,3 +138,20 @@ Contact: Mauro Carvalho Chehab <m.chehab@samsung.com>
Description:	This attribute file will display what type of memory is
		currently on this csrow. Normally, either buffered or
		unbuffered memory (for example, Unbuffered-DDR3).

What:		/sys/devices/system/edac/mc/mc*/(dimm|rank)*/dimm_ce_count
Date:		October 2016
Contact:	linux-edac@vger.kernel.org
Description:	This attribute file displays the total count of correctable
		errors that have occurred on this DIMM. This count is very important
		to examine. CEs provide early indications that a DIMM is beginning
		to fail. This count field should be monitored for non-zero values
		and report such information to the system administrator.

What:		/sys/devices/system/edac/mc/mc*/(dimm|rank)*/dimm_ue_count
Date:		October 2016
Contact:	linux-edac@vger.kernel.org
Description:	This attribute file displays the total count of uncorrectable
		errors that have occurred on this DIMM. If panic_on_ue is set, this
		counter will not have a chance to increment, since EDAC will panic the
		system
+20 −0
Original line number Diff line number Diff line
@@ -438,11 +438,13 @@ A typical EDAC system has the following structure under
	│   │   ├── ce_count
	│   │   ├── ce_noinfo_count
	│   │   ├── dimm0
	│   │   │   ├── dimm_ce_count
	│   │   │   ├── dimm_dev_type
	│   │   │   ├── dimm_edac_mode
	│   │   │   ├── dimm_label
	│   │   │   ├── dimm_location
	│   │   │   ├── dimm_mem_type
	│   │   │   ├── dimm_ue_count
	│   │   │   ├── size
	│   │   │   └── uevent
	│   │   ├── max_location
@@ -457,11 +459,13 @@ A typical EDAC system has the following structure under
	│   │   ├── ce_count
	│   │   ├── ce_noinfo_count
	│   │   ├── dimm0
	│   │   │   ├── dimm_ce_count
	│   │   │   ├── dimm_dev_type
	│   │   │   ├── dimm_edac_mode
	│   │   │   ├── dimm_label
	│   │   │   ├── dimm_location
	│   │   │   ├── dimm_mem_type
	│   │   │   ├── dimm_ue_count
	│   │   │   ├── size
	│   │   │   └── uevent
	│   │   ├── max_location
@@ -483,6 +487,22 @@ this ``X`` memory module:
	This attribute file displays, in count of megabytes, the memory
	that this csrow contains.

- ``dimm_ue_count`` - Uncorrectable Errors count attribute file

	This attribute file displays the total count of uncorrectable
	errors that have occurred on this DIMM. If panic_on_ue is set
	this counter will not have a chance to increment, since EDAC
	will panic the system.

- ``dimm_ce_count`` - Correctable Errors count attribute file

	This attribute file displays the total count of correctable
	errors that have occurred on this DIMM. This count is very
	important to examine. CEs provide early indications that a
	DIMM is beginning to fail. This count field should be
	monitored for non-zero values and report such information
	to the system administrator.

- ``dimm_dev_type``  - Device type attribute file

	This attribute file will display what type of DRAM device is
+2 −2
Original line number Diff line number Diff line
@@ -643,7 +643,7 @@ S: Maintained
F:	drivers/gpio/gpio-altera.c

ALTERA SYSTEM RESOURCE DRIVER FOR ARRIA10 DEVKIT
M:	Thor Thayer <tthayer@opensource.altera.com>
M:	Thor Thayer <thor.thayer@linux.intel.com>
S:	Maintained
F:	drivers/gpio/gpio-altera-a10sr.c
F:	drivers/mfd/altera-a10sr.c
@@ -1788,7 +1788,7 @@ S: Maintained
F:	drivers/clk/socfpga/

ARM/SOCFPGA EDAC SUPPORT
M:	Thor Thayer <tthayer@opensource.altera.com>
M:	Thor Thayer <thor.thayer@linux.intel.com>
S:	Maintained
F:	drivers/edac/altera_edac.

+1 −0
Original line number Diff line number Diff line
@@ -678,5 +678,6 @@
		compatible = "fsl,t2080-l2-cache-controller";
		reg = <0xc20000 0x40000>;
		next-level-cache = <&cpc>;
		interrupts = <16 2 1 9>;
	};
};
+39 −25
Original line number Diff line number Diff line
@@ -3065,6 +3065,8 @@ static bool ecc_enabled(struct pci_dev *F3, u16 nid)
		/* Check whether at least one UMC is enabled: */
		if (umc_en_mask)
			ecc_en = umc_en_mask == ecc_en_mask;
		else
			edac_dbg(0, "Node %d: No enabled UMCs.\n", nid);

		/* Assume UMC MCA banks are enabled. */
		nb_mce_en = true;
@@ -3075,14 +3077,15 @@ static bool ecc_enabled(struct pci_dev *F3, u16 nid)

		nb_mce_en = nb_mce_bank_enabled_on_node(nid);
		if (!nb_mce_en)
			amd64_notice("NB MCE bank disabled, set MSR 0x%08x[4] on node %d to enable.\n",
			edac_dbg(0, "NB MCE bank disabled, set MSR 0x%08x[4] on node %d to enable.\n",
				     MSR_IA32_MCG_CTL, nid);
	}

	amd64_info("DRAM ECC %s.\n", (ecc_en ? "enabled" : "disabled"));
	amd64_info("Node %d: DRAM ECC %s.\n",
		   nid, (ecc_en ? "enabled" : "disabled"));

	if (!ecc_en || !nb_mce_en) {
		amd64_notice("%s", ecc_msg);
		amd64_info("%s", ecc_msg);
		return false;
	}
	return true;
@@ -3300,15 +3303,6 @@ static int init_one_instance(unsigned int nid)
		goto err_add_mc;
	}

	/* register stuff with EDAC MCE */
	if (report_gart_errors)
		amd_report_gart_errors(true);

	if (pvt->umc)
		amd_register_ecc_decoder(decode_umc_error);
	else
		amd_register_ecc_decoder(decode_bus_error);

	return 0;

err_add_mc:
@@ -3342,7 +3336,7 @@ static int probe_one_instance(unsigned int nid)
	ecc_stngs[nid] = s;

	if (!ecc_enabled(F3, nid)) {
		ret = -ENODEV;
		ret = 0;

		if (!ecc_enable_override)
			goto err_enable;
@@ -3363,6 +3357,8 @@ static int probe_one_instance(unsigned int nid)

		if (boot_cpu_data.x86 < 0x17)
			restore_ecc_error_reporting(s, nid, F3);

		goto err_enable;
	}

	return ret;
@@ -3396,14 +3392,6 @@ static void remove_one_instance(unsigned int nid)

	free_mc_sibling_devs(pvt);

	/* unregister from EDAC MCE */
	amd_report_gart_errors(false);

	if (pvt->umc)
		amd_unregister_ecc_decoder(decode_umc_error);
	else
		amd_unregister_ecc_decoder(decode_bus_error);

	kfree(ecc_stngs[nid]);
	ecc_stngs[nid] = NULL;

@@ -3452,8 +3440,11 @@ static int __init amd64_edac_init(void)
	int err = -ENODEV;
	int i;

	if (!x86_match_cpu(amd64_cpuids))
		return -ENODEV;

	if (amd_cache_northbridges() < 0)
		goto err_ret;
		return -ENODEV;

	opstate_init();

@@ -3466,14 +3457,30 @@ static int __init amd64_edac_init(void)
	if (!msrs)
		goto err_free;

	for (i = 0; i < amd_nb_num(); i++)
		if (probe_one_instance(i)) {
	for (i = 0; i < amd_nb_num(); i++) {
		err = probe_one_instance(i);
		if (err) {
			/* unwind properly */
			while (--i >= 0)
				remove_one_instance(i);

			goto err_pci;
		}
	}

	if (!edac_has_mcs()) {
		err = -ENODEV;
		goto err_pci;
	}

	/* register stuff with EDAC MCE */
	if (report_gart_errors)
		amd_report_gart_errors(true);

	if (boot_cpu_data.x86 >= 0x17)
		amd_register_ecc_decoder(decode_umc_error);
	else
		amd_register_ecc_decoder(decode_bus_error);

	setup_pci_device();

@@ -3493,7 +3500,6 @@ err_free:
	kfree(ecc_stngs);
	ecc_stngs = NULL;

err_ret:
	return err;
}

@@ -3504,6 +3510,14 @@ static void __exit amd64_edac_exit(void)
	if (pci_ctl)
		edac_pci_release_generic_ctl(pci_ctl);

	/* unregister from EDAC MCE */
	amd_report_gart_errors(false);

	if (boot_cpu_data.x86 >= 0x17)
		amd_unregister_ecc_decoder(decode_umc_error);
	else
		amd_unregister_ecc_decoder(decode_bus_error);

	for (i = 0; i < amd_nb_num(); i++)
		remove_one_instance(i);

Loading