Commit 03b482e2 authored by Bjorn Helgaas's avatar Bjorn Helgaas
Browse files

Merge branch 'remotes/lorenzo/pci/apei'

- Add ACPI APEI notifier chain for unknown (vendor) CPER records (Shiju
  Jose)

- Add handling of HiSilicon HIP PCIe controller errors (Yicong Yang)

* remotes/lorenzo/pci/apei:
  PCI: hip: Add handling of HiSilicon HIP PCIe controller errors
  ACPI / APEI: Add a notifier chain for unknown (vendor) CPER records
parents 8b28a3f3 acb52897
Loading
Loading
Loading
Loading
+63 −0
Original line number Diff line number Diff line
@@ -79,6 +79,12 @@
	((struct acpi_hest_generic_status *)				\
	 ((struct ghes_estatus_node *)(estatus_node) + 1))

#define GHES_VENDOR_ENTRY_LEN(gdata_len)                               \
	(sizeof(struct ghes_vendor_record_entry) + (gdata_len))
#define GHES_GDATA_FROM_VENDOR_ENTRY(vendor_entry)                     \
	((struct acpi_hest_generic_data *)                              \
	((struct ghes_vendor_record_entry *)(vendor_entry) + 1))

/*
 *  NMI-like notifications vary by architecture, before the compiler can prune
 *  unused static functions it needs a value for these enums.
@@ -123,6 +129,12 @@ static DEFINE_MUTEX(ghes_list_mutex);
 */
static DEFINE_SPINLOCK(ghes_notify_lock_irq);

struct ghes_vendor_record_entry {
	struct work_struct work;
	int error_severity;
	char vendor_record[];
};

static struct gen_pool *ghes_estatus_pool;
static unsigned long ghes_estatus_pool_size_request;

@@ -511,6 +523,56 @@ static void ghes_handle_aer(struct acpi_hest_generic_data *gdata)
#endif
}

static BLOCKING_NOTIFIER_HEAD(vendor_record_notify_list);

int ghes_register_vendor_record_notifier(struct notifier_block *nb)
{
	return blocking_notifier_chain_register(&vendor_record_notify_list, nb);
}
EXPORT_SYMBOL_GPL(ghes_register_vendor_record_notifier);

void ghes_unregister_vendor_record_notifier(struct notifier_block *nb)
{
	blocking_notifier_chain_unregister(&vendor_record_notify_list, nb);
}
EXPORT_SYMBOL_GPL(ghes_unregister_vendor_record_notifier);

static void ghes_vendor_record_work_func(struct work_struct *work)
{
	struct ghes_vendor_record_entry *entry;
	struct acpi_hest_generic_data *gdata;
	u32 len;

	entry = container_of(work, struct ghes_vendor_record_entry, work);
	gdata = GHES_GDATA_FROM_VENDOR_ENTRY(entry);

	blocking_notifier_call_chain(&vendor_record_notify_list,
				     entry->error_severity, gdata);

	len = GHES_VENDOR_ENTRY_LEN(acpi_hest_get_record_size(gdata));
	gen_pool_free(ghes_estatus_pool, (unsigned long)entry, len);
}

static void ghes_defer_non_standard_event(struct acpi_hest_generic_data *gdata,
					  int sev)
{
	struct acpi_hest_generic_data *copied_gdata;
	struct ghes_vendor_record_entry *entry;
	u32 len;

	len = GHES_VENDOR_ENTRY_LEN(acpi_hest_get_record_size(gdata));
	entry = (void *)gen_pool_alloc(ghes_estatus_pool, len);
	if (!entry)
		return;

	copied_gdata = GHES_GDATA_FROM_VENDOR_ENTRY(entry);
	memcpy(copied_gdata, gdata, acpi_hest_get_record_size(gdata));
	entry->error_severity = sev;

	INIT_WORK(&entry->work, ghes_vendor_record_work_func);
	schedule_work(&entry->work);
}

static bool ghes_do_proc(struct ghes *ghes,
			 const struct acpi_hest_generic_status *estatus)
{
@@ -549,6 +611,7 @@ static bool ghes_do_proc(struct ghes *ghes,
		} else {
			void *err = acpi_hest_get_payload(gdata);

			ghes_defer_non_standard_event(gdata, sev);
			log_non_standard_event(sec_type, fru_id, fru_text,
					       sec_sev, err,
					       gdata->error_data_length);
+7 −0
Original line number Diff line number Diff line
@@ -294,6 +294,13 @@ config PCI_LOONGSON
	  Say Y here if you want to enable PCI controller support on
	  Loongson systems.

config PCIE_HISI_ERR
	depends on ACPI_APEI_GHES && (ARM64 || COMPILE_TEST)
	bool "HiSilicon HIP PCIe controller error handling driver"
	help
	  Say Y here if you want error handling support
	  for the PCIe controller's errors on HiSilicon HIP SoCs

source "drivers/pci/controller/dwc/Kconfig"
source "drivers/pci/controller/mobiveil/Kconfig"
source "drivers/pci/controller/cadence/Kconfig"
+1 −0
Original line number Diff line number Diff line
@@ -31,6 +31,7 @@ obj-$(CONFIG_PCIE_TANGO_SMP8759) += pcie-tango.o
obj-$(CONFIG_VMD) += vmd.o
obj-$(CONFIG_PCIE_BRCMSTB) += pcie-brcmstb.o
obj-$(CONFIG_PCI_LOONGSON) += pci-loongson.o
obj-$(CONFIG_PCIE_HISI_ERR) += pcie-hisi-error.o
# pcie-hisi.o quirks are needed even without CONFIG_PCIE_DW
obj-y				+= dwc/
obj-y				+= mobiveil/
+327 −0
Original line number Diff line number Diff line
// SPDX-License-Identifier: GPL-2.0
/*
 * Driver for handling the PCIe controller errors on
 * HiSilicon HIP SoCs.
 *
 * Copyright (c) 2020 HiSilicon Limited.
 */

#include <linux/acpi.h>
#include <acpi/ghes.h>
#include <linux/bitops.h>
#include <linux/delay.h>
#include <linux/pci.h>
#include <linux/platform_device.h>
#include <linux/kfifo.h>
#include <linux/spinlock.h>

/* HISI PCIe controller error definitions */
#define HISI_PCIE_ERR_MISC_REGS	33

#define HISI_PCIE_LOCAL_VALID_VERSION		BIT(0)
#define HISI_PCIE_LOCAL_VALID_SOC_ID		BIT(1)
#define HISI_PCIE_LOCAL_VALID_SOCKET_ID		BIT(2)
#define HISI_PCIE_LOCAL_VALID_NIMBUS_ID		BIT(3)
#define HISI_PCIE_LOCAL_VALID_SUB_MODULE_ID	BIT(4)
#define HISI_PCIE_LOCAL_VALID_CORE_ID		BIT(5)
#define HISI_PCIE_LOCAL_VALID_PORT_ID		BIT(6)
#define HISI_PCIE_LOCAL_VALID_ERR_TYPE		BIT(7)
#define HISI_PCIE_LOCAL_VALID_ERR_SEVERITY	BIT(8)
#define HISI_PCIE_LOCAL_VALID_ERR_MISC		9

static guid_t hisi_pcie_sec_guid =
	GUID_INIT(0xB2889FC9, 0xE7D7, 0x4F9D,
		  0xA8, 0x67, 0xAF, 0x42, 0xE9, 0x8B, 0xE7, 0x72);

/*
 * Firmware reports the socket port ID where the error occurred.  These
 * macros convert that to the core ID and core port ID required by the
 * ACPI reset method.
 */
#define HISI_PCIE_PORT_ID(core, v)       (((v) >> 1) + ((core) << 3))
#define HISI_PCIE_CORE_ID(v)             ((v) >> 3)
#define HISI_PCIE_CORE_PORT_ID(v)        (((v) & 7) << 1)

struct hisi_pcie_error_data {
	u64	val_bits;
	u8	version;
	u8	soc_id;
	u8	socket_id;
	u8	nimbus_id;
	u8	sub_module_id;
	u8	core_id;
	u8	port_id;
	u8	err_severity;
	u16	err_type;
	u8	reserv[2];
	u32	err_misc[HISI_PCIE_ERR_MISC_REGS];
};

struct hisi_pcie_error_private {
	struct notifier_block	nb;
	struct device *dev;
};

enum hisi_pcie_submodule_id {
	HISI_PCIE_SUB_MODULE_ID_AP,
	HISI_PCIE_SUB_MODULE_ID_TL,
	HISI_PCIE_SUB_MODULE_ID_MAC,
	HISI_PCIE_SUB_MODULE_ID_DL,
	HISI_PCIE_SUB_MODULE_ID_SDI,
};

static const char * const hisi_pcie_sub_module[] = {
	[HISI_PCIE_SUB_MODULE_ID_AP]	= "AP Layer",
	[HISI_PCIE_SUB_MODULE_ID_TL]	= "TL Layer",
	[HISI_PCIE_SUB_MODULE_ID_MAC]	= "MAC Layer",
	[HISI_PCIE_SUB_MODULE_ID_DL]	= "DL Layer",
	[HISI_PCIE_SUB_MODULE_ID_SDI]	= "SDI Layer",
};

enum hisi_pcie_err_severity {
	HISI_PCIE_ERR_SEV_RECOVERABLE,
	HISI_PCIE_ERR_SEV_FATAL,
	HISI_PCIE_ERR_SEV_CORRECTED,
	HISI_PCIE_ERR_SEV_NONE,
};

static const char * const hisi_pcie_error_sev[] = {
	[HISI_PCIE_ERR_SEV_RECOVERABLE]	= "recoverable",
	[HISI_PCIE_ERR_SEV_FATAL]	= "fatal",
	[HISI_PCIE_ERR_SEV_CORRECTED]	= "corrected",
	[HISI_PCIE_ERR_SEV_NONE]	= "none",
};

static const char *hisi_pcie_get_string(const char * const *array,
					size_t n, u32 id)
{
	u32 index;

	for (index = 0; index < n; index++) {
		if (index == id && array[index])
			return array[index];
	}

	return "unknown";
}

static int hisi_pcie_port_reset(struct platform_device *pdev,
				u32 chip_id, u32 port_id)
{
	struct device *dev = &pdev->dev;
	acpi_handle handle = ACPI_HANDLE(dev);
	union acpi_object arg[3];
	struct acpi_object_list arg_list;
	acpi_status s;
	unsigned long long data = 0;

	arg[0].type = ACPI_TYPE_INTEGER;
	arg[0].integer.value = chip_id;
	arg[1].type = ACPI_TYPE_INTEGER;
	arg[1].integer.value = HISI_PCIE_CORE_ID(port_id);
	arg[2].type = ACPI_TYPE_INTEGER;
	arg[2].integer.value = HISI_PCIE_CORE_PORT_ID(port_id);

	arg_list.count = 3;
	arg_list.pointer = arg;

	s = acpi_evaluate_integer(handle, "RST", &arg_list, &data);
	if (ACPI_FAILURE(s)) {
		dev_err(dev, "No RST method\n");
		return -EIO;
	}

	if (data) {
		dev_err(dev, "Failed to Reset\n");
		return -EIO;
	}

	return 0;
}

static int hisi_pcie_port_do_recovery(struct platform_device *dev,
				      u32 chip_id, u32 port_id)
{
	acpi_status s;
	struct device *device = &dev->dev;
	acpi_handle root_handle = ACPI_HANDLE(device);
	struct acpi_pci_root *pci_root;
	struct pci_bus *root_bus;
	struct pci_dev *pdev;
	u32 domain, busnr, devfn;

	s = acpi_get_parent(root_handle, &root_handle);
	if (ACPI_FAILURE(s))
		return -ENODEV;
	pci_root = acpi_pci_find_root(root_handle);
	if (!pci_root)
		return -ENODEV;
	root_bus = pci_root->bus;
	domain = pci_root->segment;

	busnr = root_bus->number;
	devfn = PCI_DEVFN(port_id, 0);
	pdev = pci_get_domain_bus_and_slot(domain, busnr, devfn);
	if (!pdev) {
		dev_info(device, "Fail to get root port %04x:%02x:%02x.%d device\n",
			 domain, busnr, PCI_SLOT(devfn), PCI_FUNC(devfn));
		return -ENODEV;
	}

	pci_stop_and_remove_bus_device_locked(pdev);
	pci_dev_put(pdev);

	if (hisi_pcie_port_reset(dev, chip_id, port_id))
		return -EIO;

	/*
	 * The initialization time of subordinate devices after
	 * hot reset is no more than 1s, which is required by
	 * the PCI spec v5.0 sec 6.6.1. The time will shorten
	 * if Readiness Notifications mechanisms are used. But
	 * wait 1s here to adapt any conditions.
	 */
	ssleep(1UL);

	/* add root port and downstream devices */
	pci_lock_rescan_remove();
	pci_rescan_bus(root_bus);
	pci_unlock_rescan_remove();

	return 0;
}

static void hisi_pcie_handle_error(struct platform_device *pdev,
				   const struct hisi_pcie_error_data *edata)
{
	struct device *dev = &pdev->dev;
	int idx, rc;
	const unsigned long valid_bits[] = {BITMAP_FROM_U64(edata->val_bits)};

	if (edata->val_bits == 0) {
		dev_warn(dev, "%s: no valid error information\n", __func__);
		return;
	}

	dev_info(dev, "\nHISI : HIP : PCIe controller error\n");
	if (edata->val_bits & HISI_PCIE_LOCAL_VALID_SOC_ID)
		dev_info(dev, "Table version = %d\n", edata->version);
	if (edata->val_bits & HISI_PCIE_LOCAL_VALID_SOCKET_ID)
		dev_info(dev, "Socket ID = %d\n", edata->socket_id);
	if (edata->val_bits & HISI_PCIE_LOCAL_VALID_NIMBUS_ID)
		dev_info(dev, "Nimbus ID = %d\n", edata->nimbus_id);
	if (edata->val_bits & HISI_PCIE_LOCAL_VALID_SUB_MODULE_ID)
		dev_info(dev, "Sub Module = %s\n",
			 hisi_pcie_get_string(hisi_pcie_sub_module,
					      ARRAY_SIZE(hisi_pcie_sub_module),
					      edata->sub_module_id));
	if (edata->val_bits & HISI_PCIE_LOCAL_VALID_CORE_ID)
		dev_info(dev, "Core ID = core%d\n", edata->core_id);
	if (edata->val_bits & HISI_PCIE_LOCAL_VALID_PORT_ID)
		dev_info(dev, "Port ID = port%d\n", edata->port_id);
	if (edata->val_bits & HISI_PCIE_LOCAL_VALID_ERR_SEVERITY)
		dev_info(dev, "Error severity = %s\n",
			 hisi_pcie_get_string(hisi_pcie_error_sev,
					      ARRAY_SIZE(hisi_pcie_error_sev),
					      edata->err_severity));
	if (edata->val_bits & HISI_PCIE_LOCAL_VALID_ERR_TYPE)
		dev_info(dev, "Error type = 0x%x\n", edata->err_type);

	dev_info(dev, "Reg Dump:\n");
	idx = HISI_PCIE_LOCAL_VALID_ERR_MISC;
	for_each_set_bit_from(idx, valid_bits,
			      HISI_PCIE_LOCAL_VALID_ERR_MISC + HISI_PCIE_ERR_MISC_REGS)
		dev_info(dev, "ERR_MISC_%d = 0x%x\n", idx - HISI_PCIE_LOCAL_VALID_ERR_MISC,
			 edata->err_misc[idx - HISI_PCIE_LOCAL_VALID_ERR_MISC]);

	if (edata->err_severity != HISI_PCIE_ERR_SEV_RECOVERABLE)
		return;

	/* Recovery for the PCIe controller errors, try reset
	 * PCI port for the error recovery
	 */
	rc = hisi_pcie_port_do_recovery(pdev, edata->socket_id,
			HISI_PCIE_PORT_ID(edata->core_id, edata->port_id));
	if (rc)
		dev_info(dev, "fail to do hisi pcie port reset\n");
}

static int hisi_pcie_notify_error(struct notifier_block *nb,
				  unsigned long event, void *data)
{
	struct acpi_hest_generic_data *gdata = data;
	const struct hisi_pcie_error_data *error_data = acpi_hest_get_payload(gdata);
	struct hisi_pcie_error_private *priv;
	struct device *dev;
	struct platform_device *pdev;
	guid_t err_sec_guid;
	u8 socket;

	import_guid(&err_sec_guid, gdata->section_type);
	if (!guid_equal(&err_sec_guid, &hisi_pcie_sec_guid))
		return NOTIFY_DONE;

	priv = container_of(nb, struct hisi_pcie_error_private, nb);
	dev = priv->dev;

	if (device_property_read_u8(dev, "socket", &socket))
		return NOTIFY_DONE;

	if (error_data->socket_id != socket)
		return NOTIFY_DONE;

	pdev = container_of(dev, struct platform_device, dev);
	hisi_pcie_handle_error(pdev, error_data);

	return NOTIFY_OK;
}

static int hisi_pcie_error_handler_probe(struct platform_device *pdev)
{
	struct hisi_pcie_error_private *priv;
	int ret;

	priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL);
	if (!priv)
		return -ENOMEM;

	priv->nb.notifier_call = hisi_pcie_notify_error;
	priv->dev = &pdev->dev;
	ret = ghes_register_vendor_record_notifier(&priv->nb);
	if (ret) {
		dev_err(&pdev->dev,
			"Failed to register hisi pcie controller error handler with apei\n");
		return ret;
	}

	platform_set_drvdata(pdev, priv);

	return 0;
}

static int hisi_pcie_error_handler_remove(struct platform_device *pdev)
{
	struct hisi_pcie_error_private *priv = platform_get_drvdata(pdev);

	ghes_unregister_vendor_record_notifier(&priv->nb);

	return 0;
}

static const struct acpi_device_id hisi_pcie_acpi_match[] = {
	{ "HISI0361", 0 },
	{ }
};

static struct platform_driver hisi_pcie_error_handler_driver = {
	.driver = {
		.name	= "hisi-pcie-error-handler",
		.acpi_match_table = hisi_pcie_acpi_match,
	},
	.probe		= hisi_pcie_error_handler_probe,
	.remove		= hisi_pcie_error_handler_remove,
};
module_platform_driver(hisi_pcie_error_handler_driver);

MODULE_DESCRIPTION("HiSilicon HIP PCIe controller error handling driver");
MODULE_LICENSE("GPL v2");
+18 −0
Original line number Diff line number Diff line
@@ -53,6 +53,24 @@ enum {
	GHES_SEV_PANIC = 0x3,
};

#ifdef CONFIG_ACPI_APEI_GHES
/**
 * ghes_register_vendor_record_notifier - register a notifier for vendor
 * records that the kernel would otherwise ignore.
 * @nb: pointer to the notifier_block structure of the event handler.
 *
 * return 0 : SUCCESS, non-zero : FAIL
 */
int ghes_register_vendor_record_notifier(struct notifier_block *nb);

/**
 * ghes_unregister_vendor_record_notifier - unregister the previously
 * registered vendor record notifier.
 * @nb: pointer to the notifier_block structure of the vendor record handler.
 */
void ghes_unregister_vendor_record_notifier(struct notifier_block *nb);
#endif

int ghes_estatus_pool_init(int num_ghes);

/* From drivers/edac/ghes_edac.c */