Commit 8fc5c735 authored by Dan Williams's avatar Dan Williams
Browse files

acpi/nfit, device-dax: Identify differentiated memory with a unique numa-node

Persistent memory, as described by the ACPI NFIT (NVDIMM Firmware
Interface Table), is the first known instance of a memory range
described by a unique "target" proximity domain. Where "initiator" and
"target" proximity domains is an approach that the ACPI HMAT
(Heterogeneous Memory Attributes Table) uses to described the unique
performance properties of a memory range relative to a given initiator
(e.g. CPU or DMA device).

Currently the numa-node for a /dev/pmemX block-device or /dev/daxX.Y
char-device follows the traditional notion of 'numa-node' where the
attribute conveys the closest online numa-node. That numa-node attribute
is useful for cpu-binding and memory-binding processes *near* the
device. However, when the memory range backing a 'pmem', or 'dax' device
is onlined (memory hot-add) the memory-only-numa-node representing that
address needs to be differentiated from the set of online nodes. In
other words, the numa-node association of the device depends on whether
you can bind processes *near* the cpu-numa-node in the offline
device-case, or bind process *on* the memory-range directly after the
backing address range is onlined.

Allow for the case that platform firmware describes persistent memory
with a unique proximity domain, i.e. when it is distinct from the
proximity of DRAM and CPUs that are on the same socket. Plumb the Linux
numa-node translation of that proximity through the libnvdimm region
device to namespaces that are in device-dax mode. With this in place the
proposed kmem driver [1] can optionally discover a unique numa-node
number for the address range as it transitions the memory from an
offline state managed by a device-driver to an online memory range
managed by the core-mm.

[1]: https://lore.kernel.org/lkml/20181022201317.8558C1D8@viggo.jf.intel.com



Reported-by: default avatarFan Du <fan.du@intel.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Oliver O'Halloran" <oohall@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Jérôme Glisse <jglisse@redhat.com>
Reviewed-by: default avatarYang Shi <yang.shi@linux.alibaba.com>
Signed-off-by: default avatarDan Williams <dan.j.williams@intel.com>
parent 730926c3
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -236,6 +236,7 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
	memset(&ndr_desc, 0, sizeof(ndr_desc));
	ndr_desc.attr_groups = region_attr_groups;
	ndr_desc.numa_node = dev_to_node(&p->pdev->dev);
	ndr_desc.target_node = ndr_desc.numa_node;
	ndr_desc.res = &p->res;
	ndr_desc.of_node = p->dn;
	ndr_desc.provider_data = p;
+6 −2
Original line number Diff line number Diff line
@@ -2869,11 +2869,15 @@ static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc,
	ndr_desc->res = &res;
	ndr_desc->provider_data = nfit_spa;
	ndr_desc->attr_groups = acpi_nfit_region_attribute_groups;
	if (spa->flags & ACPI_NFIT_PROXIMITY_VALID)
	if (spa->flags & ACPI_NFIT_PROXIMITY_VALID) {
		ndr_desc->numa_node = acpi_map_pxm_to_online_node(
						spa->proximity_domain);
	else
		ndr_desc->target_node = acpi_map_pxm_to_node(
				spa->proximity_domain);
	} else {
		ndr_desc->numa_node = NUMA_NO_NODE;
		ndr_desc->target_node = NUMA_NO_NODE;
	}

	/*
	 * Persistence domain bits are hierarchical, if
+1 −0
Original line number Diff line number Diff line
@@ -84,6 +84,7 @@ int acpi_map_pxm_to_node(int pxm)

	return node;
}
EXPORT_SYMBOL(acpi_map_pxm_to_node);

/**
 * acpi_map_pxm_to_online_node - Map proximity ID to online node
+3 −1
Original line number Diff line number Diff line
@@ -214,7 +214,7 @@ static void dax_region_unregister(void *region)
}

struct dax_region *alloc_dax_region(struct device *parent, int region_id,
		struct resource *res, unsigned int align,
		struct resource *res, int target_node, unsigned int align,
		unsigned long pfn_flags)
{
	struct dax_region *dax_region;
@@ -244,6 +244,7 @@ struct dax_region *alloc_dax_region(struct device *parent, int region_id,
	dax_region->id = region_id;
	dax_region->align = align;
	dax_region->dev = parent;
	dax_region->target_node = target_node;
	if (sysfs_create_groups(&parent->kobj, dax_region_attribute_groups)) {
		kfree(dax_region);
		return NULL;
@@ -348,6 +349,7 @@ struct dev_dax *__devm_create_dev_dax(struct dax_region *dax_region, int id,

	dev_dax->dax_dev = dax_dev;
	dev_dax->region = dax_region;
	dev_dax->target_node = dax_region->target_node;
	kref_get(&dax_region->kref);

	inode = dax_inode(dax_dev);
+2 −1
Original line number Diff line number Diff line
@@ -10,7 +10,8 @@ struct dax_device;
struct dax_region;
void dax_region_put(struct dax_region *dax_region);
struct dax_region *alloc_dax_region(struct device *parent, int region_id,
		struct resource *res, unsigned int align, unsigned long flags);
		struct resource *res, int target_node, unsigned int align,
		unsigned long flags);

enum dev_dax_subsys {
	DEV_DAX_BUS,
Loading