Commit fffe3ae0 authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull hmm updates from Jason Gunthorpe:
 "Ralph has been working on nouveau's use of hmm_range_fault() and
  migrate_vma() which resulted in this small series. It adds reporting
  of the page table order from hmm_range_fault() and some optimization
  of migrate_vma():

   - Report the size of the page table mapping out of hmm_range_fault().

     This makes it easier to establish a large/huge/etc mapping in the
     device's page table.

   - Allow devices to ignore the invalidations during migration in cases
     where the migration is not going to change pages.

     For instance migrating pages to a device does not require the
     device to invalidate pages already in the device.

   - Update nouveau and hmm_tests to use the above"

* tag 'for-linus-hmm' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma:
  mm/hmm/test: use the new migration invalidation
  nouveau/svm: use the new migration invalidation
  mm/notifier: add migration invalidation type
  mm/migrate: add a flags parameter to migrate_vma
  nouveau: fix storing invalid ptes
  nouveau/hmm: support mapping large sysmem pages
  nouveau: fix mapping 2MB sysmem pages
  nouveau/hmm: fault one page at a time
  mm/hmm: add tests for hmm_pfn_to_map_order()
  mm/hmm: provide the page mapping order in hmm_range_fault()
parents 8f7be629 7d17e83a
Loading
Loading
Loading
Loading
+3 −1
Original line number Diff line number Diff line
@@ -400,6 +400,7 @@ kvmppc_svm_page_in(struct vm_area_struct *vma, unsigned long start,
	mig.end = end;
	mig.src = &src_pfn;
	mig.dst = &dst_pfn;
	mig.flags = MIGRATE_VMA_SELECT_SYSTEM;

	/*
	 * We come here with mmap_lock write lock held just for
@@ -577,7 +578,8 @@ kvmppc_svm_page_out(struct vm_area_struct *vma, unsigned long start,
	mig.end = end;
	mig.src = &src_pfn;
	mig.dst = &dst_pfn;
	mig.src_owner = &kvmppc_uvmem_pgmap;
	mig.pgmap_owner = &kvmppc_uvmem_pgmap;
	mig.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE;

	mutex_lock(&kvm->arch.uvmem_lock);
	/* The requested page is already paged-out, nothing to do */
+15 −4
Original line number Diff line number Diff line
@@ -140,6 +140,7 @@ static vm_fault_t nouveau_dmem_fault_copy_one(struct nouveau_drm *drm,
{
	struct device *dev = drm->dev->dev;
	struct page *dpage, *spage;
	struct nouveau_svmm *svmm;

	spage = migrate_pfn_to_page(args->src[0]);
	if (!spage || !(args->src[0] & MIGRATE_PFN_MIGRATE))
@@ -154,14 +155,19 @@ static vm_fault_t nouveau_dmem_fault_copy_one(struct nouveau_drm *drm,
	if (dma_mapping_error(dev, *dma_addr))
		goto error_free_page;

	svmm = spage->zone_device_data;
	mutex_lock(&svmm->mutex);
	nouveau_svmm_invalidate(svmm, args->start, args->end);
	if (drm->dmem->migrate.copy_func(drm, 1, NOUVEAU_APER_HOST, *dma_addr,
			NOUVEAU_APER_VRAM, nouveau_dmem_page_addr(spage)))
		goto error_dma_unmap;
	mutex_unlock(&svmm->mutex);

	args->dst[0] = migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED;
	return 0;

error_dma_unmap:
	mutex_unlock(&svmm->mutex);
	dma_unmap_page(dev, *dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL);
error_free_page:
	__free_page(dpage);
@@ -182,7 +188,8 @@ static vm_fault_t nouveau_dmem_migrate_to_ram(struct vm_fault *vmf)
		.end		= vmf->address + PAGE_SIZE,
		.src		= &src,
		.dst		= &dst,
		.src_owner	= drm->dev,
		.pgmap_owner	= drm->dev,
		.flags		= MIGRATE_VMA_SELECT_DEVICE_PRIVATE,
	};

	/*
@@ -530,7 +537,8 @@ nouveau_dmem_init(struct nouveau_drm *drm)
}

static unsigned long nouveau_dmem_migrate_copy_one(struct nouveau_drm *drm,
		unsigned long src, dma_addr_t *dma_addr, u64 *pfn)
		struct nouveau_svmm *svmm, unsigned long src,
		dma_addr_t *dma_addr, u64 *pfn)
{
	struct device *dev = drm->dev->dev;
	struct page *dpage, *spage;
@@ -560,6 +568,7 @@ static unsigned long nouveau_dmem_migrate_copy_one(struct nouveau_drm *drm,
			goto out_free_page;
	}

	dpage->zone_device_data = svmm;
	*pfn = NVIF_VMM_PFNMAP_V0_V | NVIF_VMM_PFNMAP_V0_VRAM |
		((paddr >> PAGE_SHIFT) << NVIF_VMM_PFNMAP_V0_ADDR_SHIFT);
	if (src & MIGRATE_PFN_WRITE)
@@ -583,8 +592,8 @@ static void nouveau_dmem_migrate_chunk(struct nouveau_drm *drm,
	unsigned long addr = args->start, nr_dma = 0, i;

	for (i = 0; addr < args->end; i++) {
		args->dst[i] = nouveau_dmem_migrate_copy_one(drm, args->src[i],
				dma_addrs + nr_dma, pfns + i);
		args->dst[i] = nouveau_dmem_migrate_copy_one(drm, svmm,
				args->src[i], dma_addrs + nr_dma, pfns + i);
		if (!dma_mapping_error(drm->dev->dev, dma_addrs[nr_dma]))
			nr_dma++;
		addr += PAGE_SIZE;
@@ -615,6 +624,8 @@ nouveau_dmem_migrate_vma(struct nouveau_drm *drm,
	struct migrate_vma args = {
		.vma		= vma,
		.start		= start,
		.pgmap_owner	= drm->dev,
		.flags		= MIGRATE_VMA_SELECT_SYSTEM,
	};
	unsigned long i;
	u64 *pfns;
+107 −150
Original line number Diff line number Diff line
@@ -93,17 +93,6 @@ nouveau_ivmm_find(struct nouveau_svm *svm, u64 inst)
	return NULL;
}

struct nouveau_svmm {
	struct mmu_notifier notifier;
	struct nouveau_vmm *vmm;
	struct {
		unsigned long start;
		unsigned long limit;
	} unmanaged;

	struct mutex mutex;
};

#define SVMM_DBG(s,f,a...)                                                     \
	NV_DEBUG((s)->vmm->cli->drm, "svm-%p: "f"\n", (s), ##a)
#define SVMM_ERR(s,f,a...)                                                     \
@@ -246,7 +235,7 @@ nouveau_svmm_join(struct nouveau_svmm *svmm, u64 inst)
}

/* Invalidate SVMM address-range on GPU. */
static void
void
nouveau_svmm_invalidate(struct nouveau_svmm *svmm, u64 start, u64 limit)
{
	if (limit > start) {
@@ -279,6 +268,14 @@ nouveau_svmm_invalidate_range_start(struct mmu_notifier *mn,
	if (unlikely(!svmm->vmm))
		goto out;

	/*
	 * Ignore invalidation callbacks for device private pages since
	 * the invalidation is handled as part of the migration process.
	 */
	if (update->event == MMU_NOTIFY_MIGRATE &&
	    update->migrate_pgmap_owner == svmm->vmm->cli->drm->dev)
		goto out;

	if (limit > svmm->unmanaged.start && start < svmm->unmanaged.limit) {
		if (start < svmm->unmanaged.start) {
			nouveau_svmm_invalidate(svmm, start,
@@ -514,53 +511,68 @@ static const struct mmu_interval_notifier_ops nouveau_svm_mni_ops = {
};

static void nouveau_hmm_convert_pfn(struct nouveau_drm *drm,
				    struct hmm_range *range, u64 *ioctl_addr)
				    struct hmm_range *range,
				    struct nouveau_pfnmap_args *args)
{
	unsigned long i, npages;
	struct page *page;

	/*
	 * The ioctl_addr prepared here is passed through nvif_object_ioctl()
	 * The address prepared here is passed through nvif_object_ioctl()
	 * to an eventual DMA map in something like gp100_vmm_pgt_pfn()
	 *
	 * This is all just encoding the internal hmm representation into a
	 * different nouveau internal representation.
	 */
	npages = (range->end - range->start) >> PAGE_SHIFT;
	for (i = 0; i < npages; ++i) {
		struct page *page;

		if (!(range->hmm_pfns[i] & HMM_PFN_VALID)) {
			ioctl_addr[i] = 0;
			continue;
	if (!(range->hmm_pfns[0] & HMM_PFN_VALID)) {
		args->p.phys[0] = 0;
		return;
	}

		page = hmm_pfn_to_page(range->hmm_pfns[i]);
	page = hmm_pfn_to_page(range->hmm_pfns[0]);
	/*
	 * Only map compound pages to the GPU if the CPU is also mapping the
	 * page as a compound page. Otherwise, the PTE protections might not be
	 * consistent (e.g., CPU only maps part of a compound page).
	 * Note that the underlying page might still be larger than the
	 * CPU mapping (e.g., a PUD sized compound page partially mapped with
	 * a PMD sized page table entry).
	 */
	if (hmm_pfn_to_map_order(range->hmm_pfns[0])) {
		unsigned long addr = args->p.addr;

		args->p.page = hmm_pfn_to_map_order(range->hmm_pfns[0]) +
				PAGE_SHIFT;
		args->p.size = 1UL << args->p.page;
		args->p.addr &= ~(args->p.size - 1);
		page -= (addr - args->p.addr) >> PAGE_SHIFT;
	}
	if (is_device_private_page(page))
			ioctl_addr[i] = nouveau_dmem_page_addr(page) |
		args->p.phys[0] = nouveau_dmem_page_addr(page) |
				NVIF_VMM_PFNMAP_V0_V |
				NVIF_VMM_PFNMAP_V0_VRAM;
	else
			ioctl_addr[i] = page_to_phys(page) |
		args->p.phys[0] = page_to_phys(page) |
				NVIF_VMM_PFNMAP_V0_V |
				NVIF_VMM_PFNMAP_V0_HOST;
		if (range->hmm_pfns[i] & HMM_PFN_WRITE)
			ioctl_addr[i] |= NVIF_VMM_PFNMAP_V0_W;
	}
	if (range->hmm_pfns[0] & HMM_PFN_WRITE)
		args->p.phys[0] |= NVIF_VMM_PFNMAP_V0_W;
}

static int nouveau_range_fault(struct nouveau_svmm *svmm,
			       struct nouveau_drm *drm, void *data, u32 size,
			       unsigned long hmm_pfns[], u64 *ioctl_addr,
			       struct nouveau_drm *drm,
			       struct nouveau_pfnmap_args *args, u32 size,
			       unsigned long hmm_flags,
			       struct svm_notifier *notifier)
{
	unsigned long timeout =
		jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
	/* Have HMM fault pages within the fault window to the GPU. */
	unsigned long hmm_pfns[1];
	struct hmm_range range = {
		.notifier = &notifier->notifier,
		.start = notifier->notifier.interval_tree.start,
		.end = notifier->notifier.interval_tree.last + 1,
		.pfn_flags_mask = HMM_PFN_REQ_FAULT | HMM_PFN_REQ_WRITE,
		.default_flags = hmm_flags,
		.hmm_pfns = hmm_pfns,
		.dev_private_owner = drm->dev,
	};
@@ -576,11 +588,6 @@ static int nouveau_range_fault(struct nouveau_svmm *svmm,
		ret = hmm_range_fault(&range);
		mmap_read_unlock(mm);
		if (ret) {
			/*
			 * FIXME: the input PFN_REQ flags are destroyed on
			 * -EBUSY, we need to regenerate them, also for the
			 * other continue below
			 */
			if (ret == -EBUSY)
				continue;
			return ret;
@@ -595,10 +602,10 @@ static int nouveau_range_fault(struct nouveau_svmm *svmm,
		break;
	}

	nouveau_hmm_convert_pfn(drm, &range, ioctl_addr);
	nouveau_hmm_convert_pfn(drm, &range, args);

	svmm->vmm->vmm.object.client->super = true;
	ret = nvif_object_ioctl(&svmm->vmm->vmm.object, data, size, NULL);
	ret = nvif_object_ioctl(&svmm->vmm->vmm.object, args, size, NULL);
	svmm->vmm->vmm.object.client->super = false;
	mutex_unlock(&svmm->mutex);

@@ -615,17 +622,12 @@ nouveau_svm_fault(struct nvif_notify *notify)
	struct nvif_object *device = &svm->drm->client.device.object;
	struct nouveau_svmm *svmm;
	struct {
		struct {
			struct nvif_ioctl_v0 i;
			struct nvif_ioctl_mthd_v0 m;
			struct nvif_vmm_pfnmap_v0 p;
		} i;
		u64 phys[16];
		struct nouveau_pfnmap_args i;
		u64 phys[1];
	} args;
	unsigned long hmm_pfns[ARRAY_SIZE(args.phys)];
	struct vm_area_struct *vma;
	unsigned long hmm_flags;
	u64 inst, start, limit;
	int fi, fn, pi, fill;
	int fi, fn;
	int replay = 0, ret;

	/* Parse available fault buffer entries into a cache, and update
@@ -692,129 +694,84 @@ nouveau_svm_fault(struct nvif_notify *notify)
		 * window into a single update.
		 */
		start = buffer->fault[fi]->addr;
		limit = start + (ARRAY_SIZE(args.phys) << PAGE_SHIFT);
		limit = start + PAGE_SIZE;
		if (start < svmm->unmanaged.limit)
			limit = min_t(u64, limit, svmm->unmanaged.start);
		SVMM_DBG(svmm, "wndw %016llx-%016llx", start, limit);

		mm = svmm->notifier.mm;
		if (!mmget_not_zero(mm)) {
			nouveau_svm_fault_cancel_fault(svm, buffer->fault[fi]);
			continue;
		}

		/* Intersect fault window with the CPU VMA, cancelling
		 * the fault if the address is invalid.
		 */
		mmap_read_lock(mm);
		vma = find_vma_intersection(mm, start, limit);
		if (!vma) {
			SVMM_ERR(svmm, "wndw %016llx-%016llx", start, limit);
			mmap_read_unlock(mm);
			mmput(mm);
			nouveau_svm_fault_cancel_fault(svm, buffer->fault[fi]);
			continue;
		}
		start = max_t(u64, start, vma->vm_start);
		limit = min_t(u64, limit, vma->vm_end);
		mmap_read_unlock(mm);
		SVMM_DBG(svmm, "wndw %016llx-%016llx", start, limit);

		if (buffer->fault[fi]->addr != start) {
			SVMM_ERR(svmm, "addr %016llx", buffer->fault[fi]->addr);
			mmput(mm);
			nouveau_svm_fault_cancel_fault(svm, buffer->fault[fi]);
			continue;
		}

		/* Prepare the GPU-side update of all pages within the
		/*
		 * Prepare the GPU-side update of all pages within the
		 * fault window, determining required pages and access
		 * permissions based on pending faults.
		 */
		args.i.p.page = PAGE_SHIFT;
		args.i.p.addr = start;
		for (fn = fi, pi = 0;;) {
			/* Determine required permissions based on GPU fault
		args.i.p.page = PAGE_SHIFT;
		args.i.p.size = PAGE_SIZE;
		/*
		 * Determine required permissions based on GPU fault
		 * access flags.
		 * XXX: atomic?
		 */
			switch (buffer->fault[fn]->access) {
		switch (buffer->fault[fi]->access) {
		case 0: /* READ. */
				hmm_pfns[pi++] = HMM_PFN_REQ_FAULT;
			hmm_flags = HMM_PFN_REQ_FAULT;
			break;
		case 3: /* PREFETCH. */
				hmm_pfns[pi++] = 0;
			hmm_flags = 0;
			break;
		default:
				hmm_pfns[pi++] = HMM_PFN_REQ_FAULT |
						 HMM_PFN_REQ_WRITE;
			hmm_flags = HMM_PFN_REQ_FAULT | HMM_PFN_REQ_WRITE;
			break;
		}
			args.i.p.size = pi << PAGE_SHIFT;

			/* It's okay to skip over duplicate addresses from the
			 * same SVMM as faults are ordered by access type such
			 * that only the first one needs to be handled.
			 *
			 * ie. WRITE faults appear first, thus any handling of
			 * pending READ faults will already be satisfied.
			 */
			while (++fn < buffer->fault_nr &&
			       buffer->fault[fn]->svmm == svmm &&
			       buffer->fault[fn    ]->addr ==
			       buffer->fault[fn - 1]->addr);

			/* If the next fault is outside the window, or all GPU
			 * faults have been dealt with, we're done here.
			 */
			if (fn >= buffer->fault_nr ||
			    buffer->fault[fn]->svmm != svmm ||
			    buffer->fault[fn]->addr >= limit)
				break;

			/* Fill in the gap between this fault and the next. */
			fill = (buffer->fault[fn    ]->addr -
				buffer->fault[fn - 1]->addr) >> PAGE_SHIFT;
			while (--fill)
				hmm_pfns[pi++] = 0;
		mm = svmm->notifier.mm;
		if (!mmget_not_zero(mm)) {
			nouveau_svm_fault_cancel_fault(svm, buffer->fault[fi]);
			continue;
		}

		SVMM_DBG(svmm, "wndw %016llx-%016llx covering %d fault(s)",
			 args.i.p.addr,
			 args.i.p.addr + args.i.p.size, fn - fi);

		notifier.svmm = svmm;
		ret = mmu_interval_notifier_insert(&notifier.notifier,
						   svmm->notifier.mm,
		ret = mmu_interval_notifier_insert(&notifier.notifier, mm,
						   args.i.p.addr, args.i.p.size,
						   &nouveau_svm_mni_ops);
		if (!ret) {
			ret = nouveau_range_fault(
				svmm, svm->drm, &args,
				sizeof(args.i) + pi * sizeof(args.phys[0]),
				hmm_pfns, args.phys, &notifier);
			ret = nouveau_range_fault(svmm, svm->drm, &args.i,
				sizeof(args), hmm_flags, &notifier);
			mmu_interval_notifier_remove(&notifier.notifier);
		}
		mmput(mm);

		/* Cancel any faults in the window whose pages didn't manage
		 * to keep their valid bit, or stay writeable when required.
		limit = args.i.p.addr + args.i.p.size;
		for (fn = fi; ++fn < buffer->fault_nr; ) {
			/* It's okay to skip over duplicate addresses from the
			 * same SVMM as faults are ordered by access type such
			 * that only the first one needs to be handled.
			 *
		 * If handling failed completely, cancel all faults.
			 * ie. WRITE faults appear first, thus any handling of
			 * pending READ faults will already be satisfied.
			 * But if a large page is mapped, make sure subsequent
			 * fault addresses have sufficient access permission.
			 */
			if (buffer->fault[fn]->svmm != svmm ||
			    buffer->fault[fn]->addr >= limit ||
			    (buffer->fault[fi]->access == 0 /* READ. */ &&
			     !(args.phys[0] & NVIF_VMM_PFNMAP_V0_V)) ||
			    (buffer->fault[fi]->access != 0 /* READ. */ &&
			     buffer->fault[fi]->access != 3 /* PREFETCH. */ &&
			     !(args.phys[0] & NVIF_VMM_PFNMAP_V0_W)))
				break;
		}

		/* If handling failed completely, cancel all faults. */
		if (ret) {
			while (fi < fn) {
			struct nouveau_svm_fault *fault = buffer->fault[fi++];
			pi = (fault->addr - args.i.p.addr) >> PAGE_SHIFT;
			if (ret ||
			     !(args.phys[pi] & NVIF_VMM_PFNMAP_V0_V) ||
			    (!(args.phys[pi] & NVIF_VMM_PFNMAP_V0_W) &&
			     fault->access != 0 && fault->access != 3)) {
				struct nouveau_svm_fault *fault =
					buffer->fault[fi++];

				nouveau_svm_fault_cancel_fault(svm, fault);
				continue;
			}
		} else
			replay++;
	}
	}

	/* Issue fault replay to the GPU. */
	if (replay)
+12 −1
Original line number Diff line number Diff line
#ifndef __NOUVEAU_SVM_H__
#define __NOUVEAU_SVM_H__
#include <nvif/os.h>
#include <linux/mmu_notifier.h>
struct drm_device;
struct drm_file;
struct nouveau_drm;

struct nouveau_svmm;
struct nouveau_svmm {
	struct mmu_notifier notifier;
	struct nouveau_vmm *vmm;
	struct {
		unsigned long start;
		unsigned long limit;
	} unmanaged;

	struct mutex mutex;
};

#if IS_ENABLED(CONFIG_DRM_NOUVEAU_SVM)
void nouveau_svm_init(struct nouveau_drm *);
@@ -19,6 +29,7 @@ int nouveau_svmm_join(struct nouveau_svmm *, u64 inst);
void nouveau_svmm_part(struct nouveau_svmm *, u64 inst);
int nouveau_svmm_bind(struct drm_device *, void *, struct drm_file *);

void nouveau_svmm_invalidate(struct nouveau_svmm *svmm, u64 start, u64 limit);
u64 *nouveau_pfns_alloc(unsigned long npages);
void nouveau_pfns_free(u64 *pfns);
void nouveau_pfns_map(struct nouveau_svmm *svmm, struct mm_struct *mm,
+2 −3
Original line number Diff line number Diff line
@@ -1204,7 +1204,6 @@ nvkm_vmm_pfn_unmap(struct nvkm_vmm *vmm, u64 addr, u64 size)
/*TODO:
 * - Avoid PT readback (for dma_unmap etc), this might end up being dealt
 *   with inside HMM, which would be a lot nicer for us to deal with.
 * - Multiple page sizes (particularly for huge page support).
 * - Support for systems without a 4KiB page size.
 */
int
@@ -1220,8 +1219,8 @@ nvkm_vmm_pfn_map(struct nvkm_vmm *vmm, u8 shift, u64 addr, u64 size, u64 *pfn)
	/* Only support mapping where the page size of the incoming page
	 * array matches a page size available for direct mapping.
	 */
	while (page->shift && page->shift != shift &&
	       page->desc->func->pfn == NULL)
	while (page->shift && (page->shift != shift ||
	       page->desc->func->pfn == NULL))
		page++;

	if (!page->shift || !IS_ALIGNED(addr, 1ULL << shift) ||
Loading