Commit cd758a9b authored by Paul Mackerras's avatar Paul Mackerras
Browse files

KVM: PPC: Book3S HV: Use __gfn_to_pfn_memslot in HPT page fault handler



This makes the same changes in the page fault handler for HPT guests
that commits 31c8b0d0 ("KVM: PPC: Book3S HV: Use __gfn_to_pfn_memslot()
in page fault handler", 2018-03-01), 71d29f43 ("KVM: PPC: Book3S HV:
Don't use compound_order to determine host mapping size", 2018-09-11)
and 6579804c ("KVM: PPC: Book3S HV: Avoid crash from THP collapse
during radix page fault", 2018-10-04) made for the page fault handler
for radix guests.

In summary, where we used to call get_user_pages_fast() and then do
special handling for VM_PFNMAP vmas, we now call __get_user_pages_fast()
and then __gfn_to_pfn_memslot() if that fails, followed by reading the
Linux PTE to get the host PFN, host page size and mapping attributes.

This also brings in the change from SetPageDirty() to set_page_dirty_lock()
which was done for the radix page fault handler in commit c3856aeb
("KVM: PPC: Book3S HV: Fix handling of large pages in radix page fault
handler", 2018-02-23).

Signed-off-by: default avatarPaul Mackerras <paulus@ozlabs.org>
parent 1c482452
Loading
Loading
Loading
Loading
+57 −62
Original line number Diff line number Diff line
@@ -485,18 +485,18 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
	__be64 *hptep;
	unsigned long mmu_seq, psize, pte_size;
	unsigned long gpa_base, gfn_base;
	unsigned long gpa, gfn, hva, pfn;
	unsigned long gpa, gfn, hva, pfn, hpa;
	struct kvm_memory_slot *memslot;
	unsigned long *rmap;
	struct revmap_entry *rev;
	struct page *page, *pages[1];
	long index, ret, npages;
	struct page *page;
	long index, ret;
	bool is_ci;
	unsigned int writing, write_ok;
	struct vm_area_struct *vma;
	bool writing, write_ok;
	unsigned int shift;
	unsigned long rcbits;
	long mmio_update;
	struct mm_struct *mm;
	pte_t pte, *ptep;

	if (kvm_is_radix(kvm))
		return kvmppc_book3s_radix_page_fault(run, vcpu, ea, dsisr);
@@ -570,59 +570,62 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
	smp_rmb();

	ret = -EFAULT;
	is_ci = false;
	pfn = 0;
	page = NULL;
	mm = kvm->mm;
	pte_size = PAGE_SIZE;
	writing = (dsisr & DSISR_ISSTORE) != 0;
	/* If writing != 0, then the HPTE must allow writing, if we get here */
	write_ok = writing;
	hva = gfn_to_hva_memslot(memslot, gfn);
	npages = get_user_pages_fast(hva, 1, writing ? FOLL_WRITE : 0, pages);
	if (npages < 1) {
		/* Check if it's an I/O mapping */
		down_read(&mm->mmap_sem);
		vma = find_vma(mm, hva);
		if (vma && vma->vm_start <= hva && hva + psize <= vma->vm_end &&
		    (vma->vm_flags & VM_PFNMAP)) {
			pfn = vma->vm_pgoff +
				((hva - vma->vm_start) >> PAGE_SHIFT);
			pte_size = psize;
			is_ci = pte_ci(__pte((pgprot_val(vma->vm_page_prot))));
			write_ok = vma->vm_flags & VM_WRITE;
		}
		up_read(&mm->mmap_sem);
		if (!pfn)
			goto out_put;
	} else {
		page = pages[0];
		pfn = page_to_pfn(page);
		if (PageHuge(page)) {
			page = compound_head(page);
			pte_size <<= compound_order(page);
		}
		/* if the guest wants write access, see if that is OK */
		if (!writing && hpte_is_writable(r)) {
			pte_t *ptep, pte;
			unsigned long flags;

	/*
			 * We need to protect against page table destruction
			 * hugepage split and collapse.
	 * Do a fast check first, since __gfn_to_pfn_memslot doesn't
	 * do it with !atomic && !async, which is how we call it.
	 * We always ask for write permission since the common case
	 * is that the page is writable.
	 */
			local_irq_save(flags);
			ptep = find_current_mm_pte(mm->pgd, hva, NULL, NULL);
			if (ptep) {
				pte = kvmppc_read_update_linux_pte(ptep, 1);
				if (__pte_write(pte))
					write_ok = 1;
	if (__get_user_pages_fast(hva, 1, 1, &page) == 1) {
		write_ok = true;
	} else {
		/* Call KVM generic code to do the slow-path check */
		pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
					   writing, &write_ok);
		if (is_error_noslot_pfn(pfn))
			return -EFAULT;
		page = NULL;
		if (pfn_valid(pfn)) {
			page = pfn_to_page(pfn);
			if (PageReserved(page))
				page = NULL;
		}
			local_irq_restore(flags);
	}

	/*
	 * Read the PTE from the process' radix tree and use that
	 * so we get the shift and attribute bits.
	 */
	local_irq_disable();
	ptep = __find_linux_pte(vcpu->arch.pgdir, hva, NULL, &shift);
	/*
	 * If the PTE disappeared temporarily due to a THP
	 * collapse, just return and let the guest try again.
	 */
	if (!ptep) {
		local_irq_enable();
		if (page)
			put_page(page);
		return RESUME_GUEST;
	}
	pte = *ptep;
	local_irq_enable();
	hpa = pte_pfn(pte) << PAGE_SHIFT;
	pte_size = PAGE_SIZE;
	if (shift)
		pte_size = 1ul << shift;
	is_ci = pte_ci(pte);

	if (psize > pte_size)
		goto out_put;
	if (pte_size > psize)
		hpa |= hva & (pte_size - psize);

	/* Check WIMG vs. the actual page we're accessing */
	if (!hpte_cache_flags_ok(r, is_ci)) {
@@ -636,14 +639,13 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
	}

	/*
	 * Set the HPTE to point to pfn.
	 * Since the pfn is at PAGE_SIZE granularity, make sure we
	 * Set the HPTE to point to hpa.
	 * Since the hpa is at PAGE_SIZE granularity, make sure we
	 * don't mask out lower-order bits if psize < PAGE_SIZE.
	 */
	if (psize < PAGE_SIZE)
		psize = PAGE_SIZE;
	r = (r & HPTE_R_KEY_HI) | (r & ~(HPTE_R_PP0 - psize)) |
					((pfn << PAGE_SHIFT) & ~(psize - 1));
	r = (r & HPTE_R_KEY_HI) | (r & ~(HPTE_R_PP0 - psize)) | hpa;
	if (hpte_is_writable(r) && !write_ok)
		r = hpte_make_readonly(r);
	ret = RESUME_GUEST;
@@ -708,20 +710,13 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
	asm volatile("ptesync" : : : "memory");
	preempt_enable();
	if (page && hpte_is_writable(r))
		SetPageDirty(page);
		set_page_dirty_lock(page);

 out_put:
	trace_kvm_page_fault_exit(vcpu, hpte, ret);

	if (page) {
		/*
		 * We drop pages[0] here, not page because page might
		 * have been set to the head page of a compound, but
		 * we have to drop the reference on the correct tail
		 * page to match the get inside gup()
		 */
		put_page(pages[0]);
	}
	if (page)
		put_page(page);
	return ret;

 out_unlock: