Commit fd10be25 authored by Suraj Jitindar Singh's avatar Suraj Jitindar Singh Committed by Michael Ellerman
Browse files

KVM: PPC: Book3S HV: Handle page fault for a nested guest



Consider a normal (L1) guest running under the main hypervisor (L0),
and then a nested guest (L2) running under the L1 guest which is acting
as a nested hypervisor. L0 has page tables to map the address space for
L1 providing the translation from L1 real address -> L0 real address;

	L1
	|
	| (L1 -> L0)
	|
	----> L0

There are also page tables in L1 used to map the address space for L2
providing the translation from L2 real address -> L1 read address. Since
the hardware can only walk a single level of page table, we need to
maintain in L0 a "shadow_pgtable" for L2 which provides the translation
from L2 real address -> L0 real address. Which looks like;

	L2				L2
	|				|
	| (L2 -> L1)			|
	|				|
	----> L1			| (L2 -> L0)
	      |				|
	      | (L1 -> L0)		|
	      |				|
	      ----> L0			--------> L0

When a page fault occurs while running a nested (L2) guest we need to
insert a pte into this "shadow_pgtable" for the L2 -> L0 mapping. To
do this we need to:

1. Walk the pgtable in L1 memory to find the L2 -> L1 mapping, and
   provide a page fault to L1 if this mapping doesn't exist.
2. Use our L1 -> L0 pgtable to convert this L1 address to an L0 address,
   or try to insert a pte for that mapping if it doesn't exist.
3. Now we have a L2 -> L0 mapping, insert this into our shadow_pgtable

Once this mapping exists we can take rc faults when hardware is unable
to automatically set the reference and change bits in the pte. On these
we need to:

1. Check the rc bits on the L2 -> L1 pte match, and otherwise reflect
   the fault down to L1.
2. Set the rc bits in the L1 -> L0 pte which corresponds to the same
   host page.
3. Set the rc bits in the L2 -> L0 pte.

As we reuse a large number of functions in book3s_64_mmu_radix.c for
this we also needed to refactor a number of these functions to take
an lpid parameter so that the correct lpid is used for tlb invalidations.
The functionality however has remained the same.

Reviewed-by: default avatarDavid Gibson <david@gibson.dropbear.id.au>
Signed-off-by: default avatarSuraj Jitindar Singh <sjitindarsingh@gmail.com>
Signed-off-by: default avatarPaul Mackerras <paulus@ozlabs.org>
Signed-off-by: default avatarMichael Ellerman <mpe@ellerman.id.au>
parent 4bad7779
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -53,6 +53,7 @@ extern void radix__flush_tlb_lpid_page(unsigned int lpid,
					unsigned long addr,
					unsigned long page_size);
extern void radix__flush_pwc_lpid(unsigned int lpid);
extern void radix__flush_tlb_lpid(unsigned int lpid);
extern void radix__local_flush_tlb_lpid(unsigned int lpid);
extern void radix__local_flush_tlb_lpid_guest(unsigned int lpid);

+17 −0
Original line number Diff line number Diff line
@@ -188,17 +188,34 @@ extern int kvmppc_book3s_hcall_implemented(struct kvm *kvm, unsigned long hc);
extern int kvmppc_book3s_radix_page_fault(struct kvm_run *run,
			struct kvm_vcpu *vcpu,
			unsigned long ea, unsigned long dsisr);
extern int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr,
				      struct kvmppc_pte *gpte, u64 root,
				      u64 *pte_ret_p);
extern int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr,
			struct kvmppc_pte *gpte, u64 table,
			int table_index, u64 *pte_ret_p);
extern int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
			struct kvmppc_pte *gpte, bool data, bool iswrite);
extern bool kvmppc_hv_handle_set_rc(struct kvm *kvm, pgd_t *pgtable,
				    bool writing, unsigned long gpa,
				    unsigned int lpid);
extern int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
				unsigned long gpa,
				struct kvm_memory_slot *memslot,
				bool writing, bool kvm_ro,
				pte_t *inserted_pte, unsigned int *levelp);
extern int kvmppc_init_vm_radix(struct kvm *kvm);
extern void kvmppc_free_radix(struct kvm *kvm);
extern void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd,
				      unsigned int lpid);
extern int kvmppc_radix_init(void);
extern void kvmppc_radix_exit(void);
extern int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
			unsigned long gfn);
extern void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte,
			     unsigned long gpa, unsigned int shift,
			     struct kvm_memory_slot *memslot,
			     unsigned int lpid);
extern int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
			unsigned long gfn);
extern int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
+4 −0
Original line number Diff line number Diff line
@@ -549,6 +549,10 @@ static inline void copy_to_checkpoint(struct kvm_vcpu *vcpu)
}
#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */

extern int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
			     unsigned long gpa, unsigned int level,
			     unsigned long mmu_seq, unsigned int lpid);

#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */

#endif /* __ASM_KVM_BOOK3S_64_H__ */
+2 −0
Original line number Diff line number Diff line
@@ -367,7 +367,9 @@ struct kvmppc_pte {
	bool may_write		: 1;
	bool may_execute	: 1;
	unsigned long wimg;
	unsigned long rc;
	u8 page_size;		/* MMU_PAGE_xxx */
	u8 page_shift;
};

struct kvmppc_mmu {
+111 −83
Original line number Diff line number Diff line
@@ -29,43 +29,16 @@
 */
static int p9_supported_radix_bits[4] = { 5, 9, 9, 13 };

/*
 * Used to walk a partition or process table radix tree in guest memory
 * Note: We exploit the fact that a partition table and a process
 * table have the same layout, a partition-scoped page table and a
 * process-scoped page table have the same layout, and the 2nd
 * doubleword of a partition table entry has the same layout as
 * the PTCR register.
 */
int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr,
				     struct kvmppc_pte *gpte, u64 table,
				     int table_index, u64 *pte_ret_p)
int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr,
			       struct kvmppc_pte *gpte, u64 root,
			       u64 *pte_ret_p)
{
	struct kvm *kvm = vcpu->kvm;
	int ret, level, ps;
	unsigned long ptbl, root;
	unsigned long rts, bits, offset;
	unsigned long size, index;
	struct prtb_entry entry;
	unsigned long rts, bits, offset, index;
	u64 pte, base, gpa;
	__be64 rpte;

	if ((table & PRTS_MASK) > 24)
		return -EINVAL;
	size = 1ul << ((table & PRTS_MASK) + 12);

	/* Is the table big enough to contain this entry? */
	if ((table_index * sizeof(entry)) >= size)
		return -EINVAL;

	/* Read the table to find the root of the radix tree */
	ptbl = (table & PRTB_MASK) + (table_index * sizeof(entry));
	ret = kvm_read_guest(kvm, ptbl, &entry, sizeof(entry));
	if (ret)
		return ret;

	/* Root is stored in the first double word */
	root = be64_to_cpu(entry.prtb0);
	rts = ((root & RTS1_MASK) >> (RTS1_SHIFT - 3)) |
		((root & RTS2_MASK) >> RTS2_SHIFT);
	bits = root & RPDS_MASK;
@@ -79,6 +52,7 @@ int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr,

	/* Walk each level of the radix tree */
	for (level = 3; level >= 0; --level) {
		u64 addr;
		/* Check a valid size */
		if (level && bits != p9_supported_radix_bits[level])
			return -EINVAL;
@@ -90,10 +64,13 @@ int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr,
		if (base & ((1UL << (bits + 3)) - 1))
			return -EINVAL;
		/* Read the entry from guest memory */
		ret = kvm_read_guest(kvm, base + (index * sizeof(rpte)),
				     &rpte, sizeof(rpte));
		if (ret)
		addr = base + (index * sizeof(rpte));
		ret = kvm_read_guest(kvm, addr, &rpte, sizeof(rpte));
		if (ret) {
			if (pte_ret_p)
				*pte_ret_p = addr;
			return ret;
		}
		pte = __be64_to_cpu(rpte);
		if (!(pte & _PAGE_PRESENT))
			return -ENOENT;
@@ -119,6 +96,7 @@ int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr,
		if (offset == mmu_psize_defs[ps].shift)
			break;
	gpte->page_size = ps;
	gpte->page_shift = offset;

	gpte->eaddr = eaddr;
	gpte->raddr = gpa;
@@ -128,12 +106,51 @@ int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr,
	gpte->may_write = !!(pte & _PAGE_WRITE);
	gpte->may_execute = !!(pte & _PAGE_EXEC);

	gpte->rc = pte & (_PAGE_ACCESSED | _PAGE_DIRTY);

	if (pte_ret_p)
		*pte_ret_p = pte;

	return 0;
}

/*
 * Used to walk a partition or process table radix tree in guest memory
 * Note: We exploit the fact that a partition table and a process
 * table have the same layout, a partition-scoped page table and a
 * process-scoped page table have the same layout, and the 2nd
 * doubleword of a partition table entry has the same layout as
 * the PTCR register.
 */
int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr,
				     struct kvmppc_pte *gpte, u64 table,
				     int table_index, u64 *pte_ret_p)
{
	struct kvm *kvm = vcpu->kvm;
	int ret;
	unsigned long size, ptbl, root;
	struct prtb_entry entry;

	if ((table & PRTS_MASK) > 24)
		return -EINVAL;
	size = 1ul << ((table & PRTS_MASK) + 12);

	/* Is the table big enough to contain this entry? */
	if ((table_index * sizeof(entry)) >= size)
		return -EINVAL;

	/* Read the table to find the root of the radix tree */
	ptbl = (table & PRTB_MASK) + (table_index * sizeof(entry));
	ret = kvm_read_guest(kvm, ptbl, &entry, sizeof(entry));
	if (ret)
		return ret;

	/* Root is stored in the first double word */
	root = be64_to_cpu(entry.prtb0);

	return kvmppc_mmu_walk_radix_tree(vcpu, eaddr, gpte, root, pte_ret_p);
}

int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
			   struct kvmppc_pte *gpte, bool data, bool iswrite)
{
@@ -181,7 +198,7 @@ int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
}

static void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr,
				    unsigned int pshift)
				    unsigned int pshift, unsigned int lpid)
{
	unsigned long psize = PAGE_SIZE;

@@ -189,12 +206,12 @@ static void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr,
		psize = 1UL << pshift;

	addr &= ~(psize - 1);
	radix__flush_tlb_lpid_page(kvm->arch.lpid, addr, psize);
	radix__flush_tlb_lpid_page(lpid, addr, psize);
}

static void kvmppc_radix_flush_pwc(struct kvm *kvm)
static void kvmppc_radix_flush_pwc(struct kvm *kvm, unsigned int lpid)
{
	radix__flush_pwc_lpid(kvm->arch.lpid);
	radix__flush_pwc_lpid(lpid);
}

static unsigned long kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep,
@@ -239,16 +256,17 @@ static void kvmppc_pmd_free(pmd_t *pmdp)
	kmem_cache_free(kvm_pmd_cache, pmdp);
}

static void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte,
void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte,
		      unsigned long gpa, unsigned int shift,
			     struct kvm_memory_slot *memslot)
		      struct kvm_memory_slot *memslot,
		      unsigned int lpid)

{
	unsigned long old;

	old = kvmppc_radix_update_pte(kvm, pte, ~0UL, 0, gpa, shift);
	kvmppc_radix_tlbie_page(kvm, gpa, shift);
	if (old & _PAGE_DIRTY) {
	kvmppc_radix_tlbie_page(kvm, gpa, shift, lpid);
	if ((old & _PAGE_DIRTY) && (lpid == kvm->arch.lpid)) {
		unsigned long gfn = gpa >> PAGE_SHIFT;
		unsigned long page_size = PAGE_SIZE;

@@ -271,7 +289,8 @@ static void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte,
 * and emit a warning if encountered, but there may already be data
 * corruption due to the unexpected mappings.
 */
static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full)
static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full,
				  unsigned int lpid)
{
	if (full) {
		memset(pte, 0, sizeof(long) << PTE_INDEX_SIZE);
@@ -285,14 +304,15 @@ static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full)
			WARN_ON_ONCE(1);
			kvmppc_unmap_pte(kvm, p,
					 pte_pfn(*p) << PAGE_SHIFT,
					 PAGE_SHIFT, NULL);
					 PAGE_SHIFT, NULL, lpid);
		}
	}

	kvmppc_pte_free(pte);
}

static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full)
static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full,
				  unsigned int lpid)
{
	unsigned long im;
	pmd_t *p = pmd;
@@ -307,20 +327,21 @@ static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full)
				WARN_ON_ONCE(1);
				kvmppc_unmap_pte(kvm, (pte_t *)p,
					 pte_pfn(*(pte_t *)p) << PAGE_SHIFT,
					 PMD_SHIFT, NULL);
					 PMD_SHIFT, NULL, lpid);
			}
		} else {
			pte_t *pte;

			pte = pte_offset_map(p, 0);
			kvmppc_unmap_free_pte(kvm, pte, full);
			kvmppc_unmap_free_pte(kvm, pte, full, lpid);
			pmd_clear(p);
		}
	}
	kvmppc_pmd_free(pmd);
}

static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud)
static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud,
				  unsigned int lpid)
{
	unsigned long iu;
	pud_t *p = pud;
@@ -334,36 +355,40 @@ static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud)
			pmd_t *pmd;

			pmd = pmd_offset(p, 0);
			kvmppc_unmap_free_pmd(kvm, pmd, true);
			kvmppc_unmap_free_pmd(kvm, pmd, true, lpid);
			pud_clear(p);
		}
	}
	pud_free(kvm->mm, pud);
}

void kvmppc_free_radix(struct kvm *kvm)
void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd, unsigned int lpid)
{
	unsigned long ig;
	pgd_t *pgd;

	if (!kvm->arch.pgtable)
		return;
	pgd = kvm->arch.pgtable;
	for (ig = 0; ig < PTRS_PER_PGD; ++ig, ++pgd) {
		pud_t *pud;

		if (!pgd_present(*pgd))
			continue;
		pud = pud_offset(pgd, 0);
		kvmppc_unmap_free_pud(kvm, pud);
		kvmppc_unmap_free_pud(kvm, pud, lpid);
		pgd_clear(pgd);
	}
}

void kvmppc_free_radix(struct kvm *kvm)
{
	if (kvm->arch.pgtable) {
		kvmppc_free_pgtable_radix(kvm, kvm->arch.pgtable,
					  kvm->arch.lpid);
		pgd_free(kvm->mm, kvm->arch.pgtable);
		kvm->arch.pgtable = NULL;
	}
}

static void kvmppc_unmap_free_pmd_entry_table(struct kvm *kvm, pmd_t *pmd,
					      unsigned long gpa)
					unsigned long gpa, unsigned int lpid)
{
	pte_t *pte = pte_offset_kernel(pmd, 0);

@@ -373,13 +398,13 @@ static void kvmppc_unmap_free_pmd_entry_table(struct kvm *kvm, pmd_t *pmd,
	 * flushing the PWC again.
	 */
	pmd_clear(pmd);
	kvmppc_radix_flush_pwc(kvm);
	kvmppc_radix_flush_pwc(kvm, lpid);

	kvmppc_unmap_free_pte(kvm, pte, false);
	kvmppc_unmap_free_pte(kvm, pte, false, lpid);
}

static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud,
					unsigned long gpa)
					unsigned long gpa, unsigned int lpid)
{
	pmd_t *pmd = pmd_offset(pud, 0);

@@ -389,9 +414,9 @@ static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud,
	 * so can be freed without flushing the PWC again.
	 */
	pud_clear(pud);
	kvmppc_radix_flush_pwc(kvm);
	kvmppc_radix_flush_pwc(kvm, lpid);

	kvmppc_unmap_free_pmd(kvm, pmd, false);
	kvmppc_unmap_free_pmd(kvm, pmd, false, lpid);
}

/*
@@ -403,9 +428,9 @@ static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud,
 */
#define PTE_BITS_MUST_MATCH (~(_PAGE_WRITE | _PAGE_DIRTY | _PAGE_ACCESSED))

static int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
		      unsigned long gpa, unsigned int level,
			     unsigned long mmu_seq)
		      unsigned long mmu_seq, unsigned int lpid)
{
	pgd_t *pgd;
	pud_t *pud, *new_pud = NULL;
@@ -471,7 +496,8 @@ static int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
			goto out_unlock;
		}
		/* Valid 1GB page here already, remove it */
		kvmppc_unmap_pte(kvm, (pte_t *)pud, hgpa, PUD_SHIFT, NULL);
		kvmppc_unmap_pte(kvm, (pte_t *)pud, hgpa, PUD_SHIFT, NULL,
				 lpid);
	}
	if (level == 2) {
		if (!pud_none(*pud)) {
@@ -480,7 +506,7 @@ static int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
			 * install a large page, so remove and free the page
			 * table page.
			 */
			kvmppc_unmap_free_pud_entry_table(kvm, pud, gpa);
			kvmppc_unmap_free_pud_entry_table(kvm, pud, gpa, lpid);
		}
		kvmppc_radix_set_pte_at(kvm, gpa, (pte_t *)pud, pte);
		ret = 0;
@@ -520,7 +546,8 @@ static int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
			goto out_unlock;
		}
		/* Valid 2MB page here already, remove it */
		kvmppc_unmap_pte(kvm, pmdp_ptep(pmd), lgpa, PMD_SHIFT, NULL);
		kvmppc_unmap_pte(kvm, pmdp_ptep(pmd), lgpa, PMD_SHIFT, NULL,
				 lpid);
	}
	if (level == 1) {
		if (!pmd_none(*pmd)) {
@@ -529,7 +556,7 @@ static int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
			 * install a large page, so remove and free the page
			 * table page.
			 */
			kvmppc_unmap_free_pmd_entry_table(kvm, pmd, gpa);
			kvmppc_unmap_free_pmd_entry_table(kvm, pmd, gpa, lpid);
		}
		kvmppc_radix_set_pte_at(kvm, gpa, pmdp_ptep(pmd), pte);
		ret = 0;
@@ -569,8 +596,8 @@ static int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
	return ret;
}

static bool kvmppc_hv_handle_set_rc(struct kvm *kvm, pgd_t *pgtable,
				    bool writing, unsigned long gpa)
bool kvmppc_hv_handle_set_rc(struct kvm *kvm, pgd_t *pgtable, bool writing,
			     unsigned long gpa, unsigned int lpid)
{
	unsigned long pgflags;
	unsigned int shift;
@@ -597,7 +624,7 @@ static bool kvmppc_hv_handle_set_rc(struct kvm *kvm, pgd_t *pgtable,
	return false;
}

static int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
				   unsigned long gpa,
				   struct kvm_memory_slot *memslot,
				   bool writing, bool kvm_ro,
@@ -683,7 +710,7 @@ static int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,

	/* Allocate space in the tree and write the PTE */
	ret = kvmppc_create_pte(kvm, kvm->arch.pgtable, pte, gpa, level,
				mmu_seq);
				mmu_seq, kvm->arch.lpid);
	if (inserted_pte)
		*inserted_pte = pte;
	if (levelp)
@@ -758,7 +785,7 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
	if (dsisr & DSISR_SET_RC) {
		spin_lock(&kvm->mmu_lock);
		if (kvmppc_hv_handle_set_rc(kvm, kvm->arch.pgtable,
					    writing, gpa))
					    writing, gpa, kvm->arch.lpid))
			dsisr &= ~DSISR_SET_RC;
		spin_unlock(&kvm->mmu_lock);

@@ -786,7 +813,8 @@ int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,

	ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift);
	if (ptep && pte_present(*ptep))
		kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot);
		kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot,
				 kvm->arch.lpid);
	return 0;				
}

@@ -841,7 +869,7 @@ static int kvm_radix_test_clear_dirty(struct kvm *kvm,
			ret = 1 << (shift - PAGE_SHIFT);
		kvmppc_radix_update_pte(kvm, ptep, _PAGE_DIRTY, 0,
					gpa, shift);
		kvmppc_radix_tlbie_page(kvm, gpa, shift);
		kvmppc_radix_tlbie_page(kvm, gpa, shift, kvm->arch.lpid);
	}
	return ret;
}
Loading