Commit e28f7faf authored by David Gibson's avatar David Gibson Committed by Paul Mackerras
Browse files

[PATCH] Four level pagetables for ppc64



Implement 4-level pagetables for ppc64

This patch implements full four-level page tables for ppc64, thereby
extending the usable user address range to 44 bits (16T).

The patch uses a full page for the tables at the bottom and top level,
and a quarter page for the intermediate levels.  It uses full 64-bit
pointers at every level, thus also increasing the addressable range of
physical memory.  This patch also tweaks the VSID allocation to allow
matching range for user addresses (this halves the number of available
contexts) and adds some #if and BUILD_BUG sanity checks.

Signed-off-by: default avatarDavid Gibson <dwg@au1.ibm.com>
Signed-off-by: default avatarPaul Mackerras <paulus@samba.org>
parent decd300b
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -302,7 +302,7 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
	int local = 0;
	cpumask_t tmp;

	if ((ea & ~REGION_MASK) > EADDR_MASK)
	if ((ea & ~REGION_MASK) >= PGTABLE_RANGE)
		return 1;

 	switch (REGION_ID(ea)) {
+59 −128
Original line number Diff line number Diff line
@@ -27,124 +27,91 @@

#include <linux/sysctl.h>

#define	HUGEPGDIR_SHIFT		(HPAGE_SHIFT + PAGE_SHIFT - 3)
#define HUGEPGDIR_SIZE		(1UL << HUGEPGDIR_SHIFT)
#define HUGEPGDIR_MASK		(~(HUGEPGDIR_SIZE-1))

#define HUGEPTE_INDEX_SIZE	9
#define HUGEPGD_INDEX_SIZE	10

#define PTRS_PER_HUGEPTE	(1 << HUGEPTE_INDEX_SIZE)
#define PTRS_PER_HUGEPGD	(1 << HUGEPGD_INDEX_SIZE)

static inline int hugepgd_index(unsigned long addr)
{
	return (addr & ~REGION_MASK) >> HUGEPGDIR_SHIFT;
}

static pud_t *hugepgd_offset(struct mm_struct *mm, unsigned long addr)
/* Modelled after find_linux_pte() */
pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
{
	int index;
	pgd_t *pg;
	pud_t *pu;
	pmd_t *pm;
	pte_t *pt;

	if (! mm->context.huge_pgdir)
		return NULL;
	BUG_ON(! in_hugepage_area(mm->context, addr));

	addr &= HPAGE_MASK;

	index = hugepgd_index(addr);
	BUG_ON(index >= PTRS_PER_HUGEPGD);
	return (pud_t *)(mm->context.huge_pgdir + index);
	pg = pgd_offset(mm, addr);
	if (!pgd_none(*pg)) {
		pu = pud_offset(pg, addr);
		if (!pud_none(*pu)) {
			pm = pmd_offset(pu, addr);
			pt = (pte_t *)pm;
			BUG_ON(!pmd_none(*pm)
			       && !(pte_present(*pt) && pte_huge(*pt)));
			return pt;
		}
	}

static inline pte_t *hugepte_offset(pud_t *dir, unsigned long addr)
{
	int index;

	if (pud_none(*dir))
	return NULL;

	index = (addr >> HPAGE_SHIFT) % PTRS_PER_HUGEPTE;
	return (pte_t *)pud_page(*dir) + index;
}

static pud_t *hugepgd_alloc(struct mm_struct *mm, unsigned long addr)
pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
{
	pgd_t *pg;
	pud_t *pu;
	pmd_t *pm;
	pte_t *pt;

	BUG_ON(! in_hugepage_area(mm->context, addr));

	if (! mm->context.huge_pgdir) {
		pgd_t *new;
		spin_unlock(&mm->page_table_lock);
		/* Don't use pgd_alloc(), because we want __GFP_REPEAT */
		new = kmem_cache_alloc(zero_cache, GFP_KERNEL | __GFP_REPEAT);
		BUG_ON(memcmp(new, empty_zero_page, PAGE_SIZE));
		spin_lock(&mm->page_table_lock);
	addr &= HPAGE_MASK;

		/*
		 * Because we dropped the lock, we should re-check the
		 * entry, as somebody else could have populated it..
		 */
		if (mm->context.huge_pgdir)
			pgd_free(new);
		else
			mm->context.huge_pgdir = new;
	pg = pgd_offset(mm, addr);
	pu = pud_alloc(mm, pg, addr);

	if (pu) {
		pm = pmd_alloc(mm, pu, addr);
		if (pm) {
			pt = (pte_t *)pm;
			BUG_ON(!pmd_none(*pm)
			       && !(pte_present(*pt) && pte_huge(*pt)));
			return pt;
		}
	return hugepgd_offset(mm, addr);
	}

static pte_t *hugepte_alloc(struct mm_struct *mm, pud_t *dir, unsigned long addr)
{
	if (! pud_present(*dir)) {
		pte_t *new;

		spin_unlock(&mm->page_table_lock);
		new = kmem_cache_alloc(zero_cache, GFP_KERNEL | __GFP_REPEAT);
		BUG_ON(memcmp(new, empty_zero_page, PAGE_SIZE));
		spin_lock(&mm->page_table_lock);
		/*
		 * Because we dropped the lock, we should re-check the
		 * entry, as somebody else could have populated it..
		 */
		if (pud_present(*dir)) {
			if (new)
				kmem_cache_free(zero_cache, new);
		} else {
			struct page *ptepage;

			if (! new)
	return NULL;
			ptepage = virt_to_page(new);
			ptepage->mapping = (void *) mm;
			ptepage->index = addr & HUGEPGDIR_MASK;
			pud_populate(mm, dir, new);
		}
}

	return hugepte_offset(dir, addr);
}
#define HUGEPTE_BATCH_SIZE	(HPAGE_SIZE / PMD_SIZE)

pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
		     pte_t *ptep, pte_t pte)
{
	pud_t *pud;

	BUG_ON(! in_hugepage_area(mm->context, addr));
	int i;

	pud = hugepgd_offset(mm, addr);
	if (! pud)
		return NULL;
	if (pte_present(*ptep)) {
		pte_clear(mm, addr, ptep);
		flush_tlb_pending();
	}

	return hugepte_offset(pud, addr);
	for (i = 0; i < HUGEPTE_BATCH_SIZE; i++) {
		*ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
		ptep++;
	}
}

pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
			      pte_t *ptep)
{
	pud_t *pud;
	unsigned long old = pte_update(ptep, ~0UL);
	int i;

	BUG_ON(! in_hugepage_area(mm->context, addr));
	if (old & _PAGE_HASHPTE)
		hpte_update(mm, addr, old, 0);

	pud = hugepgd_alloc(mm, addr);
	if (! pud)
		return NULL;
	for (i = 1; i < HUGEPTE_BATCH_SIZE; i++)
		ptep[i] = __pte(0);

	return hugepte_alloc(mm, pud, addr);
	return __pte(old);
}

/*
@@ -541,42 +508,6 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
	}
}

void hugetlb_mm_free_pgd(struct mm_struct *mm)
{
	int i;
	pgd_t *pgdir;

	spin_lock(&mm->page_table_lock);

	pgdir = mm->context.huge_pgdir;
	if (! pgdir)
		goto out;

	mm->context.huge_pgdir = NULL;

	/* cleanup any hugepte pages leftover */
	for (i = 0; i < PTRS_PER_HUGEPGD; i++) {
		pud_t *pud = (pud_t *)(pgdir + i);

		if (! pud_none(*pud)) {
			pte_t *pte = (pte_t *)pud_page(*pud);
			struct page *ptepage = virt_to_page(pte);

			ptepage->mapping = NULL;

			BUG_ON(memcmp(pte, empty_zero_page, PAGE_SIZE));
			kmem_cache_free(zero_cache, pte);
		}
		pud_clear(pud);
	}

	BUG_ON(memcmp(pgdir, empty_zero_page, PAGE_SIZE));
	kmem_cache_free(zero_cache, pgdir);

 out:
	spin_unlock(&mm->page_table_lock);
}

int hash_huge_page(struct mm_struct *mm, unsigned long access,
		   unsigned long ea, unsigned long vsid, int local)
{
+1 −1
Original line number Diff line number Diff line
@@ -31,7 +31,7 @@ static int get_free_im_addr(unsigned long size, unsigned long *im_addr)
			break;
		if ((unsigned long)tmp->addr >= ioremap_bot)
			addr = tmp->size + (unsigned long) tmp->addr;
		if (addr > IMALLOC_END-size) 
		if (addr >= IMALLOC_END-size)
			return 1;
	}
	*im_addr = addr;
+41 −21
Original line number Diff line number Diff line
@@ -66,6 +66,14 @@
#include <asm/vdso.h>
#include <asm/imalloc.h>

#if PGTABLE_RANGE > USER_VSID_RANGE
#warning Limited user VSID range means pagetable space is wasted
#endif

#if (TASK_SIZE_USER64 < PGTABLE_RANGE) && (TASK_SIZE_USER64 < USER_VSID_RANGE)
#warning TASK_SIZE is smaller than it needs to be.
#endif

int mem_init_done;
unsigned long ioremap_bot = IMALLOC_BASE;
static unsigned long phbs_io_bot = PHBS_IO_BASE;
@@ -226,7 +234,7 @@ void __iomem * __ioremap(unsigned long addr, unsigned long size,
	 * Before that, we map using addresses going
	 * up from ioremap_bot.  imalloc will use
	 * the addresses from ioremap_bot through
	 * IMALLOC_END (0xE000001fffffffff)
	 * IMALLOC_END
	 * 
	 */
	pa = addr & PAGE_MASK;
@@ -417,12 +425,6 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
	int index;
	int err;

#ifdef CONFIG_HUGETLB_PAGE
	/* We leave htlb_segs as it was, but for a fork, we need to
	 * clear the huge_pgdir. */
	mm->context.huge_pgdir = NULL;
#endif

again:
	if (!idr_pre_get(&mmu_context_idr, GFP_KERNEL))
		return -ENOMEM;
@@ -453,8 +455,6 @@ void destroy_context(struct mm_struct *mm)
	spin_unlock(&mmu_context_lock);

	mm->context.id = NO_CONTEXT;

	hugetlb_mm_free_pgd(mm);
}

/*
@@ -833,23 +833,43 @@ void __iomem * reserve_phb_iospace(unsigned long size)
	return virt_addr;
}

kmem_cache_t *zero_cache;

static void zero_ctor(void *pte, kmem_cache_t *cache, unsigned long flags)
static void zero_ctor(void *addr, kmem_cache_t *cache, unsigned long flags)
{
	memset(pte, 0, PAGE_SIZE);
	memset(addr, 0, kmem_cache_size(cache));
}

static const int pgtable_cache_size[2] = {
	PTE_TABLE_SIZE, PMD_TABLE_SIZE
};
static const char *pgtable_cache_name[ARRAY_SIZE(pgtable_cache_size)] = {
	"pgd_pte_cache", "pud_pmd_cache",
};

kmem_cache_t *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)];

void pgtable_cache_init(void)
{
	zero_cache = kmem_cache_create("zero",
				PAGE_SIZE,
				0,
				SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN,
	int i;

	BUILD_BUG_ON(PTE_TABLE_SIZE != pgtable_cache_size[PTE_CACHE_NUM]);
	BUILD_BUG_ON(PMD_TABLE_SIZE != pgtable_cache_size[PMD_CACHE_NUM]);
	BUILD_BUG_ON(PUD_TABLE_SIZE != pgtable_cache_size[PUD_CACHE_NUM]);
	BUILD_BUG_ON(PGD_TABLE_SIZE != pgtable_cache_size[PGD_CACHE_NUM]);

	for (i = 0; i < ARRAY_SIZE(pgtable_cache_size); i++) {
		int size = pgtable_cache_size[i];
		const char *name = pgtable_cache_name[i];

		pgtable_cache[i] = kmem_cache_create(name,
						     size, size,
						     SLAB_HWCACHE_ALIGN
						     | SLAB_MUST_HWCACHE_ALIGN,
						     zero_ctor,
						     NULL);
	if (!zero_cache)
		panic("pgtable_cache_init(): could not create zero_cache!\n");
		if (! pgtable_cache[i])
			panic("pgtable_cache_init(): could not create %s!\n",
			      name);
	}
}

pgprot_t phys_mem_access_prot(struct file *file, unsigned long addr,
+1 −1
Original line number Diff line number Diff line
@@ -91,7 +91,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_16M_PAGE)
0:	/* user address: proto-VSID = context<<15 | ESID */
	li	r11,SLB_VSID_USER

	srdi.	r9,r3,13
	srdi.	r9,r3,USER_ESID_BITS
	bne-	8f			/* invalid ea bits set */

#ifdef CONFIG_HUGETLB_PAGE
Loading