Commit 1c134b19 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 mm updates from Ingo Molnar:
 "The main changes in this cycle were:

   - A PAT series from Davidlohr Bueso, which simplifies the memtype
     rbtree by using the interval tree helpers. (There's more cleanups
     in this area queued up, but they didn't make the merge window.)

   - Also flip over CONFIG_X86_5LEVEL to default-y. This might draw in a
     few more testers, as all the major distros are going to have
     5-level paging enabled by default in their next iterations.

   - Misc cleanups"

* 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/mm/pat: Rename pat_rbtree.c to pat_interval.c
  x86/mm/pat: Drop the rbt_ prefix from external memtype calls
  x86/mm/pat: Do not pass 'rb_root' down the memtype tree helper functions
  x86/mm/pat: Convert the PAT tree to a generic interval tree
  x86/mm: Clean up the pmd_read_atomic() comments
  x86/mm: Fix function name typo in pmd_read_atomic() comment
  x86/cpu: Clean up intel_tlb_table[]
  x86/mm: Enable 5-level paging support by default
parents 24ee25a6 7f264dab
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -1462,6 +1462,7 @@ config X86_PAE

config X86_5LEVEL
	bool "Enable 5-level page tables support"
	default y
	select DYNAMIC_MEMORY_LAYOUT
	select SPARSEMEM_VMEMMAP
	depends on X86_64
+24 −22
Original line number Diff line number Diff line
@@ -36,39 +36,41 @@ static inline void native_set_pte(pte_t *ptep, pte_t pte)

#define pmd_read_atomic pmd_read_atomic
/*
 * pte_offset_map_lock on 32bit PAE kernels was reading the pmd_t with
 * a "*pmdp" dereference done by gcc. Problem is, in certain places
 * where pte_offset_map_lock is called, concurrent page faults are
 * pte_offset_map_lock() on 32-bit PAE kernels was reading the pmd_t with
 * a "*pmdp" dereference done by GCC. Problem is, in certain places
 * where pte_offset_map_lock() is called, concurrent page faults are
 * allowed, if the mmap_sem is hold for reading. An example is mincore
 * vs page faults vs MADV_DONTNEED. On the page fault side
 * pmd_populate rightfully does a set_64bit, but if we're reading the
 * pmd_populate() rightfully does a set_64bit(), but if we're reading the
 * pmd_t with a "*pmdp" on the mincore side, a SMP race can happen
 * because gcc will not read the 64bit of the pmd atomically. To fix
 * this all places running pmd_offset_map_lock() while holding the
 * because GCC will not read the 64-bit value of the pmd atomically.
 *
 * To fix this all places running pte_offset_map_lock() while holding the
 * mmap_sem in read mode, shall read the pmdp pointer using this
 * function to know if the pmd is null nor not, and in turn to know if
 * they can run pmd_offset_map_lock or pmd_trans_huge or other pmd
 * function to know if the pmd is null or not, and in turn to know if
 * they can run pte_offset_map_lock() or pmd_trans_huge() or other pmd
 * operations.
 *
 * Without THP if the mmap_sem is hold for reading, the pmd can only
 * transition from null to not null while pmd_read_atomic runs. So
 * Without THP if the mmap_sem is held for reading, the pmd can only
 * transition from null to not null while pmd_read_atomic() runs. So
 * we can always return atomic pmd values with this function.
 *
 * With THP if the mmap_sem is hold for reading, the pmd can become
 * With THP if the mmap_sem is held for reading, the pmd can become
 * trans_huge or none or point to a pte (and in turn become "stable")
 * at any time under pmd_read_atomic. We could read it really
 * atomically here with a atomic64_read for the THP enabled case (and
 * at any time under pmd_read_atomic(). We could read it truly
 * atomically here with an atomic64_read() for the THP enabled case (and
 * it would be a whole lot simpler), but to avoid using cmpxchg8b we
 * only return an atomic pmdval if the low part of the pmdval is later
 * found stable (i.e. pointing to a pte). And we're returning a none
 * pmdval if the low part of the pmd is none. In some cases the high
 * and low part of the pmdval returned may not be consistent if THP is
 * enabled (the low part may point to previously mapped hugepage,
 * while the high part may point to a more recently mapped hugepage),
 * but pmd_none_or_trans_huge_or_clear_bad() only needs the low part
 * of the pmd to be read atomically to decide if the pmd is unstable
 * or not, with the only exception of when the low part of the pmd is
 * zero in which case we return a none pmd.
 * found to be stable (i.e. pointing to a pte). We are also returning a
 * 'none' (zero) pmdval if the low part of the pmd is zero.
 *
 * In some cases the high and low part of the pmdval returned may not be
 * consistent if THP is enabled (the low part may point to previously
 * mapped hugepage, while the high part may point to a more recently
 * mapped hugepage), but pmd_none_or_trans_huge_or_clear_bad() only
 * needs the low part of the pmd to be read atomically to decide if the
 * pmd is unstable or not, with the only exception when the low part
 * of the pmd is zero, in which case we return a 'none' pmd.
 */
static inline pmd_t pmd_read_atomic(pmd_t *pmdp)
{
+4 −4
Original line number Diff line number Diff line
@@ -819,7 +819,7 @@ static const struct _tlb_table intel_tlb_table[] = {
	{ 0x04, TLB_DATA_4M,		8,	" TLB_DATA 4 MByte pages, 4-way set associative" },
	{ 0x05, TLB_DATA_4M,		32,	" TLB_DATA 4 MByte pages, 4-way set associative" },
	{ 0x0b, TLB_INST_4M,		4,	" TLB_INST 4 MByte pages, 4-way set associative" },
	{ 0x4f, TLB_INST_4K,		32,	" TLB_INST 4 KByte pages */" },
	{ 0x4f, TLB_INST_4K,		32,	" TLB_INST 4 KByte pages" },
	{ 0x50, TLB_INST_ALL,		64,	" TLB_INST 4 KByte and 2-MByte or 4-MByte pages" },
	{ 0x51, TLB_INST_ALL,		128,	" TLB_INST 4 KByte and 2-MByte or 4-MByte pages" },
	{ 0x52, TLB_INST_ALL,		256,	" TLB_INST 4 KByte and 2-MByte or 4-MByte pages" },
@@ -847,7 +847,7 @@ static const struct _tlb_table intel_tlb_table[] = {
	{ 0xba, TLB_DATA_4K,		64,	" TLB_DATA 4 KByte pages, 4-way associative" },
	{ 0xc0, TLB_DATA_4K_4M,		8,	" TLB_DATA 4 KByte and 4 MByte pages, 4-way associative" },
	{ 0xc1, STLB_4K_2M,		1024,	" STLB 4 KByte and 2 MByte pages, 8-way associative" },
	{ 0xc2, TLB_DATA_2M_4M,		16,	" DTLB 2 MByte/4MByte pages, 4-way associative" },
	{ 0xc2, TLB_DATA_2M_4M,		16,	" TLB_DATA 2 MByte/4MByte pages, 4-way associative" },
	{ 0xca, STLB_4K,		512,	" STLB 4 KByte pages, 4-way associative" },
	{ 0x00, 0, 0 }
};
@@ -859,7 +859,7 @@ static void intel_tlb_lookup(const unsigned char desc)
		return;

	/* look up this descriptor in the table */
	for (k = 0; intel_tlb_table[k].descriptor != desc && \
	for (k = 0; intel_tlb_table[k].descriptor != desc &&
	     intel_tlb_table[k].descriptor != 0; k++)
		;

+1 −1
Original line number Diff line number Diff line
@@ -23,7 +23,7 @@ CFLAGS_mem_encrypt_identity.o := $(nostackp)

CFLAGS_fault.o := -I $(srctree)/$(src)/../include/asm/trace

obj-$(CONFIG_X86_PAT)		+= pat_rbtree.o
obj-$(CONFIG_X86_PAT)		+= pat_interval.o

obj-$(CONFIG_X86_32)		+= pgtable_32.o iomap_32.o

+4 −4
Original line number Diff line number Diff line
@@ -603,7 +603,7 @@ int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_type,

	spin_lock(&memtype_lock);

	err = rbt_memtype_check_insert(new, new_type);
	err = memtype_check_insert(new, new_type);
	if (err) {
		pr_info("x86/PAT: reserve_memtype failed [mem %#010Lx-%#010Lx], track %s, req %s\n",
			start, end - 1,
@@ -650,7 +650,7 @@ int free_memtype(u64 start, u64 end)
	}

	spin_lock(&memtype_lock);
	entry = rbt_memtype_erase(start, end);
	entry = memtype_erase(start, end);
	spin_unlock(&memtype_lock);

	if (IS_ERR(entry)) {
@@ -693,7 +693,7 @@ static enum page_cache_mode lookup_memtype(u64 paddr)

	spin_lock(&memtype_lock);

	entry = rbt_memtype_lookup(paddr);
	entry = memtype_lookup(paddr);
	if (entry != NULL)
		rettype = entry->type;
	else
@@ -1109,7 +1109,7 @@ static struct memtype *memtype_get_idx(loff_t pos)
		return NULL;

	spin_lock(&memtype_lock);
	ret = rbt_memtype_copy_nth_element(print_entry, pos);
	ret = memtype_copy_nth_element(print_entry, pos);
	spin_unlock(&memtype_lock);

	if (!ret) {
Loading