Commit 145f573b authored by Rik van Riel's avatar Rik van Riel Committed by Peter Zijlstra
Browse files

x86/mm/tlb: Make lazy TLB mode lazier



Lazy TLB mode can result in an idle CPU being woken up by a TLB flush,
when all it really needs to do is reload %CR3 at the next context switch,
assuming no page table pages got freed.

Memory ordering is used to prevent race conditions between switch_mm_irqs_off,
which checks whether .tlb_gen changed, and the TLB invalidation code, which
increments .tlb_gen whenever page table entries get invalidated.

The atomic increment in inc_mm_tlb_gen is its own barrier; the context
switch code adds an explicit barrier between reading tlbstate.is_lazy and
next->context.tlb_gen.

CPUs in lazy TLB mode remain part of the mm_cpumask(mm), both because
that allows TLB flush IPIs to be sent at page table freeing time, and
because the cache line bouncing on the mm_cpumask(mm) was responsible
for about half the CPU use in switch_mm_irqs_off().

We can change native_flush_tlb_others() without touching other
(paravirt) implementations of flush_tlb_others() because we'll be
flushing less. The existing implementations flush more and are
therefore still correct.

Cc: npiggin@gmail.com
Cc: mingo@kernel.org
Cc: will.deacon@arm.com
Cc: kernel-team@fb.com
Cc: luto@kernel.org
Cc: hpa@zytor.com
Tested-by: default avatarSong Liu <songliubraving@fb.com>
Signed-off-by: default avatarRik van Riel <riel@surriel.com>
Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20180926035844.1420-8-riel@surriel.com
parent 97807813
Loading
Loading
Loading
Loading
+58 −9
Original line number Diff line number Diff line
@@ -185,6 +185,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
{
	struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
	u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
	bool was_lazy = this_cpu_read(cpu_tlbstate.is_lazy);
	unsigned cpu = smp_processor_id();
	u64 next_tlb_gen;
	bool need_flush;
@@ -242,17 +243,40 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
			   next->context.ctx_id);

		/*
		 * We don't currently support having a real mm loaded without
		 * our cpu set in mm_cpumask().  We have all the bookkeeping
		 * in place to figure out whether we would need to flush
		 * if our cpu were cleared in mm_cpumask(), but we don't
		 * currently use it.
		 * Even in lazy TLB mode, the CPU should stay set in the
		 * mm_cpumask. The TLB shootdown code can figure out from
		 * from cpu_tlbstate.is_lazy whether or not to send an IPI.
		 */
		if (WARN_ON_ONCE(real_prev != &init_mm &&
				 !cpumask_test_cpu(cpu, mm_cpumask(next))))
			cpumask_set_cpu(cpu, mm_cpumask(next));

		/*
		 * If the CPU is not in lazy TLB mode, we are just switching
		 * from one thread in a process to another thread in the same
		 * process. No TLB flush required.
		 */
		if (!was_lazy)
			return;

		/*
		 * Read the tlb_gen to check whether a flush is needed.
		 * If the TLB is up to date, just use it.
		 * The barrier synchronizes with the tlb_gen increment in
		 * the TLB shootdown code.
		 */
		smp_mb();
		next_tlb_gen = atomic64_read(&next->context.tlb_gen);
		if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) ==
				next_tlb_gen)
			return;

		/*
		 * TLB contents went out of date while we were in lazy
		 * mode. Fall through to the TLB switching code below.
		 */
		new_asid = prev_asid;
		need_flush = true;
	} else {
		u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id);

@@ -346,9 +370,11 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
	this_cpu_write(cpu_tlbstate.loaded_mm, next);
	this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);

	if (next != real_prev) {
		load_mm_cr4(next);
		switch_ldt(real_prev, next);
	}
}

/*
 * Please ignore the name of this function.  It should be called
@@ -455,6 +481,9 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
		 * paging-structure cache to avoid speculatively reading
		 * garbage into our TLB.  Since switching to init_mm is barely
		 * slower than a minimal flush, just switch to init_mm.
		 *
		 * This should be rare, with native_flush_tlb_others skipping
		 * IPIs to lazy TLB mode CPUs.
		 */
		switch_mm_irqs_off(NULL, &init_mm, NULL);
		return;
@@ -557,6 +586,11 @@ static void flush_tlb_func_remote(void *info)
	flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN);
}

static bool tlb_is_not_lazy(int cpu, void *data)
{
	return !per_cpu(cpu_tlbstate.is_lazy, cpu);
}

void native_flush_tlb_others(const struct cpumask *cpumask,
			     const struct flush_tlb_info *info)
{
@@ -592,8 +626,23 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
					       (void *)info, 1);
		return;
	}

	/*
	 * If no page tables were freed, we can skip sending IPIs to
	 * CPUs in lazy TLB mode. They will flush the CPU themselves
	 * at the next context switch.
	 *
	 * However, if page tables are getting freed, we need to send the
	 * IPI everywhere, to prevent CPUs in lazy TLB mode from tripping
	 * up on the new contents of what used to be page tables, while
	 * doing a speculative memory access.
	 */
	if (info->freed_tables)
		smp_call_function_many(cpumask, flush_tlb_func_remote,
			       (void *)info, 1);
	else
		on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func_remote,
				(void *)info, 1, GFP_ATOMIC, cpumask);
}

/*