Commit 6168d0da authored by Alex Shi's avatar Alex Shi Committed by Linus Torvalds
Browse files

mm/lru: replace pgdat lru_lock with lruvec lock

This patch moves per node lru_lock into lruvec, thus bring a lru_lock for
each of memcg per node.  So on a large machine, each of memcg don't have
to suffer from per node pgdat->lru_lock competition.  They could go fast
with their self lru_lock.

After move memcg charge before lru inserting, page isolation could
serialize page's memcg, then per memcg lruvec lock is stable and could
replace per node lru lock.

In isolate_migratepages_block(), compact_unlock_should_abort and
lock_page_lruvec_irqsave are open coded to work with compact_control.
Also add a debug func in locking which may give some clues if there are
sth out of hands.

Daniel Jordan's testing show 62% improvement on modified readtwice case on
his 2P * 10 core * 2 HT broadwell box.
https://lore.kernel.org/lkml/20200915165807.kpp7uhiw7l3loofu@ca-dmjordan1.us.oracle.com/

Hugh Dickins helped on the patch polish, thanks!

[alex.shi@linux.alibaba.com: fix comment typo]
  Link: https://lkml.kernel.org/r/5b085715-292a-4b43-50b3-d73dc90d1de5@linux.alibaba.com
[alex.shi@linux.alibaba.com: use page_memcg()]
  Link: https://lkml.kernel.org/r/5a4c2b72-7ee8-2478-fc0e-85eb83aafec4@linux.alibaba.com

Link: https://lkml.kernel.org/r/1604566549-62481-18-git-send-email-alex.shi@linux.alibaba.com


Signed-off-by: default avatarAlex Shi <alex.shi@linux.alibaba.com>
Acked-by: default avatarHugh Dickins <hughd@google.com>
Acked-by: default avatarJohannes Weiner <hannes@cmpxchg.org>
Cc: Rong Chen <rong.a.chen@intel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Yang Shi <yang.shi@linux.alibaba.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Cc: Daniel Jordan <daniel.m.jordan@oracle.com>
Cc: Alexander Duyck <alexander.duyck@gmail.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Jann Horn <jannh@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mika Penttilä <mika.penttila@nextfour.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Yang <richard.weiyang@gmail.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent fc574c23
Loading
Loading
Loading
Loading
+58 −0
Original line number Diff line number Diff line
@@ -491,6 +491,19 @@ struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm);

struct mem_cgroup *get_mem_cgroup_from_page(struct page *page);

struct lruvec *lock_page_lruvec(struct page *page);
struct lruvec *lock_page_lruvec_irq(struct page *page);
struct lruvec *lock_page_lruvec_irqsave(struct page *page,
						unsigned long *flags);

#ifdef CONFIG_DEBUG_VM
void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page);
#else
static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
{
}
#endif

static inline
struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){
	return css ? container_of(css, struct mem_cgroup, css) : NULL;
@@ -996,6 +1009,31 @@ static inline void mem_cgroup_put(struct mem_cgroup *memcg)
{
}

static inline struct lruvec *lock_page_lruvec(struct page *page)
{
	struct pglist_data *pgdat = page_pgdat(page);

	spin_lock(&pgdat->__lruvec.lru_lock);
	return &pgdat->__lruvec;
}

static inline struct lruvec *lock_page_lruvec_irq(struct page *page)
{
	struct pglist_data *pgdat = page_pgdat(page);

	spin_lock_irq(&pgdat->__lruvec.lru_lock);
	return &pgdat->__lruvec;
}

static inline struct lruvec *lock_page_lruvec_irqsave(struct page *page,
		unsigned long *flagsp)
{
	struct pglist_data *pgdat = page_pgdat(page);

	spin_lock_irqsave(&pgdat->__lruvec.lru_lock, *flagsp);
	return &pgdat->__lruvec;
}

static inline struct mem_cgroup *
mem_cgroup_iter(struct mem_cgroup *root,
		struct mem_cgroup *prev,
@@ -1215,6 +1253,10 @@ static inline
void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx)
{
}

static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
{
}
#endif /* CONFIG_MEMCG */

/* idx can be of type enum memcg_stat_item or node_stat_item */
@@ -1296,6 +1338,22 @@ static inline struct lruvec *parent_lruvec(struct lruvec *lruvec)
	return mem_cgroup_lruvec(memcg, lruvec_pgdat(lruvec));
}

static inline void unlock_page_lruvec(struct lruvec *lruvec)
{
	spin_unlock(&lruvec->lru_lock);
}

static inline void unlock_page_lruvec_irq(struct lruvec *lruvec)
{
	spin_unlock_irq(&lruvec->lru_lock);
}

static inline void unlock_page_lruvec_irqrestore(struct lruvec *lruvec,
		unsigned long flags)
{
	spin_unlock_irqrestore(&lruvec->lru_lock, flags);
}

#ifdef CONFIG_CGROUP_WRITEBACK

struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb);
+2 −1
Original line number Diff line number Diff line
@@ -276,6 +276,8 @@ enum lruvec_flags {

struct lruvec {
	struct list_head		lists[NR_LRU_LISTS];
	/* per lruvec lru_lock for memcg */
	spinlock_t			lru_lock;
	/*
	 * These track the cost of reclaiming one LRU - file or anon -
	 * over the other. As the observed cost of reclaiming one LRU
@@ -782,7 +784,6 @@ typedef struct pglist_data {

	/* Write-intensive fields used by page reclaim */
	ZONE_PADDING(_pad1_)
	spinlock_t		lru_lock;

#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
	/*
+36 −20
Original line number Diff line number Diff line
@@ -804,7 +804,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
	unsigned long nr_scanned = 0, nr_isolated = 0;
	struct lruvec *lruvec;
	unsigned long flags = 0;
	bool locked = false;
	struct lruvec *locked = NULL;
	struct page *page = NULL, *valid_page = NULL;
	unsigned long start_pfn = low_pfn;
	bool skip_on_failure = false;
@@ -868,13 +868,22 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
		 * contention, to give chance to IRQs. Abort completely if
		 * a fatal signal is pending.
		 */
		if (!(low_pfn % SWAP_CLUSTER_MAX)
		    && compact_unlock_should_abort(&pgdat->lru_lock,
					    flags, &locked, cc)) {
		if (!(low_pfn % SWAP_CLUSTER_MAX)) {
			if (locked) {
				unlock_page_lruvec_irqrestore(locked, flags);
				locked = NULL;
			}

			if (fatal_signal_pending(current)) {
				cc->contended = true;

				low_pfn = 0;
				goto fatal_pending;
			}

			cond_resched();
		}

		if (!pfn_valid_within(low_pfn))
			goto isolate_fail;
		nr_scanned++;
@@ -944,9 +953,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
			if (unlikely(__PageMovable(page)) &&
					!PageIsolated(page)) {
				if (locked) {
					spin_unlock_irqrestore(&pgdat->lru_lock,
									flags);
					locked = false;
					unlock_page_lruvec_irqrestore(locked, flags);
					locked = NULL;
				}

				if (!isolate_movable_page(page, isolate_mode))
@@ -987,10 +995,19 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
		if (!TestClearPageLRU(page))
			goto isolate_fail_put;

		rcu_read_lock();
		lruvec = mem_cgroup_page_lruvec(page, pgdat);

		/* If we already hold the lock, we can skip some rechecking */
		if (!locked) {
			locked = compact_lock_irqsave(&pgdat->lru_lock,
								&flags, cc);
		if (lruvec != locked) {
			if (locked)
				unlock_page_lruvec_irqrestore(locked, flags);

			compact_lock_irqsave(&lruvec->lru_lock, &flags, cc);
			locked = lruvec;
			rcu_read_unlock();

			lruvec_memcg_debug(lruvec, page);

			/* Try get exclusive access under lock */
			if (!skip_updated) {
@@ -1009,9 +1026,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
				SetPageLRU(page);
				goto isolate_fail_put;
			}
		}

		lruvec = mem_cgroup_page_lruvec(page, pgdat);
		} else
			rcu_read_unlock();

		/* The whole page is taken off the LRU; skip the tail pages. */
		if (PageCompound(page))
@@ -1045,8 +1061,8 @@ isolate_success:
isolate_fail_put:
		/* Avoid potential deadlock in freeing page under lru_lock */
		if (locked) {
			spin_unlock_irqrestore(&pgdat->lru_lock, flags);
			locked = false;
			unlock_page_lruvec_irqrestore(locked, flags);
			locked = NULL;
		}
		put_page(page);

@@ -1061,8 +1077,8 @@ isolate_fail:
		 */
		if (nr_isolated) {
			if (locked) {
				spin_unlock_irqrestore(&pgdat->lru_lock, flags);
				locked = false;
				unlock_page_lruvec_irqrestore(locked, flags);
				locked = NULL;
			}
			putback_movable_pages(&cc->migratepages);
			cc->nr_migratepages = 0;
@@ -1090,7 +1106,7 @@ isolate_fail:

isolate_abort:
	if (locked)
		spin_unlock_irqrestore(&pgdat->lru_lock, flags);
		unlock_page_lruvec_irqrestore(locked, flags);
	if (page) {
		SetPageLRU(page);
		put_page(page);
+4 −7
Original line number Diff line number Diff line
@@ -2365,7 +2365,7 @@ static void lru_add_page_tail(struct page *head, struct page *tail,
	VM_BUG_ON_PAGE(!PageHead(head), head);
	VM_BUG_ON_PAGE(PageCompound(tail), head);
	VM_BUG_ON_PAGE(PageLRU(tail), head);
	lockdep_assert_held(&lruvec_pgdat(lruvec)->lru_lock);
	lockdep_assert_held(&lruvec->lru_lock);

	if (list) {
		/* page reclaim is reclaiming a huge page */
@@ -2449,7 +2449,6 @@ static void __split_huge_page(struct page *page, struct list_head *list,
		pgoff_t end)
{
	struct page *head = compound_head(page);
	pg_data_t *pgdat = page_pgdat(head);
	struct lruvec *lruvec;
	struct address_space *swap_cache = NULL;
	unsigned long offset = 0;
@@ -2467,10 +2466,8 @@ static void __split_huge_page(struct page *page, struct list_head *list,
		xa_lock(&swap_cache->i_pages);
	}

	/* prevent PageLRU to go away from under us, and freeze lru stats */
	spin_lock(&pgdat->lru_lock);

	lruvec = mem_cgroup_page_lruvec(head, pgdat);
	/* lock lru list/PageCompound, ref freezed by page_ref_freeze */
	lruvec = lock_page_lruvec(head);

	for (i = nr - 1; i >= 1; i--) {
		__split_huge_page_tail(head, i, lruvec, list);
@@ -2491,7 +2488,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
	}

	ClearPageCompound(head);
	spin_unlock(&pgdat->lru_lock);
	unlock_page_lruvec(lruvec);
	/* Caller disabled irqs, so they are still disabled here */

	split_page_owner(head, nr);
+75 −3
Original line number Diff line number Diff line
@@ -20,6 +20,9 @@
 * Lockless page tracking & accounting
 * Unified hierarchy configuration model
 * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner
 *
 * Per memcg lru locking
 * Copyright (C) 2020 Alibaba, Inc, Alex Shi
 */

#include <linux/page_counter.h>
@@ -1330,6 +1333,23 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
	return ret;
}

#ifdef CONFIG_DEBUG_VM
void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
{
	struct mem_cgroup *memcg;

	if (mem_cgroup_disabled())
		return;

	memcg = page_memcg(page);

	if (!memcg)
		VM_BUG_ON_PAGE(lruvec_memcg(lruvec) != root_mem_cgroup, page);
	else
		VM_BUG_ON_PAGE(lruvec_memcg(lruvec) != memcg, page);
}
#endif

/**
 * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
 * @page: the page
@@ -1370,6 +1390,60 @@ out:
	return lruvec;
}

/**
 * lock_page_lruvec - lock and return lruvec for a given page.
 * @page: the page
 *
 * This series functions should be used in either conditions:
 * PageLRU is cleared or unset
 * or page->_refcount is zero
 * or page is locked.
 */
struct lruvec *lock_page_lruvec(struct page *page)
{
	struct lruvec *lruvec;
	struct pglist_data *pgdat = page_pgdat(page);

	rcu_read_lock();
	lruvec = mem_cgroup_page_lruvec(page, pgdat);
	spin_lock(&lruvec->lru_lock);
	rcu_read_unlock();

	lruvec_memcg_debug(lruvec, page);

	return lruvec;
}

struct lruvec *lock_page_lruvec_irq(struct page *page)
{
	struct lruvec *lruvec;
	struct pglist_data *pgdat = page_pgdat(page);

	rcu_read_lock();
	lruvec = mem_cgroup_page_lruvec(page, pgdat);
	spin_lock_irq(&lruvec->lru_lock);
	rcu_read_unlock();

	lruvec_memcg_debug(lruvec, page);

	return lruvec;
}

struct lruvec *lock_page_lruvec_irqsave(struct page *page, unsigned long *flags)
{
	struct lruvec *lruvec;
	struct pglist_data *pgdat = page_pgdat(page);

	rcu_read_lock();
	lruvec = mem_cgroup_page_lruvec(page, pgdat);
	spin_lock_irqsave(&lruvec->lru_lock, *flags);
	rcu_read_unlock();

	lruvec_memcg_debug(lruvec, page);

	return lruvec;
}

/**
 * mem_cgroup_update_lru_size - account for adding or removing an lru page
 * @lruvec: mem_cgroup per zone lru vector
@@ -3281,10 +3355,8 @@ void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
#endif /* CONFIG_MEMCG_KMEM */

#ifdef CONFIG_TRANSPARENT_HUGEPAGE

/*
 * Because tail pages are not marked as "used", set it. We're under
 * pgdat->lru_lock and migration entries setup in all page mappings.
 * Because page_memcg(head) is not set on compound tails, set it now.
 */
void mem_cgroup_split_huge_fixup(struct page *head)
{
Loading