Commit 9855609b authored by Roman Gushchin's avatar Roman Gushchin Committed by Linus Torvalds
Browse files

mm: memcg/slab: use a single set of kmem_caches for all accounted allocations



This is fairly big but mostly red patch, which makes all accounted slab
allocations use a single set of kmem_caches instead of creating a separate
set for each memory cgroup.

Because the number of non-root kmem_caches is now capped by the number of
root kmem_caches, there is no need to shrink or destroy them prematurely.
They can be perfectly destroyed together with their root counterparts.
This allows to dramatically simplify the management of non-root
kmem_caches and delete a ton of code.

This patch performs the following changes:
1) introduces memcg_params.memcg_cache pointer to represent the
   kmem_cache which will be used for all non-root allocations
2) reuses the existing memcg kmem_cache creation mechanism
   to create memcg kmem_cache on the first allocation attempt
3) memcg kmem_caches are named <kmemcache_name>-memcg,
   e.g. dentry-memcg
4) simplifies memcg_kmem_get_cache() to just return memcg kmem_cache
   or schedule it's creation and return the root cache
5) removes almost all non-root kmem_cache management code
   (separate refcounter, reparenting, shrinking, etc)
6) makes slab debugfs to display root_mem_cgroup css id and never
   show :dead and :deact flags in the memcg_slabinfo attribute.

Following patches in the series will simplify the kmem_cache creation.

Signed-off-by: default avatarRoman Gushchin <guro@fb.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Reviewed-by: default avatarVlastimil Babka <vbabka@suse.cz>
Reviewed-by: default avatarShakeel Butt <shakeelb@google.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Link: http://lkml.kernel.org/r/20200623174037.3951353-13-guro@fb.com


Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 0f876e4d
Loading
Loading
Loading
Loading
+1 −4
Original line number Diff line number Diff line
@@ -317,7 +317,6 @@ struct mem_cgroup {
        /* Index in the kmem_cache->memcg_params.memcg_caches array */
	int kmemcg_id;
	enum memcg_kmem_state kmem_state;
	struct list_head kmem_caches;
	struct obj_cgroup __rcu *objcg;
	struct list_head objcg_list; /* list of inherited objcgs */
#endif
@@ -1404,9 +1403,7 @@ static inline void memcg_set_shrinker_bit(struct mem_cgroup *memcg,
}
#endif

struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep,
					struct obj_cgroup **objcgp);
void memcg_kmem_put_cache(struct kmem_cache *cachep);
struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep);

#ifdef CONFIG_MEMCG_KMEM
int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
+1 −4
Original line number Diff line number Diff line
@@ -155,8 +155,7 @@ struct kmem_cache *kmem_cache_create_usercopy(const char *name,
void kmem_cache_destroy(struct kmem_cache *);
int kmem_cache_shrink(struct kmem_cache *);

void memcg_create_kmem_cache(struct mem_cgroup *, struct kmem_cache *);
void memcg_deactivate_kmem_caches(struct mem_cgroup *, struct mem_cgroup *);
void memcg_create_kmem_cache(struct kmem_cache *cachep);

/*
 * Please use this macro to create slab caches. Simply specify the
@@ -580,8 +579,6 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
	return __kmalloc_node(size, flags, node);
}

int memcg_update_all_caches(int num_memcgs);

/**
 * kmalloc_array - allocate memory for an array.
 * @n: number of elements.
+32 −131
Original line number Diff line number Diff line
@@ -350,7 +350,7 @@ static void memcg_reparent_objcgs(struct mem_cgroup *memcg,
}

/*
 * This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
 * This will be used as a shrinker list's index.
 * The main reason for not using cgroup id for this:
 *  this works better in sparse environments, where we have a lot of memcgs,
 *  but only a few kmem-limited. Or also, if we have, for instance, 200
@@ -569,9 +569,6 @@ ino_t page_cgroup_ino(struct page *page)
	unsigned long ino = 0;

	rcu_read_lock();
	if (PageSlab(page) && !PageTail(page)) {
		memcg = memcg_from_slab_page(page);
	} else {
	memcg = page->mem_cgroup;

	/*
@@ -582,7 +579,6 @@ ino_t page_cgroup_ino(struct page *page)
	 */
	if ((unsigned long) memcg & 0x1UL)
		memcg = NULL;
	}

	while (memcg && !(memcg->css.flags & CSS_ONLINE))
		memcg = parent_mem_cgroup(memcg);
@@ -2822,12 +2818,18 @@ struct mem_cgroup *mem_cgroup_from_obj(void *p)
	page = virt_to_head_page(p);

	/*
	 * Slab pages don't have page->mem_cgroup set because corresponding
	 * kmem caches can be reparented during the lifetime. That's why
	 * memcg_from_slab_page() should be used instead.
	 * Slab objects are accounted individually, not per-page.
	 * Memcg membership data for each individual object is saved in
	 * the page->obj_cgroups.
	 */
	if (PageSlab(page))
		return memcg_from_slab_page(page);
	if (page_has_obj_cgroups(page)) {
		struct obj_cgroup *objcg;
		unsigned int off;

		off = obj_to_index(page->slab_cache, page, p);
		objcg = page_obj_cgroups(page)[off];
		return obj_cgroup_memcg(objcg);
	}

	/* All other pages use page->mem_cgroup */
	return page->mem_cgroup;
@@ -2882,8 +2884,6 @@ static int memcg_alloc_cache_id(void)
	else if (size > MEMCG_CACHES_MAX_SIZE)
		size = MEMCG_CACHES_MAX_SIZE;

	err = memcg_update_all_caches(size);
	if (!err)
	err = memcg_update_all_list_lrus(size);
	if (!err)
		memcg_nr_cache_ids = size;
@@ -2903,7 +2903,6 @@ static void memcg_free_cache_id(int id)
}

struct memcg_kmem_cache_create_work {
	struct mem_cgroup *memcg;
	struct kmem_cache *cachep;
	struct work_struct work;
};
@@ -2912,33 +2911,24 @@ static void memcg_kmem_cache_create_func(struct work_struct *w)
{
	struct memcg_kmem_cache_create_work *cw =
		container_of(w, struct memcg_kmem_cache_create_work, work);
	struct mem_cgroup *memcg = cw->memcg;
	struct kmem_cache *cachep = cw->cachep;

	memcg_create_kmem_cache(memcg, cachep);
	memcg_create_kmem_cache(cachep);

	css_put(&memcg->css);
	kfree(cw);
}

/*
 * Enqueue the creation of a per-memcg kmem_cache.
 */
static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
					       struct kmem_cache *cachep)
static void memcg_schedule_kmem_cache_create(struct kmem_cache *cachep)
{
	struct memcg_kmem_cache_create_work *cw;

	if (!css_tryget_online(&memcg->css))
		return;

	cw = kmalloc(sizeof(*cw), GFP_NOWAIT | __GFP_NOWARN);
	if (!cw) {
		css_put(&memcg->css);
	if (!cw)
		return;
	}

	cw->memcg = memcg;
	cw->cachep = cachep;
	INIT_WORK(&cw->work, memcg_kmem_cache_create_func);

@@ -2946,102 +2936,26 @@ static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
}

/**
 * memcg_kmem_get_cache: select the correct per-memcg cache for allocation
 * memcg_kmem_get_cache: select memcg or root cache for allocation
 * @cachep: the original global kmem cache
 *
 * Return the kmem_cache we're supposed to use for a slab allocation.
 * We try to use the current memcg's version of the cache.
 *
 * If the cache does not exist yet, if we are the first user of it, we
 * create it asynchronously in a workqueue and let the current allocation
 * go through with the original cache.
 *
 * This function takes a reference to the cache it returns to assure it
 * won't get destroyed while we are working with it. Once the caller is
 * done with it, memcg_kmem_put_cache() must be called to release the
 * reference.
 */
struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep,
					struct obj_cgroup **objcgp)
struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep)
{
	struct mem_cgroup *memcg;
	struct kmem_cache *memcg_cachep;
	struct memcg_cache_array *arr;
	int kmemcg_id;

	VM_BUG_ON(!is_root_cache(cachep));

	if (memcg_kmem_bypass())
		return cachep;

	rcu_read_lock();

	if (unlikely(current->active_memcg))
		memcg = current->active_memcg;
	else
		memcg = mem_cgroup_from_task(current);

	if (!memcg || memcg == root_mem_cgroup)
		goto out_unlock;

	kmemcg_id = READ_ONCE(memcg->kmemcg_id);
	if (kmemcg_id < 0)
		goto out_unlock;

	arr = rcu_dereference(cachep->memcg_params.memcg_caches);

	/*
	 * Make sure we will access the up-to-date value. The code updating
	 * memcg_caches issues a write barrier to match the data dependency
	 * barrier inside READ_ONCE() (see memcg_create_kmem_cache()).
	 */
	memcg_cachep = READ_ONCE(arr->entries[kmemcg_id]);

	/*
	 * If we are in a safe context (can wait, and not in interrupt
	 * context), we could be be predictable and return right away.
	 * This would guarantee that the allocation being performed
	 * already belongs in the new cache.
	 *
	 * However, there are some clashes that can arrive from locking.
	 * For instance, because we acquire the slab_mutex while doing
	 * memcg_create_kmem_cache, this means no further allocation
	 * could happen with the slab_mutex held. So it's better to
	 * defer everything.
	 *
	 * If the memcg is dying or memcg_cache is about to be released,
	 * don't bother creating new kmem_caches. Because memcg_cachep
	 * is ZEROed as the fist step of kmem offlining, we don't need
	 * percpu_ref_tryget_live() here. css_tryget_online() check in
	 * memcg_schedule_kmem_cache_create() will prevent us from
	 * creation of a new kmem_cache.
	 */
	if (unlikely(!memcg_cachep))
		memcg_schedule_kmem_cache_create(memcg, cachep);
	else if (percpu_ref_tryget(&memcg_cachep->memcg_params.refcnt)) {
		struct obj_cgroup *objcg = rcu_dereference(memcg->objcg);

		if (!objcg || !obj_cgroup_tryget(objcg)) {
			percpu_ref_put(&memcg_cachep->memcg_params.refcnt);
			goto out_unlock;
		}

		*objcgp = objcg;
		cachep = memcg_cachep;
	}
out_unlock:
	rcu_read_unlock();
	memcg_cachep = READ_ONCE(cachep->memcg_params.memcg_cache);
	if (unlikely(!memcg_cachep)) {
		memcg_schedule_kmem_cache_create(cachep);
		return cachep;
	}

/**
 * memcg_kmem_put_cache: drop reference taken by memcg_kmem_get_cache
 * @cachep: the cache returned by memcg_kmem_get_cache
 */
void memcg_kmem_put_cache(struct kmem_cache *cachep)
{
	if (!is_root_cache(cachep))
		percpu_ref_put(&cachep->memcg_params.refcnt);
	return memcg_cachep;
}

/**
@@ -3731,7 +3645,6 @@ static int memcg_online_kmem(struct mem_cgroup *memcg)
	 */
	memcg->kmemcg_id = memcg_id;
	memcg->kmem_state = KMEM_ONLINE;
	INIT_LIST_HEAD(&memcg->kmem_caches);

	return 0;
}
@@ -3744,22 +3657,13 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg)

	if (memcg->kmem_state != KMEM_ONLINE)
		return;
	/*
	 * Clear the online state before clearing memcg_caches array
	 * entries. The slab_mutex in memcg_deactivate_kmem_caches()
	 * guarantees that no cache will be created for this cgroup
	 * after we are done (see memcg_create_kmem_cache()).
	 */

	memcg->kmem_state = KMEM_ALLOCATED;

	parent = parent_mem_cgroup(memcg);
	if (!parent)
		parent = root_mem_cgroup;

	/*
	 * Deactivate and reparent kmem_caches and objcgs.
	 */
	memcg_deactivate_kmem_caches(memcg, parent);
	memcg_reparent_objcgs(memcg, parent);

	kmemcg_id = memcg->kmemcg_id;
@@ -5384,9 +5288,6 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)

	/* The following stuff does not apply to the root */
	if (!parent) {
#ifdef CONFIG_MEMCG_KMEM
		INIT_LIST_HEAD(&memcg->kmem_caches);
#endif
		root_mem_cgroup = memcg;
		return &memcg->css;
	}
+3 −13
Original line number Diff line number Diff line
@@ -1249,7 +1249,7 @@ void __init kmem_cache_init(void)
				  nr_node_ids * sizeof(struct kmem_cache_node *),
				  SLAB_HWCACHE_ALIGN, 0, 0);
	list_add(&kmem_cache->list, &slab_caches);
	memcg_link_cache(kmem_cache, NULL);
	memcg_link_cache(kmem_cache);
	slab_state = PARTIAL;

	/*
@@ -2253,17 +2253,6 @@ int __kmem_cache_shrink(struct kmem_cache *cachep)
	return (ret ? 1 : 0);
}

#ifdef CONFIG_MEMCG
void __kmemcg_cache_deactivate(struct kmem_cache *cachep)
{
	__kmem_cache_shrink(cachep);
}

void __kmemcg_cache_deactivate_after_rcu(struct kmem_cache *s)
{
}
#endif

int __kmem_cache_shutdown(struct kmem_cache *cachep)
{
	return __kmem_cache_shrink(cachep);
@@ -3872,7 +3861,8 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
		return ret;

	lockdep_assert_held(&slab_mutex);
	for_each_memcg_cache(c, cachep) {
	c = memcg_cache(cachep);
	if (c) {
		/* return value determined by the root cache only */
		__do_tune_cpucache(c, limit, batchcount, shared, gfp);
	}
+45 −101
Original line number Diff line number Diff line
@@ -32,66 +32,25 @@ struct kmem_cache {

#else /* !CONFIG_SLOB */

struct memcg_cache_array {
	struct rcu_head rcu;
	struct kmem_cache *entries[0];
};

/*
 * This is the main placeholder for memcg-related information in kmem caches.
 * Both the root cache and the child caches will have it. For the root cache,
 * this will hold a dynamically allocated array large enough to hold
 * information about the currently limited memcgs in the system. To allow the
 * array to be accessed without taking any locks, on relocation we free the old
 * version only after a grace period.
 *
 * Root and child caches hold different metadata.
 * Both the root cache and the child cache will have it. Some fields are used
 * in both cases, other are specific to root caches.
 *
 * @root_cache:	Common to root and child caches.  NULL for root, pointer to
 *		the root cache for children.
 *
 * The following fields are specific to root caches.
 *
 * @memcg_caches: kmemcg ID indexed table of child caches.  This table is
 *		used to index child cachces during allocation and cleared
 *		early during shutdown.
 *
 * @root_caches_node: List node for slab_root_caches list.
 *
 * @children:	List of all child caches.  While the child caches are also
 *		reachable through @memcg_caches, a child cache remains on
 *		this list until it is actually destroyed.
 *
 * The following fields are specific to child caches.
 *
 * @memcg:	Pointer to the memcg this cache belongs to.
 *
 * @children_node: List node for @root_cache->children list.
 *
 * @kmem_caches_node: List node for @memcg->kmem_caches list.
 * @memcg_cache: pointer to memcg kmem cache, used by all non-root memory
 *		cgroups.
 * @root_caches_node: list node for slab_root_caches list.
 */
struct memcg_cache_params {
	struct kmem_cache *root_cache;
	union {
		struct {
			struct memcg_cache_array __rcu *memcg_caches;

	struct kmem_cache *memcg_cache;
	struct list_head __root_caches_node;
			struct list_head children;
			bool dying;
		};
		struct {
			struct mem_cgroup *memcg;
			struct list_head children_node;
			struct list_head kmem_caches_node;
			struct percpu_ref refcnt;

			void (*work_fn)(struct kmem_cache *);
			union {
				struct rcu_head rcu_head;
				struct work_struct work;
			};
		};
	};
};
#endif /* CONFIG_SLOB */

@@ -236,8 +195,6 @@ bool __kmem_cache_empty(struct kmem_cache *);
int __kmem_cache_shutdown(struct kmem_cache *);
void __kmem_cache_release(struct kmem_cache *);
int __kmem_cache_shrink(struct kmem_cache *);
void __kmemcg_cache_deactivate(struct kmem_cache *s);
void __kmemcg_cache_deactivate_after_rcu(struct kmem_cache *s);
void slab_kmem_cache_release(struct kmem_cache *);
void kmem_cache_shrink_all(struct kmem_cache *s);

@@ -311,14 +268,6 @@ static inline bool kmem_cache_debug_flags(struct kmem_cache *s, slab_flags_t fla
extern struct list_head		slab_root_caches;
#define root_caches_node	memcg_params.__root_caches_node

/*
 * Iterate over all memcg caches of the given root cache. The caller must hold
 * slab_mutex.
 */
#define for_each_memcg_cache(iter, root) \
	list_for_each_entry(iter, &(root)->memcg_params.children, \
			    memcg_params.children_node)

static inline bool is_root_cache(struct kmem_cache *s)
{
	return !s->memcg_params.root_cache;
@@ -349,6 +298,13 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
	return s->memcg_params.root_cache;
}

static inline struct kmem_cache *memcg_cache(struct kmem_cache *s)
{
	if (is_root_cache(s))
		return s->memcg_params.memcg_cache;
	return NULL;
}

static inline struct obj_cgroup **page_obj_cgroups(struct page *page)
{
	/*
@@ -361,25 +317,9 @@ static inline struct obj_cgroup **page_obj_cgroups(struct page *page)
		((unsigned long)page->obj_cgroups & ~0x1UL);
}

/*
 * Expects a pointer to a slab page. Please note, that PageSlab() check
 * isn't sufficient, as it returns true also for tail compound slab pages,
 * which do not have slab_cache pointer set.
 * So this function assumes that the page can pass PageSlab() && !PageTail()
 * check.
 *
 * The kmem_cache can be reparented asynchronously. The caller must ensure
 * the memcg lifetime, e.g. by taking rcu_read_lock() or cgroup_mutex.
 */
static inline struct mem_cgroup *memcg_from_slab_page(struct page *page)
static inline bool page_has_obj_cgroups(struct page *page)
{
	struct kmem_cache *s;

	s = READ_ONCE(page->slab_cache);
	if (s && !is_root_cache(s))
		return READ_ONCE(s->memcg_params.memcg);

	return NULL;
	return ((unsigned long)page->obj_cgroups & 0x1UL);
}

static inline int memcg_alloc_page_obj_cgroups(struct page *page,
@@ -418,17 +358,25 @@ static inline struct kmem_cache *memcg_slab_pre_alloc_hook(struct kmem_cache *s,
						size_t objects, gfp_t flags)
{
	struct kmem_cache *cachep;
	struct obj_cgroup *objcg;

	cachep = memcg_kmem_get_cache(s, objcgp);
	if (memcg_kmem_bypass())
		return s;

	cachep = memcg_kmem_get_cache(s);
	if (is_root_cache(cachep))
		return s;

	if (obj_cgroup_charge(*objcgp, flags, objects * obj_full_size(s))) {
		obj_cgroup_put(*objcgp);
		memcg_kmem_put_cache(cachep);
	objcg = get_obj_cgroup_from_current();
	if (!objcg)
		return s;

	if (obj_cgroup_charge(objcg, flags, objects * obj_full_size(s))) {
		obj_cgroup_put(objcg);
		cachep = NULL;
	}

	*objcgp = objcg;
	return cachep;
}

@@ -467,7 +415,6 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
		}
	}
	obj_cgroup_put(objcg);
	memcg_kmem_put_cache(s);
}

static inline void memcg_slab_free_hook(struct kmem_cache *s, struct page *page,
@@ -491,7 +438,7 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s, struct page *page,
}

extern void slab_init_memcg_params(struct kmem_cache *);
extern void memcg_link_cache(struct kmem_cache *s, struct mem_cgroup *memcg);
extern void memcg_link_cache(struct kmem_cache *s);

#else /* CONFIG_MEMCG_KMEM */

@@ -499,9 +446,6 @@ extern void memcg_link_cache(struct kmem_cache *s, struct mem_cgroup *memcg);
#define slab_root_caches	slab_caches
#define root_caches_node	list

#define for_each_memcg_cache(iter, root) \
	for ((void)(iter), (void)(root); 0; )

static inline bool is_root_cache(struct kmem_cache *s)
{
	return true;
@@ -523,7 +467,17 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
	return s;
}

static inline struct mem_cgroup *memcg_from_slab_page(struct page *page)
static inline struct kmem_cache *memcg_cache(struct kmem_cache *s)
{
	return NULL;
}

static inline bool page_has_obj_cgroups(struct page *page)
{
	return false;
}

static inline struct mem_cgroup *memcg_from_slab_obj(void *ptr)
{
	return NULL;
}
@@ -560,8 +514,7 @@ static inline void slab_init_memcg_params(struct kmem_cache *s)
{
}

static inline void memcg_link_cache(struct kmem_cache *s,
				    struct mem_cgroup *memcg)
static inline void memcg_link_cache(struct kmem_cache *s)
{
}

@@ -582,17 +535,14 @@ static __always_inline int charge_slab_page(struct page *page,
					    gfp_t gfp, int order,
					    struct kmem_cache *s)
{
#ifdef CONFIG_MEMCG_KMEM
	if (memcg_kmem_enabled() && !is_root_cache(s)) {
		int ret;

		ret = memcg_alloc_page_obj_cgroups(page, s, gfp);
		if (ret)
			return ret;

		percpu_ref_get_many(&s->memcg_params.refcnt, 1 << order);
	}
#endif

	mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
			    PAGE_SIZE << order);
	return 0;
@@ -601,12 +551,9 @@ static __always_inline int charge_slab_page(struct page *page,
static __always_inline void uncharge_slab_page(struct page *page, int order,
					       struct kmem_cache *s)
{
#ifdef CONFIG_MEMCG_KMEM
	if (memcg_kmem_enabled() && !is_root_cache(s)) {
	if (memcg_kmem_enabled() && !is_root_cache(s))
		memcg_free_page_obj_cgroups(page);
		percpu_ref_put_many(&s->memcg_params.refcnt, 1 << order);
	}
#endif

	mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
			    -(PAGE_SIZE << order));
}
@@ -749,9 +696,6 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
void *slab_start(struct seq_file *m, loff_t *pos);
void *slab_next(struct seq_file *m, void *p, loff_t *pos);
void slab_stop(struct seq_file *m, void *p);
void *memcg_slab_start(struct seq_file *m, loff_t *pos);
void *memcg_slab_next(struct seq_file *m, void *p, loff_t *pos);
void memcg_slab_stop(struct seq_file *m, void *p);
int memcg_slab_show(struct seq_file *m, void *p);

#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)
Loading