Commit 71a2c112 authored by Kirill A. Shutemov's avatar Kirill A. Shutemov Committed by Linus Torvalds
Browse files

khugepaged: introduce 'max_ptes_shared' tunable

'max_ptes_shared' specifies how many pages can be shared across multiple
processes.  Exceeding the number would block the collapse::

	/sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_shared

A higher value may increase memory footprint for some workloads.

By default, at least half of pages has to be not shared.

[colin.king@canonical.com: fix several spelling mistakes]
  Link: http://lkml.kernel.org/r/20200420084241.65433-1-colin.king@canonical.com


Signed-off-by: default avatarKirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: default avatarColin Ian King <colin.king@canonical.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Tested-by: default avatarZi Yan <ziy@nvidia.com>
Reviewed-by: default avatarWilliam Kucharski <william.kucharski@oracle.com>
Reviewed-by: default avatarZi Yan <ziy@nvidia.com>
Acked-by: default avatarYang Shi <yang.shi@linux.alibaba.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Link: http://lkml.kernel.org/r/20200416160026.16538-9-kirill.shutemov@linux.intel.com


Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 3917c802
Loading
Loading
Loading
Loading
+7 −0
Original line number Diff line number Diff line
@@ -220,6 +220,13 @@ memory. A lower value can prevent THPs from being
collapsed, resulting fewer pages being collapsed into
THPs, and lower memory access performance.

``max_ptes_shared`` specifies how many pages can be shared across multiple
processes. Exceeding the number would block the collapse::

	/sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_shared

A higher value may increase memory footprint for some workloads.

Boot parameter
==============

+2 −1
Original line number Diff line number Diff line
@@ -12,6 +12,8 @@
	EM( SCAN_SUCCEED,		"succeeded")			\
	EM( SCAN_PMD_NULL,		"pmd_null")			\
	EM( SCAN_EXCEED_NONE_PTE,	"exceed_none_pte")		\
	EM( SCAN_EXCEED_SWAP_PTE,	"exceed_swap_pte")		\
	EM( SCAN_EXCEED_SHARED_PTE,	"exceed_shared_pte")		\
	EM( SCAN_PTE_NON_PRESENT,	"pte_non_present")		\
	EM( SCAN_PTE_UFFD_WP,		"pte_uffd_wp")			\
	EM( SCAN_PAGE_RO,		"no_writable_page")		\
@@ -31,7 +33,6 @@
	EM( SCAN_DEL_PAGE_LRU,		"could_not_delete_page_from_lru")\
	EM( SCAN_ALLOC_HUGE_PAGE_FAIL,	"alloc_huge_page_failed")	\
	EM( SCAN_CGROUP_CHARGE_FAIL,	"ccgroup_charge_failed")	\
	EM( SCAN_EXCEED_SWAP_PTE,	"exceed_swap_pte")		\
	EM( SCAN_TRUNCATED,		"truncated")			\
	EMe(SCAN_PAGE_HAS_PRIVATE,	"page_has_private")		\

+48 −4
Original line number Diff line number Diff line
@@ -28,6 +28,8 @@ enum scan_result {
	SCAN_SUCCEED,
	SCAN_PMD_NULL,
	SCAN_EXCEED_NONE_PTE,
	SCAN_EXCEED_SWAP_PTE,
	SCAN_EXCEED_SHARED_PTE,
	SCAN_PTE_NON_PRESENT,
	SCAN_PTE_UFFD_WP,
	SCAN_PAGE_RO,
@@ -47,7 +49,6 @@ enum scan_result {
	SCAN_DEL_PAGE_LRU,
	SCAN_ALLOC_HUGE_PAGE_FAIL,
	SCAN_CGROUP_CHARGE_FAIL,
	SCAN_EXCEED_SWAP_PTE,
	SCAN_TRUNCATED,
	SCAN_PAGE_HAS_PRIVATE,
};
@@ -72,6 +73,7 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
 */
static unsigned int khugepaged_max_ptes_none __read_mostly;
static unsigned int khugepaged_max_ptes_swap __read_mostly;
static unsigned int khugepaged_max_ptes_shared __read_mostly;

#define MM_SLOTS_HASH_BITS 10
static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
@@ -291,15 +293,43 @@ static struct kobj_attribute khugepaged_max_ptes_swap_attr =
	__ATTR(max_ptes_swap, 0644, khugepaged_max_ptes_swap_show,
	       khugepaged_max_ptes_swap_store);

static ssize_t khugepaged_max_ptes_shared_show(struct kobject *kobj,
					     struct kobj_attribute *attr,
					     char *buf)
{
	return sprintf(buf, "%u\n", khugepaged_max_ptes_shared);
}

static ssize_t khugepaged_max_ptes_shared_store(struct kobject *kobj,
					      struct kobj_attribute *attr,
					      const char *buf, size_t count)
{
	int err;
	unsigned long max_ptes_shared;

	err  = kstrtoul(buf, 10, &max_ptes_shared);
	if (err || max_ptes_shared > HPAGE_PMD_NR-1)
		return -EINVAL;

	khugepaged_max_ptes_shared = max_ptes_shared;

	return count;
}

static struct kobj_attribute khugepaged_max_ptes_shared_attr =
	__ATTR(max_ptes_shared, 0644, khugepaged_max_ptes_shared_show,
	       khugepaged_max_ptes_shared_store);

static struct attribute *khugepaged_attr[] = {
	&khugepaged_defrag_attr.attr,
	&khugepaged_max_ptes_none_attr.attr,
	&khugepaged_max_ptes_swap_attr.attr,
	&khugepaged_max_ptes_shared_attr.attr,
	&pages_to_scan_attr.attr,
	&pages_collapsed_attr.attr,
	&full_scans_attr.attr,
	&scan_sleep_millisecs_attr.attr,
	&alloc_sleep_millisecs_attr.attr,
	&khugepaged_max_ptes_swap_attr.attr,
	NULL,
};

@@ -359,6 +389,7 @@ int __init khugepaged_init(void)
	khugepaged_pages_to_scan = HPAGE_PMD_NR * 8;
	khugepaged_max_ptes_none = HPAGE_PMD_NR - 1;
	khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8;
	khugepaged_max_ptes_shared = HPAGE_PMD_NR / 2;

	return 0;
}
@@ -557,7 +588,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
{
	struct page *page = NULL;
	pte_t *_pte;
	int none_or_zero = 0, result = 0, referenced = 0;
	int none_or_zero = 0, shared = 0, result = 0, referenced = 0;
	bool writable = false;

	for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
@@ -585,6 +616,12 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,

		VM_BUG_ON_PAGE(!PageAnon(page), page);

		if (page_mapcount(page) > 1 &&
				++shared > khugepaged_max_ptes_shared) {
			result = SCAN_EXCEED_SHARED_PTE;
			goto out;
		}

		if (PageCompound(page)) {
			struct page *p;
			page = compound_head(page);
@@ -1168,7 +1205,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
{
	pmd_t *pmd;
	pte_t *pte, *_pte;
	int ret = 0, none_or_zero = 0, result = 0, referenced = 0;
	int ret = 0, result = 0, referenced = 0;
	int none_or_zero = 0, shared = 0;
	struct page *page = NULL;
	unsigned long _address;
	spinlock_t *ptl;
@@ -1240,6 +1278,12 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
			goto out_unmap;
		}

		if (page_mapcount(page) > 1 &&
				++shared > khugepaged_max_ptes_shared) {
			result = SCAN_EXCEED_SHARED_PTE;
			goto out_unmap;
		}

		page = compound_head(page);

		/*
+83 −0
Original line number Diff line number Diff line
@@ -78,6 +78,7 @@ struct khugepaged_settings {
	unsigned int scan_sleep_millisecs;
	unsigned int max_ptes_none;
	unsigned int max_ptes_swap;
	unsigned int max_ptes_shared;
	unsigned long pages_to_scan;
};

@@ -277,6 +278,7 @@ static void write_settings(struct settings *settings)
			khugepaged->scan_sleep_millisecs);
	write_num("khugepaged/max_ptes_none", khugepaged->max_ptes_none);
	write_num("khugepaged/max_ptes_swap", khugepaged->max_ptes_swap);
	write_num("khugepaged/max_ptes_shared", khugepaged->max_ptes_shared);
	write_num("khugepaged/pages_to_scan", khugepaged->pages_to_scan);
}

@@ -313,6 +315,7 @@ static void save_settings(void)
			read_num("khugepaged/scan_sleep_millisecs"),
		.max_ptes_none = read_num("khugepaged/max_ptes_none"),
		.max_ptes_swap = read_num("khugepaged/max_ptes_swap"),
		.max_ptes_shared = read_num("khugepaged/max_ptes_shared"),
		.pages_to_scan = read_num("khugepaged/pages_to_scan"),
	};
	success("OK");
@@ -896,12 +899,90 @@ static void collapse_fork_compound(void)
			fail("Fail");
		fill_memory(p, 0, page_size);

		write_num("khugepaged/max_ptes_shared", hpage_pmd_nr - 1);
		if (wait_for_scan("Collapse PTE table full of compound pages in child", p))
			fail("Timeout");
		else if (check_huge(p))
			success("OK");
		else
			fail("Fail");
		write_num("khugepaged/max_ptes_shared",
				default_settings.khugepaged.max_ptes_shared);

		validate_memory(p, 0, hpage_pmd_size);
		munmap(p, hpage_pmd_size);
		exit(exit_status);
	}

	wait(&wstatus);
	exit_status += WEXITSTATUS(wstatus);

	printf("Check if parent still has huge page...");
	if (check_huge(p))
		success("OK");
	else
		fail("Fail");
	validate_memory(p, 0, hpage_pmd_size);
	munmap(p, hpage_pmd_size);
}

static void collapse_max_ptes_shared()
{
	int max_ptes_shared = read_num("khugepaged/max_ptes_shared");
	int wstatus;
	void *p;

	p = alloc_mapping();

	printf("Allocate huge page...");
	madvise(p, hpage_pmd_size, MADV_HUGEPAGE);
	fill_memory(p, 0, hpage_pmd_size);
	if (check_huge(p))
		success("OK");
	else
		fail("Fail");

	printf("Share huge page over fork()...");
	if (!fork()) {
		/* Do not touch settings on child exit */
		skip_settings_restore = true;
		exit_status = 0;

		if (check_huge(p))
			success("OK");
		else
			fail("Fail");

		printf("Trigger CoW on page %d of %d...",
				hpage_pmd_nr - max_ptes_shared - 1, hpage_pmd_nr);
		fill_memory(p, 0, (hpage_pmd_nr - max_ptes_shared - 1) * page_size);
		if (!check_huge(p))
			success("OK");
		else
			fail("Fail");

		if (wait_for_scan("Do not collapse with max_ptes_shared exceeded", p))
			fail("Timeout");
		else if (!check_huge(p))
			success("OK");
		else
			fail("Fail");

		printf("Trigger CoW on page %d of %d...",
				hpage_pmd_nr - max_ptes_shared, hpage_pmd_nr);
		fill_memory(p, 0, (hpage_pmd_nr - max_ptes_shared) * page_size);
		if (!check_huge(p))
			success("OK");
		else
			fail("Fail");


		if (wait_for_scan("Collapse with max_ptes_shared PTEs shared", p))
			fail("Timeout");
		else if (check_huge(p))
			success("OK");
		else
			fail("Fail");

		validate_memory(p, 0, hpage_pmd_size);
		munmap(p, hpage_pmd_size);
@@ -930,6 +1011,7 @@ int main(void)

	default_settings.khugepaged.max_ptes_none = hpage_pmd_nr - 1;
	default_settings.khugepaged.max_ptes_swap = hpage_pmd_nr / 8;
	default_settings.khugepaged.max_ptes_shared = hpage_pmd_nr / 2;
	default_settings.khugepaged.pages_to_scan = hpage_pmd_nr * 8;

	save_settings();
@@ -947,6 +1029,7 @@ int main(void)
	collapse_compound_extreme();
	collapse_fork();
	collapse_fork_compound();
	collapse_max_ptes_shared();

	restore_settings(0);
}