Skip to content

Commit 86fce8b

Browse files
yuzhaogoogleunifreq
authored andcommitted
BACKPORT: mm: multi-gen LRU: kill switch
Add /sys/kernel/mm/lru_gen/enabled as a kill switch. Components that can be disabled include: 0x0001: the multi-gen LRU core 0x0002: walking page table, when arch_has_hw_pte_young() returns true 0x0004: clearing the accessed bit in non-leaf PMD entries, when CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG=y [yYnN]: apply to all the components above E.g., echo y >/sys/kernel/mm/lru_gen/enabled cat /sys/kernel/mm/lru_gen/enabled 0x0007 echo 5 >/sys/kernel/mm/lru_gen/enabled cat /sys/kernel/mm/lru_gen/enabled 0x0005 NB: the page table walks happen on the scale of seconds under heavy memory pressure, in which case the mmap_lock contention is a lesser concern, compared with the LRU lock contention and the I/O congestion. So far the only well-known case of the mmap_lock contention happens on Android, due to Scudo [1] which allocates several thousand VMAs for merely a few hundred MBs. The SPF and the Maple Tree also have provided their own assessments [2][3]. However, if walking page tables does worsen the mmap_lock contention, the kill switch can be used to disable it. In this case the multi-gen LRU will suffer a minor performance degradation, as shown previously. Clearing the accessed bit in non-leaf PMD entries can also be disabled, since this behavior was not tested on x86 varieties other than Intel and AMD. [1] https://source.android.com/devices/tech/debug/scudo [2] https://lore.kernel.org/r/20220128131006.67712-1-michel@lespinasse.org/ [3] https://lore.kernel.org/r/20220426150616.3937571-1-Liam.Howlett@oracle.com/ Link: https://lkml.kernel.org/r/20220918080010.2920238-11-yuzhao@google.com Change-Id: If3116e6698cc6967b6992c2017962fac6c2d3a11 Signed-off-by: Yu Zhao <yuzhao@google.com> Acked-by: Brian Geffon <bgeffon@google.com> Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org> Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name> Acked-by: Steven Barrett <steven@liquorix.net> Acked-by: Suleiman Souhlal <suleiman@google.com> Tested-by: Daniel Byrne <djbyrne@mtu.edu> Tested-by: Donald Carr <d@chaos-reins.com> Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com> Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru> Tested-by: Shuang Zhai <szhai2@cs.rochester.edu> Tested-by: Sofia Trinh <sofia.trinh@edi.works> Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> Cc: Barry Song <baohua@kernel.org> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: Hillf Danton <hdanton@sina.com> Cc: Jens Axboe <axboe@kernel.dk> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Matthew Wilcox <willy@infradead.org> Cc: Mel Gorman <mgorman@suse.de> Cc: Miaohe Lin <linmiaohe@huawei.com> Cc: Michael Larabel <Michael@MichaelLarabel.com> Cc: Michal Hocko <mhocko@kernel.org> Cc: Mike Rapoport <rppt@kernel.org> Cc: Mike Rapoport <rppt@linux.ibm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Qi Zheng <zhengqi.arch@bytedance.com> Cc: Tejun Heo <tj@kernel.org> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Will Deacon <will@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> (cherry picked from commit 354ed597442952fb680c9cafc7e4eb8a76f9514c) Bug: 249601646 Signed-off-by: Kalesh Singh <kaleshsingh@google.com>
1 parent 90b3a4c commit 86fce8b

File tree

6 files changed

+266
-9
lines changed

6 files changed

+266
-9
lines changed

include/linux/cgroup.h

+14-1
Original file line numberDiff line numberDiff line change
@@ -437,6 +437,18 @@ static inline void cgroup_put(struct cgroup *cgrp)
437437
css_put(&cgrp->self);
438438
}
439439

440+
extern struct mutex cgroup_mutex;
441+
442+
static inline void cgroup_lock(void)
443+
{
444+
mutex_lock(&cgroup_mutex);
445+
}
446+
447+
static inline void cgroup_unlock(void)
448+
{
449+
mutex_unlock(&cgroup_mutex);
450+
}
451+
440452
/**
441453
* task_css_set_check - obtain a task's css_set with extra access conditions
442454
* @task: the task to obtain css_set for
@@ -451,7 +463,6 @@ static inline void cgroup_put(struct cgroup *cgrp)
451463
* as locks used during the cgroup_subsys::attach() methods.
452464
*/
453465
#ifdef CONFIG_PROVE_RCU
454-
extern struct mutex cgroup_mutex;
455466
extern spinlock_t css_set_lock;
456467
#define task_css_set_check(task, __c) \
457468
rcu_dereference_check((task)->cgroups, \
@@ -711,6 +722,8 @@ struct cgroup;
711722
static inline u64 cgroup_id(struct cgroup *cgrp) { return 1; }
712723
static inline void css_get(struct cgroup_subsys_state *css) {}
713724
static inline void css_put(struct cgroup_subsys_state *css) {}
725+
static inline void cgroup_lock(void) {}
726+
static inline void cgroup_unlock(void) {}
714727
static inline int cgroup_attach_task_all(struct task_struct *from,
715728
struct task_struct *t) { return 0; }
716729
static inline int cgroupstats_build(struct cgroupstats *stats,

include/linux/mm_inline.h

+13-2
Original file line numberDiff line numberDiff line change
@@ -95,10 +95,21 @@ static __always_inline enum lru_list page_lru(struct page *page)
9595

9696
#ifdef CONFIG_LRU_GEN
9797

98+
#ifdef CONFIG_LRU_GEN_ENABLED
9899
static inline bool lru_gen_enabled(void)
99100
{
100-
return true;
101+
DECLARE_STATIC_KEY_TRUE(lru_gen_caps[NR_LRU_GEN_CAPS]);
102+
103+
return static_branch_likely(&lru_gen_caps[LRU_GEN_CORE]);
104+
}
105+
#else
106+
static inline bool lru_gen_enabled(void)
107+
{
108+
DECLARE_STATIC_KEY_FALSE(lru_gen_caps[NR_LRU_GEN_CAPS]);
109+
110+
return static_branch_unlikely(&lru_gen_caps[LRU_GEN_CORE]);
101111
}
112+
#endif
102113

103114
static inline bool lru_gen_in_fault(void)
104115
{
@@ -211,7 +222,7 @@ static inline bool lru_gen_add_page(struct lruvec *lruvec, struct page *page, bo
211222

212223
VM_WARN_ON_ONCE_PAGE(gen != -1, page);
213224

214-
if (PageUnevictable(page))
225+
if (PageUnevictable(page) || !lrugen->enabled)
215226
return false;
216227
/*
217228
* There are three common cases for this page:

include/linux/mmzone.h

+9
Original file line numberDiff line numberDiff line change
@@ -347,6 +347,13 @@ enum {
347347
LRU_GEN_FILE,
348348
};
349349

350+
enum {
351+
LRU_GEN_CORE,
352+
LRU_GEN_MM_WALK,
353+
LRU_GEN_NONLEAF_YOUNG,
354+
NR_LRU_GEN_CAPS
355+
};
356+
350357
#define MIN_LRU_BATCH BITS_PER_LONG
351358
#define MAX_LRU_BATCH (MIN_LRU_BATCH * 64)
352359

@@ -388,6 +395,8 @@ struct lru_gen_struct {
388395
/* can be modified without holding the LRU lock */
389396
atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
390397
atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
398+
/* whether the multi-gen LRU is enabled */
399+
bool enabled;
391400
};
392401

393402
enum {

kernel/cgroup/cgroup-internal.h

-1
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,6 @@ struct cgroup_mgctx {
165165
#define DEFINE_CGROUP_MGCTX(name) \
166166
struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name)
167167

168-
extern struct mutex cgroup_mutex;
169168
extern spinlock_t css_set_lock;
170169
extern struct cgroup_subsys *cgroup_subsys[];
171170
extern struct list_head cgroup_roots;

mm/Kconfig

+6
Original file line numberDiff line numberDiff line change
@@ -921,6 +921,12 @@ config LRU_GEN
921921
help
922922
A high performance LRU implementation to overcommit memory.
923923

924+
config LRU_GEN_ENABLED
925+
bool "Enable by default"
926+
depends on LRU_GEN
927+
help
928+
This option enables the multi-gen LRU by default.
929+
924930
config LRU_GEN_STATS
925931
bool "Full stats for debugging"
926932
depends on LRU_GEN

mm/vmscan.c

+224-5
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@
5353
#include <linux/psi.h>
5454
#include <linux/pagewalk.h>
5555
#include <linux/shmem_fs.h>
56+
#include <linux/ctype.h>
5657

5758
#include <asm/tlbflush.h>
5859
#include <asm/div64.h>
@@ -2629,6 +2630,14 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
26292630

26302631
#ifdef CONFIG_LRU_GEN
26312632

2633+
#ifdef CONFIG_LRU_GEN_ENABLED
2634+
DEFINE_STATIC_KEY_ARRAY_TRUE(lru_gen_caps, NR_LRU_GEN_CAPS);
2635+
#define get_cap(cap) static_branch_likely(&lru_gen_caps[cap])
2636+
#else
2637+
DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_caps, NR_LRU_GEN_CAPS);
2638+
#define get_cap(cap) static_branch_unlikely(&lru_gen_caps[cap])
2639+
#endif
2640+
26322641
/******************************************************************************
26332642
* shorthand helpers
26342643
******************************************************************************/
@@ -3507,7 +3516,8 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area
35073516
goto next;
35083517

35093518
if (!pmd_trans_huge(pmd[i])) {
3510-
if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG))
3519+
if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) &&
3520+
get_cap(LRU_GEN_NONLEAF_YOUNG))
35113521
pmdp_test_and_clear_young(vma, addr, pmd + i);
35123522
goto next;
35133523
}
@@ -3605,10 +3615,12 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
36053615
walk->mm_stats[MM_NONLEAF_TOTAL]++;
36063616

36073617
#ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
3608-
if (!pmd_young(val))
3609-
continue;
3618+
if (get_cap(LRU_GEN_NONLEAF_YOUNG)) {
3619+
if (!pmd_young(val))
3620+
continue;
36103621

3611-
walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos);
3622+
walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos);
3623+
}
36123624
#endif
36133625
if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i))
36143626
continue;
@@ -3872,7 +3884,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
38723884
* handful of PTEs. Spreading the work out over a period of time usually
38733885
* is less efficient, but it avoids bursty page faults.
38743886
*/
3875-
if (!arch_has_hw_pte_young()) {
3887+
if (!(arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))) {
38763888
success = iterate_mm_list_nowalk(lruvec, max_seq);
38773889
goto done;
38783890
}
@@ -4630,6 +4642,209 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
46304642
blk_finish_plug(&plug);
46314643
}
46324644

4645+
/******************************************************************************
4646+
* state change
4647+
******************************************************************************/
4648+
4649+
static bool __maybe_unused state_is_valid(struct lruvec *lruvec)
4650+
{
4651+
struct lru_gen_struct *lrugen = &lruvec->lrugen;
4652+
4653+
if (lrugen->enabled) {
4654+
enum lru_list lru;
4655+
4656+
for_each_evictable_lru(lru) {
4657+
if (!list_empty(&lruvec->lists[lru]))
4658+
return false;
4659+
}
4660+
} else {
4661+
int gen, type, zone;
4662+
4663+
for_each_gen_type_zone(gen, type, zone) {
4664+
if (!list_empty(&lrugen->lists[gen][type][zone]))
4665+
return false;
4666+
}
4667+
}
4668+
4669+
return true;
4670+
}
4671+
4672+
static bool fill_evictable(struct lruvec *lruvec)
4673+
{
4674+
enum lru_list lru;
4675+
int remaining = MAX_LRU_BATCH;
4676+
4677+
for_each_evictable_lru(lru) {
4678+
int type = is_file_lru(lru);
4679+
bool active = is_active_lru(lru);
4680+
struct list_head *head = &lruvec->lists[lru];
4681+
4682+
while (!list_empty(head)) {
4683+
bool success;
4684+
struct page *page = lru_to_page(head);
4685+
4686+
VM_WARN_ON_ONCE_PAGE(PageUnevictable(page), page);
4687+
VM_WARN_ON_ONCE_PAGE(PageActive(page) != active, page);
4688+
VM_WARN_ON_ONCE_PAGE(page_is_file_lru(page) != type, page);
4689+
VM_WARN_ON_ONCE_PAGE(page_lru_gen(page) != -1, page);
4690+
4691+
del_page_from_lru_list(page, lruvec);
4692+
success = lru_gen_add_page(lruvec, page, false);
4693+
VM_WARN_ON_ONCE(!success);
4694+
4695+
if (!--remaining)
4696+
return false;
4697+
}
4698+
}
4699+
4700+
return true;
4701+
}
4702+
4703+
static bool drain_evictable(struct lruvec *lruvec)
4704+
{
4705+
int gen, type, zone;
4706+
int remaining = MAX_LRU_BATCH;
4707+
4708+
for_each_gen_type_zone(gen, type, zone) {
4709+
struct list_head *head = &lruvec->lrugen.lists[gen][type][zone];
4710+
4711+
while (!list_empty(head)) {
4712+
bool success;
4713+
struct page *page = lru_to_page(head);
4714+
4715+
VM_WARN_ON_ONCE_PAGE(PageUnevictable(page), page);
4716+
VM_WARN_ON_ONCE_PAGE(PageActive(page), page);
4717+
VM_WARN_ON_ONCE_PAGE(page_is_file_lru(page) != type, page);
4718+
VM_WARN_ON_ONCE_PAGE(page_zonenum(page) != zone, page);
4719+
4720+
success = lru_gen_del_page(lruvec, page, false);
4721+
VM_WARN_ON_ONCE(!success);
4722+
add_page_to_lru_list(page, lruvec);
4723+
4724+
if (!--remaining)
4725+
return false;
4726+
}
4727+
}
4728+
4729+
return true;
4730+
}
4731+
4732+
static void lru_gen_change_state(bool enabled)
4733+
{
4734+
static DEFINE_MUTEX(state_mutex);
4735+
4736+
struct mem_cgroup *memcg;
4737+
4738+
cgroup_lock();
4739+
cpus_read_lock();
4740+
get_online_mems();
4741+
mutex_lock(&state_mutex);
4742+
4743+
if (enabled == lru_gen_enabled())
4744+
goto unlock;
4745+
4746+
if (enabled)
4747+
static_branch_enable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]);
4748+
else
4749+
static_branch_disable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]);
4750+
4751+
memcg = mem_cgroup_iter(NULL, NULL, NULL);
4752+
do {
4753+
int nid;
4754+
4755+
for_each_node(nid) {
4756+
struct pglist_data *pgdat = NODE_DATA(nid);
4757+
struct lruvec *lruvec = get_lruvec(memcg, nid);
4758+
4759+
if (!lruvec)
4760+
continue;
4761+
4762+
spin_lock_irq(&pgdat->lru_lock);
4763+
4764+
VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
4765+
VM_WARN_ON_ONCE(!state_is_valid(lruvec));
4766+
4767+
lruvec->lrugen.enabled = enabled;
4768+
4769+
while (!(enabled ? fill_evictable(lruvec) : drain_evictable(lruvec))) {
4770+
spin_unlock_irq(&pgdat->lru_lock);
4771+
cond_resched();
4772+
spin_lock_irq(&pgdat->lru_lock);
4773+
}
4774+
4775+
spin_unlock_irq(&pgdat->lru_lock);
4776+
}
4777+
4778+
cond_resched();
4779+
} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
4780+
unlock:
4781+
mutex_unlock(&state_mutex);
4782+
put_online_mems();
4783+
cpus_read_unlock();
4784+
cgroup_unlock();
4785+
}
4786+
4787+
/******************************************************************************
4788+
* sysfs interface
4789+
******************************************************************************/
4790+
4791+
static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
4792+
{
4793+
unsigned int caps = 0;
4794+
4795+
if (get_cap(LRU_GEN_CORE))
4796+
caps |= BIT(LRU_GEN_CORE);
4797+
4798+
if (arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))
4799+
caps |= BIT(LRU_GEN_MM_WALK);
4800+
4801+
if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) && get_cap(LRU_GEN_NONLEAF_YOUNG))
4802+
caps |= BIT(LRU_GEN_NONLEAF_YOUNG);
4803+
4804+
return snprintf(buf, PAGE_SIZE, "0x%04x\n", caps);
4805+
}
4806+
4807+
static ssize_t store_enabled(struct kobject *kobj, struct kobj_attribute *attr,
4808+
const char *buf, size_t len)
4809+
{
4810+
int i;
4811+
unsigned int caps;
4812+
4813+
if (tolower(*buf) == 'n')
4814+
caps = 0;
4815+
else if (tolower(*buf) == 'y')
4816+
caps = -1;
4817+
else if (kstrtouint(buf, 0, &caps))
4818+
return -EINVAL;
4819+
4820+
for (i = 0; i < NR_LRU_GEN_CAPS; i++) {
4821+
bool enabled = caps & BIT(i);
4822+
4823+
if (i == LRU_GEN_CORE)
4824+
lru_gen_change_state(enabled);
4825+
else if (enabled)
4826+
static_branch_enable(&lru_gen_caps[i]);
4827+
else
4828+
static_branch_disable(&lru_gen_caps[i]);
4829+
}
4830+
4831+
return len;
4832+
}
4833+
4834+
static struct kobj_attribute lru_gen_enabled_attr = __ATTR(
4835+
enabled, 0644, show_enabled, store_enabled
4836+
);
4837+
4838+
static struct attribute *lru_gen_attrs[] = {
4839+
&lru_gen_enabled_attr.attr,
4840+
NULL
4841+
};
4842+
4843+
static struct attribute_group lru_gen_attr_group = {
4844+
.name = "lru_gen",
4845+
.attrs = lru_gen_attrs,
4846+
};
4847+
46334848
/******************************************************************************
46344849
* initialization
46354850
******************************************************************************/
@@ -4640,6 +4855,7 @@ void lru_gen_init_lruvec(struct lruvec *lruvec)
46404855
struct lru_gen_struct *lrugen = &lruvec->lrugen;
46414856

46424857
lrugen->max_seq = MIN_NR_GENS + 1;
4858+
lrugen->enabled = lru_gen_enabled();
46434859

46444860
for_each_gen_type_zone(gen, type, zone)
46454861
INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
@@ -4679,6 +4895,9 @@ static int __init init_lru_gen(void)
46794895
BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS);
46804896
BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
46814897

4898+
if (sysfs_create_group(mm_kobj, &lru_gen_attr_group))
4899+
pr_err("lru_gen: failed to create sysfs group\n");
4900+
46824901
return 0;
46834902
};
46844903
late_initcall(init_lru_gen);

0 commit comments

Comments
 (0)