Skip to content
This repository was archived by the owner on Oct 31, 2024. It is now read-only.

Commit b5dc3ef

Browse files
yuzhaogooglegregkh
authored andcommitted
mm/hugetlb_vmemmap: batch HVO work when demoting
commit c0f398c upstream. Batch the HVO work, including de-HVO of the source and HVO of the destination hugeTLB folios, to speed up demotion. After commit bd22553 ("mm/hugetlb_vmemmap: fix race with speculative PFN walkers"), each request of HVO or de-HVO, batched or not, invokes synchronize_rcu() once. For example, when not batched, demoting one 1GB hugeTLB folio to 512 2MB hugeTLB folios invokes synchronize_rcu() 513 times (1 de-HVO plus 512 HVO requests), whereas when batched, only twice (1 de-HVO plus 1 HVO request). And the performance difference between the two cases is significant, e.g., echo 2048kB >/sys/kernel/mm/hugepages/hugepages-1048576kB/demote_size time echo 100 >/sys/kernel/mm/hugepages/hugepages-1048576kB/demote Before this patch: real 8m58.158s user 0m0.009s sys 0m5.900s After this patch: real 0m0.900s user 0m0.000s sys 0m0.851s Note that this patch changes the behavior of the `demote` interface when de-HVO fails. Before, the interface aborts immediately upon failure; now, it tries to finish an entire batch, meaning it can make extra progress if the rest of the batch contains folios that do not need to de-HVO. Link: https://lkml.kernel.org/r/20240812224823.3914837-1-yuzhao@google.com Fixes: bd22553 ("mm/hugetlb_vmemmap: fix race with speculative PFN walkers") Signed-off-by: Yu Zhao <yuzhao@google.com> Reviewed-by: Muchun Song <muchun.song@linux.dev> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
1 parent 331ed2c commit b5dc3ef

File tree

1 file changed

+92
-64
lines changed

1 file changed

+92
-64
lines changed

mm/hugetlb.c

+92-64
Original file line numberDiff line numberDiff line change
@@ -3921,100 +3921,124 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
39213921
return 0;
39223922
}
39233923

3924-
static int demote_free_hugetlb_folio(struct hstate *h, struct folio *folio)
3924+
static long demote_free_hugetlb_folios(struct hstate *src, struct hstate *dst,
3925+
struct list_head *src_list)
39253926
{
3926-
int i, nid = folio_nid(folio);
3927-
struct hstate *target_hstate;
3928-
struct page *subpage;
3929-
struct folio *inner_folio;
3930-
int rc = 0;
3931-
3932-
target_hstate = size_to_hstate(PAGE_SIZE << h->demote_order);
3927+
long rc;
3928+
struct folio *folio, *next;
3929+
LIST_HEAD(dst_list);
3930+
LIST_HEAD(ret_list);
39333931

3934-
remove_hugetlb_folio(h, folio, false);
3935-
spin_unlock_irq(&hugetlb_lock);
3936-
3937-
/*
3938-
* If vmemmap already existed for folio, the remove routine above would
3939-
* have cleared the hugetlb folio flag. Hence the folio is technically
3940-
* no longer a hugetlb folio. hugetlb_vmemmap_restore_folio can only be
3941-
* passed hugetlb folios and will BUG otherwise.
3942-
*/
3943-
if (folio_test_hugetlb(folio)) {
3944-
rc = hugetlb_vmemmap_restore_folio(h, folio);
3945-
if (rc) {
3946-
/* Allocation of vmemmmap failed, we can not demote folio */
3947-
spin_lock_irq(&hugetlb_lock);
3948-
add_hugetlb_folio(h, folio, false);
3949-
return rc;
3950-
}
3951-
}
3952-
3953-
/*
3954-
* Use destroy_compound_hugetlb_folio_for_demote for all huge page
3955-
* sizes as it will not ref count folios.
3956-
*/
3957-
destroy_compound_hugetlb_folio_for_demote(folio, huge_page_order(h));
3932+
rc = hugetlb_vmemmap_restore_folios(src, src_list, &ret_list);
3933+
list_splice_init(&ret_list, src_list);
39583934

39593935
/*
39603936
* Taking target hstate mutex synchronizes with set_max_huge_pages.
39613937
* Without the mutex, pages added to target hstate could be marked
39623938
* as surplus.
39633939
*
3964-
* Note that we already hold h->resize_lock. To prevent deadlock,
3940+
* Note that we already hold src->resize_lock. To prevent deadlock,
39653941
* use the convention of always taking larger size hstate mutex first.
39663942
*/
3967-
mutex_lock(&target_hstate->resize_lock);
3968-
for (i = 0; i < pages_per_huge_page(h);
3969-
i += pages_per_huge_page(target_hstate)) {
3970-
subpage = folio_page(folio, i);
3971-
inner_folio = page_folio(subpage);
3972-
if (hstate_is_gigantic(target_hstate))
3973-
prep_compound_gigantic_folio_for_demote(inner_folio,
3974-
target_hstate->order);
3975-
else
3976-
prep_compound_page(subpage, target_hstate->order);
3977-
folio_change_private(inner_folio, NULL);
3978-
prep_new_hugetlb_folio(target_hstate, inner_folio, nid);
3979-
free_huge_folio(inner_folio);
3943+
mutex_lock(&dst->resize_lock);
3944+
3945+
list_for_each_entry_safe(folio, next, src_list, lru) {
3946+
int i;
3947+
3948+
if (folio_test_hugetlb_vmemmap_optimized(folio))
3949+
continue;
3950+
3951+
list_del(&folio->lru);
3952+
/*
3953+
* Use destroy_compound_hugetlb_folio_for_demote for all huge page
3954+
* sizes as it will not ref count folios.
3955+
*/
3956+
destroy_compound_hugetlb_folio_for_demote(folio, huge_page_order(src));
3957+
3958+
for (i = 0; i < pages_per_huge_page(src); i += pages_per_huge_page(dst)) {
3959+
struct page *page = folio_page(folio, i);
3960+
3961+
if (hstate_is_gigantic(dst))
3962+
prep_compound_gigantic_folio_for_demote(page_folio(page),
3963+
dst->order);
3964+
else
3965+
prep_compound_page(page, dst->order);
3966+
set_page_private(page, 0);
3967+
3968+
init_new_hugetlb_folio(dst, page_folio(page));
3969+
list_add(&page->lru, &dst_list);
3970+
}
39803971
}
3981-
mutex_unlock(&target_hstate->resize_lock);
39823972

3983-
spin_lock_irq(&hugetlb_lock);
3973+
prep_and_add_allocated_folios(dst, &dst_list);
39843974

3985-
/*
3986-
* Not absolutely necessary, but for consistency update max_huge_pages
3987-
* based on pool changes for the demoted page.
3988-
*/
3989-
h->max_huge_pages--;
3990-
target_hstate->max_huge_pages +=
3991-
pages_per_huge_page(h) / pages_per_huge_page(target_hstate);
3975+
mutex_unlock(&dst->resize_lock);
39923976

39933977
return rc;
39943978
}
39953979

3996-
static int demote_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
3980+
static long demote_pool_huge_page(struct hstate *src, nodemask_t *nodes_allowed,
3981+
unsigned long nr_to_demote)
39973982
__must_hold(&hugetlb_lock)
39983983
{
39993984
int nr_nodes, node;
4000-
struct folio *folio;
3985+
struct hstate *dst;
3986+
long rc = 0;
3987+
long nr_demoted = 0;
40013988

40023989
lockdep_assert_held(&hugetlb_lock);
40033990

40043991
/* We should never get here if no demote order */
4005-
if (!h->demote_order) {
3992+
if (!src->demote_order) {
40063993
pr_warn("HugeTLB: NULL demote order passed to demote_pool_huge_page.\n");
40073994
return -EINVAL; /* internal error */
40083995
}
3996+
dst = size_to_hstate(PAGE_SIZE << src->demote_order);
40093997

4010-
for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
4011-
list_for_each_entry(folio, &h->hugepage_freelists[node], lru) {
3998+
for_each_node_mask_to_free(src, nr_nodes, node, nodes_allowed) {
3999+
LIST_HEAD(list);
4000+
struct folio *folio, *next;
4001+
4002+
list_for_each_entry_safe(folio, next, &src->hugepage_freelists[node], lru) {
40124003
if (folio_test_hwpoison(folio))
40134004
continue;
4014-
return demote_free_hugetlb_folio(h, folio);
4005+
4006+
remove_hugetlb_folio(src, folio, false);
4007+
list_add(&folio->lru, &list);
4008+
4009+
if (++nr_demoted == nr_to_demote)
4010+
break;
40154011
}
4012+
4013+
spin_unlock_irq(&hugetlb_lock);
4014+
4015+
rc = demote_free_hugetlb_folios(src, dst, &list);
4016+
4017+
spin_lock_irq(&hugetlb_lock);
4018+
4019+
list_for_each_entry_safe(folio, next, &list, lru) {
4020+
list_del(&folio->lru);
4021+
add_hugetlb_folio(src, folio, false);
4022+
4023+
nr_demoted--;
4024+
}
4025+
4026+
if (rc < 0 || nr_demoted == nr_to_demote)
4027+
break;
40164028
}
40174029

4030+
/*
4031+
* Not absolutely necessary, but for consistency update max_huge_pages
4032+
* based on pool changes for the demoted page.
4033+
*/
4034+
src->max_huge_pages -= nr_demoted;
4035+
dst->max_huge_pages += nr_demoted << (huge_page_order(src) - huge_page_order(dst));
4036+
4037+
if (rc < 0)
4038+
return rc;
4039+
4040+
if (nr_demoted)
4041+
return nr_demoted;
40184042
/*
40194043
* Only way to get here is if all pages on free lists are poisoned.
40204044
* Return -EBUSY so that caller will not retry.
@@ -4249,6 +4273,8 @@ static ssize_t demote_store(struct kobject *kobj,
42494273
spin_lock_irq(&hugetlb_lock);
42504274

42514275
while (nr_demote) {
4276+
long rc;
4277+
42524278
/*
42534279
* Check for available pages to demote each time thorough the
42544280
* loop as demote_pool_huge_page will drop hugetlb_lock.
@@ -4261,11 +4287,13 @@ static ssize_t demote_store(struct kobject *kobj,
42614287
if (!nr_available)
42624288
break;
42634289

4264-
err = demote_pool_huge_page(h, n_mask);
4265-
if (err)
4290+
rc = demote_pool_huge_page(h, n_mask, nr_demote);
4291+
if (rc < 0) {
4292+
err = rc;
42664293
break;
4294+
}
42674295

4268-
nr_demote--;
4296+
nr_demote -= rc;
42694297
}
42704298

42714299
spin_unlock_irq(&hugetlb_lock);

0 commit comments

Comments
 (0)