Skip to content

Commit 290408d

Browse files
Naoya HoriguchiAndi Kleen
authored andcommitted
hugetlb: hugepage migration core
This patch extends page migration code to support hugepage migration. One of the potential users of this feature is soft offlining which is triggered by memory corrected errors (added by the next patch.) Todo: - there are other users of page migration such as memory policy, memory hotplug and memocy compaction. They are not ready for hugepage support for now. ChangeLog since v4: - define migrate_huge_pages() - remove changes on isolation/putback_lru_page() ChangeLog since v2: - refactor isolate/putback_lru_page() to handle hugepage - add comment about race on unmap_and_move_huge_page() ChangeLog since v1: - divide migration code path for hugepage - define routine checking migration swap entry for hugetlb - replace "goto" with "if/else" in remove_migration_pte() Signed-off-by: Naoya Horiguchi <[email protected]> Signed-off-by: Jun'ichi Nomura <[email protected]> Acked-by: Mel Gorman <[email protected]> Signed-off-by: Andi Kleen <[email protected]>
1 parent 0ebabb4 commit 290408d

File tree

4 files changed

+262
-19
lines changed

4 files changed

+262
-19
lines changed

fs/hugetlbfs/inode.c

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
#include <linux/statfs.h>
3232
#include <linux/security.h>
3333
#include <linux/magic.h>
34+
#include <linux/migrate.h>
3435

3536
#include <asm/uaccess.h>
3637

@@ -573,6 +574,19 @@ static int hugetlbfs_set_page_dirty(struct page *page)
573574
return 0;
574575
}
575576

577+
static int hugetlbfs_migrate_page(struct address_space *mapping,
578+
struct page *newpage, struct page *page)
579+
{
580+
int rc;
581+
582+
rc = migrate_huge_page_move_mapping(mapping, newpage, page);
583+
if (rc)
584+
return rc;
585+
migrate_page_copy(newpage, page);
586+
587+
return 0;
588+
}
589+
576590
static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
577591
{
578592
struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb);
@@ -659,6 +673,7 @@ static const struct address_space_operations hugetlbfs_aops = {
659673
.write_begin = hugetlbfs_write_begin,
660674
.write_end = hugetlbfs_write_end,
661675
.set_page_dirty = hugetlbfs_set_page_dirty,
676+
.migratepage = hugetlbfs_migrate_page,
662677
};
663678

664679

include/linux/migrate.h

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@ extern int migrate_page(struct address_space *,
1414
struct page *, struct page *);
1515
extern int migrate_pages(struct list_head *l, new_page_t x,
1616
unsigned long private, int offlining);
17+
extern int migrate_huge_pages(struct list_head *l, new_page_t x,
18+
unsigned long private, int offlining);
1719

1820
extern int fail_migrate_page(struct address_space *,
1921
struct page *, struct page *);
@@ -23,12 +25,17 @@ extern int migrate_prep_local(void);
2325
extern int migrate_vmas(struct mm_struct *mm,
2426
const nodemask_t *from, const nodemask_t *to,
2527
unsigned long flags);
28+
extern void migrate_page_copy(struct page *newpage, struct page *page);
29+
extern int migrate_huge_page_move_mapping(struct address_space *mapping,
30+
struct page *newpage, struct page *page);
2631
#else
2732
#define PAGE_MIGRATION 0
2833

2934
static inline void putback_lru_pages(struct list_head *l) {}
3035
static inline int migrate_pages(struct list_head *l, new_page_t x,
3136
unsigned long private, int offlining) { return -ENOSYS; }
37+
static inline int migrate_huge_pages(struct list_head *l, new_page_t x,
38+
unsigned long private, int offlining) { return -ENOSYS; }
3239

3340
static inline int migrate_prep(void) { return -ENOSYS; }
3441
static inline int migrate_prep_local(void) { return -ENOSYS; }
@@ -40,6 +47,15 @@ static inline int migrate_vmas(struct mm_struct *mm,
4047
return -ENOSYS;
4148
}
4249

50+
static inline void migrate_page_copy(struct page *newpage,
51+
struct page *page) {}
52+
53+
extern int migrate_huge_page_move_mapping(struct address_space *mapping,
54+
struct page *newpage, struct page *page)
55+
{
56+
return -ENOSYS;
57+
}
58+
4359
/* Possible settings for the migrate_page() method in address_operations */
4460
#define migrate_page NULL
4561
#define fail_migrate_page NULL

mm/hugetlb.c

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2217,6 +2217,19 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
22172217
return -ENOMEM;
22182218
}
22192219

2220+
static int is_hugetlb_entry_migration(pte_t pte)
2221+
{
2222+
swp_entry_t swp;
2223+
2224+
if (huge_pte_none(pte) || pte_present(pte))
2225+
return 0;
2226+
swp = pte_to_swp_entry(pte);
2227+
if (non_swap_entry(swp) && is_migration_entry(swp)) {
2228+
return 1;
2229+
} else
2230+
return 0;
2231+
}
2232+
22202233
static int is_hugetlb_entry_hwpoisoned(pte_t pte)
22212234
{
22222235
swp_entry_t swp;
@@ -2648,7 +2661,10 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
26482661
ptep = huge_pte_offset(mm, address);
26492662
if (ptep) {
26502663
entry = huge_ptep_get(ptep);
2651-
if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
2664+
if (unlikely(is_hugetlb_entry_migration(entry))) {
2665+
migration_entry_wait(mm, (pmd_t *)ptep, address);
2666+
return 0;
2667+
} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
26522668
return VM_FAULT_HWPOISON;
26532669
}
26542670

mm/migrate.c

Lines changed: 214 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
#include <linux/security.h>
3333
#include <linux/memcontrol.h>
3434
#include <linux/syscalls.h>
35+
#include <linux/hugetlb.h>
3536
#include <linux/gfp.h>
3637

3738
#include "internal.h"
@@ -95,26 +96,34 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
9596
pte_t *ptep, pte;
9697
spinlock_t *ptl;
9798

98-
pgd = pgd_offset(mm, addr);
99-
if (!pgd_present(*pgd))
100-
goto out;
99+
if (unlikely(PageHuge(new))) {
100+
ptep = huge_pte_offset(mm, addr);
101+
if (!ptep)
102+
goto out;
103+
ptl = &mm->page_table_lock;
104+
} else {
105+
pgd = pgd_offset(mm, addr);
106+
if (!pgd_present(*pgd))
107+
goto out;
101108

102-
pud = pud_offset(pgd, addr);
103-
if (!pud_present(*pud))
104-
goto out;
109+
pud = pud_offset(pgd, addr);
110+
if (!pud_present(*pud))
111+
goto out;
105112

106-
pmd = pmd_offset(pud, addr);
107-
if (!pmd_present(*pmd))
108-
goto out;
113+
pmd = pmd_offset(pud, addr);
114+
if (!pmd_present(*pmd))
115+
goto out;
109116

110-
ptep = pte_offset_map(pmd, addr);
117+
ptep = pte_offset_map(pmd, addr);
111118

112-
if (!is_swap_pte(*ptep)) {
113-
pte_unmap(ptep);
114-
goto out;
115-
}
119+
if (!is_swap_pte(*ptep)) {
120+
pte_unmap(ptep);
121+
goto out;
122+
}
123+
124+
ptl = pte_lockptr(mm, pmd);
125+
}
116126

117-
ptl = pte_lockptr(mm, pmd);
118127
spin_lock(ptl);
119128
pte = *ptep;
120129
if (!is_swap_pte(pte))
@@ -130,10 +139,17 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
130139
pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
131140
if (is_write_migration_entry(entry))
132141
pte = pte_mkwrite(pte);
142+
if (PageHuge(new))
143+
pte = pte_mkhuge(pte);
133144
flush_cache_page(vma, addr, pte_pfn(pte));
134145
set_pte_at(mm, addr, ptep, pte);
135146

136-
if (PageAnon(new))
147+
if (PageHuge(new)) {
148+
if (PageAnon(new))
149+
hugepage_add_anon_rmap(new, vma, addr);
150+
else
151+
page_dup_rmap(new);
152+
} else if (PageAnon(new))
137153
page_add_anon_rmap(new, vma, addr);
138154
else
139155
page_add_file_rmap(new);
@@ -275,12 +291,60 @@ static int migrate_page_move_mapping(struct address_space *mapping,
275291
return 0;
276292
}
277293

294+
/*
295+
* The expected number of remaining references is the same as that
296+
* of migrate_page_move_mapping().
297+
*/
298+
int migrate_huge_page_move_mapping(struct address_space *mapping,
299+
struct page *newpage, struct page *page)
300+
{
301+
int expected_count;
302+
void **pslot;
303+
304+
if (!mapping) {
305+
if (page_count(page) != 1)
306+
return -EAGAIN;
307+
return 0;
308+
}
309+
310+
spin_lock_irq(&mapping->tree_lock);
311+
312+
pslot = radix_tree_lookup_slot(&mapping->page_tree,
313+
page_index(page));
314+
315+
expected_count = 2 + page_has_private(page);
316+
if (page_count(page) != expected_count ||
317+
(struct page *)radix_tree_deref_slot(pslot) != page) {
318+
spin_unlock_irq(&mapping->tree_lock);
319+
return -EAGAIN;
320+
}
321+
322+
if (!page_freeze_refs(page, expected_count)) {
323+
spin_unlock_irq(&mapping->tree_lock);
324+
return -EAGAIN;
325+
}
326+
327+
get_page(newpage);
328+
329+
radix_tree_replace_slot(pslot, newpage);
330+
331+
page_unfreeze_refs(page, expected_count);
332+
333+
__put_page(page);
334+
335+
spin_unlock_irq(&mapping->tree_lock);
336+
return 0;
337+
}
338+
278339
/*
279340
* Copy the page to its new location
280341
*/
281-
static void migrate_page_copy(struct page *newpage, struct page *page)
342+
void migrate_page_copy(struct page *newpage, struct page *page)
282343
{
283-
copy_highpage(newpage, page);
344+
if (PageHuge(page))
345+
copy_huge_page(newpage, page);
346+
else
347+
copy_highpage(newpage, page);
284348

285349
if (PageError(page))
286350
SetPageError(newpage);
@@ -723,6 +787,92 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
723787
return rc;
724788
}
725789

790+
/*
791+
* Counterpart of unmap_and_move_page() for hugepage migration.
792+
*
793+
* This function doesn't wait the completion of hugepage I/O
794+
* because there is no race between I/O and migration for hugepage.
795+
* Note that currently hugepage I/O occurs only in direct I/O
796+
* where no lock is held and PG_writeback is irrelevant,
797+
* and writeback status of all subpages are counted in the reference
798+
* count of the head page (i.e. if all subpages of a 2MB hugepage are
799+
* under direct I/O, the reference of the head page is 512 and a bit more.)
800+
* This means that when we try to migrate hugepage whose subpages are
801+
* doing direct I/O, some references remain after try_to_unmap() and
802+
* hugepage migration fails without data corruption.
803+
*
804+
* There is also no race when direct I/O is issued on the page under migration,
805+
* because then pte is replaced with migration swap entry and direct I/O code
806+
* will wait in the page fault for migration to complete.
807+
*/
808+
static int unmap_and_move_huge_page(new_page_t get_new_page,
809+
unsigned long private, struct page *hpage,
810+
int force, int offlining)
811+
{
812+
int rc = 0;
813+
int *result = NULL;
814+
struct page *new_hpage = get_new_page(hpage, private, &result);
815+
int rcu_locked = 0;
816+
struct anon_vma *anon_vma = NULL;
817+
818+
if (!new_hpage)
819+
return -ENOMEM;
820+
821+
rc = -EAGAIN;
822+
823+
if (!trylock_page(hpage)) {
824+
if (!force)
825+
goto out;
826+
lock_page(hpage);
827+
}
828+
829+
if (PageAnon(hpage)) {
830+
rcu_read_lock();
831+
rcu_locked = 1;
832+
833+
if (page_mapped(hpage)) {
834+
anon_vma = page_anon_vma(hpage);
835+
atomic_inc(&anon_vma->external_refcount);
836+
}
837+
}
838+
839+
try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
840+
841+
if (!page_mapped(hpage))
842+
rc = move_to_new_page(new_hpage, hpage, 1);
843+
844+
if (rc)
845+
remove_migration_ptes(hpage, hpage);
846+
847+
if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount,
848+
&anon_vma->lock)) {
849+
int empty = list_empty(&anon_vma->head);
850+
spin_unlock(&anon_vma->lock);
851+
if (empty)
852+
anon_vma_free(anon_vma);
853+
}
854+
855+
if (rcu_locked)
856+
rcu_read_unlock();
857+
out:
858+
unlock_page(hpage);
859+
860+
if (rc != -EAGAIN) {
861+
list_del(&hpage->lru);
862+
put_page(hpage);
863+
}
864+
865+
put_page(new_hpage);
866+
867+
if (result) {
868+
if (rc)
869+
*result = rc;
870+
else
871+
*result = page_to_nid(new_hpage);
872+
}
873+
return rc;
874+
}
875+
726876
/*
727877
* migrate_pages
728878
*
@@ -788,6 +938,52 @@ int migrate_pages(struct list_head *from,
788938
return nr_failed + retry;
789939
}
790940

941+
int migrate_huge_pages(struct list_head *from,
942+
new_page_t get_new_page, unsigned long private, int offlining)
943+
{
944+
int retry = 1;
945+
int nr_failed = 0;
946+
int pass = 0;
947+
struct page *page;
948+
struct page *page2;
949+
int rc;
950+
951+
for (pass = 0; pass < 10 && retry; pass++) {
952+
retry = 0;
953+
954+
list_for_each_entry_safe(page, page2, from, lru) {
955+
cond_resched();
956+
957+
rc = unmap_and_move_huge_page(get_new_page,
958+
private, page, pass > 2, offlining);
959+
960+
switch(rc) {
961+
case -ENOMEM:
962+
goto out;
963+
case -EAGAIN:
964+
retry++;
965+
break;
966+
case 0:
967+
break;
968+
default:
969+
/* Permanent failure */
970+
nr_failed++;
971+
break;
972+
}
973+
}
974+
}
975+
rc = 0;
976+
out:
977+
978+
list_for_each_entry_safe(page, page2, from, lru)
979+
put_page(page);
980+
981+
if (rc)
982+
return rc;
983+
984+
return nr_failed + retry;
985+
}
986+
791987
#ifdef CONFIG_NUMA
792988
/*
793989
* Move a list of individual pages

0 commit comments

Comments
 (0)