Skip to content

Commit 15995a3

Browse files
Sourav Pandaakpm00
authored andcommitted
mm: report per-page metadata information
Today, we do not have any observability of per-page metadata and how much it takes away from the machine capacity. Thus, we want to describe the amount of memory that is going towards per-page metadata, which can vary depending on build configuration, machine architecture, and system use. This patch adds 2 fields to /proc/vmstat that can used as shown below: Accounting per-page metadata allocated by boot-allocator: /proc/vmstat:nr_memmap_boot * PAGE_SIZE Accounting per-page metadata allocated by buddy-allocator: /proc/vmstat:nr_memmap * PAGE_SIZE Accounting total Perpage metadata allocated on the machine: (/proc/vmstat:nr_memmap_boot + /proc/vmstat:nr_memmap) * PAGE_SIZE Utility for userspace: Observability: Describe the amount of memory overhead that is going to per-page metadata on the system at any given time since this overhead is not currently observable. Debugging: Tracking the changes or absolute value in struct pages can help detect anomalies as they can be correlated with other metrics in the machine (e.g., memtotal, number of huge pages, etc). page_ext overheads: Some kernel features such as page_owner page_table_check that use page_ext can be optionally enabled via kernel parameters. Having the total per-page metadata information helps users precisely measure impact. Furthermore, page-metadata metrics will reflect the amount of struct pages reliquished (or overhead reduced) when hugetlbfs pages are reserved which will vary depending on whether hugetlb vmemmap optimization is enabled or not. For background and results see: lore.kernel.org/all/[email protected] Link: https://lkml.kernel.org/r/[email protected] Signed-off-by: Sourav Panda <[email protected]> Acked-by: David Rientjes <[email protected]> Reviewed-by: Pasha Tatashin <[email protected]> Cc: Alexey Dobriyan <[email protected]> Cc: Bjorn Helgaas <[email protected]> Cc: Chen Linxuan <[email protected]> Cc: David Hildenbrand <[email protected]> Cc: Greg Kroah-Hartman <[email protected]> Cc: Ivan Babrou <[email protected]> Cc: Johannes Weiner <[email protected]> Cc: Jonathan Corbet <[email protected]> Cc: Kefeng Wang <[email protected]> Cc: Kirill A. Shutemov <[email protected]> Cc: Liam R. Howlett <[email protected]> Cc: Mike Kravetz <[email protected]> Cc: Mike Rapoport (IBM) <[email protected]> Cc: Muchun Song <[email protected]> Cc: "Rafael J. Wysocki" <[email protected]> Cc: Randy Dunlap <[email protected]> Cc: Shakeel Butt <[email protected]> Cc: Suren Baghdasaryan <[email protected]> Cc: Tomas Mudrunka <[email protected]> Cc: Vlastimil Babka <[email protected]> Cc: Wei Xu <[email protected]> Cc: Yang Yang <[email protected]> Cc: Yosry Ahmed <[email protected]> Signed-off-by: Andrew Morton <[email protected]>
1 parent 8192bc0 commit 15995a3

File tree

9 files changed

+85
-15
lines changed

9 files changed

+85
-15
lines changed

include/linux/mmzone.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,8 @@ enum node_stat_item {
220220
PGDEMOTE_KSWAPD,
221221
PGDEMOTE_DIRECT,
222222
PGDEMOTE_KHUGEPAGED,
223+
NR_MEMMAP, /* page metadata allocated through buddy allocator */
224+
NR_MEMMAP_BOOT, /* page metadata allocated through boot allocator */
223225
NR_VM_NODE_STAT_ITEMS
224226
};
225227

include/linux/vmstat.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -624,4 +624,8 @@ static inline void lruvec_stat_sub_folio(struct folio *folio,
624624
{
625625
lruvec_stat_mod_folio(folio, idx, -folio_nr_pages(folio));
626626
}
627+
628+
void __meminit mod_node_early_perpage_metadata(int nid, long delta);
629+
void __meminit store_early_perpage_metadata(void);
630+
627631
#endif /* _LINUX_VMSTAT_H */

mm/hugetlb_vmemmap.c

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -184,10 +184,13 @@ static int vmemmap_remap_range(unsigned long start, unsigned long end,
184184
*/
185185
static inline void free_vmemmap_page(struct page *page)
186186
{
187-
if (PageReserved(page))
187+
if (PageReserved(page)) {
188188
free_bootmem_page(page);
189-
else
189+
mod_node_page_state(page_pgdat(page), NR_MEMMAP_BOOT, -1);
190+
} else {
190191
__free_page(page);
192+
mod_node_page_state(page_pgdat(page), NR_MEMMAP, -1);
193+
}
191194
}
192195

193196
/* Free a list of the vmemmap pages */
@@ -338,6 +341,7 @@ static int vmemmap_remap_free(unsigned long start, unsigned long end,
338341
copy_page(page_to_virt(walk.reuse_page),
339342
(void *)walk.reuse_addr);
340343
list_add(&walk.reuse_page->lru, vmemmap_pages);
344+
mod_node_page_state(NODE_DATA(nid), NR_MEMMAP, 1);
341345
}
342346

343347
/*
@@ -384,14 +388,19 @@ static int alloc_vmemmap_page_list(unsigned long start, unsigned long end,
384388
unsigned long nr_pages = (end - start) >> PAGE_SHIFT;
385389
int nid = page_to_nid((struct page *)start);
386390
struct page *page, *next;
391+
int i;
387392

388-
while (nr_pages--) {
393+
for (i = 0; i < nr_pages; i++) {
389394
page = alloc_pages_node(nid, gfp_mask, 0);
390-
if (!page)
395+
if (!page) {
396+
mod_node_page_state(NODE_DATA(nid), NR_MEMMAP, i);
391397
goto out;
398+
}
392399
list_add(&page->lru, list);
393400
}
394401

402+
mod_node_page_state(NODE_DATA(nid), NR_MEMMAP, nr_pages);
403+
395404
return 0;
396405
out:
397406
list_for_each_entry_safe(page, next, list, lru)

mm/mm_init.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
#include <linux/cma.h>
3030
#include <linux/crash_dump.h>
3131
#include <linux/execmem.h>
32+
#include <linux/vmstat.h>
3233
#include "internal.h"
3334
#include "slab.h"
3435
#include "shuffle.h"
@@ -1618,6 +1619,8 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat)
16181619
panic("Failed to allocate %ld bytes for node %d memory map\n",
16191620
size, pgdat->node_id);
16201621
pgdat->node_mem_map = map + offset;
1622+
mod_node_early_perpage_metadata(pgdat->node_id,
1623+
DIV_ROUND_UP(size, PAGE_SIZE));
16211624
pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n",
16221625
__func__, pgdat->node_id, (unsigned long)pgdat,
16231626
(unsigned long)pgdat->node_mem_map);

mm/page_alloc.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5738,6 +5738,7 @@ void __init setup_per_cpu_pageset(void)
57385738
for_each_online_pgdat(pgdat)
57395739
pgdat->per_cpu_nodestats =
57405740
alloc_percpu(struct per_cpu_nodestat);
5741+
store_early_perpage_metadata();
57415742
}
57425743

57435744
__meminit void zone_pcp_init(struct zone *zone)

mm/page_ext.c

Lines changed: 23 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,8 @@ static int __init alloc_node_page_ext(int nid)
214214
return -ENOMEM;
215215
NODE_DATA(nid)->node_page_ext = base;
216216
total_usage += table_size;
217+
mod_node_page_state(NODE_DATA(nid), NR_MEMMAP_BOOT,
218+
DIV_ROUND_UP(table_size, PAGE_SIZE));
217219
return 0;
218220
}
219221

@@ -268,12 +270,15 @@ static void *__meminit alloc_page_ext(size_t size, int nid)
268270
void *addr = NULL;
269271

270272
addr = alloc_pages_exact_nid(nid, size, flags);
271-
if (addr) {
273+
if (addr)
272274
kmemleak_alloc(addr, size, 1, flags);
273-
return addr;
274-
}
275+
else
276+
addr = vzalloc_node(size, nid);
275277

276-
addr = vzalloc_node(size, nid);
278+
if (addr) {
279+
mod_node_page_state(NODE_DATA(nid), NR_MEMMAP,
280+
DIV_ROUND_UP(size, PAGE_SIZE));
281+
}
277282

278283
return addr;
279284
}
@@ -316,18 +321,27 @@ static int __meminit init_section_page_ext(unsigned long pfn, int nid)
316321

317322
static void free_page_ext(void *addr)
318323
{
324+
size_t table_size;
325+
struct page *page;
326+
struct pglist_data *pgdat;
327+
328+
table_size = page_ext_size * PAGES_PER_SECTION;
329+
319330
if (is_vmalloc_addr(addr)) {
331+
page = vmalloc_to_page(addr);
332+
pgdat = page_pgdat(page);
320333
vfree(addr);
321334
} else {
322-
struct page *page = virt_to_page(addr);
323-
size_t table_size;
324-
325-
table_size = page_ext_size * PAGES_PER_SECTION;
326-
335+
page = virt_to_page(addr);
336+
pgdat = page_pgdat(page);
327337
BUG_ON(PageReserved(page));
328338
kmemleak_free(addr);
329339
free_pages_exact(addr, table_size);
330340
}
341+
342+
mod_node_page_state(pgdat, NR_MEMMAP,
343+
-1L * (DIV_ROUND_UP(table_size, PAGE_SIZE)));
344+
331345
}
332346

333347
static void __free_page_ext(unsigned long pfn)

mm/sparse-vmemmap.c

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -469,5 +469,13 @@ struct page * __meminit __populate_section_memmap(unsigned long pfn,
469469
if (r < 0)
470470
return NULL;
471471

472+
if (system_state == SYSTEM_BOOTING) {
473+
mod_node_early_perpage_metadata(nid, DIV_ROUND_UP(end - start,
474+
PAGE_SIZE));
475+
} else {
476+
mod_node_page_state(NODE_DATA(nid), NR_MEMMAP,
477+
DIV_ROUND_UP(end - start, PAGE_SIZE));
478+
}
479+
472480
return pfn_to_page(pfn);
473481
}

mm/sparse.c

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
#include <linux/swap.h>
1515
#include <linux/swapops.h>
1616
#include <linux/bootmem_info.h>
17-
17+
#include <linux/vmstat.h>
1818
#include "internal.h"
1919
#include <asm/dma.h>
2020

@@ -465,6 +465,9 @@ static void __init sparse_buffer_init(unsigned long size, int nid)
465465
*/
466466
sparsemap_buf = memmap_alloc(size, section_map_size(), addr, nid, true);
467467
sparsemap_buf_end = sparsemap_buf + size;
468+
#ifndef CONFIG_SPARSEMEM_VMEMMAP
469+
mod_node_early_perpage_metadata(nid, DIV_ROUND_UP(size, PAGE_SIZE));
470+
#endif
468471
}
469472

470473
static void __init sparse_buffer_fini(void)
@@ -643,6 +646,8 @@ static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages,
643646
unsigned long start = (unsigned long) pfn_to_page(pfn);
644647
unsigned long end = start + nr_pages * sizeof(struct page);
645648

649+
mod_node_page_state(page_pgdat(pfn_to_page(pfn)), NR_MEMMAP,
650+
-1L * (DIV_ROUND_UP(end - start, PAGE_SIZE)));
646651
vmemmap_free(start, end, altmap);
647652
}
648653
static void free_map_bootmem(struct page *memmap)

mm/vmstat.c

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1255,7 +1255,8 @@ const char * const vmstat_text[] = {
12551255
"pgdemote_kswapd",
12561256
"pgdemote_direct",
12571257
"pgdemote_khugepaged",
1258-
1258+
"nr_memmap",
1259+
"nr_memmap_boot",
12591260
/* enum writeback_stat_item counters */
12601261
"nr_dirty_threshold",
12611262
"nr_dirty_background_threshold",
@@ -2282,4 +2283,27 @@ static int __init extfrag_debug_init(void)
22822283
}
22832284

22842285
module_init(extfrag_debug_init);
2286+
22852287
#endif
2288+
2289+
/*
2290+
* Page metadata size (struct page and page_ext) in pages
2291+
*/
2292+
static unsigned long early_perpage_metadata[MAX_NUMNODES] __meminitdata;
2293+
2294+
void __meminit mod_node_early_perpage_metadata(int nid, long delta)
2295+
{
2296+
early_perpage_metadata[nid] += delta;
2297+
}
2298+
2299+
void __meminit store_early_perpage_metadata(void)
2300+
{
2301+
int nid;
2302+
struct pglist_data *pgdat;
2303+
2304+
for_each_online_pgdat(pgdat) {
2305+
nid = pgdat->node_id;
2306+
mod_node_page_state(NODE_DATA(nid), NR_MEMMAP_BOOT,
2307+
early_perpage_metadata[nid]);
2308+
}
2309+
}

0 commit comments

Comments
 (0)