From c44fa9355bb8a5f0315c49ed56bc44799e1fd84f Mon Sep 17 00:00:00 2001 From: Jay Wang Date: Fri, 19 Apr 2024 12:16:41 +0000 Subject: [PATCH] vppinfra: fix huge page alloc error on 5.19+ kernel Running VPP on a NUMA system with 5.19+ kernel outputs the following error messages. 'show physmem' command confirms that VPP falls back to using normal 4K pages instead of the preallocated 1G huge pages. The root cause is that VPP uses move_pages()[1] to get the huge page node information. However, this misbehaves on the 5.19+ kernel due to changes introduced in its implementation[2]. Our proposed fix is retry obtaining NUMA node info with get_mempolicy()[3] only if we see -ENOENT returned in status from move_pages() and huge pages are used. Additionally, we use mincore()[4] to check if pages are allocated and in memory to avoid the possibility of get_mempolicy() falsely allocating a new page. buffer [warn ]: numa[1] falling back to non-hugepage backed buffer pool () vpp# show physmem used-pages 2 reserved-pages 16 default-page-size 1G lookup-page-size 4K arena 'buffers-numa-0' pages 1 subpage-size 1G numa-node 0 shared fd 5 arena 'buffers-numa-1' pages 1 subpage-size 4K numa-node 1 shared fd 6 [1] https://man7.org/linux/man-pages/man2/move_pages.2.html [2] https://lore.kernel.org/linux-mm/91da2c3b-96f1-bb03-8fff-4c38f31cb9be@huawei.com/ [3] https://man7.org/linux/man-pages/man2/get_mempolicy.2.html [4] https://man7.org/linux/man-pages/man2/mincore.2.html Type: fix Signed-off-by: Jay Wang Change-Id: Ia423745423bb080404292333ef95455a4950ce0a --- src/vppinfra/linux/mem.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/vppinfra/linux/mem.c b/src/vppinfra/linux/mem.c index 734f5c4788c..21aaa55fc00 100644 --- a/src/vppinfra/linux/mem.c +++ b/src/vppinfra/linux/mem.c @@ -530,6 +530,7 @@ clib_mem_get_page_stats (void *start, clib_mem_page_sz_t log2_page_size, { int i, *status = 0; void **ptr = 0; + unsigned char incore; log2_page_size = clib_mem_log2_page_size_validate (log2_page_size); @@ -551,6 +552,19 @@ clib_mem_get_page_stats (void *start, clib_mem_page_sz_t log2_page_size, for (i = 0; i < n_pages; i++) { + /* move_pages() returns -ENONET in status for huge pages on 5.19+ kernel. + * Retry with get_mempolicy() to obtain NUMA node info only if the pages + * are allocated and in memory, which is checked by mincore(). */ + if (status[i] == -ENOENT && + syscall (__NR_mincore, ptr[i], 1, &incore) == 0 && (incore & 1) != 0) + { + if (syscall (__NR_get_mempolicy, &status[i], 0, 0, ptr[i], + MPOL_F_NODE | MPOL_F_ADDR) != 0) + { + /* if get_mempolicy fails, keep the original value in status */ + status[i] = -ENONET; + } + } if (status[i] >= 0 && status[i] < CLIB_MAX_NUMAS) { stats->mapped++;