[dpdk-dev] [PATCH v9 1/2] mem: balanced allocation of hugepages
Ilya Maximets
i.maximets at samsung.com
Thu Jun 29 07:48:28 CEST 2017
On 29.06.2017 08:32, Hemant Agrawal wrote:
> On 6/27/2017 3:54 PM, Ilya Maximets wrote:
>> Currently EAL allocates hugepages one by one not paying attention
>> from which NUMA node allocation was done.
>>
>> Such behaviour leads to allocation failure if number of available
>> hugepages for application limited by cgroups or hugetlbfs and
>> memory requested not only from the first socket.
>>
>> Example:
>> # 90 x 1GB hugepages availavle in a system
>>
>> cgcreate -g hugetlb:/test
>> # Limit to 32GB of hugepages
>> cgset -r hugetlb.1GB.limit_in_bytes=34359738368 test
>> # Request 4GB from each of 2 sockets
>> cgexec -g hugetlb:test testpmd --socket-mem=4096,4096 ...
>>
>> EAL: SIGBUS: Cannot mmap more hugepages of size 1024 MB
>> EAL: 32 not 90 hugepages of size 1024 MB allocated
>> EAL: Not enough memory available on socket 1!
>> Requested: 4096MB, available: 0MB
>> PANIC in rte_eal_init():
>> Cannot init memory
>>
>> This happens beacause all allocated pages are
>> on socket 0.
>>
>> Fix this issue by setting mempolicy MPOL_PREFERRED for each hugepage
>> to one of requested nodes using following schema:
>>
>> 1) Allocate essential hugepages:
>> 1.1) Allocate as many hugepages from numa N to
>> only fit requested memory for this numa.
>> 1.2) repeat 1.1 for all numa nodes.
>> 2) Try to map all remaining free hugepages in a round-robin
>> fashion.
>> 3) Sort pages and choose the most suitable.
>>
>> In this case all essential memory will be allocated and all remaining
>> pages will be fairly distributed between all requested nodes.
>>
>> New config option RTE_EAL_NUMA_AWARE_HUGEPAGES introduced and
>> enabled by default for linuxapp except armv7 and dpaa2.
>> Enabling of this option adds libnuma as a dependency for EAL.
>>
>> Fixes: 77988fc08dc5 ("mem: fix allocating all free hugepages")
>>
>> Signed-off-by: Ilya Maximets <i.maximets at samsung.com>
>> ---
>> config/common_base | 1 +
>> config/common_linuxapp | 1 +
>> config/defconfig_arm-armv7a-linuxapp-gcc | 3 +
>> config/defconfig_arm64-dpaa2-linuxapp-gcc | 3 +
>> lib/librte_eal/linuxapp/eal/Makefile | 3 +
>> lib/librte_eal/linuxapp/eal/eal_memory.c | 120 ++++++++++++++++++++++++++++--
>> mk/rte.app.mk | 3 +
>> 7 files changed, 126 insertions(+), 8 deletions(-)
>>
>> diff --git a/config/common_base b/config/common_base
>> index f6aafd1..660588a 100644
>> --- a/config/common_base
>> +++ b/config/common_base
>> @@ -103,6 +103,7 @@ CONFIG_RTE_EAL_ALWAYS_PANIC_ON_ERROR=n
>> CONFIG_RTE_EAL_IGB_UIO=n
>> CONFIG_RTE_EAL_VFIO=n
>> CONFIG_RTE_MALLOC_DEBUG=n
>> +CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES=n
>>
>> #
>> # Recognize/ignore the AVX/AVX512 CPU flags for performance/power testing.
>> diff --git a/config/common_linuxapp b/config/common_linuxapp
>> index b3cf41b..64bef87 100644
>> --- a/config/common_linuxapp
>> +++ b/config/common_linuxapp
>> @@ -35,6 +35,7 @@
>> CONFIG_RTE_EXEC_ENV="linuxapp"
>> CONFIG_RTE_EXEC_ENV_LINUXAPP=y
>>
>> +CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES=y
>> CONFIG_RTE_EAL_IGB_UIO=y
>> CONFIG_RTE_EAL_VFIO=y
>> CONFIG_RTE_KNI_KMOD=y
>> diff --git a/config/defconfig_arm-armv7a-linuxapp-gcc b/config/defconfig_arm-armv7a-linuxapp-gcc
>> index 19607eb..e06b1d4 100644
>> --- a/config/defconfig_arm-armv7a-linuxapp-gcc
>> +++ b/config/defconfig_arm-armv7a-linuxapp-gcc
>> @@ -47,6 +47,9 @@ CONFIG_RTE_ARCH_STRICT_ALIGN=y
>> CONFIG_RTE_TOOLCHAIN="gcc"
>> CONFIG_RTE_TOOLCHAIN_GCC=y
>>
>> +# NUMA is not supported on ARM
>> +CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES=n
>> +
>> # ARM doesn't have support for vmware TSC map
>> CONFIG_RTE_LIBRTE_EAL_VMWARE_TSC_MAP_SUPPORT=n
>>
>> diff --git a/config/defconfig_arm64-dpaa2-linuxapp-gcc b/config/defconfig_arm64-dpaa2-linuxapp-gcc
>> index 2304ab6..f78449d 100644
>> --- a/config/defconfig_arm64-dpaa2-linuxapp-gcc
>> +++ b/config/defconfig_arm64-dpaa2-linuxapp-gcc
>> @@ -45,6 +45,9 @@ CONFIG_RTE_CACHE_LINE_SIZE=64
>>
>> CONFIG_RTE_PKTMBUF_HEADROOM=256
>>
>> +# Doesn't support NUMA
>> +CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES=y
>> +
>
> DPAA2 does not support NUMA so,
> CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES=n
Oh, sorry. Just typo.
Thanks for catching this.
Sergio, I'll send v10 with only this change and will keep your
acked-by because the change is trivial.
>> #
>> # Compile Support Libraries for DPAA2
>> #
>> diff --git a/lib/librte_eal/linuxapp/eal/Makefile b/lib/librte_eal/linuxapp/eal/Makefile
>> index 640afd0..8651e27 100644
>> --- a/lib/librte_eal/linuxapp/eal/Makefile
>> +++ b/lib/librte_eal/linuxapp/eal/Makefile
>> @@ -50,6 +50,9 @@ LDLIBS += -ldl
>> LDLIBS += -lpthread
>> LDLIBS += -lgcc_s
>> LDLIBS += -lrt
>> +ifeq ($(CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES),y)
>> +LDLIBS += -lnuma
>> +endif
>>
>> # specific to linuxapp exec-env
>> SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) := eal.c
>> diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
>> index e17c9cb..647d89c 100644
>> --- a/lib/librte_eal/linuxapp/eal/eal_memory.c
>> +++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
>> @@ -54,6 +54,10 @@
>> #include <sys/time.h>
>> #include <signal.h>
>> #include <setjmp.h>
>> +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
>> +#include <numa.h>
>> +#include <numaif.h>
>> +#endif
>>
>> #include <rte_log.h>
>> #include <rte_memory.h>
>> @@ -348,6 +352,14 @@ static int huge_wrap_sigsetjmp(void)
>> return sigsetjmp(huge_jmpenv, 1);
>> }
>>
>> +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
>> +/* Callback for numa library. */
>> +void numa_error(char *where)
>> +{
>> + RTE_LOG(ERR, EAL, "%s failed: %s\n", where, strerror(errno));
>> +}
>> +#endif
>> +
>> /*
>> * Mmap all hugepages of hugepage table: it first open a file in
>> * hugetlbfs, then mmap() hugepage_sz data in it. If orig is set, the
>> @@ -356,18 +368,78 @@ static int huge_wrap_sigsetjmp(void)
>> * map continguous physical blocks in contiguous virtual blocks.
>> */
>> static unsigned
>> -map_all_hugepages(struct hugepage_file *hugepg_tbl,
>> - struct hugepage_info *hpi, int orig)
>> +map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi,
>> + uint64_t *essential_memory __rte_unused, int orig)
>> {
>> int fd;
>> unsigned i;
>> void *virtaddr;
>> void *vma_addr = NULL;
>> size_t vma_len = 0;
>> +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
>> + int node_id = -1;
>> + int essential_prev = 0;
>> + int oldpolicy;
>> + struct bitmask *oldmask = numa_allocate_nodemask();
>> + bool have_numa = true;
>> + unsigned long maxnode = 0;
>> +
>> + /* Check if kernel supports NUMA. */
>> + if (numa_available() != 0) {
>> + RTE_LOG(DEBUG, EAL, "NUMA is not supported.\n");
>> + have_numa = false;
>> + }
>> +
>> + if (orig && have_numa) {
>> + RTE_LOG(DEBUG, EAL, "Trying to obtain current memory policy.\n");
>> + if (get_mempolicy(&oldpolicy, oldmask->maskp,
>> + oldmask->size + 1, 0, 0) < 0) {
>> + RTE_LOG(ERR, EAL,
>> + "Failed to get current mempolicy: %s. "
>> + "Assuming MPOL_DEFAULT.\n", strerror(errno));
>> + oldpolicy = MPOL_DEFAULT;
>> + }
>> + for (i = 0; i < RTE_MAX_NUMA_NODES; i++)
>> + if (internal_config.socket_mem[i])
>> + maxnode = i + 1;
>> + }
>> +#endif
>>
>> for (i = 0; i < hpi->num_pages[0]; i++) {
>> uint64_t hugepage_sz = hpi->hugepage_sz;
>>
>> +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
>> + if (maxnode) {
>> + unsigned int j;
>> +
>> + for (j = 0; j < maxnode; j++)
>> + if (essential_memory[j])
>> + break;
>> +
>> + if (j == maxnode) {
>> + node_id = (node_id + 1) % maxnode;
>> + while (!internal_config.socket_mem[node_id]) {
>> + node_id++;
>> + node_id %= maxnode;
>> + }
>> + essential_prev = 0;
>> + } else {
>> + node_id = j;
>> + essential_prev = essential_memory[j];
>> +
>> + if (essential_memory[j] < hugepage_sz)
>> + essential_memory[j] = 0;
>> + else
>> + essential_memory[j] -= hugepage_sz;
>> + }
>> +
>> + RTE_LOG(DEBUG, EAL,
>> + "Setting policy MPOL_PREFERRED for socket %d\n",
>> + node_id);
>> + numa_set_preferred(node_id);
>> + }
>> +#endif
>> +
>> if (orig) {
>> hugepg_tbl[i].file_id = i;
>> hugepg_tbl[i].size = hugepage_sz;
>> @@ -422,7 +494,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
>> if (fd < 0) {
>> RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n", __func__,
>> strerror(errno));
>> - return i;
>> + goto out;
>> }
>>
>> /* map the segment, and populate page tables,
>> @@ -433,7 +505,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
>> RTE_LOG(DEBUG, EAL, "%s(): mmap failed: %s\n", __func__,
>> strerror(errno));
>> close(fd);
>> - return i;
>> + goto out;
>> }
>>
>> if (orig) {
>> @@ -458,7 +530,12 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
>> munmap(virtaddr, hugepage_sz);
>> close(fd);
>> unlink(hugepg_tbl[i].filepath);
>> - return i;
>> +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
>> + if (maxnode)
>> + essential_memory[node_id] =
>> + essential_prev;
>> +#endif
>> + goto out;
>> }
>> *(int *)virtaddr = 0;
>> }
>> @@ -469,7 +546,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
>> RTE_LOG(DEBUG, EAL, "%s(): Locking file failed:%s \n",
>> __func__, strerror(errno));
>> close(fd);
>> - return i;
>> + goto out;
>> }
>>
>> close(fd);
>> @@ -478,6 +555,22 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
>> vma_len -= hugepage_sz;
>> }
>>
>> +out:
>> +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
>> + if (maxnode) {
>> + RTE_LOG(DEBUG, EAL,
>> + "Restoring previous memory policy: %d\n", oldpolicy);
>> + if (oldpolicy == MPOL_DEFAULT) {
>> + numa_set_localalloc();
>> + } else if (set_mempolicy(oldpolicy, oldmask->maskp,
>> + oldmask->size + 1) < 0) {
>> + RTE_LOG(ERR, EAL, "Failed to restore mempolicy: %s\n",
>> + strerror(errno));
>> + numa_set_localalloc();
>> + }
>> + }
>> + numa_free_cpumask(oldmask);
>> +#endif
>> return i;
>> }
>>
>> @@ -562,6 +655,11 @@ find_numasocket(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi)
>> if (hugepg_tbl[i].orig_va == va) {
>> hugepg_tbl[i].socket_id = socket_id;
>> hp_count++;
>> +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
>> + RTE_LOG(DEBUG, EAL,
>> + "Hugepage %s is on socket %d\n",
>> + hugepg_tbl[i].filepath, socket_id);
>> +#endif
>> }
>> }
>> }
>> @@ -1000,6 +1098,11 @@ rte_eal_hugepage_init(void)
>>
>> huge_register_sigbus();
>>
>> + /* make a copy of socket_mem, needed for balanced allocation. */
>> + for (i = 0; i < RTE_MAX_NUMA_NODES; i++)
>> + memory[i] = internal_config.socket_mem[i];
>> +
>> +
>> /* map all hugepages and sort them */
>> for (i = 0; i < (int)internal_config.num_hugepage_sizes; i ++){
>> unsigned pages_old, pages_new;
>> @@ -1017,7 +1120,8 @@ rte_eal_hugepage_init(void)
>>
>> /* map all hugepages available */
>> pages_old = hpi->num_pages[0];
>> - pages_new = map_all_hugepages(&tmp_hp[hp_offset], hpi, 1);
>> + pages_new = map_all_hugepages(&tmp_hp[hp_offset], hpi,
>> + memory, 1);
>> if (pages_new < pages_old) {
>> RTE_LOG(DEBUG, EAL,
>> "%d not %d hugepages of size %u MB allocated\n",
>> @@ -1060,7 +1164,7 @@ rte_eal_hugepage_init(void)
>> sizeof(struct hugepage_file), cmp_physaddr);
>>
>> /* remap all hugepages */
>> - if (map_all_hugepages(&tmp_hp[hp_offset], hpi, 0) !=
>> + if (map_all_hugepages(&tmp_hp[hp_offset], hpi, NULL, 0) !=
>> hpi->num_pages[0]) {
>> RTE_LOG(ERR, EAL, "Failed to remap %u MB pages\n",
>> (unsigned)(hpi->hugepage_sz / 0x100000));
>> diff --git a/mk/rte.app.mk b/mk/rte.app.mk
>> index bcaf1b3..4fe22d1 100644
>> --- a/mk/rte.app.mk
>> +++ b/mk/rte.app.mk
>> @@ -186,6 +186,9 @@ ifeq ($(CONFIG_RTE_BUILD_SHARED_LIB),n)
>> # The static libraries do not know their dependencies.
>> # So linking with static library requires explicit dependencies.
>> _LDLIBS-$(CONFIG_RTE_LIBRTE_EAL) += -lrt
>> +ifeq ($(CONFIG_RTE_EXEC_ENV_LINUXAPP)$(CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES),yy)
>> +_LDLIBS-$(CONFIG_RTE_LIBRTE_EAL) += -lnuma
>> +endif
>> _LDLIBS-$(CONFIG_RTE_LIBRTE_SCHED) += -lm
>> _LDLIBS-$(CONFIG_RTE_LIBRTE_SCHED) += -lrt
>> _LDLIBS-$(CONFIG_RTE_LIBRTE_METER) += -lm
>>
>
>
>
>
>
More information about the dev
mailing list