[dpdk-dev] [PATCH v9 1/2] mem: balanced allocation of hugepages

Ilya Maximets i.maximets at samsung.com
Thu Jun 29 07:48:28 CEST 2017


On 29.06.2017 08:32, Hemant Agrawal wrote:
> On 6/27/2017 3:54 PM, Ilya Maximets wrote:
>> Currently EAL allocates hugepages one by one not paying attention
>> from which NUMA node allocation was done.
>>
>> Such behaviour leads to allocation failure if number of available
>> hugepages for application limited by cgroups or hugetlbfs and
>> memory requested not only from the first socket.
>>
>> Example:
>>     # 90 x 1GB hugepages availavle in a system
>>
>>     cgcreate -g hugetlb:/test
>>     # Limit to 32GB of hugepages
>>     cgset -r hugetlb.1GB.limit_in_bytes=34359738368 test
>>     # Request 4GB from each of 2 sockets
>>     cgexec -g hugetlb:test testpmd --socket-mem=4096,4096 ...
>>
>>     EAL: SIGBUS: Cannot mmap more hugepages of size 1024 MB
>>     EAL: 32 not 90 hugepages of size 1024 MB allocated
>>     EAL: Not enough memory available on socket 1!
>>          Requested: 4096MB, available: 0MB
>>     PANIC in rte_eal_init():
>>     Cannot init memory
>>
>>     This happens beacause all allocated pages are
>>     on socket 0.
>>
>> Fix this issue by setting mempolicy MPOL_PREFERRED for each hugepage
>> to one of requested nodes using following schema:
>>
>>     1) Allocate essential hugepages:
>>         1.1) Allocate as many hugepages from numa N to
>>              only fit requested memory for this numa.
>>         1.2) repeat 1.1 for all numa nodes.
>>     2) Try to map all remaining free hugepages in a round-robin
>>        fashion.
>>     3) Sort pages and choose the most suitable.
>>
>> In this case all essential memory will be allocated and all remaining
>> pages will be fairly distributed between all requested nodes.
>>
>> New config option RTE_EAL_NUMA_AWARE_HUGEPAGES introduced and
>> enabled by default for linuxapp except armv7 and dpaa2.
>> Enabling of this option adds libnuma as a dependency for EAL.
>>
>> Fixes: 77988fc08dc5 ("mem: fix allocating all free hugepages")
>>
>> Signed-off-by: Ilya Maximets <i.maximets at samsung.com>
>> ---
>>  config/common_base                        |   1 +
>>  config/common_linuxapp                    |   1 +
>>  config/defconfig_arm-armv7a-linuxapp-gcc  |   3 +
>>  config/defconfig_arm64-dpaa2-linuxapp-gcc |   3 +
>>  lib/librte_eal/linuxapp/eal/Makefile      |   3 +
>>  lib/librte_eal/linuxapp/eal/eal_memory.c  | 120 ++++++++++++++++++++++++++++--
>>  mk/rte.app.mk                             |   3 +
>>  7 files changed, 126 insertions(+), 8 deletions(-)
>>
>> diff --git a/config/common_base b/config/common_base
>> index f6aafd1..660588a 100644
>> --- a/config/common_base
>> +++ b/config/common_base
>> @@ -103,6 +103,7 @@ CONFIG_RTE_EAL_ALWAYS_PANIC_ON_ERROR=n
>>  CONFIG_RTE_EAL_IGB_UIO=n
>>  CONFIG_RTE_EAL_VFIO=n
>>  CONFIG_RTE_MALLOC_DEBUG=n
>> +CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES=n
>>
>>  #
>>  # Recognize/ignore the AVX/AVX512 CPU flags for performance/power testing.
>> diff --git a/config/common_linuxapp b/config/common_linuxapp
>> index b3cf41b..64bef87 100644
>> --- a/config/common_linuxapp
>> +++ b/config/common_linuxapp
>> @@ -35,6 +35,7 @@
>>  CONFIG_RTE_EXEC_ENV="linuxapp"
>>  CONFIG_RTE_EXEC_ENV_LINUXAPP=y
>>
>> +CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES=y
>>  CONFIG_RTE_EAL_IGB_UIO=y
>>  CONFIG_RTE_EAL_VFIO=y
>>  CONFIG_RTE_KNI_KMOD=y
>> diff --git a/config/defconfig_arm-armv7a-linuxapp-gcc b/config/defconfig_arm-armv7a-linuxapp-gcc
>> index 19607eb..e06b1d4 100644
>> --- a/config/defconfig_arm-armv7a-linuxapp-gcc
>> +++ b/config/defconfig_arm-armv7a-linuxapp-gcc
>> @@ -47,6 +47,9 @@ CONFIG_RTE_ARCH_STRICT_ALIGN=y
>>  CONFIG_RTE_TOOLCHAIN="gcc"
>>  CONFIG_RTE_TOOLCHAIN_GCC=y
>>
>> +# NUMA is not supported on ARM
>> +CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES=n
>> +
>>  # ARM doesn't have support for vmware TSC map
>>  CONFIG_RTE_LIBRTE_EAL_VMWARE_TSC_MAP_SUPPORT=n
>>
>> diff --git a/config/defconfig_arm64-dpaa2-linuxapp-gcc b/config/defconfig_arm64-dpaa2-linuxapp-gcc
>> index 2304ab6..f78449d 100644
>> --- a/config/defconfig_arm64-dpaa2-linuxapp-gcc
>> +++ b/config/defconfig_arm64-dpaa2-linuxapp-gcc
>> @@ -45,6 +45,9 @@ CONFIG_RTE_CACHE_LINE_SIZE=64
>>
>>  CONFIG_RTE_PKTMBUF_HEADROOM=256
>>
>> +# Doesn't support NUMA
>> +CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES=y
>> +
> 
> DPAA2 does not support NUMA so,
> CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES=n

Oh, sorry. Just typo.
Thanks for catching this.


Sergio, I'll send v10 with only this change and will keep your
acked-by because the change is trivial.

>>  #
>>  # Compile Support Libraries for DPAA2
>>  #
>> diff --git a/lib/librte_eal/linuxapp/eal/Makefile b/lib/librte_eal/linuxapp/eal/Makefile
>> index 640afd0..8651e27 100644
>> --- a/lib/librte_eal/linuxapp/eal/Makefile
>> +++ b/lib/librte_eal/linuxapp/eal/Makefile
>> @@ -50,6 +50,9 @@ LDLIBS += -ldl
>>  LDLIBS += -lpthread
>>  LDLIBS += -lgcc_s
>>  LDLIBS += -lrt
>> +ifeq ($(CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES),y)
>> +LDLIBS += -lnuma
>> +endif
>>
>>  # specific to linuxapp exec-env
>>  SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) := eal.c
>> diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
>> index e17c9cb..647d89c 100644
>> --- a/lib/librte_eal/linuxapp/eal/eal_memory.c
>> +++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
>> @@ -54,6 +54,10 @@
>>  #include <sys/time.h>
>>  #include <signal.h>
>>  #include <setjmp.h>
>> +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
>> +#include <numa.h>
>> +#include <numaif.h>
>> +#endif
>>
>>  #include <rte_log.h>
>>  #include <rte_memory.h>
>> @@ -348,6 +352,14 @@ static int huge_wrap_sigsetjmp(void)
>>      return sigsetjmp(huge_jmpenv, 1);
>>  }
>>
>> +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
>> +/* Callback for numa library. */
>> +void numa_error(char *where)
>> +{
>> +    RTE_LOG(ERR, EAL, "%s failed: %s\n", where, strerror(errno));
>> +}
>> +#endif
>> +
>>  /*
>>   * Mmap all hugepages of hugepage table: it first open a file in
>>   * hugetlbfs, then mmap() hugepage_sz data in it. If orig is set, the
>> @@ -356,18 +368,78 @@ static int huge_wrap_sigsetjmp(void)
>>   * map continguous physical blocks in contiguous virtual blocks.
>>   */
>>  static unsigned
>> -map_all_hugepages(struct hugepage_file *hugepg_tbl,
>> -        struct hugepage_info *hpi, int orig)
>> +map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi,
>> +          uint64_t *essential_memory __rte_unused, int orig)
>>  {
>>      int fd;
>>      unsigned i;
>>      void *virtaddr;
>>      void *vma_addr = NULL;
>>      size_t vma_len = 0;
>> +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
>> +    int node_id = -1;
>> +    int essential_prev = 0;
>> +    int oldpolicy;
>> +    struct bitmask *oldmask = numa_allocate_nodemask();
>> +    bool have_numa = true;
>> +    unsigned long maxnode = 0;
>> +
>> +    /* Check if kernel supports NUMA. */
>> +    if (numa_available() != 0) {
>> +        RTE_LOG(DEBUG, EAL, "NUMA is not supported.\n");
>> +        have_numa = false;
>> +    }
>> +
>> +    if (orig && have_numa) {
>> +        RTE_LOG(DEBUG, EAL, "Trying to obtain current memory policy.\n");
>> +        if (get_mempolicy(&oldpolicy, oldmask->maskp,
>> +                  oldmask->size + 1, 0, 0) < 0) {
>> +            RTE_LOG(ERR, EAL,
>> +                "Failed to get current mempolicy: %s. "
>> +                "Assuming MPOL_DEFAULT.\n", strerror(errno));
>> +            oldpolicy = MPOL_DEFAULT;
>> +        }
>> +        for (i = 0; i < RTE_MAX_NUMA_NODES; i++)
>> +            if (internal_config.socket_mem[i])
>> +                maxnode = i + 1;
>> +    }
>> +#endif
>>
>>      for (i = 0; i < hpi->num_pages[0]; i++) {
>>          uint64_t hugepage_sz = hpi->hugepage_sz;
>>
>> +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
>> +        if (maxnode) {
>> +            unsigned int j;
>> +
>> +            for (j = 0; j < maxnode; j++)
>> +                if (essential_memory[j])
>> +                    break;
>> +
>> +            if (j == maxnode) {
>> +                node_id = (node_id + 1) % maxnode;
>> +                while (!internal_config.socket_mem[node_id]) {
>> +                    node_id++;
>> +                    node_id %= maxnode;
>> +                }
>> +                essential_prev = 0;
>> +            } else {
>> +                node_id = j;
>> +                essential_prev = essential_memory[j];
>> +
>> +                if (essential_memory[j] < hugepage_sz)
>> +                    essential_memory[j] = 0;
>> +                else
>> +                    essential_memory[j] -= hugepage_sz;
>> +            }
>> +
>> +            RTE_LOG(DEBUG, EAL,
>> +                "Setting policy MPOL_PREFERRED for socket %d\n",
>> +                node_id);
>> +            numa_set_preferred(node_id);
>> +        }
>> +#endif
>> +
>>          if (orig) {
>>              hugepg_tbl[i].file_id = i;
>>              hugepg_tbl[i].size = hugepage_sz;
>> @@ -422,7 +494,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
>>          if (fd < 0) {
>>              RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n", __func__,
>>                      strerror(errno));
>> -            return i;
>> +            goto out;
>>          }
>>
>>          /* map the segment, and populate page tables,
>> @@ -433,7 +505,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
>>              RTE_LOG(DEBUG, EAL, "%s(): mmap failed: %s\n", __func__,
>>                      strerror(errno));
>>              close(fd);
>> -            return i;
>> +            goto out;
>>          }
>>
>>          if (orig) {
>> @@ -458,7 +530,12 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
>>                  munmap(virtaddr, hugepage_sz);
>>                  close(fd);
>>                  unlink(hugepg_tbl[i].filepath);
>> -                return i;
>> +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
>> +                if (maxnode)
>> +                    essential_memory[node_id] =
>> +                        essential_prev;
>> +#endif
>> +                goto out;
>>              }
>>              *(int *)virtaddr = 0;
>>          }
>> @@ -469,7 +546,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
>>              RTE_LOG(DEBUG, EAL, "%s(): Locking file failed:%s \n",
>>                  __func__, strerror(errno));
>>              close(fd);
>> -            return i;
>> +            goto out;
>>          }
>>
>>          close(fd);
>> @@ -478,6 +555,22 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
>>          vma_len -= hugepage_sz;
>>      }
>>
>> +out:
>> +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
>> +    if (maxnode) {
>> +        RTE_LOG(DEBUG, EAL,
>> +            "Restoring previous memory policy: %d\n", oldpolicy);
>> +        if (oldpolicy == MPOL_DEFAULT) {
>> +            numa_set_localalloc();
>> +        } else if (set_mempolicy(oldpolicy, oldmask->maskp,
>> +                     oldmask->size + 1) < 0) {
>> +            RTE_LOG(ERR, EAL, "Failed to restore mempolicy: %s\n",
>> +                strerror(errno));
>> +            numa_set_localalloc();
>> +        }
>> +    }
>> +    numa_free_cpumask(oldmask);
>> +#endif
>>      return i;
>>  }
>>
>> @@ -562,6 +655,11 @@ find_numasocket(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi)
>>              if (hugepg_tbl[i].orig_va == va) {
>>                  hugepg_tbl[i].socket_id = socket_id;
>>                  hp_count++;
>> +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
>> +                RTE_LOG(DEBUG, EAL,
>> +                    "Hugepage %s is on socket %d\n",
>> +                    hugepg_tbl[i].filepath, socket_id);
>> +#endif
>>              }
>>          }
>>      }
>> @@ -1000,6 +1098,11 @@ rte_eal_hugepage_init(void)
>>
>>      huge_register_sigbus();
>>
>> +    /* make a copy of socket_mem, needed for balanced allocation. */
>> +    for (i = 0; i < RTE_MAX_NUMA_NODES; i++)
>> +        memory[i] = internal_config.socket_mem[i];
>> +
>> +
>>      /* map all hugepages and sort them */
>>      for (i = 0; i < (int)internal_config.num_hugepage_sizes; i ++){
>>          unsigned pages_old, pages_new;
>> @@ -1017,7 +1120,8 @@ rte_eal_hugepage_init(void)
>>
>>          /* map all hugepages available */
>>          pages_old = hpi->num_pages[0];
>> -        pages_new = map_all_hugepages(&tmp_hp[hp_offset], hpi, 1);
>> +        pages_new = map_all_hugepages(&tmp_hp[hp_offset], hpi,
>> +                          memory, 1);
>>          if (pages_new < pages_old) {
>>              RTE_LOG(DEBUG, EAL,
>>                  "%d not %d hugepages of size %u MB allocated\n",
>> @@ -1060,7 +1164,7 @@ rte_eal_hugepage_init(void)
>>                sizeof(struct hugepage_file), cmp_physaddr);
>>
>>          /* remap all hugepages */
>> -        if (map_all_hugepages(&tmp_hp[hp_offset], hpi, 0) !=
>> +        if (map_all_hugepages(&tmp_hp[hp_offset], hpi, NULL, 0) !=
>>              hpi->num_pages[0]) {
>>              RTE_LOG(ERR, EAL, "Failed to remap %u MB pages\n",
>>                      (unsigned)(hpi->hugepage_sz / 0x100000));
>> diff --git a/mk/rte.app.mk b/mk/rte.app.mk
>> index bcaf1b3..4fe22d1 100644
>> --- a/mk/rte.app.mk
>> +++ b/mk/rte.app.mk
>> @@ -186,6 +186,9 @@ ifeq ($(CONFIG_RTE_BUILD_SHARED_LIB),n)
>>  # The static libraries do not know their dependencies.
>>  # So linking with static library requires explicit dependencies.
>>  _LDLIBS-$(CONFIG_RTE_LIBRTE_EAL)            += -lrt
>> +ifeq ($(CONFIG_RTE_EXEC_ENV_LINUXAPP)$(CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES),yy)
>> +_LDLIBS-$(CONFIG_RTE_LIBRTE_EAL)            += -lnuma
>> +endif
>>  _LDLIBS-$(CONFIG_RTE_LIBRTE_SCHED)          += -lm
>>  _LDLIBS-$(CONFIG_RTE_LIBRTE_SCHED)          += -lrt
>>  _LDLIBS-$(CONFIG_RTE_LIBRTE_METER)          += -lm
>>
> 
> 
> 
> 
> 


More information about the dev mailing list