[dpdk-dev] [PATCH v9 1/2] mem: balanced allocation of hugepages

Ilya Maximets i.maximets at samsung.com
Thu Jun 29 08:08:35 CEST 2017


On 29.06.2017 08:48, Ilya Maximets wrote:
> On 29.06.2017 08:32, Hemant Agrawal wrote:
>> On 6/27/2017 3:54 PM, Ilya Maximets wrote:
>>> Currently EAL allocates hugepages one by one not paying attention
>>> from which NUMA node allocation was done.
>>>
>>> Such behaviour leads to allocation failure if number of available
>>> hugepages for application limited by cgroups or hugetlbfs and
>>> memory requested not only from the first socket.
>>>
>>> Example:
>>>     # 90 x 1GB hugepages availavle in a system
>>>
>>>     cgcreate -g hugetlb:/test
>>>     # Limit to 32GB of hugepages
>>>     cgset -r hugetlb.1GB.limit_in_bytes=34359738368 test
>>>     # Request 4GB from each of 2 sockets
>>>     cgexec -g hugetlb:test testpmd --socket-mem=4096,4096 ...
>>>
>>>     EAL: SIGBUS: Cannot mmap more hugepages of size 1024 MB
>>>     EAL: 32 not 90 hugepages of size 1024 MB allocated
>>>     EAL: Not enough memory available on socket 1!
>>>          Requested: 4096MB, available: 0MB
>>>     PANIC in rte_eal_init():
>>>     Cannot init memory
>>>
>>>     This happens beacause all allocated pages are
>>>     on socket 0.
>>>
>>> Fix this issue by setting mempolicy MPOL_PREFERRED for each hugepage
>>> to one of requested nodes using following schema:
>>>
>>>     1) Allocate essential hugepages:
>>>         1.1) Allocate as many hugepages from numa N to
>>>              only fit requested memory for this numa.
>>>         1.2) repeat 1.1 for all numa nodes.
>>>     2) Try to map all remaining free hugepages in a round-robin
>>>        fashion.
>>>     3) Sort pages and choose the most suitable.
>>>
>>> In this case all essential memory will be allocated and all remaining
>>> pages will be fairly distributed between all requested nodes.
>>>
>>> New config option RTE_EAL_NUMA_AWARE_HUGEPAGES introduced and
>>> enabled by default for linuxapp except armv7 and dpaa2.
>>> Enabling of this option adds libnuma as a dependency for EAL.
>>>
>>> Fixes: 77988fc08dc5 ("mem: fix allocating all free hugepages")
>>>
>>> Signed-off-by: Ilya Maximets <i.maximets at samsung.com>
>>> ---
>>>  config/common_base                        |   1 +
>>>  config/common_linuxapp                    |   1 +
>>>  config/defconfig_arm-armv7a-linuxapp-gcc  |   3 +
>>>  config/defconfig_arm64-dpaa2-linuxapp-gcc |   3 +
>>>  lib/librte_eal/linuxapp/eal/Makefile      |   3 +
>>>  lib/librte_eal/linuxapp/eal/eal_memory.c  | 120 ++++++++++++++++++++++++++++--
>>>  mk/rte.app.mk                             |   3 +
>>>  7 files changed, 126 insertions(+), 8 deletions(-)
>>>
>>> diff --git a/config/common_base b/config/common_base
>>> index f6aafd1..660588a 100644
>>> --- a/config/common_base
>>> +++ b/config/common_base
>>> @@ -103,6 +103,7 @@ CONFIG_RTE_EAL_ALWAYS_PANIC_ON_ERROR=n
>>>  CONFIG_RTE_EAL_IGB_UIO=n
>>>  CONFIG_RTE_EAL_VFIO=n
>>>  CONFIG_RTE_MALLOC_DEBUG=n
>>> +CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES=n
>>>
>>>  #
>>>  # Recognize/ignore the AVX/AVX512 CPU flags for performance/power testing.
>>> diff --git a/config/common_linuxapp b/config/common_linuxapp
>>> index b3cf41b..64bef87 100644
>>> --- a/config/common_linuxapp
>>> +++ b/config/common_linuxapp
>>> @@ -35,6 +35,7 @@
>>>  CONFIG_RTE_EXEC_ENV="linuxapp"
>>>  CONFIG_RTE_EXEC_ENV_LINUXAPP=y
>>>
>>> +CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES=y
>>>  CONFIG_RTE_EAL_IGB_UIO=y
>>>  CONFIG_RTE_EAL_VFIO=y
>>>  CONFIG_RTE_KNI_KMOD=y
>>> diff --git a/config/defconfig_arm-armv7a-linuxapp-gcc b/config/defconfig_arm-armv7a-linuxapp-gcc
>>> index 19607eb..e06b1d4 100644
>>> --- a/config/defconfig_arm-armv7a-linuxapp-gcc
>>> +++ b/config/defconfig_arm-armv7a-linuxapp-gcc
>>> @@ -47,6 +47,9 @@ CONFIG_RTE_ARCH_STRICT_ALIGN=y
>>>  CONFIG_RTE_TOOLCHAIN="gcc"
>>>  CONFIG_RTE_TOOLCHAIN_GCC=y
>>>
>>> +# NUMA is not supported on ARM
>>> +CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES=n
>>> +
>>>  # ARM doesn't have support for vmware TSC map
>>>  CONFIG_RTE_LIBRTE_EAL_VMWARE_TSC_MAP_SUPPORT=n
>>>
>>> diff --git a/config/defconfig_arm64-dpaa2-linuxapp-gcc b/config/defconfig_arm64-dpaa2-linuxapp-gcc
>>> index 2304ab6..f78449d 100644
>>> --- a/config/defconfig_arm64-dpaa2-linuxapp-gcc
>>> +++ b/config/defconfig_arm64-dpaa2-linuxapp-gcc
>>> @@ -45,6 +45,9 @@ CONFIG_RTE_CACHE_LINE_SIZE=64
>>>
>>>  CONFIG_RTE_PKTMBUF_HEADROOM=256
>>>
>>> +# Doesn't support NUMA
>>> +CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES=y
>>> +
>>
>> DPAA2 does not support NUMA so,
>> CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES=n
> 
> Oh, sorry. Just typo.
> Thanks for catching this.

Fixed. Hemant, please, check the new version (v10).

> Sergio, I'll send v10 with only this change and will keep your
> acked-by because the change is trivial.
> 
>>>  #
>>>  # Compile Support Libraries for DPAA2
>>>  #
>>> diff --git a/lib/librte_eal/linuxapp/eal/Makefile b/lib/librte_eal/linuxapp/eal/Makefile
>>> index 640afd0..8651e27 100644
>>> --- a/lib/librte_eal/linuxapp/eal/Makefile
>>> +++ b/lib/librte_eal/linuxapp/eal/Makefile
>>> @@ -50,6 +50,9 @@ LDLIBS += -ldl
>>>  LDLIBS += -lpthread
>>>  LDLIBS += -lgcc_s
>>>  LDLIBS += -lrt
>>> +ifeq ($(CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES),y)
>>> +LDLIBS += -lnuma
>>> +endif
>>>
>>>  # specific to linuxapp exec-env
>>>  SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) := eal.c
>>> diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
>>> index e17c9cb..647d89c 100644
>>> --- a/lib/librte_eal/linuxapp/eal/eal_memory.c
>>> +++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
>>> @@ -54,6 +54,10 @@
>>>  #include <sys/time.h>
>>>  #include <signal.h>
>>>  #include <setjmp.h>
>>> +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
>>> +#include <numa.h>
>>> +#include <numaif.h>
>>> +#endif
>>>
>>>  #include <rte_log.h>
>>>  #include <rte_memory.h>
>>> @@ -348,6 +352,14 @@ static int huge_wrap_sigsetjmp(void)
>>>      return sigsetjmp(huge_jmpenv, 1);
>>>  }
>>>
>>> +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
>>> +/* Callback for numa library. */
>>> +void numa_error(char *where)
>>> +{
>>> +    RTE_LOG(ERR, EAL, "%s failed: %s\n", where, strerror(errno));
>>> +}
>>> +#endif
>>> +
>>>  /*
>>>   * Mmap all hugepages of hugepage table: it first open a file in
>>>   * hugetlbfs, then mmap() hugepage_sz data in it. If orig is set, the
>>> @@ -356,18 +368,78 @@ static int huge_wrap_sigsetjmp(void)
>>>   * map continguous physical blocks in contiguous virtual blocks.
>>>   */
>>>  static unsigned
>>> -map_all_hugepages(struct hugepage_file *hugepg_tbl,
>>> -        struct hugepage_info *hpi, int orig)
>>> +map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi,
>>> +          uint64_t *essential_memory __rte_unused, int orig)
>>>  {
>>>      int fd;
>>>      unsigned i;
>>>      void *virtaddr;
>>>      void *vma_addr = NULL;
>>>      size_t vma_len = 0;
>>> +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
>>> +    int node_id = -1;
>>> +    int essential_prev = 0;
>>> +    int oldpolicy;
>>> +    struct bitmask *oldmask = numa_allocate_nodemask();
>>> +    bool have_numa = true;
>>> +    unsigned long maxnode = 0;
>>> +
>>> +    /* Check if kernel supports NUMA. */
>>> +    if (numa_available() != 0) {
>>> +        RTE_LOG(DEBUG, EAL, "NUMA is not supported.\n");
>>> +        have_numa = false;
>>> +    }
>>> +
>>> +    if (orig && have_numa) {
>>> +        RTE_LOG(DEBUG, EAL, "Trying to obtain current memory policy.\n");
>>> +        if (get_mempolicy(&oldpolicy, oldmask->maskp,
>>> +                  oldmask->size + 1, 0, 0) < 0) {
>>> +            RTE_LOG(ERR, EAL,
>>> +                "Failed to get current mempolicy: %s. "
>>> +                "Assuming MPOL_DEFAULT.\n", strerror(errno));
>>> +            oldpolicy = MPOL_DEFAULT;
>>> +        }
>>> +        for (i = 0; i < RTE_MAX_NUMA_NODES; i++)
>>> +            if (internal_config.socket_mem[i])
>>> +                maxnode = i + 1;
>>> +    }
>>> +#endif
>>>
>>>      for (i = 0; i < hpi->num_pages[0]; i++) {
>>>          uint64_t hugepage_sz = hpi->hugepage_sz;
>>>
>>> +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
>>> +        if (maxnode) {
>>> +            unsigned int j;
>>> +
>>> +            for (j = 0; j < maxnode; j++)
>>> +                if (essential_memory[j])
>>> +                    break;
>>> +
>>> +            if (j == maxnode) {
>>> +                node_id = (node_id + 1) % maxnode;
>>> +                while (!internal_config.socket_mem[node_id]) {
>>> +                    node_id++;
>>> +                    node_id %= maxnode;
>>> +                }
>>> +                essential_prev = 0;
>>> +            } else {
>>> +                node_id = j;
>>> +                essential_prev = essential_memory[j];
>>> +
>>> +                if (essential_memory[j] < hugepage_sz)
>>> +                    essential_memory[j] = 0;
>>> +                else
>>> +                    essential_memory[j] -= hugepage_sz;
>>> +            }
>>> +
>>> +            RTE_LOG(DEBUG, EAL,
>>> +                "Setting policy MPOL_PREFERRED for socket %d\n",
>>> +                node_id);
>>> +            numa_set_preferred(node_id);
>>> +        }
>>> +#endif
>>> +
>>>          if (orig) {
>>>              hugepg_tbl[i].file_id = i;
>>>              hugepg_tbl[i].size = hugepage_sz;
>>> @@ -422,7 +494,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
>>>          if (fd < 0) {
>>>              RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n", __func__,
>>>                      strerror(errno));
>>> -            return i;
>>> +            goto out;
>>>          }
>>>
>>>          /* map the segment, and populate page tables,
>>> @@ -433,7 +505,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
>>>              RTE_LOG(DEBUG, EAL, "%s(): mmap failed: %s\n", __func__,
>>>                      strerror(errno));
>>>              close(fd);
>>> -            return i;
>>> +            goto out;
>>>          }
>>>
>>>          if (orig) {
>>> @@ -458,7 +530,12 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
>>>                  munmap(virtaddr, hugepage_sz);
>>>                  close(fd);
>>>                  unlink(hugepg_tbl[i].filepath);
>>> -                return i;
>>> +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
>>> +                if (maxnode)
>>> +                    essential_memory[node_id] =
>>> +                        essential_prev;
>>> +#endif
>>> +                goto out;
>>>              }
>>>              *(int *)virtaddr = 0;
>>>          }
>>> @@ -469,7 +546,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
>>>              RTE_LOG(DEBUG, EAL, "%s(): Locking file failed:%s \n",
>>>                  __func__, strerror(errno));
>>>              close(fd);
>>> -            return i;
>>> +            goto out;
>>>          }
>>>
>>>          close(fd);
>>> @@ -478,6 +555,22 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
>>>          vma_len -= hugepage_sz;
>>>      }
>>>
>>> +out:
>>> +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
>>> +    if (maxnode) {
>>> +        RTE_LOG(DEBUG, EAL,
>>> +            "Restoring previous memory policy: %d\n", oldpolicy);
>>> +        if (oldpolicy == MPOL_DEFAULT) {
>>> +            numa_set_localalloc();
>>> +        } else if (set_mempolicy(oldpolicy, oldmask->maskp,
>>> +                     oldmask->size + 1) < 0) {
>>> +            RTE_LOG(ERR, EAL, "Failed to restore mempolicy: %s\n",
>>> +                strerror(errno));
>>> +            numa_set_localalloc();
>>> +        }
>>> +    }
>>> +    numa_free_cpumask(oldmask);
>>> +#endif
>>>      return i;
>>>  }
>>>
>>> @@ -562,6 +655,11 @@ find_numasocket(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi)
>>>              if (hugepg_tbl[i].orig_va == va) {
>>>                  hugepg_tbl[i].socket_id = socket_id;
>>>                  hp_count++;
>>> +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
>>> +                RTE_LOG(DEBUG, EAL,
>>> +                    "Hugepage %s is on socket %d\n",
>>> +                    hugepg_tbl[i].filepath, socket_id);
>>> +#endif
>>>              }
>>>          }
>>>      }
>>> @@ -1000,6 +1098,11 @@ rte_eal_hugepage_init(void)
>>>
>>>      huge_register_sigbus();
>>>
>>> +    /* make a copy of socket_mem, needed for balanced allocation. */
>>> +    for (i = 0; i < RTE_MAX_NUMA_NODES; i++)
>>> +        memory[i] = internal_config.socket_mem[i];
>>> +
>>> +
>>>      /* map all hugepages and sort them */
>>>      for (i = 0; i < (int)internal_config.num_hugepage_sizes; i ++){
>>>          unsigned pages_old, pages_new;
>>> @@ -1017,7 +1120,8 @@ rte_eal_hugepage_init(void)
>>>
>>>          /* map all hugepages available */
>>>          pages_old = hpi->num_pages[0];
>>> -        pages_new = map_all_hugepages(&tmp_hp[hp_offset], hpi, 1);
>>> +        pages_new = map_all_hugepages(&tmp_hp[hp_offset], hpi,
>>> +                          memory, 1);
>>>          if (pages_new < pages_old) {
>>>              RTE_LOG(DEBUG, EAL,
>>>                  "%d not %d hugepages of size %u MB allocated\n",
>>> @@ -1060,7 +1164,7 @@ rte_eal_hugepage_init(void)
>>>                sizeof(struct hugepage_file), cmp_physaddr);
>>>
>>>          /* remap all hugepages */
>>> -        if (map_all_hugepages(&tmp_hp[hp_offset], hpi, 0) !=
>>> +        if (map_all_hugepages(&tmp_hp[hp_offset], hpi, NULL, 0) !=
>>>              hpi->num_pages[0]) {
>>>              RTE_LOG(ERR, EAL, "Failed to remap %u MB pages\n",
>>>                      (unsigned)(hpi->hugepage_sz / 0x100000));
>>> diff --git a/mk/rte.app.mk b/mk/rte.app.mk
>>> index bcaf1b3..4fe22d1 100644
>>> --- a/mk/rte.app.mk
>>> +++ b/mk/rte.app.mk
>>> @@ -186,6 +186,9 @@ ifeq ($(CONFIG_RTE_BUILD_SHARED_LIB),n)
>>>  # The static libraries do not know their dependencies.
>>>  # So linking with static library requires explicit dependencies.
>>>  _LDLIBS-$(CONFIG_RTE_LIBRTE_EAL)            += -lrt
>>> +ifeq ($(CONFIG_RTE_EXEC_ENV_LINUXAPP)$(CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES),yy)
>>> +_LDLIBS-$(CONFIG_RTE_LIBRTE_EAL)            += -lnuma
>>> +endif
>>>  _LDLIBS-$(CONFIG_RTE_LIBRTE_SCHED)          += -lm
>>>  _LDLIBS-$(CONFIG_RTE_LIBRTE_SCHED)          += -lrt
>>>  _LDLIBS-$(CONFIG_RTE_LIBRTE_METER)          += -lm
>>>
>>
>>
>>
>>
>>


More information about the dev mailing list