[dpdk-dev] [PATCH v7 19/21] app/testpmd: add support for external memory

Anatoly Burakov anatoly.burakov at intel.com
Mon Oct 1 13:05:08 CEST 2018


Currently, mempools can only be allocated either using native
DPDK memory, or anonymous memory. This patch will add two new
methods to allocate mempool using external memory (regular or
hugepage memory), and add documentation about it to testpmd
user guide.

It adds a new flag "--mp-alloc", with four possible values:
native (use regular DPDK allocator), anon (use anonymous
mempool), xmem (use externally allocated memory area), and
xmemhuge (use externally allocated hugepage memory area). Old
flag "--mp-anon" is kept for compatibility.

All external memory is allocated using the same external heap,
but each will allocate and add a new memory area.

Signed-off-by: Anatoly Burakov <anatoly.burakov at intel.com>
Suggested-by: Konstantin Ananyev <konstantin.ananyev at intel.com>
---
 app/test-pmd/config.c                 |  21 +-
 app/test-pmd/parameters.c             |  23 +-
 app/test-pmd/testpmd.c                | 318 ++++++++++++++++++++++++--
 app/test-pmd/testpmd.h                |  13 +-
 doc/guides/testpmd_app_ug/run_app.rst |  12 +
 5 files changed, 362 insertions(+), 25 deletions(-)

diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index a0f934932..4789910b3 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -2413,6 +2413,23 @@ fwd_config_setup(void)
 		simple_fwd_config_setup();
 }
 
+static const char *
+mp_alloc_to_str(uint8_t mode)
+{
+	switch (mode) {
+	case MP_ALLOC_NATIVE:
+		return "native";
+	case MP_ALLOC_ANON:
+		return "anon";
+	case MP_ALLOC_XMEM:
+		return "xmem";
+	case MP_ALLOC_XMEM_HUGE:
+		return "xmemhuge";
+	default:
+		return "invalid";
+	}
+}
+
 void
 pkt_fwd_config_display(struct fwd_config *cfg)
 {
@@ -2421,12 +2438,12 @@ pkt_fwd_config_display(struct fwd_config *cfg)
 	streamid_t sm_id;
 
 	printf("%s packet forwarding%s - ports=%d - cores=%d - streams=%d - "
-		"NUMA support %s, MP over anonymous pages %s\n",
+		"NUMA support %s, MP allocation mode: %s\n",
 		cfg->fwd_eng->fwd_mode_name,
 		retry_enabled == 0 ? "" : " with retry",
 		cfg->nb_fwd_ports, cfg->nb_fwd_lcores, cfg->nb_fwd_streams,
 		numa_support == 1 ? "enabled" : "disabled",
-		mp_anon != 0 ? "enabled" : "disabled");
+		mp_alloc_to_str(mp_alloc_type));
 
 	if (retry_enabled)
 		printf("TX retry num: %u, delay between TX retries: %uus\n",
diff --git a/app/test-pmd/parameters.c b/app/test-pmd/parameters.c
index 9220e1c1b..b4016668c 100644
--- a/app/test-pmd/parameters.c
+++ b/app/test-pmd/parameters.c
@@ -190,6 +190,11 @@ usage(char* progname)
 	printf("  --vxlan-gpe-port=N: UPD port of tunnel VXLAN-GPE\n");
 	printf("  --mlockall: lock all memory\n");
 	printf("  --no-mlockall: do not lock all memory\n");
+	printf("  --mp-alloc <native|anon|xmem|xmemhuge>: mempool allocation method.\n"
+	       "    native: use regular DPDK memory to create and populate mempool\n"
+	       "    anon: use regular DPDK memory to create and anonymous memory to populate mempool\n"
+	       "    xmem: use anonymous memory to create and populate mempool\n"
+	       "    xmemhuge: use anonymous hugepage memory to create and populate mempool\n");
 }
 
 #ifdef RTE_LIBRTE_CMDLINE
@@ -625,6 +630,7 @@ launch_args_parse(int argc, char** argv)
 		{ "vxlan-gpe-port",		1, 0, 0 },
 		{ "mlockall",			0, 0, 0 },
 		{ "no-mlockall",		0, 0, 0 },
+		{ "mp-alloc",			1, 0, 0 },
 		{ 0, 0, 0, 0 },
 	};
 
@@ -743,7 +749,22 @@ launch_args_parse(int argc, char** argv)
 			if (!strcmp(lgopts[opt_idx].name, "numa"))
 				numa_support = 1;
 			if (!strcmp(lgopts[opt_idx].name, "mp-anon")) {
-				mp_anon = 1;
+				mp_alloc_type = MP_ALLOC_ANON;
+			}
+			if (!strcmp(lgopts[opt_idx].name, "mp-alloc")) {
+				if (!strcmp(optarg, "native"))
+					mp_alloc_type = MP_ALLOC_NATIVE;
+				else if (!strcmp(optarg, "anon"))
+					mp_alloc_type = MP_ALLOC_ANON;
+				else if (!strcmp(optarg, "xmem"))
+					mp_alloc_type = MP_ALLOC_XMEM;
+				else if (!strcmp(optarg, "xmemhuge"))
+					mp_alloc_type = MP_ALLOC_XMEM_HUGE;
+				else
+					rte_exit(EXIT_FAILURE,
+						"mp-alloc %s invalid - must be: "
+						"native, anon or xmem\n",
+						 optarg);
 			}
 			if (!strcmp(lgopts[opt_idx].name, "port-numa-config")) {
 				if (parse_portnuma_config(optarg))
diff --git a/app/test-pmd/testpmd.c b/app/test-pmd/testpmd.c
index 001f0e552..7f4bd62ac 100644
--- a/app/test-pmd/testpmd.c
+++ b/app/test-pmd/testpmd.c
@@ -27,6 +27,7 @@
 #include <rte_log.h>
 #include <rte_debug.h>
 #include <rte_cycles.h>
+#include <rte_malloc_heap.h>
 #include <rte_memory.h>
 #include <rte_memcpy.h>
 #include <rte_launch.h>
@@ -63,6 +64,22 @@
 
 #include "testpmd.h"
 
+#ifndef MAP_HUGETLB
+/* FreeBSD may not have MAP_HUGETLB (in fact, it probably doesn't) */
+#define HUGE_FLAG (0x40000)
+#else
+#define HUGE_FLAG MAP_HUGETLB
+#endif
+
+#ifndef MAP_HUGE_SHIFT
+/* older kernels (or FreeBSD) will not have this define */
+#define HUGE_SHIFT (26)
+#else
+#define HUGE_SHIFT MAP_HUGE_SHIFT
+#endif
+
+#define EXTMEM_HEAP_NAME "extmem"
+
 uint16_t verbose_level = 0; /**< Silent by default. */
 int testpmd_logtype; /**< Log type for testpmd logs */
 
@@ -88,9 +105,13 @@ uint8_t numa_support = 1; /**< numa enabled by default */
 uint8_t socket_num = UMA_NO_CONFIG;
 
 /*
- * Use ANONYMOUS mapped memory (might be not physically continuous) for mbufs.
+ * Select mempool allocation type:
+ * - native: use regular DPDK memory
+ * - anon: use regular DPDK memory to create mempool, but populate using
+ *         anonymous memory (may not be IOVA-contiguous)
+ * - xmem: use externally allocated hugepage memory
  */
-uint8_t mp_anon = 0;
+uint8_t mp_alloc_type = MP_ALLOC_NATIVE;
 
 /*
  * Store specified sockets on which memory pool to be used by ports
@@ -527,6 +548,229 @@ set_def_fwd_config(void)
 	set_default_fwd_ports_config();
 }
 
+/* extremely pessimistic estimation of memory required to create a mempool */
+static int
+calc_mem_size(uint32_t nb_mbufs, uint32_t mbuf_sz, size_t pgsz, size_t *out)
+{
+	unsigned int n_pages, mbuf_per_pg, leftover;
+	uint64_t total_mem, mbuf_mem, obj_sz;
+
+	/* there is no good way to predict how much space the mempool will
+	 * occupy because it will allocate chunks on the fly, and some of those
+	 * will come from default DPDK memory while some will come from our
+	 * external memory, so just assume 128MB will be enough for everyone.
+	 */
+	uint64_t hdr_mem = 128 << 20;
+
+	/* account for possible non-contiguousness */
+	obj_sz = rte_mempool_calc_obj_size(mbuf_sz, 0, NULL);
+	if (obj_sz > pgsz) {
+		TESTPMD_LOG(ERR, "Object size is bigger than page size\n");
+		return -1;
+	}
+
+	mbuf_per_pg = pgsz / obj_sz;
+	leftover = (nb_mbufs % mbuf_per_pg) > 0;
+	n_pages = (nb_mbufs / mbuf_per_pg) + leftover;
+
+	mbuf_mem = n_pages * pgsz;
+
+	total_mem = RTE_ALIGN(hdr_mem + mbuf_mem, pgsz);
+
+	if (total_mem > SIZE_MAX) {
+		TESTPMD_LOG(ERR, "Memory size too big\n");
+		return -1;
+	}
+	*out = (size_t)total_mem;
+
+	return 0;
+}
+
+static inline uint32_t
+bsf64(uint64_t v)
+{
+	return (uint32_t)__builtin_ctzll(v);
+}
+
+static inline uint32_t
+log2_u64(uint64_t v)
+{
+	if (v == 0)
+		return 0;
+	v = rte_align64pow2(v);
+	return bsf64(v);
+}
+
+static int
+pagesz_flags(uint64_t page_sz)
+{
+	/* as per mmap() manpage, all page sizes are log2 of page size
+	 * shifted by MAP_HUGE_SHIFT
+	 */
+	int log2 = log2_u64(page_sz);
+	return (log2 << HUGE_SHIFT);
+}
+
+static void *
+alloc_mem(size_t memsz, size_t pgsz, bool huge)
+{
+	void *addr;
+	int flags;
+
+	/* allocate anonymous hugepages */
+	flags = MAP_ANONYMOUS | MAP_PRIVATE;
+	if (huge)
+		flags |= HUGE_FLAG | pagesz_flags(pgsz);
+
+	addr = mmap(NULL, memsz, PROT_READ | PROT_WRITE, flags, -1, 0);
+	if (addr == MAP_FAILED)
+		return NULL;
+
+	return addr;
+}
+
+struct extmem_param {
+	void *addr;
+	size_t len;
+	size_t pgsz;
+	rte_iova_t *iova_table;
+	unsigned int iova_table_len;
+};
+
+static int
+create_extmem(uint32_t nb_mbufs, uint32_t mbuf_sz, struct extmem_param *param,
+		bool huge)
+{
+	uint64_t pgsizes[] = {RTE_PGSIZE_2M, RTE_PGSIZE_1G, /* x86_64, ARM */
+			RTE_PGSIZE_16M, RTE_PGSIZE_16G};    /* POWER */
+	unsigned int cur_page, n_pages, pgsz_idx;
+	size_t mem_sz, cur_pgsz;
+	rte_iova_t *iovas = NULL;
+	void *addr;
+	int ret;
+
+	for (pgsz_idx = 0; pgsz_idx < RTE_DIM(pgsizes); pgsz_idx++) {
+		/* skip anything that is too big */
+		if (pgsizes[pgsz_idx] > SIZE_MAX)
+			continue;
+
+		cur_pgsz = pgsizes[pgsz_idx];
+
+		/* if we were told not to allocate hugepages, override */
+		if (!huge)
+			cur_pgsz = sysconf(_SC_PAGESIZE);
+
+		ret = calc_mem_size(nb_mbufs, mbuf_sz, cur_pgsz, &mem_sz);
+		if (ret < 0) {
+			TESTPMD_LOG(ERR, "Cannot calculate memory size\n");
+			return -1;
+		}
+
+		/* allocate our memory */
+		addr = alloc_mem(mem_sz, cur_pgsz, huge);
+
+		/* if we couldn't allocate memory with a specified page size,
+		 * that doesn't mean we can't do it with other page sizes, so
+		 * try another one.
+		 */
+		if (addr == NULL)
+			continue;
+
+		/* store IOVA addresses for every page in this memory area */
+		n_pages = mem_sz / cur_pgsz;
+
+		iovas = malloc(sizeof(*iovas) * n_pages);
+
+		if (iovas == NULL) {
+			TESTPMD_LOG(ERR, "Cannot allocate memory for iova addresses\n");
+			goto fail;
+		}
+		/* lock memory if it's not huge pages */
+		if (!huge)
+			mlock(addr, mem_sz);
+
+		/* populate IOVA addresses */
+		for (cur_page = 0; cur_page < n_pages; cur_page++) {
+			rte_iova_t iova;
+			size_t offset;
+			void *cur;
+
+			offset = cur_pgsz * cur_page;
+			cur = RTE_PTR_ADD(addr, offset);
+			iova = rte_mem_virt2iova(cur);
+
+			iovas[cur_page] = iova;
+		}
+
+		break;
+	}
+	/* if we couldn't allocate anything */
+	if (iovas == NULL)
+		return -1;
+
+	param->addr = addr;
+	param->len = mem_sz;
+	param->pgsz = cur_pgsz;
+	param->iova_table = iovas;
+	param->iova_table_len = n_pages;
+
+	return 0;
+fail:
+	if (iovas)
+		free(iovas);
+	if (addr)
+		munmap(addr, mem_sz);
+
+	return -1;
+}
+
+static int
+setup_extmem(uint32_t nb_mbufs, uint32_t mbuf_sz, bool huge)
+{
+	struct extmem_param param = {};
+	int socket_id, ret;
+
+	/* check if our heap exists */
+	socket_id = rte_malloc_heap_get_socket(EXTMEM_HEAP_NAME);
+	if (socket_id < 0) {
+		/* create our heap */
+		ret = rte_malloc_heap_create(EXTMEM_HEAP_NAME);
+		if (ret < 0) {
+			TESTPMD_LOG(ERR, "Cannot create heap\n");
+			return -1;
+		}
+	}
+
+	ret = create_extmem(nb_mbufs, mbuf_sz, &param, huge);
+	if (ret < 0) {
+		TESTPMD_LOG(ERR, "Cannot create memory area\n");
+		return -1;
+	}
+
+	/* we now have a valid memory area, so add it to heap */
+	ret = rte_malloc_heap_memory_add(EXTMEM_HEAP_NAME,
+			param.addr, param.len, param.iova_table,
+			param.iova_table_len, param.pgsz);
+
+	/* when using VFIO, memory is automatically mapped for DMA by EAL */
+
+	/* not needed any more */
+	free(param.iova_table);
+
+	if (ret < 0) {
+		TESTPMD_LOG(ERR, "Cannot add memory to heap\n");
+		munmap(param.addr, param.len);
+		return -1;
+	}
+
+	/* success */
+
+	TESTPMD_LOG(DEBUG, "Allocated %zuMB of external memory\n",
+			param.len >> 20);
+
+	return 0;
+}
+
 /*
  * Configuration initialisation done once at init time.
  */
@@ -545,27 +789,59 @@ mbuf_pool_create(uint16_t mbuf_seg_size, unsigned nb_mbuf,
 		"create a new mbuf pool <%s>: n=%u, size=%u, socket=%u\n",
 		pool_name, nb_mbuf, mbuf_seg_size, socket_id);
 
-	if (mp_anon != 0) {
-		rte_mp = rte_mempool_create_empty(pool_name, nb_mbuf,
-			mb_size, (unsigned) mb_mempool_cache,
-			sizeof(struct rte_pktmbuf_pool_private),
-			socket_id, 0);
-		if (rte_mp == NULL)
-			goto err;
+	switch (mp_alloc_type) {
+	case MP_ALLOC_NATIVE:
+		{
+			/* wrapper to rte_mempool_create() */
+			TESTPMD_LOG(INFO, "preferred mempool ops selected: %s\n",
+					rte_mbuf_best_mempool_ops());
+			rte_mp = rte_pktmbuf_pool_create(pool_name, nb_mbuf,
+				mb_mempool_cache, 0, mbuf_seg_size, socket_id);
+			break;
+		}
+	case MP_ALLOC_ANON:
+		{
+			rte_mp = rte_mempool_create_empty(pool_name, nb_mbuf,
+				mb_size, (unsigned int) mb_mempool_cache,
+				sizeof(struct rte_pktmbuf_pool_private),
+				socket_id, 0);
+			if (rte_mp == NULL)
+				goto err;
+
+			if (rte_mempool_populate_anon(rte_mp) == 0) {
+				rte_mempool_free(rte_mp);
+				rte_mp = NULL;
+				goto err;
+			}
+			rte_pktmbuf_pool_init(rte_mp, NULL);
+			rte_mempool_obj_iter(rte_mp, rte_pktmbuf_init, NULL);
+			break;
+		}
+	case MP_ALLOC_XMEM:
+	case MP_ALLOC_XMEM_HUGE:
+		{
+			int heap_socket;
+			bool huge = mp_alloc_type == MP_ALLOC_XMEM_HUGE;
 
-		if (rte_mempool_populate_anon(rte_mp) == 0) {
-			rte_mempool_free(rte_mp);
-			rte_mp = NULL;
-			goto err;
+			if (setup_extmem(nb_mbuf, mbuf_seg_size, huge) < 0)
+				rte_exit(EXIT_FAILURE, "Could not create external memory\n");
+
+			heap_socket =
+				rte_malloc_heap_get_socket(EXTMEM_HEAP_NAME);
+			if (heap_socket < 0)
+				rte_exit(EXIT_FAILURE, "Could not get external memory socket ID\n");
+
+			TESTPMD_LOG(INFO, "preferred mempool ops selected: %s\n",
+					rte_mbuf_best_mempool_ops());
+			rte_mp = rte_pktmbuf_pool_create(pool_name, nb_mbuf,
+					mb_mempool_cache, 0, mbuf_seg_size,
+					heap_socket);
+			break;
+		}
+	default:
+		{
+			rte_exit(EXIT_FAILURE, "Invalid mempool creation mode\n");
 		}
-		rte_pktmbuf_pool_init(rte_mp, NULL);
-		rte_mempool_obj_iter(rte_mp, rte_pktmbuf_init, NULL);
-	} else {
-		/* wrapper to rte_mempool_create() */
-		TESTPMD_LOG(INFO, "preferred mempool ops selected: %s\n",
-				rte_mbuf_best_mempool_ops());
-		rte_mp = rte_pktmbuf_pool_create(pool_name, nb_mbuf,
-			mb_mempool_cache, 0, mbuf_seg_size, socket_id);
 	}
 
 err:
diff --git a/app/test-pmd/testpmd.h b/app/test-pmd/testpmd.h
index a1f661472..65e0cec90 100644
--- a/app/test-pmd/testpmd.h
+++ b/app/test-pmd/testpmd.h
@@ -69,6 +69,16 @@ enum {
 	PORT_TOPOLOGY_LOOP,
 };
 
+enum {
+	MP_ALLOC_NATIVE, /**< allocate and populate mempool natively */
+	MP_ALLOC_ANON,
+	/**< allocate mempool natively, but populate using anonymous memory */
+	MP_ALLOC_XMEM,
+	/**< allocate and populate mempool using anonymous memory */
+	MP_ALLOC_XMEM_HUGE
+	/**< allocate and populate mempool using anonymous hugepage memory */
+};
+
 #ifdef RTE_TEST_PMD_RECORD_BURST_STATS
 /**
  * The data structure associated with RX and TX packet burst statistics
@@ -304,7 +314,8 @@ extern uint8_t  numa_support; /**< set by "--numa" parameter */
 extern uint16_t port_topology; /**< set by "--port-topology" parameter */
 extern uint8_t no_flush_rx; /**<set by "--no-flush-rx" parameter */
 extern uint8_t flow_isolate_all; /**< set by "--flow-isolate-all */
-extern uint8_t  mp_anon; /**< set by "--mp-anon" parameter */
+extern uint8_t  mp_alloc_type;
+/**< set by "--mp-anon" or "--mp-alloc" parameter */
 extern uint8_t no_link_check; /**<set by "--disable-link-check" parameter */
 extern volatile int test_done; /* stop packet forwarding when set to 1. */
 extern uint8_t lsc_interrupt; /**< disabled by "--no-lsc-interrupt" parameter */
diff --git a/doc/guides/testpmd_app_ug/run_app.rst b/doc/guides/testpmd_app_ug/run_app.rst
index f301c2b6f..67a8532a4 100644
--- a/doc/guides/testpmd_app_ug/run_app.rst
+++ b/doc/guides/testpmd_app_ug/run_app.rst
@@ -498,3 +498,15 @@ The commandline options are:
 *   ``--no-mlockall``
 
     Disable locking all memory.
+
+*   ``--mp-alloc <native|anon|xmem|xmemhuge>``
+
+    Select mempool allocation mode:
+
+    * native: create and populate mempool using native DPDK memory
+    * anon: create mempool using native DPDK memory, but populate using
+      anonymous memory
+    * xmem: create and populate mempool using externally and anonymously
+      allocated area
+    * xmemhuge: create and populate mempool using externally and anonymously
+      allocated hugepage area
-- 
2.17.1


More information about the dev mailing list