[dpdk-dev] [PATCH v3 17/20] examples: add external memory example app
Ananyev, Konstantin
konstantin.ananyev at intel.com
Fri Sep 21 00:47:58 CEST 2018
Hi Anatoly
>
> Introduce an example application demonstrating the use of
> external memory support. This is a simple application based on
> skeleton app, but instead of using internal DPDK memory, it is
> using externally allocated memory.
>
> The RX/TX and init path is a carbon-copy of skeleton app, with
> no modifications whatseoever. The only difference is an additional
> init stage to allocate memory and create a heap for it, and the
> socket ID supplied to the mempool initialization function. The
> memory used by this app is hugepage memory allocated anonymously.
>
> Anonymous hugepage memory will not be allocated in a NUMA-aware
> fashion, so there is a chance of performance degradation when
> using this app, but given that kernel usually gives hugepages on
> local socket first, this should not be a problem in most cases.
Do we need a new sample app just for that?
Couldn't it be added into testpmd, same, as we have now 'mp-anon'
to use mempool over anonymous memory?
Konstantin
>
> Signed-off-by: Anatoly Burakov <anatoly.burakov at intel.com>
> ---
> examples/external_mem/Makefile | 62 ++++
> examples/external_mem/extmem.c | 461 ++++++++++++++++++++++++++++++
> examples/external_mem/meson.build | 12 +
> 3 files changed, 535 insertions(+)
> create mode 100644 examples/external_mem/Makefile
> create mode 100644 examples/external_mem/extmem.c
> create mode 100644 examples/external_mem/meson.build
>
> diff --git a/examples/external_mem/Makefile b/examples/external_mem/Makefile
> new file mode 100644
> index 000000000..3b6ab3b2f
> --- /dev/null
> +++ b/examples/external_mem/Makefile
> @@ -0,0 +1,62 @@
> +# SPDX-License-Identifier: BSD-3-Clause
> +# Copyright(c) 2010-2018 Intel Corporation
> +
> +# binary name
> +APP = extmem
> +
> +# all source are stored in SRCS-y
> +SRCS-y := extmem.c
> +
> +# Build using pkg-config variables if possible
> +$(shell pkg-config --exists libdpdk)
> +ifeq ($(.SHELLSTATUS),0)
> +
> +all: shared
> +.PHONY: shared static
> +shared: build/$(APP)-shared
> + ln -sf $(APP)-shared build/$(APP)
> +static: build/$(APP)-static
> + ln -sf $(APP)-static build/$(APP)
> +
> +PC_FILE := $(shell pkg-config --path libdpdk)
> +CFLAGS += -O3 $(shell pkg-config --cflags libdpdk)
> +CFLAGS += -DALLOW_EXPERIMENTAL_API
> +LDFLAGS_SHARED = $(shell pkg-config --libs libdpdk)
> +LDFLAGS_STATIC = -Wl,-Bstatic $(shell pkg-config --static --libs libdpdk)
> +
> +build/$(APP)-shared: $(SRCS-y) Makefile $(PC_FILE) | build
> + $(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_SHARED)
> +
> +build/$(APP)-static: $(SRCS-y) Makefile $(PC_FILE) | build
> + $(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_STATIC)
> +
> +build:
> + @mkdir -p $@
> +
> +.PHONY: clean
> +clean:
> + rm -f build/$(APP) build/$(APP)-static build/$(APP)-shared
> + rmdir --ignore-fail-on-non-empty build
> +
> +else # Build using legacy build system
> +
> +ifeq ($(RTE_SDK),)
> +$(error "Please define RTE_SDK environment variable")
> +endif
> +
> +# Default target, can be overridden by command line or environment
> +RTE_TARGET ?= x86_64-native-linuxapp-gcc
> +
> +include $(RTE_SDK)/mk/rte.vars.mk
> +
> +CFLAGS += $(WERROR_FLAGS)
> +CFLAGS += -DALLOW_EXPERIMENTAL_API
> +
> +# workaround for a gcc bug with noreturn attribute
> +# http://gcc.gnu.org/bugzilla/show_bug.cgi?id=12603
> +ifeq ($(CONFIG_RTE_TOOLCHAIN_GCC),y)
> +CFLAGS_main.o += -Wno-return-type
> +endif
> +
> +include $(RTE_SDK)/mk/rte.extapp.mk
> +endif
> diff --git a/examples/external_mem/extmem.c b/examples/external_mem/extmem.c
> new file mode 100644
> index 000000000..818a02171
> --- /dev/null
> +++ b/examples/external_mem/extmem.c
> @@ -0,0 +1,461 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2010-2018 Intel Corporation
> + */
> +
> +#include <stdint.h>
> +#include <inttypes.h>
> +#include <stdbool.h>
> +#include <unistd.h>
> +#include <sys/mman.h>
> +
> +#include <rte_eal.h>
> +#include <rte_ethdev.h>
> +#include <rte_cycles.h>
> +#include <rte_lcore.h>
> +#include <rte_mbuf.h>
> +#include <rte_malloc.h>
> +#include <rte_memory.h>
> +#include <rte_vfio.h>
> +
> +#define RX_RING_SIZE 1024
> +#define TX_RING_SIZE 1024
> +
> +#define NUM_MBUFS 8191
> +#define MBUF_CACHE_SIZE 250
> +#define BURST_SIZE 32
> +#define EXTMEM_HEAP_NAME "extmem"
> +
> +static const struct rte_eth_conf port_conf_default = {
> + .rxmode = {
> + .max_rx_pkt_len = ETHER_MAX_LEN,
> + },
> +};
> +
> +/* extmem.c: Basic DPDK skeleton forwarding example using external memory. */
> +
> +/*
> + * Initializes a given port using global settings and with the RX buffers
> + * coming from the mbuf_pool passed as a parameter.
> + */
> +static inline int
> +port_init(uint16_t port, struct rte_mempool *mbuf_pool)
> +{
> + struct rte_eth_conf port_conf = port_conf_default;
> + const uint16_t rx_rings = 1, tx_rings = 1;
> + uint16_t nb_rxd = RX_RING_SIZE;
> + uint16_t nb_txd = TX_RING_SIZE;
> + int retval;
> + uint16_t q;
> + struct rte_eth_dev_info dev_info;
> + struct rte_eth_txconf txconf;
> +
> + if (!rte_eth_dev_is_valid_port(port))
> + return -1;
> +
> + rte_eth_dev_info_get(port, &dev_info);
> + if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE)
> + port_conf.txmode.offloads |=
> + DEV_TX_OFFLOAD_MBUF_FAST_FREE;
> +
> + /* Configure the Ethernet device. */
> + retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
> + if (retval != 0)
> + return retval;
> +
> + retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &nb_rxd, &nb_txd);
> + if (retval != 0)
> + return retval;
> +
> + /* Allocate and set up 1 RX queue per Ethernet port. */
> + for (q = 0; q < rx_rings; q++) {
> + retval = rte_eth_rx_queue_setup(port, q, nb_rxd,
> + rte_eth_dev_socket_id(port), NULL, mbuf_pool);
> + if (retval < 0)
> + return retval;
> + }
> +
> + txconf = dev_info.default_txconf;
> + txconf.offloads = port_conf.txmode.offloads;
> + /* Allocate and set up 1 TX queue per Ethernet port. */
> + for (q = 0; q < tx_rings; q++) {
> + retval = rte_eth_tx_queue_setup(port, q, nb_txd,
> + rte_eth_dev_socket_id(port), &txconf);
> + if (retval < 0)
> + return retval;
> + }
> +
> + /* Start the Ethernet port. */
> + retval = rte_eth_dev_start(port);
> + if (retval < 0)
> + return retval;
> +
> + /* Display the port MAC address. */
> + struct ether_addr addr;
> + rte_eth_macaddr_get(port, &addr);
> + printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8
> + " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n",
> + port,
> + addr.addr_bytes[0], addr.addr_bytes[1],
> + addr.addr_bytes[2], addr.addr_bytes[3],
> + addr.addr_bytes[4], addr.addr_bytes[5]);
> +
> + /* Enable RX in promiscuous mode for the Ethernet device. */
> + rte_eth_promiscuous_enable(port);
> +
> + return 0;
> +}
> +
> +/*
> + * The lcore main. This is the main thread that does the work, reading from
> + * an input port and writing to an output port.
> + */
> +static __attribute__((noreturn)) void
> +lcore_main(void)
> +{
> + uint16_t port;
> +
> + /*
> + * Check that the port is on the same NUMA node as the polling thread
> + * for best performance.
> + */
> + RTE_ETH_FOREACH_DEV(port)
> + if (rte_eth_dev_socket_id(port) > 0 &&
> + rte_eth_dev_socket_id(port) !=
> + (int)rte_socket_id())
> + printf("WARNING, port %u is on remote NUMA node to "
> + "polling thread.\n\tPerformance will "
> + "not be optimal.\n", port);
> +
> + printf("\nCore %u forwarding packets. [Ctrl+C to quit]\n",
> + rte_lcore_id());
> +
> + /* Run until the application is quit or killed. */
> + for (;;) {
> + /*
> + * Receive packets on a port and forward them on the paired
> + * port. The mapping is 0 -> 1, 1 -> 0, 2 -> 3, 3 -> 2, etc.
> + */
> + RTE_ETH_FOREACH_DEV(port) {
> +
> + /* Get burst of RX packets, from first port of pair. */
> + struct rte_mbuf *bufs[BURST_SIZE];
> + const uint16_t nb_rx = rte_eth_rx_burst(port, 0,
> + bufs, BURST_SIZE);
> +
> + if (unlikely(nb_rx == 0))
> + continue;
> +
> + /* Send burst of TX packets, to second port of pair. */
> + const uint16_t nb_tx = rte_eth_tx_burst(port ^ 1, 0,
> + bufs, nb_rx);
> +
> + /* Free any unsent packets. */
> + if (unlikely(nb_tx < nb_rx)) {
> + uint16_t buf;
> + for (buf = nb_tx; buf < nb_rx; buf++)
> + rte_pktmbuf_free(bufs[buf]);
> + }
> + }
> + }
> +}
> +
> +/* extremely pessimistic estimation of memory required to create a mempool */
> +static int
> +calc_mem_size(uint32_t nb_ports, uint32_t nb_mbufs_per_port,
> + uint32_t mbuf_sz, size_t pgsz, size_t *out)
> +{
> + uint32_t nb_mbufs = nb_ports * nb_mbufs_per_port;
> + uint64_t total_mem, mbuf_mem, obj_sz;
> +
> + /* there is no good way to predict how much space the mempool will
> + * occupy because it will allocate chunks on the fly, and some of those
> + * will come from default DPDK memory while some will come from our
> + * external memory, so just assume 16MB will be enough for everyone.
> + */
> + uint64_t hdr_mem = 16 << 20;
> +
> + obj_sz = rte_mempool_calc_obj_size(mbuf_sz, 0, NULL);
> + if (rte_eal_iova_mode() == RTE_IOVA_VA) {
> + /* contiguous - no need to account for page boundaries */
> + mbuf_mem = nb_mbufs * obj_sz;
> + } else {
> + /* account for possible non-contiguousness */
> + unsigned int n_pages, mbuf_per_pg, leftover;
> +
> + mbuf_per_pg = pgsz / obj_sz;
> + leftover = (nb_mbufs % mbuf_per_pg) > 0;
> + n_pages = (nb_mbufs / mbuf_per_pg) + leftover;
> +
> + mbuf_mem = n_pages * pgsz;
> + }
> +
> + total_mem = RTE_ALIGN(hdr_mem + mbuf_mem, pgsz);
> +
> + if (total_mem > SIZE_MAX) {
> + printf("Memory size too big\n");
> + return -1;
> + }
> + *out = (size_t)total_mem;
> +
> + return 0;
> +}
> +
> +static inline uint32_t
> +bsf64(uint64_t v)
> +{
> + return (uint32_t)__builtin_ctzll(v);
> +}
> +
> +static inline uint32_t
> +log2_u64(uint64_t v)
> +{
> + if (v == 0)
> + return 0;
> + v = rte_align64pow2(v);
> + return bsf64(v);
> +}
> +
> +#ifndef MAP_HUGE_SHIFT
> +#define HUGE_SHIFT 26
> +#else
> +#define HUGE_SHIFT MAP_HUGE_SHIFT
> +#endif
> +
> +static int
> +pagesz_flags(uint64_t page_sz)
> +{
> + /* as per mmap() manpage, all page sizes are log2 of page size
> + * shifted by MAP_HUGE_SHIFT
> + */
> + int log2 = log2_u64(page_sz);
> + return log2 << HUGE_SHIFT;
> +}
> +
> +static void *
> +alloc_mem(size_t memsz, size_t pgsz)
> +{
> + void *addr;
> + int flags;
> +
> + /* allocate anonymous hugepages */
> + flags = MAP_ANONYMOUS | MAP_PRIVATE | MAP_HUGETLB | pagesz_flags(pgsz);
> +
> + addr = mmap(NULL, memsz, PROT_READ | PROT_WRITE, flags, -1, 0);
> + if (addr == MAP_FAILED)
> + return NULL;
> +
> + return addr;
> +}
> +
> +struct extmem_param {
> + void *addr;
> + size_t len;
> + size_t pgsz;
> + rte_iova_t *iova_table;
> + unsigned int iova_table_len;
> +};
> +
> +static int
> +create_extmem(uint32_t nb_ports, uint32_t nb_mbufs_per_port, uint32_t mbuf_sz,
> + struct extmem_param *param)
> +{
> + uint64_t pgsizes[] = {RTE_PGSIZE_2M, RTE_PGSIZE_1G, /* x86_64, ARM */
> + RTE_PGSIZE_16M, RTE_PGSIZE_16G}; /* POWER */
> + unsigned int n_pages, cur_page, pgsz_idx;
> + size_t mem_sz, offset, cur_pgsz;
> + bool vfio_supported = true;
> + rte_iova_t *iovas = NULL;
> + void *addr;
> + int ret;
> +
> + for (pgsz_idx = 0; pgsz_idx < RTE_DIM(pgsizes); pgsz_idx++) {
> + /* skip anything that is too big */
> + if (pgsizes[pgsz_idx] > SIZE_MAX)
> + continue;
> +
> + cur_pgsz = pgsizes[pgsz_idx];
> +
> + ret = calc_mem_size(nb_ports, nb_mbufs_per_port,
> + mbuf_sz, cur_pgsz, &mem_sz);
> + if (ret < 0) {
> + printf("Cannot calculate memory size\n");
> + return -1;
> + }
> +
> + /* allocate our memory */
> + addr = alloc_mem(mem_sz, cur_pgsz);
> +
> + /* if we couldn't allocate memory with a specified page size,
> + * that doesn't mean we can't do it with other page sizes, so
> + * try another one.
> + */
> + if (addr == NULL)
> + continue;
> +
> + /* store IOVA addresses for every page in this memory area */
> + n_pages = mem_sz / cur_pgsz;
> +
> + iovas = malloc(sizeof(*iovas) * n_pages);
> +
> + if (iovas == NULL) {
> + printf("Cannot allocate memory for iova addresses\n");
> + goto fail;
> + }
> +
> + /* populate IOVA table */
> + for (cur_page = 0; cur_page < n_pages; cur_page++) {
> + rte_iova_t iova;
> + void *cur;
> +
> + offset = cur_pgsz * cur_page;
> + cur = RTE_PTR_ADD(addr, offset);
> +
> + iova = (uintptr_t)rte_mem_virt2iova(cur);
> +
> + iovas[cur_page] = iova;
> +
> + if (vfio_supported) {
> + /* map memory for DMA */
> + ret = rte_vfio_dma_map((uintptr_t)addr,
> + iova, cur_pgsz);
> + if (ret < 0) {
> + /*
> + * ENODEV means VFIO is not initialized
> + * ENOTSUP means current IOMMU mode
> + * doesn't support mapping
> + * both cases are not an error
> + */
> + if (rte_errno == ENOTSUP ||
> + rte_errno == ENODEV)
> + /* VFIO is unsupported, don't
> + * try again.
> + */
> + vfio_supported = false;
> + else
> + /* this is an actual error */
> + goto fail;
> + }
> + }
> + }
> +
> + break;
> + }
> + /* if we couldn't allocate anything */
> + if (iovas == NULL)
> + return -1;
> +
> + param->addr = addr;
> + param->len = mem_sz;
> + param->pgsz = cur_pgsz;
> + param->iova_table = iovas;
> + param->iova_table_len = n_pages;
> +
> + return 0;
> +fail:
> + if (iovas)
> + free(iovas);
> + if (addr)
> + munmap(addr, mem_sz);
> +
> + return -1;
> +}
> +
> +static int
> +setup_extmem(uint32_t nb_ports, uint32_t nb_mbufs_per_port, uint32_t mbuf_sz)
> +{
> + struct extmem_param param;
> + int ret;
> +
> + /* create our heap */
> + ret = rte_malloc_heap_create(EXTMEM_HEAP_NAME);
> + if (ret < 0) {
> + printf("Cannot create heap\n");
> + return -1;
> + }
> +
> + ret = create_extmem(nb_ports, nb_mbufs_per_port, mbuf_sz, ¶m);
> + if (ret < 0) {
> + printf("Cannot create memory area\n");
> + return -1;
> + }
> +
> + /* we now have a valid memory area, so add it to heap */
> + ret = rte_malloc_heap_memory_add(EXTMEM_HEAP_NAME,
> + param.addr, param.len, param.iova_table,
> + param.iova_table_len, param.pgsz);
> +
> + /* not needed any more */
> + free(param.iova_table);
> +
> + if (ret < 0) {
> + printf("Cannot add memory to heap\n");
> + munmap(param.addr, param.len);
> + return -1;
> + }
> +
> + printf("Allocated %zuMB of memory\n", param.len >> 20);
> +
> + /* success */
> + return 0;
> +}
> +
> +
> +/*
> + * The main function, which does initialization and calls the per-lcore
> + * functions.
> + */
> +int
> +main(int argc, char *argv[])
> +{
> + struct rte_mempool *mbuf_pool;
> + unsigned int nb_ports;
> + int socket_id;
> + uint16_t portid;
> + uint32_t nb_mbufs_per_port, mbuf_sz;
> +
> + /* Initialize the Environment Abstraction Layer (EAL). */
> + int ret = rte_eal_init(argc, argv);
> + if (ret < 0)
> + rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
> +
> + argc -= ret;
> + argv += ret;
> +
> + /* Check that there is an even number of ports to send/receive on. */
> + nb_ports = rte_eth_dev_count_avail();
> + if (nb_ports < 2 || (nb_ports & 1))
> + rte_exit(EXIT_FAILURE, "Error: number of ports must be even\n");
> +
> + nb_mbufs_per_port = NUM_MBUFS;
> + mbuf_sz = RTE_MBUF_DEFAULT_BUF_SIZE;
> +
> + if (setup_extmem(nb_ports, nb_mbufs_per_port, mbuf_sz) < 0)
> + rte_exit(EXIT_FAILURE, "Error: cannot set up external memory\n");
> +
> + /* retrieve socket ID for our heap */
> + socket_id = rte_malloc_heap_get_socket(EXTMEM_HEAP_NAME);
> + if (socket_id < 0)
> + rte_exit(EXIT_FAILURE, "Invalid socket for external heap\n");
> +
> + /* Creates a new mempool in memory to hold the mbufs. */
> + mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL",
> + nb_mbufs_per_port * nb_ports, MBUF_CACHE_SIZE, 0,
> + mbuf_sz, socket_id);
> +
> + if (mbuf_pool == NULL)
> + rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
> +
> + /* Initialize all ports. */
> + RTE_ETH_FOREACH_DEV(portid)
> + if (port_init(portid, mbuf_pool) != 0)
> + rte_exit(EXIT_FAILURE, "Cannot init port %"PRIu16 "\n",
> + portid);
> +
> + if (rte_lcore_count() > 1)
> + printf("\nWARNING: Too many lcores enabled. Only 1 used.\n");
> +
> + /* Call lcore_main on the master core only. */
> + lcore_main();
> +
> + return 0;
> +}
> diff --git a/examples/external_mem/meson.build b/examples/external_mem/meson.build
> new file mode 100644
> index 000000000..17a363ad2
> --- /dev/null
> +++ b/examples/external_mem/meson.build
> @@ -0,0 +1,12 @@
> +# SPDX-License-Identifier: BSD-3-Clause
> +# Copyright(c) 2017 Intel Corporation
> +
> +# meson file, for building this example as part of a main DPDK build.
> +#
> +# To build this example as a standalone application with an already-installed
> +# DPDK instance, use 'make'
> +
> +allow_experimental_apis = true
> +sources = files(
> + 'extmem.c'
> +)
> --
> 2.17.1
More information about the dev
mailing list