[dpdk-dev] [PATCH v5] net/memif: zero-copy slave

Jakub Grajciar jgrajcia at cisco.com
Thu Aug 22 10:18:33 CEST 2019


Zero-copy slave support for memif PMD.
Slave interface exposes DPDK memory to
master interface. Only single file segments
are supported (EAL option --single-file-segments).

Signed-off-by: Jakub Grajciar <jgrajcia at cisco.com>
---
 doc/guides/nics/memif.rst                     |  42 +-
 drivers/net/memif/Makefile                    |   1 +
 drivers/net/memif/memif_socket.c              |  64 +--
 drivers/net/memif/meson.build                 |   1 +
 drivers/net/memif/rte_eth_memif.c             | 449 +++++++++++++++++-
 drivers/net/memif/rte_eth_memif.h             |  11 +-
 lib/librte_eal/common/eal_common_mcfg.c       |   7 +
 .../common/include/rte_eal_memconfig.h        |  10 +
 lib/librte_eal/rte_eal_version.map            |   1 +
 9 files changed, 513 insertions(+), 73 deletions(-)

V2:
- fix coding style

V3:
- fix compilation issues

V4:
- don't move existing code
- add new EAL API rte_mcfg_get_single_file_segments,
  mem_config is now private, this api returns
  single_file_segments parameter value

V5:
- explain single file segments limitation
- add zero-copy slave example

diff --git a/doc/guides/nics/memif.rst b/doc/guides/nics/memif.rst
index de2d481eb..31a10848c 100644
--- a/doc/guides/nics/memif.rst
+++ b/doc/guides/nics/memif.rst
@@ -45,7 +45,7 @@ client.
    "socket=/tmp/memif.sock", "Socket filename", "/tmp/memif.sock", "string len 256"
    "mac=01:23:45:ab:cd:ef", "Mac address", "01:ab:23:cd:45:ef", ""
    "secret=abc123", "Secret is an optional security option, which if specified, must be matched by peer", "", "string len 24"
-   "zero-copy=yes", "Enable/disable zero-copy slave mode", "no", "yes|no"
+   "zero-copy=yes", "Enable/disable zero-copy slave mode. Only relevant to slave, requires '--single-file-segments' eal argument", "no", "yes|no"

 **Connection establishment**

@@ -171,6 +171,42 @@ Files
 - net/memif/memif.h *- descriptor and ring definitions*
 - net/memif/rte_eth_memif.c *- eth_memif_rx() eth_memif_tx()*

+Zero-copy slave
+~~~~~~~~~~~~~~~
+
+Zero-copy slave can be enabled with memif configuration option 'zero-copy=yes'. This option
+is only relevant to slave and requires eal argument '--single-file-segments'.
+This limitation is in place, because it is too expensive to identify memseg
+for each packet buffer, resulting in worse performance than with zero-copy disabled.
+With single file segments we can calculate offset from the beginning of the file
+for each packet buffer.
+
+**Shared memory format**
+
+Region 0 is created by memif driver and contains rings. Slave interface exposes DPDK memory (memseg).
+Instead of using memfd_create() to create new shared file, existing memsegs are used.
+Master interface functions the same as with zero-copy disabled.
+
+region 0:
+
++-----------------------+
+| Rings                 |
++-----------+-----------+
+| S2M rings | M2S rings |
++-----------+-----------+
+
+region n:
+
++-----------------+
+| Buffers         |
++-----------------+
+|memseg           |
++-----------------+
+
+Buffers are dequeued and enqueued as needed. Offset descriptor field is calculated at tx.
+Only single file segments mode (EAL option --single-file-segments) is supported, as calculating
+offset from multiple segments is too expensive.
+
 Example: testpmd
 ----------------------------
 In this example we run two instances of testpmd application and transmit packets over memif.
@@ -183,6 +219,10 @@ Now create ``slave`` interface (master must be already running so the slave will

     #./build/app/testpmd -l 2-3 --proc-type=primary --file-prefix=pmd2 --vdev=net_memif -- -i

+You can also enable ``zero-copy`` on ``slave`` interface::
+
+    #./build/app/testpmd -l 2-3 --proc-type=primary --file-prefix=pmd2 --vdev=net_memif,zero-copy=yes --single-file-segments -- -i
+
 Start forwarding packets::

     Slave:
diff --git a/drivers/net/memif/Makefile b/drivers/net/memif/Makefile
index 3d92b08f2..b50fc1cdb 100644
--- a/drivers/net/memif/Makefile
+++ b/drivers/net/memif/Makefile
@@ -20,6 +20,7 @@ CFLAGS += -DALLOW_EXPERIMENTAL_API
 # - rte_mp_action_register
 # - rte_mp_reply
 # - rte_mp_request_sync
+# - rte_mcfg_get_single_file_segments
 LDLIBS += -lrte_eal -lrte_mbuf -lrte_mempool
 LDLIBS += -lrte_ethdev -lrte_kvargs -lrte_net
 LDLIBS += -lrte_hash
diff --git a/drivers/net/memif/memif_socket.c b/drivers/net/memif/memif_socket.c
index 6740e3471..4710e05bd 100644
--- a/drivers/net/memif/memif_socket.c
+++ b/drivers/net/memif/memif_socket.c
@@ -176,8 +176,7 @@ memif_msg_receive_hello(struct rte_eth_dev *dev, memif_msg_t *msg)

 	strlcpy(pmd->remote_name, (char *)h->name, sizeof(pmd->remote_name));

-	MIF_LOG(DEBUG, "%s: Connecting to %s.",
-		rte_vdev_device_name(pmd->vdev), pmd->remote_name);
+	MIF_LOG(DEBUG, "Connecting to %s.", pmd->remote_name);

 	return 0;
 }
@@ -339,8 +338,7 @@ memif_msg_receive_connect(struct rte_eth_dev *dev, memif_msg_t *msg)

 	strlcpy(pmd->remote_if_name, (char *)c->if_name,
 		sizeof(pmd->remote_if_name));
-	MIF_LOG(INFO, "%s: Remote interface %s connected.",
-		rte_vdev_device_name(pmd->vdev), pmd->remote_if_name);
+	MIF_LOG(INFO, "Remote interface %s connected.", pmd->remote_if_name);

 	return 0;
 }
@@ -358,8 +356,7 @@ memif_msg_receive_connected(struct rte_eth_dev *dev, memif_msg_t *msg)

 	strlcpy(pmd->remote_if_name, (char *)c->if_name,
 		sizeof(pmd->remote_if_name));
-	MIF_LOG(INFO, "%s: Remote interface %s connected.",
-		rte_vdev_device_name(pmd->vdev), pmd->remote_if_name);
+	MIF_LOG(INFO, "Remote interface %s connected.", pmd->remote_if_name);

 	return 0;
 }
@@ -370,14 +367,13 @@ memif_msg_receive_disconnect(struct rte_eth_dev *dev, memif_msg_t *msg)
 	struct pmd_internals *pmd = dev->data->dev_private;
 	memif_msg_disconnect_t *d = &msg->disconnect;

-	memset(pmd->remote_disc_string, 0, ETH_MEMIF_DISC_STRING_SIZE);
+	memset(pmd->remote_disc_string, 0, sizeof(pmd->remote_disc_string));
 	strlcpy(pmd->remote_disc_string, (char *)d->string,
 		sizeof(pmd->remote_disc_string));

-	MIF_LOG(INFO, "%s: Disconnect received: %s",
-		rte_vdev_device_name(pmd->vdev), pmd->remote_disc_string);
+	MIF_LOG(INFO, "Disconnect received: %s", pmd->remote_disc_string);

-	memset(pmd->local_disc_string, 0, ETH_MEMIF_DISC_STRING_SIZE);
+	memset(pmd->local_disc_string, 0, 96);
 	memif_disconnect(dev);
 	return 0;
 }
@@ -473,7 +469,6 @@ memif_msg_enq_connect(struct rte_eth_dev *dev)
 {
 	struct pmd_internals *pmd = dev->data->dev_private;
 	struct memif_msg_queue_elt *e = memif_msg_enq(pmd->cc);
-	const char *name = rte_vdev_device_name(pmd->vdev);
 	memif_msg_connect_t *c;

 	if (e == NULL)
@@ -481,7 +476,7 @@ memif_msg_enq_connect(struct rte_eth_dev *dev)

 	c = &e->msg.connect;
 	e->msg.type = MEMIF_MSG_TYPE_CONNECT;
-	strlcpy((char *)c->if_name, name, sizeof(c->if_name));
+	strlcpy((char *)c->if_name, dev->data->name, sizeof(c->if_name));

 	return 0;
 }
@@ -491,7 +486,6 @@ memif_msg_enq_connected(struct rte_eth_dev *dev)
 {
 	struct pmd_internals *pmd = dev->data->dev_private;
 	struct memif_msg_queue_elt *e = memif_msg_enq(pmd->cc);
-	const char *name = rte_vdev_device_name(pmd->vdev);
 	memif_msg_connected_t *c;

 	if (e == NULL)
@@ -499,7 +493,7 @@ memif_msg_enq_connected(struct rte_eth_dev *dev)

 	c = &e->msg.connected;
 	e->msg.type = MEMIF_MSG_TYPE_CONNECTED;
-	strlcpy((char *)c->if_name, name, sizeof(c->if_name));
+	strlcpy((char *)c->if_name, dev->data->name, sizeof(c->if_name));

 	return 0;
 }
@@ -525,7 +519,6 @@ void
 memif_disconnect(struct rte_eth_dev *dev)
 {
 	struct pmd_internals *pmd = dev->data->dev_private;
-	struct pmd_process_private *proc_private = dev->process_private;
 	struct memif_msg_queue_elt *elt, *next;
 	struct memif_queue *mq;
 	struct rte_intr_handle *ih;
@@ -615,7 +608,7 @@ memif_disconnect(struct rte_eth_dev *dev)
 		}
 	}

-	memif_free_regions(proc_private);
+	memif_free_regions(dev);

 	/* reset connection configuration */
 	memset(&pmd->run, 0, sizeof(pmd->run));
@@ -662,7 +655,7 @@ memif_msg_receive(struct memif_control_channel *cc)
 			if (cmsg->cmsg_type == SCM_CREDENTIALS)
 				cr = (struct ucred *)CMSG_DATA(cmsg);
 			else if (cmsg->cmsg_type == SCM_RIGHTS)
-				memcpy(&afd, CMSG_DATA(cmsg), sizeof(int));
+				rte_memcpy(&afd, CMSG_DATA(cmsg), sizeof(int));
 		}
 		cmsg = CMSG_NXTHDR(&mh, cmsg);
 	}
@@ -861,7 +854,7 @@ memif_listener_handler(void *arg)
 }

 static struct memif_socket *
-memif_socket_create(struct pmd_internals *pmd, char *key, uint8_t listener)
+memif_socket_create(char *key, uint8_t listener)
 {
 	struct memif_socket *sock;
 	struct sockaddr_un un;
@@ -899,17 +892,15 @@ memif_socket_create(struct pmd_internals *pmd, char *key, uint8_t listener)
 		if (ret < 0)
 			goto error;

-		MIF_LOG(DEBUG, "%s: Memif listener socket %s created.",
-			rte_vdev_device_name(pmd->vdev), sock->filename);
+		MIF_LOG(DEBUG, "Memif listener socket %s created.", sock->filename);

 		sock->intr_handle.fd = sockfd;
 		sock->intr_handle.type = RTE_INTR_HANDLE_EXT;
 		ret = rte_intr_callback_register(&sock->intr_handle,
 						 memif_listener_handler, sock);
 		if (ret < 0) {
-			MIF_LOG(ERR, "%s: Failed to register interrupt "
-				"callback for listener socket",
-				rte_vdev_device_name(pmd->vdev));
+			MIF_LOG(ERR, "Failed to register interrupt "
+				"callback for listener socket");
 			return NULL;
 		}
 	}
@@ -917,9 +908,7 @@ memif_socket_create(struct pmd_internals *pmd, char *key, uint8_t listener)
 	return sock;

  error:
-	MIF_LOG(ERR, "%s: Failed to setup socket %s: %s",
-		rte_vdev_device_name(pmd->vdev) ?
-		rte_vdev_device_name(pmd->vdev) : "NULL", key, strerror(errno));
+	MIF_LOG(ERR, "Failed to setup socket %s: %s", key, strerror(errno));
 	if (sock != NULL)
 		rte_free(sock);
 	if (sockfd >= 0)
@@ -963,9 +952,8 @@ memif_socket_init(struct rte_eth_dev *dev, const char *socket_filename)
 	rte_memcpy(key, socket_filename, strlen(socket_filename));
 	ret = rte_hash_lookup_data(hash, key, (void **)&socket);
 	if (ret < 0) {
-		socket = memif_socket_create(pmd, key,
-					     (pmd->role ==
-					      MEMIF_ROLE_SLAVE) ? 0 : 1);
+		socket = memif_socket_create(key,
+					     (pmd->role == MEMIF_ROLE_SLAVE) ? 0 : 1);
 		if (socket == NULL)
 			return -1;
 		ret = rte_hash_add_key_data(hash, key, socket);
@@ -996,8 +984,7 @@ memif_socket_init(struct rte_eth_dev *dev, const char *socket_filename)

 	elt = rte_malloc("pmd-queue", sizeof(struct memif_socket_dev_list_elt), 0);
 	if (elt == NULL) {
-		MIF_LOG(ERR, "%s: Failed to add device to socket device list.",
-			rte_vdev_device_name(pmd->vdev));
+		MIF_LOG(ERR, "Failed to add device to socket device list.");
 		return -1;
 	}
 	elt->dev = dev;
@@ -1075,8 +1062,7 @@ memif_connect_slave(struct rte_eth_dev *dev)

 	sockfd = socket(AF_UNIX, SOCK_SEQPACKET, 0);
 	if (sockfd < 0) {
-		MIF_LOG(ERR, "%s: Failed to open socket.",
-			rte_vdev_device_name(pmd->vdev));
+		MIF_LOG(ERR, "Failed to open socket.");
 		return -1;
 	}

@@ -1087,19 +1073,16 @@ memif_connect_slave(struct rte_eth_dev *dev)
 	ret = connect(sockfd, (struct sockaddr *)&sun,
 		      sizeof(struct sockaddr_un));
 	if (ret < 0) {
-		MIF_LOG(ERR, "%s: Failed to connect socket: %s.",
-			rte_vdev_device_name(pmd->vdev), pmd->socket_filename);
+		MIF_LOG(ERR, "Failed to connect socket: %s.", pmd->socket_filename);
 		goto error;
 	}

-	MIF_LOG(DEBUG, "%s: Memif socket: %s connected.",
-		rte_vdev_device_name(pmd->vdev), pmd->socket_filename);
+	MIF_LOG(DEBUG, "Memif socket: %s connected.", pmd->socket_filename);

 	pmd->cc = rte_zmalloc("memif-cc",
 			      sizeof(struct memif_control_channel), 0);
 	if (pmd->cc == NULL) {
-		MIF_LOG(ERR, "%s: Failed to allocate control channel.",
-			rte_vdev_device_name(pmd->vdev));
+		MIF_LOG(ERR, "Failed to allocate control channel.");
 		goto error;
 	}

@@ -1112,8 +1095,7 @@ memif_connect_slave(struct rte_eth_dev *dev)
 	ret = rte_intr_callback_register(&pmd->cc->intr_handle,
 					 memif_intr_handler, pmd->cc);
 	if (ret < 0) {
-		MIF_LOG(ERR, "%s: Failed to register interrupt callback "
-			"for control fd", rte_vdev_device_name(pmd->vdev));
+		MIF_LOG(ERR, "Failed to register interrupt callback for control fd");
 		goto error;
 	}

diff --git a/drivers/net/memif/meson.build b/drivers/net/memif/meson.build
index bedc97311..6b62255f9 100644
--- a/drivers/net/memif/meson.build
+++ b/drivers/net/memif/meson.build
@@ -14,5 +14,6 @@ allow_experimental_apis = true
 # - rte_mp_action_register
 # - rte_mp_reply
 # - rte_mp_request_sync
+# - rte_mcfg_get_single_file_segments

 deps += ['hash']
diff --git a/drivers/net/memif/rte_eth_memif.c b/drivers/net/memif/rte_eth_memif.c
index a59f80967..47c6ea0f3 100644
--- a/drivers/net/memif/rte_eth_memif.c
+++ b/drivers/net/memif/rte_eth_memif.c
@@ -23,6 +23,10 @@
 #include <rte_kvargs.h>
 #include <rte_bus_vdev.h>
 #include <rte_string_fns.h>
+#include <rte_errno.h>
+#include <rte_memory.h>
+#include <rte_memzone.h>
+#include <rte_eal_memconfig.h>

 #include "rte_eth_memif.h"
 #include "memif_socket.h"
@@ -50,6 +54,10 @@ static const char * const valid_arguments[] = {

 #define MEMIF_MP_SEND_REGION		"memif_mp_send_region"

+
+static int memif_region_init_zc(const struct rte_memseg_list *msl,
+				const struct rte_memseg *ms, void *arg);
+
 const char *
 memif_version(void)
 {
@@ -116,10 +124,14 @@ memif_mp_request_regions(struct rte_eth_dev *dev)
 	struct mp_region_msg *reply_param;
 	struct memif_region *r;
 	struct pmd_process_private *proc_private = dev->process_private;
+	struct pmd_internals *pmd = dev->data->dev_private;
+	/* in case of zero-copy slave, only request region 0 */
+	uint16_t max_region_num = (pmd->flags & ETH_MEMIF_FLAG_ZERO_COPY) ?
+				   1 : ETH_MEMIF_MAX_REGION_NUM;

 	MIF_LOG(DEBUG, "Requesting memory regions");

-	for (i = 0; i < ETH_MEMIF_MAX_REGION_NUM; i++) {
+	for (i = 0; i < max_region_num; i++) {
 		/* Prepare the message */
 		memset(&msg, 0, sizeof(msg));
 		strlcpy(msg.name, MEMIF_MP_SEND_REGION, sizeof(msg.name));
@@ -161,6 +173,12 @@ memif_mp_request_regions(struct rte_eth_dev *dev)
 		free(reply);
 	}

+	if (pmd->flags & ETH_MEMIF_FLAG_ZERO_COPY) {
+		ret = rte_memseg_walk(memif_region_init_zc, (void *)proc_private);
+		if (ret < 0)
+			return ret;
+	}
+
 	return memif_connect(dev);
 }

@@ -218,6 +236,23 @@ memif_get_buffer(struct pmd_process_private *proc_private, memif_desc_t *d)
 	return ((uint8_t *)proc_private->regions[d->region]->addr + d->offset);
 }

+/* Free mbufs received by master */
+static void
+memif_free_stored_mbufs(struct pmd_process_private *proc_private, struct memif_queue *mq)
+{
+	uint16_t mask = (1 << mq->log2_ring_size) - 1;
+	memif_ring_t *ring = memif_get_ring_from_queue(proc_private, mq);
+
+	/* FIXME: improve performance */
+	while (mq->last_tail != ring->tail) {
+		RTE_MBUF_PREFETCH_TO_FREE(mq->buffers[(mq->last_tail + 1) & mask]);
+		/* Decrement refcnt and free mbuf. (current segment) */
+		rte_mbuf_refcnt_update(mq->buffers[mq->last_tail & mask], -1);
+		rte_pktmbuf_free_seg(mq->buffers[mq->last_tail & mask]);
+		mq->last_tail++;
+	}
+}
+
 static int
 memif_pktmbuf_chain(struct rte_mbuf *head, struct rte_mbuf *cur_tail,
 		    struct rte_mbuf *tail)
@@ -323,8 +358,8 @@ eth_memif_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 				rte_pktmbuf_pkt_len(mbuf_head) += cp_len;

 			memcpy(rte_pktmbuf_mtod_offset(mbuf, void *, dst_off),
-			       (uint8_t *)memif_get_buffer(proc_private, d0) +
-			       src_off, cp_len);
+			       (uint8_t *)memif_get_buffer(proc_private, d0) + src_off,
+			       cp_len);

 			src_off += cp_len;
 			dst_off += cp_len;
@@ -369,6 +404,120 @@ eth_memif_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 	return n_rx_pkts;
 }

+static uint16_t
+eth_memif_rx_zc(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
+{
+	struct memif_queue *mq = queue;
+	struct pmd_internals *pmd = rte_eth_devices[mq->in_port].data->dev_private;
+	struct pmd_process_private *proc_private =
+		rte_eth_devices[mq->in_port].process_private;
+	memif_ring_t *ring = memif_get_ring_from_queue(proc_private, mq);
+	uint16_t cur_slot, last_slot, n_slots, ring_size, mask, s0, head;
+	uint16_t n_rx_pkts = 0;
+	memif_desc_t *d0;
+	struct rte_mbuf *mbuf, *mbuf_tail;
+	struct rte_mbuf *mbuf_head = NULL;
+	int ret;
+	struct rte_eth_link link;
+
+	if (unlikely((pmd->flags & ETH_MEMIF_FLAG_CONNECTED) == 0))
+		return 0;
+	if (unlikely(ring == NULL)) {
+		/* Secondary process will attempt to request regions. */
+		rte_eth_link_get(mq->in_port, &link);
+		return 0;
+	}
+
+	/* consume interrupt */
+	if ((ring->flags & MEMIF_RING_FLAG_MASK_INT) == 0) {
+		uint64_t b;
+		ssize_t size __rte_unused;
+		size = read(mq->intr_handle.fd, &b, sizeof(b));
+	}
+
+	ring_size = 1 << mq->log2_ring_size;
+	mask = ring_size - 1;
+
+	cur_slot = mq->last_tail;
+	last_slot = ring->tail;
+	if (cur_slot == last_slot)
+		goto refill;
+	n_slots = last_slot - cur_slot;
+
+	while (n_slots && n_rx_pkts < nb_pkts) {
+		s0 = cur_slot & mask;
+
+		d0 = &ring->desc[s0];
+		mbuf_head = mq->buffers[s0];
+		mbuf = mbuf_head;
+
+next_slot:
+		/* prefetch next descriptor */
+		if (n_rx_pkts + 1 < nb_pkts)
+			rte_prefetch0(&ring->desc[(cur_slot + 1) & mask]);
+
+		mbuf->port = mq->in_port;
+		rte_pktmbuf_data_len(mbuf) = d0->length;
+		rte_pktmbuf_pkt_len(mbuf) = rte_pktmbuf_data_len(mbuf);
+
+		mq->n_bytes += rte_pktmbuf_data_len(mbuf);
+
+		cur_slot++;
+		n_slots--;
+		if (d0->flags & MEMIF_DESC_FLAG_NEXT) {
+			s0 = cur_slot & mask;
+			d0 = &ring->desc[s0];
+			mbuf_tail = mbuf;
+			mbuf = mq->buffers[s0];
+			ret = memif_pktmbuf_chain(mbuf_head, mbuf_tail, mbuf);
+			if (unlikely(ret < 0)) {
+				MIF_LOG(ERR, "number-of-segments-overflow");
+				goto refill;
+			}
+			goto next_slot;
+		}
+
+		*bufs++ = mbuf_head;
+		n_rx_pkts++;
+	}
+
+	mq->last_tail = cur_slot;
+
+/* Supply master with new buffers */
+refill:
+	head = ring->head;
+	n_slots = ring_size - head + mq->last_tail;
+
+	if (n_slots < 32)
+		goto no_free_mbufs;
+
+	ret = rte_pktmbuf_alloc_bulk(mq->mempool, &mq->buffers[head & mask], n_slots);
+	if (unlikely(ret < 0))
+		goto no_free_mbufs;
+
+	while (n_slots--) {
+		s0 = head++ & mask;
+		if (n_slots > 0)
+			rte_prefetch0(mq->buffers[head & mask]);
+		d0 = &ring->desc[s0];
+		/* store buffer header */
+		mbuf = mq->buffers[s0];
+		/* populate descriptor */
+		d0->length = rte_pktmbuf_data_room_size(mq->mempool) -
+				RTE_PKTMBUF_HEADROOM;
+		d0->region = 1;
+		d0->offset = rte_pktmbuf_mtod(mbuf, uint8_t *) -
+			(uint8_t *)proc_private->regions[d0->region]->addr;
+	}
+no_free_mbufs:
+	rte_mb();
+	ring->head = head;
+
+	mq->n_pkts += n_rx_pkts;
+
+	return n_rx_pkts;
+}
+
 static uint16_t
 eth_memif_tx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 {
@@ -483,19 +632,173 @@ eth_memif_tx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 	return n_tx_pkts;
 }

+
+static int
+memif_tx_one_zc(struct pmd_process_private *proc_private, struct memif_queue *mq,
+		memif_ring_t *ring, struct rte_mbuf *mbuf, const uint16_t mask,
+		uint16_t slot, uint16_t n_free)
+{
+	memif_desc_t *d0;
+	int used_slots = 1;
+
+next_in_chain:
+	/* store pointer to mbuf to free it later */
+	mq->buffers[slot & mask] = mbuf;
+	/* Increment refcnt to make sure the buffer is not freed before master
+	 * receives it. (current segment)
+	 */
+	rte_mbuf_refcnt_update(mbuf, 1);
+	/* populate descriptor */
+	d0 = &ring->desc[slot & mask];
+	d0->length = rte_pktmbuf_data_len(mbuf);
+	/* FIXME: get region index */
+	d0->region = 1;
+	d0->offset = rte_pktmbuf_mtod(mbuf, uint8_t *) -
+		(uint8_t *)proc_private->regions[d0->region]->addr;
+	d0->flags = 0;
+
+	/* check if buffer is chained */
+	if (rte_pktmbuf_is_contiguous(mbuf) == 0) {
+		if (n_free < 2)
+			return 0;
+		/* mark buffer as chained */
+		d0->flags |= MEMIF_DESC_FLAG_NEXT;
+		/* advance mbuf */
+		mbuf = mbuf->next;
+		/* update counters */
+		used_slots++;
+		slot++;
+		n_free--;
+		goto next_in_chain;
+	}
+	return used_slots;
+}
+
+static uint16_t
+eth_memif_tx_zc(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
+{
+	struct memif_queue *mq = queue;
+	struct pmd_internals *pmd = rte_eth_devices[mq->in_port].data->dev_private;
+	struct pmd_process_private *proc_private =
+		rte_eth_devices[mq->in_port].process_private;
+	memif_ring_t *ring = memif_get_ring_from_queue(proc_private, mq);
+	uint16_t slot, n_free, ring_size, mask, n_tx_pkts = 0;
+	memif_ring_type_t type = mq->type;
+	struct rte_eth_link link;
+
+	if (unlikely((pmd->flags & ETH_MEMIF_FLAG_CONNECTED) == 0))
+		return 0;
+	if (unlikely(ring == NULL)) {
+		/* Secondary process will attempt to request regions. */
+		rte_eth_link_get(mq->in_port, &link);
+		return 0;
+	}
+
+	ring_size = 1 << mq->log2_ring_size;
+	mask = ring_size - 1;
+
+	/* free mbufs received by master */
+	memif_free_stored_mbufs(proc_private, mq);
+
+	/* ring type always MEMIF_RING_S2M */
+	slot = ring->head;
+	n_free = ring_size - ring->head + mq->last_tail;
+
+	int used_slots;
+
+	while (n_free && (n_tx_pkts < nb_pkts)) {
+		while ((n_free > 4) && ((nb_pkts - n_tx_pkts) > 4)) {
+			if ((nb_pkts - n_tx_pkts) > 8) {
+				rte_prefetch0(*bufs + 4);
+				rte_prefetch0(*bufs + 5);
+				rte_prefetch0(*bufs + 6);
+				rte_prefetch0(*bufs + 7);
+			}
+			used_slots = memif_tx_one_zc(proc_private, mq, ring, *bufs++,
+				mask, slot, n_free);
+			if (unlikely(used_slots < 1))
+				goto no_free_slots;
+			n_tx_pkts++;
+			slot += used_slots;
+			n_free -= used_slots;
+
+			used_slots = memif_tx_one_zc(proc_private, mq, ring, *bufs++,
+				mask, slot, n_free);
+			if (unlikely(used_slots < 1))
+				goto no_free_slots;
+			n_tx_pkts++;
+			slot += used_slots;
+			n_free -= used_slots;
+
+			used_slots = memif_tx_one_zc(proc_private, mq, ring, *bufs++,
+				mask, slot, n_free);
+			if (unlikely(used_slots < 1))
+				goto no_free_slots;
+			n_tx_pkts++;
+			slot += used_slots;
+			n_free -= used_slots;
+
+			used_slots = memif_tx_one_zc(proc_private, mq, ring, *bufs++,
+				mask, slot, n_free);
+			if (unlikely(used_slots < 1))
+				goto no_free_slots;
+			n_tx_pkts++;
+			slot += used_slots;
+			n_free -= used_slots;
+		}
+		used_slots = memif_tx_one_zc(proc_private, mq, ring, *bufs++,
+			mask, slot, n_free);
+		if (unlikely(used_slots < 1))
+			goto no_free_slots;
+		n_tx_pkts++;
+		slot += used_slots;
+		n_free -= used_slots;
+	}
+
+no_free_slots:
+	rte_mb();
+	/* update ring pointers */
+	if (type == MEMIF_RING_S2M)
+		ring->head = slot;
+	else
+		ring->tail = slot;
+
+	/* Send interrupt, if enabled. */
+	if ((ring->flags & MEMIF_RING_FLAG_MASK_INT) == 0) {
+		uint64_t a = 1;
+		ssize_t size = write(mq->intr_handle.fd, &a, sizeof(a));
+		if (unlikely(size < 0)) {
+			MIF_LOG(WARNING,
+				"Failed to send interrupt. %s", strerror(errno));
+		}
+	}
+
+	/* increment queue counters */
+	mq->n_pkts += n_tx_pkts;
+
+	return n_tx_pkts;
+}
+
 void
-memif_free_regions(struct pmd_process_private *proc_private)
+memif_free_regions(struct rte_eth_dev *dev)
 {
+	struct pmd_process_private *proc_private = dev->process_private;
+	struct pmd_internals *pmd = dev->data->dev_private;
 	int i;
 	struct memif_region *r;

-	MIF_LOG(DEBUG, "Free memory regions");
 	/* regions are allocated contiguously, so it's
 	 * enough to loop until 'proc_private->regions_num'
 	 */
 	for (i = 0; i < proc_private->regions_num; i++) {
 		r = proc_private->regions[i];
 		if (r != NULL) {
+			/* This is memzone */
+			if (i > 0 && (pmd->flags & ETH_MEMIF_FLAG_ZERO_COPY)) {
+				r->addr = NULL;
+				if (r->fd > 0)
+					close(r->fd);
+			}
 			if (r->addr != NULL) {
 				munmap(r->addr, r->region_size);
 				if (r->fd > 0) {
@@ -510,6 +813,45 @@ memif_free_regions(struct pmd_process_private *proc_private)
 	proc_private->regions_num = 0;
 }

+static int
+memif_region_init_zc(const struct rte_memseg_list *msl, const struct rte_memseg *ms,
+		     void *arg)
+{
+	struct pmd_process_private *proc_private = (struct pmd_process_private *)arg;
+	struct memif_region *r;
+
+	if (proc_private->regions_num < 1) {
+		MIF_LOG(ERR, "Missing descriptor region");
+		return -1;
+	}
+
+	r = proc_private->regions[proc_private->regions_num - 1];
+
+	if (r->addr != msl->base_va)
+		r = proc_private->regions[++proc_private->regions_num - 1];
+
+	if (r == NULL) {
+		r = rte_zmalloc("region", sizeof(struct memif_region), 0);
+		if (r == NULL) {
+			MIF_LOG(ERR, "Failed to alloc memif region.");
+			return -ENOMEM;
+		}
+
+		r->addr = msl->base_va;
+		r->region_size = ms->len;
+		r->fd = rte_memseg_get_fd(ms);
+		if (r->fd < 0)
+			return -1;
+		r->pkt_buffer_offset = 0;
+
+		proc_private->regions[proc_private->regions_num - 1] = r;
+	} else {
+		r->region_size += ms->len;
+	}
+
+	return 0;
+}
+
 static int
 memif_region_init_shm(struct rte_eth_dev *dev, uint8_t has_buffers)
 {
@@ -590,12 +932,29 @@ memif_region_init_shm(struct rte_eth_dev *dev, uint8_t has_buffers)
 static int
 memif_regions_init(struct rte_eth_dev *dev)
 {
+	struct pmd_internals *pmd = dev->data->dev_private;
 	int ret;

-	/* create one buffer region */
-	ret = memif_region_init_shm(dev, /* has buffer */ 1);
-	if (ret < 0)
-		return ret;
+	/*
+	 * Zero-copy exposes dpdk memory.
+	 * Each memseg list will be represented by memif region.
+	 * Zero-copy regions indexing: memseg list idx + 1,
+	 * as we already have region 0 reserved for descriptors.
+	 */
+	if (pmd->flags & ETH_MEMIF_FLAG_ZERO_COPY) {
+		/* create region idx 0 containing descriptors */
+		ret = memif_region_init_shm(dev, 0);
+		if (ret < 0)
+			return ret;
+		ret = rte_memseg_walk(memif_region_init_zc, (void *)dev->process_private);
+		if (ret < 0)
+			return ret;
+	} else {
+		/* create one memory region contaning rings and buffers */
+		ret = memif_region_init_shm(dev, /* has buffers */ 1);
+		if (ret < 0)
+			return ret;
+	}

 	return 0;
 }
@@ -615,6 +974,10 @@ memif_init_rings(struct rte_eth_dev *dev)
 		ring->tail = 0;
 		ring->cookie = MEMIF_COOKIE;
 		ring->flags = 0;
+
+		if (pmd->flags & ETH_MEMIF_FLAG_ZERO_COPY)
+			continue;
+
 		for (j = 0; j < (1 << pmd->run.log2_ring_size); j++) {
 			slot = i * (1 << pmd->run.log2_ring_size) + j;
 			ring->desc[j].region = 0;
@@ -631,6 +994,10 @@ memif_init_rings(struct rte_eth_dev *dev)
 		ring->tail = 0;
 		ring->cookie = MEMIF_COOKIE;
 		ring->flags = 0;
+
+		if (pmd->flags & ETH_MEMIF_FLAG_ZERO_COPY)
+			continue;
+
 		for (j = 0; j < (1 << pmd->run.log2_ring_size); j++) {
 			slot = (i + pmd->run.num_s2m_rings) *
 			    (1 << pmd->run.log2_ring_size) + j;
@@ -644,7 +1011,7 @@ memif_init_rings(struct rte_eth_dev *dev)
 }

 /* called only by slave */
-static void
+static int
 memif_init_queues(struct rte_eth_dev *dev)
 {
 	struct pmd_internals *pmd = dev->data->dev_private;
@@ -665,6 +1032,13 @@ memif_init_queues(struct rte_eth_dev *dev)
 				"Failed to create eventfd for tx queue %d: %s.", i,
 				strerror(errno));
 		}
+		mq->buffers = NULL;
+		if (pmd->flags & ETH_MEMIF_FLAG_ZERO_COPY) {
+			mq->buffers = rte_zmalloc("bufs", sizeof(struct rte_mbuf *) *
+						  (1 << mq->log2_ring_size), 0);
+			if (mq->buffers == NULL)
+				return -ENOMEM;
+		}
 	}

 	for (i = 0; i < pmd->run.num_m2s_rings; i++) {
@@ -681,7 +1055,15 @@ memif_init_queues(struct rte_eth_dev *dev)
 				"Failed to create eventfd for rx queue %d: %s.", i,
 				strerror(errno));
 		}
+		mq->buffers = NULL;
+		if (pmd->flags & ETH_MEMIF_FLAG_ZERO_COPY) {
+			mq->buffers = rte_zmalloc("bufs", sizeof(struct rte_mbuf *) *
+						  (1 << mq->log2_ring_size), 0);
+			if (mq->buffers == NULL)
+				return -ENOMEM;
+		}
 	}
+	return 0;
 }

 int
@@ -695,7 +1077,9 @@ memif_init_regions_and_queues(struct rte_eth_dev *dev)

 	memif_init_rings(dev);

-	memif_init_queues(dev);
+	ret = memif_init_queues(dev);
+	if (ret < 0)
+		return ret;

 	return 0;
 }
@@ -719,8 +1103,16 @@ memif_connect(struct rte_eth_dev *dev)
 				mr->addr = mmap(NULL, mr->region_size,
 						PROT_READ | PROT_WRITE,
 						MAP_SHARED, mr->fd, 0);
-				if (mr->addr == NULL)
+				if (mr->addr == MAP_FAILED) {
+					MIF_LOG(ERR, "mmap failed: %s\n",
+						strerror(errno));
 					return -1;
+				}
+			}
+			if (i > 0 && (pmd->flags & ETH_MEMIF_FLAG_ZERO_COPY)) {
+				/* close memseg file */
+				close(mr->fd);
+				mr->fd = -1;
 			}
 		}
 	}
@@ -781,8 +1173,7 @@ memif_dev_start(struct rte_eth_dev *dev)
 		ret = memif_connect_master(dev);
 		break;
 	default:
-		MIF_LOG(ERR, "%s: Unknown role: %d.",
-			rte_vdev_device_name(pmd->vdev), pmd->role);
+		MIF_LOG(ERR, "Unknown role: %d.", pmd->role);
 		ret = -1;
 		break;
 	}
@@ -847,8 +1238,7 @@ memif_tx_queue_setup(struct rte_eth_dev *dev,

 	mq = rte_zmalloc("tx-queue", sizeof(struct memif_queue), 0);
 	if (mq == NULL) {
-		MIF_LOG(ERR, "%s: Failed to allocate tx queue id: %u",
-			rte_vdev_device_name(pmd->vdev), qid);
+		MIF_LOG(ERR, "Failed to allocate tx queue id: %u", qid);
 		return -ENOMEM;
 	}

@@ -876,8 +1266,7 @@ memif_rx_queue_setup(struct rte_eth_dev *dev,

 	mq = rte_zmalloc("rx-queue", sizeof(struct memif_queue), 0);
 	if (mq == NULL) {
-		MIF_LOG(ERR, "%s: Failed to allocate rx queue id: %u",
-			rte_vdev_device_name(pmd->vdev), qid);
+		MIF_LOG(ERR, "Failed to allocate rx queue id: %u", qid);
 		return -ENOMEM;
 	}

@@ -917,7 +1306,7 @@ memif_link_update(struct rte_eth_dev *dev,
 			memif_mp_request_regions(dev);
 		} else if (dev->data->dev_link.link_status == ETH_LINK_DOWN &&
 				proc_private->regions_num > 0) {
-			memif_free_regions(proc_private);
+			memif_free_regions(dev);
 		}
 	}
 	return 0;
@@ -1036,11 +1425,6 @@ memif_create(struct rte_vdev_device *vdev, enum memif_role_t role,
 	const unsigned int numa_node = vdev->device.numa_node;
 	const char *name = rte_vdev_device_name(vdev);

-	if (flags & ETH_MEMIF_FLAG_ZERO_COPY) {
-		MIF_LOG(ERR, "Zero-copy slave not supported.");
-		return -1;
-	}
-
 	eth_dev = rte_eth_vdev_allocate(vdev, sizeof(*pmd));
 	if (eth_dev == NULL) {
 		MIF_LOG(ERR, "%s: Unable to allocate device struct.", name);
@@ -1064,6 +1448,9 @@ memif_create(struct rte_vdev_device *vdev, enum memif_role_t role,
 	pmd->flags = flags;
 	pmd->flags |= ETH_MEMIF_FLAG_DISABLED;
 	pmd->role = role;
+	/* Zero-copy flag irelevant to master. */
+	if (pmd->role == MEMIF_ROLE_MASTER)
+		pmd->flags &= ~ETH_MEMIF_FLAG_ZERO_COPY;

 	ret = memif_socket_init(eth_dev, socket_filename);
 	if (ret < 0)
@@ -1087,8 +1474,14 @@ memif_create(struct rte_vdev_device *vdev, enum memif_role_t role,

 	eth_dev->dev_ops = &ops;
 	eth_dev->device = &vdev->device;
-	eth_dev->rx_pkt_burst = eth_memif_rx;
-	eth_dev->tx_pkt_burst = eth_memif_tx;
+	if (pmd->flags & ETH_MEMIF_FLAG_ZERO_COPY) {
+		eth_dev->rx_pkt_burst = eth_memif_rx_zc;
+		eth_dev->tx_pkt_burst = eth_memif_tx_zc;
+	} else {
+		eth_dev->rx_pkt_burst = eth_memif_rx;
+		eth_dev->tx_pkt_burst = eth_memif_tx;
+	}
+

 	eth_dev->data->dev_flags &= RTE_ETH_DEV_CLOSE_REMOVE;

@@ -1120,6 +1513,10 @@ memif_set_zc(const char *key __rte_unused, const char *value, void *extra_args)
 	uint32_t *flags = (uint32_t *)extra_args;

 	if (strstr(value, "yes") != NULL) {
+		if (!rte_mcfg_get_single_file_segments()) {
+			MIF_LOG(ERR, "Zero-copy doesn't support multi-file segments.");
+			return -ENOTSUP;
+		}
 		*flags |= ETH_MEMIF_FLAG_ZERO_COPY;
 	} else if (strstr(value, "no") != NULL) {
 		*flags &= ~ETH_MEMIF_FLAG_ZERO_COPY;
diff --git a/drivers/net/memif/rte_eth_memif.h b/drivers/net/memif/rte_eth_memif.h
index 8269212cb..0d2566392 100644
--- a/drivers/net/memif/rte_eth_memif.h
+++ b/drivers/net/memif/rte_eth_memif.h
@@ -63,12 +63,15 @@ struct memif_queue {
 	uint16_t last_head;			/**< last ring head */
 	uint16_t last_tail;			/**< last ring tail */

+	struct rte_mbuf **buffers;
+	/**< Stored mbufs. Used in zero-copy tx. Slave stores transmitted
+	 * mbufs to free them once master has received them.
+	 */
+
 	/* rx/tx info */
 	uint64_t n_pkts;			/**< number of rx/tx packets */
 	uint64_t n_bytes;			/**< number of rx/tx bytes */

-	memif_ring_t *ring;			/**< pointer to ring */
-
 	struct rte_intr_handle intr_handle;	/**< interrupt handle */

 	memif_log2_ring_size_t log2_ring_size;	/**< log2 of ring size */
@@ -115,8 +118,6 @@ struct pmd_internals {
 	/**< local disconnect reason */
 	char remote_disc_string[ETH_MEMIF_DISC_STRING_SIZE];
 	/**< remote disconnect reason */
-
-	struct rte_vdev_device *vdev;		/**< vdev handle */
 };

 struct pmd_process_private {
@@ -131,7 +132,7 @@ struct pmd_process_private {
  * @param proc_private
  *   device process private data
  */
-void memif_free_regions(struct pmd_process_private *proc_private);
+void memif_free_regions(struct rte_eth_dev *dev);

 /**
  * Finalize connection establishment process. Map shared memory file
diff --git a/lib/librte_eal/common/eal_common_mcfg.c b/lib/librte_eal/common/eal_common_mcfg.c
index 066549432..03d9d472d 100644
--- a/lib/librte_eal/common/eal_common_mcfg.c
+++ b/lib/librte_eal/common/eal_common_mcfg.c
@@ -161,3 +161,10 @@ rte_mcfg_timer_unlock(void)
 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
 	rte_spinlock_unlock(&mcfg->tlock);
 }
+
+uint32_t
+rte_mcfg_get_single_file_segments(void)
+{
+	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+	return mcfg->single_file_segments;
+}
diff --git a/lib/librte_eal/common/include/rte_eal_memconfig.h b/lib/librte_eal/common/include/rte_eal_memconfig.h
index 34b0e44a0..9bb4a57f8 100644
--- a/lib/librte_eal/common/include/rte_eal_memconfig.h
+++ b/lib/librte_eal/common/include/rte_eal_memconfig.h
@@ -109,6 +109,16 @@ __rte_experimental
 void
 rte_mcfg_timer_unlock(void);

+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Get the single_file_segments parameter value from memory configuration.
+ */
+__rte_experimental
+uint32_t
+rte_mcfg_get_single_file_segments(void);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
index 7cbf82d37..c2b9d473f 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -418,5 +418,6 @@ EXPERIMENTAL {
 	rte_lcore_to_cpu_id;
 	rte_mcfg_timer_lock;
 	rte_mcfg_timer_unlock;
+	rte_mcfg_get_single_file_segments;
 	rte_rand_max;
 };
--
2.17.1


More information about the dev mailing list