[dpdk-dev] [PATCH v2 3/5] virtio/vdev: add embeded device emulation

Jianfeng Tan jianfeng.tan at intel.com
Fri Feb 5 12:20:26 CET 2016


To implement virtio vdev, we need way to interract with vhost backend.
And more importantly, needs way to emulate a device into DPDK. So this
patch acts as embedded device emulation.

Depends on the type of vhost file: vhost-user is used if the given
path points to a unix socket; vhost-net is used if the given path
points to a char device.

Signed-off-by: Huawei Xie <huawei.xie at intel.com>
Signed-off-by: Jianfeng Tan <jianfeng.tan at intel.com>
---
 config/common_linuxapp              |   5 +
 drivers/net/virtio/Makefile         |   4 +
 drivers/net/virtio/vhost.h          | 194 +++++++++
 drivers/net/virtio/vhost_embedded.c | 809 ++++++++++++++++++++++++++++++++++++
 drivers/net/virtio/virtio_ethdev.h  |   6 +-
 drivers/net/virtio/virtio_pci.h     |  15 +-
 6 files changed, 1031 insertions(+), 2 deletions(-)
 create mode 100644 drivers/net/virtio/vhost.h
 create mode 100644 drivers/net/virtio/vhost_embedded.c

diff --git a/config/common_linuxapp b/config/common_linuxapp
index 74bc515..f76e162 100644
--- a/config/common_linuxapp
+++ b/config/common_linuxapp
@@ -534,3 +534,8 @@ CONFIG_RTE_APP_TEST=y
 CONFIG_RTE_TEST_PMD=y
 CONFIG_RTE_TEST_PMD_RECORD_CORE_CYCLES=n
 CONFIG_RTE_TEST_PMD_RECORD_BURST_STATS=n
+
+#
+# Enable virtio support for container
+#
+CONFIG_RTE_VIRTIO_VDEV=y
diff --git a/drivers/net/virtio/Makefile b/drivers/net/virtio/Makefile
index 43835ba..ef920f9 100644
--- a/drivers/net/virtio/Makefile
+++ b/drivers/net/virtio/Makefile
@@ -52,6 +52,10 @@ SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_rxtx.c
 SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_ethdev.c
 SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_rxtx_simple.c
 
+ifeq ($(CONFIG_RTE_VIRTIO_VDEV),y)
+	SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += vhost_embedded.c
+endif
+
 # this lib depends upon:
 DEPDIRS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += lib/librte_eal lib/librte_ether
 DEPDIRS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += lib/librte_mempool lib/librte_mbuf
diff --git a/drivers/net/virtio/vhost.h b/drivers/net/virtio/vhost.h
new file mode 100644
index 0000000..73d4f5c
--- /dev/null
+++ b/drivers/net/virtio/vhost.h
@@ -0,0 +1,194 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _VHOST_NET_USER_H
+#define _VHOST_NET_USER_H
+
+#include <stdint.h>
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+#define VHOST_MEMORY_MAX_NREGIONS 8
+
+struct vhost_vring_state {
+	unsigned int index;
+	unsigned int num;
+};
+
+struct vhost_vring_file {
+	unsigned int index;
+	int fd;
+};
+
+struct vhost_vring_addr {
+	unsigned int index;
+	/* Option flags. */
+	unsigned int flags;
+	/* Flag values: */
+	/* Whether log address is valid. If set enables logging. */
+#define VHOST_VRING_F_LOG 0
+
+	/* Start of array of descriptors (virtually contiguous) */
+	uint64_t desc_user_addr;
+	/* Used structure address. Must be 32 bit aligned */
+	uint64_t used_user_addr;
+	/* Available structure address. Must be 16 bit aligned */
+	uint64_t avail_user_addr;
+	/* Logging support. */
+	/* Log writes to used structure, at offset calculated from specified
+	 * address. Address must be 32 bit aligned.
+	 */
+	uint64_t log_guest_addr;
+};
+
+#define VIRTIO_CONFIG_S_DRIVER_OK   4
+
+enum vhost_user_request {
+	VHOST_USER_NONE = 0,
+	VHOST_USER_GET_FEATURES = 1,
+	VHOST_USER_SET_FEATURES = 2,
+	VHOST_USER_SET_OWNER = 3,
+	VHOST_USER_RESET_OWNER = 4,
+	VHOST_USER_SET_MEM_TABLE = 5,
+	VHOST_USER_SET_LOG_BASE = 6,
+	VHOST_USER_SET_LOG_FD = 7,
+	VHOST_USER_SET_VRING_NUM = 8,
+	VHOST_USER_SET_VRING_ADDR = 9,
+	VHOST_USER_SET_VRING_BASE = 10,
+	VHOST_USER_GET_VRING_BASE = 11,
+	VHOST_USER_SET_VRING_KICK = 12,
+	VHOST_USER_SET_VRING_CALL = 13,
+	VHOST_USER_SET_VRING_ERR = 14,
+	VHOST_USER_GET_PROTOCOL_FEATURES = 15,
+	VHOST_USER_SET_PROTOCOL_FEATURES = 16,
+	VHOST_USER_GET_QUEUE_NUM = 17,
+	VHOST_USER_SET_VRING_ENABLE = 18,
+	VHOST_USER_MAX
+};
+
+struct vhost_memory_region {
+	uint64_t guest_phys_addr;
+	uint64_t memory_size; /* bytes */
+	uint64_t userspace_addr;
+	uint64_t mmap_offset;
+};
+
+struct vhost_memory_kernel {
+	uint32_t nregions;
+	uint32_t padding;
+	struct vhost_memory_region regions[0];
+};
+
+struct vhost_memory {
+	uint32_t nregions;
+	uint32_t padding;
+	struct vhost_memory_region regions[VHOST_MEMORY_MAX_NREGIONS];
+};
+
+struct vhost_user_msg {
+	enum vhost_user_request request;
+
+#define VHOST_USER_VERSION_MASK     0x3
+#define VHOST_USER_REPLY_MASK       (0x1 << 2)
+	uint32_t flags;
+	uint32_t size; /* the following payload size */
+	union {
+#define VHOST_USER_VRING_IDX_MASK   0xff
+#define VHOST_USER_VRING_NOFD_MASK  (0x1 << 8)
+		uint64_t u64;
+		struct vhost_vring_state state;
+		struct vhost_vring_addr addr;
+		struct vhost_memory memory;
+	} payload;
+	int fds[VHOST_MEMORY_MAX_NREGIONS];
+} __attribute((packed));
+
+#define VHOST_USER_HDR_SIZE offsetof(struct vhost_user_msg, payload.u64)
+#define VHOST_USER_PAYLOAD_SIZE (sizeof(struct vhost_user_msg) - VHOST_USER_HDR_SIZE)
+
+/* The version of the protocol we support */
+#define VHOST_USER_VERSION    0x1
+
+/* ioctls */
+
+#define VHOST_VIRTIO 0xAF
+
+#define VHOST_GET_FEATURES _IOR(VHOST_VIRTIO, 0x00, __u64)
+#define VHOST_SET_FEATURES _IOW(VHOST_VIRTIO, 0x00, __u64)
+#define VHOST_SET_OWNER _IO(VHOST_VIRTIO, 0x01)
+#define VHOST_RESET_OWNER _IO(VHOST_VIRTIO, 0x02)
+#define VHOST_SET_MEM_TABLE _IOW(VHOST_VIRTIO, 0x03, struct vhost_memory_kernel)
+#define VHOST_SET_LOG_BASE _IOW(VHOST_VIRTIO, 0x04, __u64)
+#define VHOST_SET_LOG_FD _IOW(VHOST_VIRTIO, 0x07, int)
+#define VHOST_SET_VRING_NUM _IOW(VHOST_VIRTIO, 0x10, struct vhost_vring_state)
+#define VHOST_SET_VRING_ADDR _IOW(VHOST_VIRTIO, 0x11, struct vhost_vring_addr)
+#define VHOST_SET_VRING_BASE _IOW(VHOST_VIRTIO, 0x12, struct vhost_vring_state)
+#define VHOST_GET_VRING_BASE _IOWR(VHOST_VIRTIO, 0x12, struct vhost_vring_state)
+#define VHOST_SET_VRING_KICK _IOW(VHOST_VIRTIO, 0x20, struct vhost_vring_file)
+#define VHOST_SET_VRING_CALL _IOW(VHOST_VIRTIO, 0x21, struct vhost_vring_file)
+#define VHOST_SET_VRING_ERR _IOW(VHOST_VIRTIO, 0x22, struct vhost_vring_file)
+#define VHOST_NET_SET_BACKEND _IOW(VHOST_VIRTIO, 0x30, struct vhost_vring_file)
+
+/*****************************************************************************/
+
+/* Ioctl defines */
+#define TUNSETIFF     _IOW('T', 202, int)
+#define TUNGETFEATURES _IOR('T', 207, unsigned int)
+#define TUNSETOFFLOAD  _IOW('T', 208, unsigned int)
+#define TUNGETIFF      _IOR('T', 210, unsigned int)
+#define TUNSETSNDBUF   _IOW('T', 212, int)
+#define TUNGETVNETHDRSZ _IOR('T', 215, int)
+#define TUNSETVNETHDRSZ _IOW('T', 216, int)
+#define TUNSETQUEUE  _IOW('T', 217, int)
+#define TUNSETVNETLE _IOW('T', 220, int)
+#define TUNSETVNETBE _IOW('T', 222, int)
+
+/* TUNSETIFF ifr flags */
+#define IFF_TAP          0x0002
+#define IFF_NO_PI        0x1000
+#define IFF_ONE_QUEUE    0x2000
+#define IFF_VNET_HDR     0x4000
+#define IFF_MULTI_QUEUE  0x0100
+#define IFF_ATTACH_QUEUE 0x0200
+#define IFF_DETACH_QUEUE 0x0400
+
+/* Features for GSO (TUNSETOFFLOAD). */
+#define TUN_F_CSUM	0x01	/* You can hand me unchecksummed packets. */
+#define TUN_F_TSO4	0x02	/* I can handle TSO for IPv4 packets */
+#define TUN_F_TSO6	0x04	/* I can handle TSO for IPv6 packets */
+#define TUN_F_TSO_ECN	0x08	/* I can handle TSO with ECN bits. */
+#define TUN_F_UFO	0x10	/* I can handle UFO packets */
+
+#define PATH_NET_TUN "/dev/net/tun"
+
+#endif
diff --git a/drivers/net/virtio/vhost_embedded.c b/drivers/net/virtio/vhost_embedded.c
new file mode 100644
index 0000000..0073b86
--- /dev/null
+++ b/drivers/net/virtio/vhost_embedded.c
@@ -0,0 +1,809 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdint.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <stdio.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <string.h>
+#include <errno.h>
+#include <assert.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <sys/eventfd.h>
+#include <sys/ioctl.h>
+#include <net/if.h>
+
+#include <rte_mbuf.h>
+#include <rte_memory.h>
+#include <rte_eal_memconfig.h>
+
+#include "virtio_pci.h"
+#include "virtio_logs.h"
+#include "virtio_ethdev.h"
+#include "virtqueue.h"
+#include "vhost.h"
+
+static int
+vhost_user_write(int fd, void *buf, int len, int *fds, int fd_num)
+{
+	int r;
+	struct msghdr msgh;
+	struct iovec iov;
+	size_t fd_size = fd_num * sizeof(int);
+	char control[CMSG_SPACE(fd_size)];
+	struct cmsghdr *cmsg;
+
+	bzero(&msgh, sizeof(msgh));
+	bzero(control, sizeof(control));
+
+	iov.iov_base = (uint8_t *)buf;
+	iov.iov_len = len;
+
+	msgh.msg_iov = &iov;
+	msgh.msg_iovlen = 1;
+	msgh.msg_control = control;
+	msgh.msg_controllen = sizeof(control);
+
+	cmsg = CMSG_FIRSTHDR(&msgh);
+	cmsg->cmsg_len = CMSG_LEN(fd_size);
+	cmsg->cmsg_level = SOL_SOCKET;
+	cmsg->cmsg_type = SCM_RIGHTS;
+	memcpy(CMSG_DATA(cmsg), fds, fd_size);
+
+	do {
+		r = sendmsg(fd, &msgh, 0);
+	} while (r < 0 && errno == EINTR);
+
+	return r;
+}
+
+static int
+vhost_user_read(int fd, struct vhost_user_msg *msg)
+{
+	uint32_t valid_flags = VHOST_USER_REPLY_MASK | VHOST_USER_VERSION;
+	int ret, sz_hdr = VHOST_USER_HDR_SIZE, sz_payload;
+
+	ret = recv(fd, (void *)msg, sz_hdr, 0);
+	if (ret < sz_hdr) {
+		PMD_DRV_LOG(ERR, "Failed to recv msg hdr: %d instead of %d.",
+			    ret, sz_hdr);
+		goto fail;
+	}
+
+	/* validate msg flags */
+	if (msg->flags != (valid_flags)) {
+		PMD_DRV_LOG(ERR, "Failed to recv msg: flags %x instead of %x.",
+			    msg->flags, valid_flags);
+		goto fail;
+	}
+
+	sz_payload = msg->size;
+	if (sz_payload) {
+		ret = recv(fd, (void *)((char *)msg + sz_hdr), sz_payload, 0);
+		if (ret < sz_payload) {
+			PMD_DRV_LOG(ERR, "Failed to recv msg payload: %d instead of %d.",
+				    ret, msg->size);
+			goto fail;
+		}
+	}
+
+	return 0;
+
+fail:
+	return -1;
+}
+
+static struct vhost_user_msg m __rte_unused;
+
+static void
+prepare_vhost_memory_user(struct vhost_user_msg *msg, int fds[])
+{
+	int i, num;
+	struct back_file *huges;
+	struct vhost_memory_region *mr;
+
+	num = rte_eal_get_backfile_info(&huges);
+
+	if (num > VHOST_MEMORY_MAX_NREGIONS)
+		rte_panic("%d files exceed maximum of %d for vhost-user\n",
+			  num, VHOST_MEMORY_MAX_NREGIONS);
+
+	for (i = 0; i < num; ++i) {
+		mr = &msg->payload.memory.regions[i];
+		mr->guest_phys_addr = (uint64_t)huges[i].addr; /* use vaddr! */
+		mr->userspace_addr = (uint64_t)huges[i].addr;
+		mr->memory_size = huges[i].size;
+		mr->mmap_offset = 0;
+		fds[i] = open(huges[i].filepath, O_RDWR);
+	}
+
+	msg->payload.memory.nregions = num;
+	msg->payload.memory.padding = 0;
+	free(huges);
+}
+
+static int
+vhost_user_sock(struct virtio_hw *hw, unsigned long int req, void *arg)
+{
+	struct vhost_user_msg msg;
+	struct vhost_vring_file *file = 0;
+	int need_reply = 0;
+	int fds[VHOST_MEMORY_MAX_NREGIONS];
+	int fd_num = 0;
+	int i, len;
+
+	msg.request = req;
+	msg.flags = VHOST_USER_VERSION;
+	msg.size = 0;
+
+	switch (req) {
+	case VHOST_USER_GET_FEATURES:
+		need_reply = 1;
+		break;
+
+	case VHOST_USER_SET_FEATURES:
+	case VHOST_USER_SET_LOG_BASE:
+		msg.payload.u64 = *((__u64 *)arg);
+		msg.size = sizeof(m.payload.u64);
+		break;
+
+	case VHOST_USER_SET_OWNER:
+	case VHOST_USER_RESET_OWNER:
+		break;
+
+	case VHOST_USER_SET_MEM_TABLE:
+		prepare_vhost_memory_user(&msg, fds);
+		fd_num = msg.payload.memory.nregions;
+		msg.size = sizeof(m.payload.memory.nregions);
+		msg.size += sizeof(m.payload.memory.padding);
+		msg.size += fd_num * sizeof(struct vhost_memory_region);
+		break;
+
+	case VHOST_USER_SET_LOG_FD:
+		fds[fd_num++] = *((int *)arg);
+		break;
+
+	case VHOST_USER_SET_VRING_NUM:
+	case VHOST_USER_SET_VRING_BASE:
+		memcpy(&msg.payload.state, arg, sizeof(msg.payload.state));
+		msg.size = sizeof(m.payload.state);
+		break;
+
+	case VHOST_USER_GET_VRING_BASE:
+		memcpy(&msg.payload.state, arg, sizeof(msg.payload.state));
+		msg.size = sizeof(m.payload.state);
+		need_reply = 1;
+		break;
+
+	case VHOST_USER_SET_VRING_ADDR:
+		memcpy(&msg.payload.addr, arg, sizeof(msg.payload.addr));
+		msg.size = sizeof(m.payload.addr);
+		break;
+
+	case VHOST_USER_SET_VRING_KICK:
+	case VHOST_USER_SET_VRING_CALL:
+	case VHOST_USER_SET_VRING_ERR:
+		file = arg;
+		msg.payload.u64 = file->index & VHOST_USER_VRING_IDX_MASK;
+		msg.size = sizeof(m.payload.u64);
+		if (file->fd > 0)
+			fds[fd_num++] = file->fd;
+		else
+			msg.payload.u64 |= VHOST_USER_VRING_NOFD_MASK;
+		break;
+
+	default:
+		PMD_DRV_LOG(ERR, "vhost-user trying to send unhandled msg type");
+		return -1;
+	}
+
+	len = VHOST_USER_HDR_SIZE + msg.size;
+	if (vhost_user_write(hw->vhostfd, &msg, len, fds, fd_num) < 0)
+		return 0;
+
+	if (req == VHOST_USER_SET_MEM_TABLE)
+		for (i = 0; i < fd_num; ++i)
+			close(fds[i]);
+
+	if (need_reply) {
+		if (vhost_user_read(hw->vhostfd, &msg) < 0)
+			return -1;
+
+		if (req != msg.request) {
+			PMD_DRV_LOG(ERR, "Received unexpected msg type.");
+			return -1;
+		}
+
+		switch (req) {
+		case VHOST_USER_GET_FEATURES:
+			if (msg.size != sizeof(m.payload.u64)) {
+				PMD_DRV_LOG(ERR, "Received bad msg size.");
+				return -1;
+			}
+			*((__u64 *)arg) = msg.payload.u64;
+			break;
+		case VHOST_USER_GET_VRING_BASE:
+			if (msg.size != sizeof(m.payload.state)) {
+				PMD_DRV_LOG(ERR, "Received bad msg size.");
+				return -1;
+			}
+			memcpy(arg, &msg.payload.state,
+			       sizeof(struct vhost_vring_state));
+			break;
+		default:
+			PMD_DRV_LOG(ERR, "Received unexpected msg type.");
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
+static int
+vhost_kernel_ioctl(struct virtio_hw *hw, unsigned long int req, void *arg)
+{
+	return ioctl(hw->vhostfd, req, arg);
+}
+
+enum {
+	VHOST_MSG_SET_OWNER,
+	VHOST_MSG_SET_FEATURES,
+	VHOST_MSG_GET_FEATURES,
+	VHOST_MSG_SET_VRING_CALL,
+	VHOST_MSG_SET_VRING_NUM,
+	VHOST_MSG_SET_VRING_BASE,
+	VHOST_MSG_GET_VRING_BASE,
+	VHOST_MSG_SET_VRING_ADDR,
+	VHOST_MSG_SET_VRING_KICK,
+	VHOST_MSG_SET_MEM_TABLE,
+	VHOST_MSG_MAX,
+};
+
+static const char * const vhost_msg_strings[] = {
+	[VHOST_MSG_SET_OWNER] = "VHOST_MSG_SET_OWNER",
+	[VHOST_MSG_SET_FEATURES] = "VHOST_MSG_SET_FEATURES",
+	[VHOST_MSG_GET_FEATURES] = "VHOST_MSG_GET_FEATURES",
+	[VHOST_MSG_SET_VRING_CALL] = "VHOST_MSG_SET_VRING_CALL",
+	[VHOST_MSG_SET_VRING_NUM] = "VHOST_MSG_SET_VRING_NUM",
+	[VHOST_MSG_SET_VRING_BASE] = "VHOST_MSG_SET_VRING_BASE",
+	[VHOST_MSG_GET_VRING_BASE] = "VHOST_MSG_GET_VRING_BASE",
+	[VHOST_MSG_SET_VRING_ADDR] = "VHOST_MSG_SET_VRING_ADDR",
+	[VHOST_MSG_SET_VRING_KICK] = "VHOST_MSG_SET_VRING_KICK",
+	[VHOST_MSG_SET_MEM_TABLE] = "VHOST_MSG_SET_MEM_TABLE",
+	NULL,
+};
+
+static unsigned long int vhost_req_map[][2] = {
+	[VHOST_MSG_SET_OWNER] = {
+		VHOST_SET_OWNER, VHOST_USER_SET_OWNER
+	},
+	[VHOST_MSG_SET_FEATURES] = {
+		VHOST_SET_FEATURES, VHOST_USER_SET_FEATURES
+	},
+	[VHOST_MSG_GET_FEATURES] = {
+		VHOST_GET_FEATURES, VHOST_USER_GET_FEATURES
+	},
+	[VHOST_MSG_SET_VRING_CALL] = {
+		VHOST_SET_VRING_CALL, VHOST_USER_SET_VRING_CALL
+	},
+	[VHOST_MSG_SET_VRING_NUM] = {
+		VHOST_SET_VRING_NUM, VHOST_USER_SET_VRING_NUM
+	},
+	[VHOST_MSG_SET_VRING_BASE] = {
+		VHOST_SET_VRING_BASE, VHOST_USER_SET_VRING_BASE
+	},
+	[VHOST_MSG_GET_VRING_BASE] = {
+		VHOST_GET_VRING_BASE, VHOST_USER_GET_VRING_BASE
+	},
+	[VHOST_MSG_SET_VRING_ADDR] = {
+		VHOST_SET_VRING_ADDR, VHOST_USER_SET_VRING_ADDR
+	},
+	[VHOST_MSG_SET_VRING_KICK] = {
+		VHOST_SET_VRING_KICK, VHOST_USER_SET_VRING_KICK
+	},
+	[VHOST_MSG_SET_MEM_TABLE] = {
+		VHOST_SET_MEM_TABLE, VHOST_USER_SET_MEM_TABLE
+	},
+};
+
+static int
+vhost_call(struct virtio_hw *hw, unsigned long int req_orig, void *arg)
+{
+	unsigned long int req_new;
+	int ret;
+
+	if (req_orig >= VHOST_MSG_MAX)
+		rte_panic("invalid req: %lu\n", req_orig);
+
+	PMD_DRV_LOG(INFO, "%s\n", vhost_msg_strings[req_orig]);
+	req_new = vhost_req_map[req_orig][hw->type];
+	if (hw->type == VHOST_USER)
+		ret = vhost_user_sock(hw, req_new, arg);
+	else
+		ret = vhost_kernel_ioctl(hw, req_new, arg);
+
+	if (ret < 0)
+		rte_panic("vhost_call %s failed: %s\n",
+			  vhost_msg_strings[req_orig], strerror(errno));
+
+	return ret;
+}
+
+static void
+kick_one_vq(struct virtio_hw *hw, struct virtqueue *vq, unsigned queue_sel)
+{
+	int callfd, kickfd;
+	struct vhost_vring_file file;
+	struct vhost_vring_state state;
+	struct vhost_vring_addr addr = {
+		.index = queue_sel,
+		.desc_user_addr = (uint64_t)(uintptr_t)vq->vq_ring.desc,
+		.avail_user_addr = (uint64_t)(uintptr_t)vq->vq_ring.avail,
+		.used_user_addr = (uint64_t)(uintptr_t)vq->vq_ring.used,
+		.log_guest_addr = 0,
+		.flags = 0, /* disable log */
+	};
+
+	/* or use invalid flag to disable it, but vhost-dpdk uses this to judge
+	 * if dev is alive. so finally we need two real event_fds.
+	 */
+	/* Of all per virtqueue MSGs, make sure VHOST_SET_VRING_CALL come
+	 * firstly because vhost depends on this msg to allocate virtqueue
+	 * pair.
+	 */
+	callfd = eventfd(0, O_CLOEXEC | O_NONBLOCK);
+	if (callfd < 0)
+		rte_panic("callfd error, %s\n", strerror(errno));
+
+	file.index = queue_sel;
+	file.fd = callfd;
+	vhost_call(hw, VHOST_MSG_SET_VRING_CALL, &file);
+	hw->callfds[queue_sel] = callfd;
+
+	state.index = queue_sel;
+	state.num = vq->vq_ring.num;
+	vhost_call(hw, VHOST_MSG_SET_VRING_NUM, &state);
+
+	state.num = 0; /* no reservation */
+	vhost_call(hw, VHOST_MSG_SET_VRING_BASE, &state);
+
+	vhost_call(hw, VHOST_MSG_SET_VRING_ADDR, &addr);
+
+	/* Of all per virtqueue MSGs, make sure VHOST_SET_VRING_KICK comes
+	 * lastly because vhost depends on this msg to judge if
+	 * virtio_is_ready().
+	 */
+	kickfd = eventfd(0, O_CLOEXEC | O_NONBLOCK);
+	if (kickfd < 0)
+		rte_panic("kickfd error, %s\n", strerror(errno));
+
+	file.fd = kickfd;
+	vhost_call(hw, VHOST_MSG_SET_VRING_KICK, &file);
+	hw->kickfds[queue_sel] = kickfd;
+}
+
+/**
+ * Merge those virtually adjacent memsegs into one region.
+ */
+static void
+prepare_vhost_memory_kernel(struct vhost_memory_kernel **p_vm)
+{
+	unsigned i, j, k = 0;
+	struct rte_memseg *seg;
+	struct vhost_memory_region *mr;
+	struct vhost_memory_kernel *vm;
+
+	vm = malloc(sizeof(struct vhost_memory_kernel) +
+		    RTE_MAX_MEMSEG * sizeof(struct vhost_memory_region));
+
+	for (i = 0; i < RTE_MAX_MEMSEG; ++i) {
+		seg = &rte_eal_get_configuration()->mem_config->memseg[i];
+		if (!seg->addr)
+			break;
+
+		int new_region = 1;
+
+		for (j = 0; j < k; ++j) {
+			mr = &vm->regions[j];
+
+			if (mr->userspace_addr + mr->memory_size ==
+			    (uint64_t)seg->addr) {
+				mr->memory_size += seg->len;
+				new_region = 0;
+				break;
+			}
+
+			if ((uint64_t)seg->addr + seg->len ==
+			    mr->userspace_addr) {
+				mr->guest_phys_addr = (uint64_t)seg->addr;
+				mr->userspace_addr = (uint64_t)seg->addr;
+				mr->memory_size += seg->len;
+				new_region = 0;
+				break;
+			}
+		}
+
+		if (new_region == 0)
+			continue;
+
+		mr = &vm->regions[k++];
+		mr->guest_phys_addr = (uint64_t)seg->addr; /* use vaddr here! */
+		mr->userspace_addr = (uint64_t)seg->addr;
+		mr->memory_size = seg->len;
+		mr->mmap_offset = 0;
+	}
+
+	vm->nregions = k;
+	vm->padding = 0;
+	*p_vm = vm;
+}
+
+static void kick_all_vq(struct virtio_hw *hw)
+{
+	uint64_t features;
+	unsigned i, queue_sel, nvqs;
+	struct rte_eth_dev_data *data = hw->data;
+
+	if (hw->type == VHOST_KERNEL) {
+		struct vhost_memory_kernel *vm = NULL;
+
+		prepare_vhost_memory_kernel(&vm);
+		vhost_call(hw, VHOST_MSG_SET_MEM_TABLE, vm);
+		free(vm);
+	} else {
+		/* construct vhost_memory inside prepare_vhost_memory_user() */
+		vhost_call(hw, VHOST_MSG_SET_MEM_TABLE, NULL);
+	}
+
+	for (i = 0; i < data->nb_rx_queues; ++i) {
+		queue_sel = 2 * i + VTNET_SQ_RQ_QUEUE_IDX;
+		kick_one_vq(hw, data->rx_queues[i], queue_sel);
+	}
+	for (i = 0; i < data->nb_tx_queues; ++i) {
+		queue_sel = 2 * i + VTNET_SQ_TQ_QUEUE_IDX;
+		kick_one_vq(hw, data->tx_queues[i], queue_sel);
+	}
+
+	/* after setup all virtqueues, we need to set_features again
+	 * so that these features can be set into each virtqueue in
+	 * vhost side.
+	 */
+	features = hw->guest_features;
+	features &= ~(1ull << VIRTIO_NET_F_MAC);
+	vhost_call(hw, VHOST_MSG_SET_FEATURES, &features);
+	if (hw->type == VHOST_KERNEL)
+		if (ioctl(hw->backfd, TUNSETVNETHDRSZ,
+			  &hw->vtnet_hdr_size) == -1)
+			rte_panic("TUNSETVNETHDRSZ failed: %s\n",
+				  strerror(errno));
+	PMD_DRV_LOG(INFO, "set features:%" PRIx64 "\n", features);
+
+	if (hw->type == VHOST_KERNEL) {
+		struct vhost_vring_file file;
+
+		file.fd = hw->backfd;
+		nvqs = data->nb_rx_queues + data->nb_tx_queues;
+		for (file.index = 0; file.index < nvqs; ++file.index) {
+			if (vhost_kernel_ioctl(hw, VHOST_NET_SET_BACKEND,
+					       &file) < 0)
+				rte_panic("VHOST_NET_SET_BACKEND failed, %s\n",
+					  strerror(errno));
+		}
+	}
+}
+
+static void
+vdev_read_dev_config(struct virtio_hw *hw, uint64_t offset,
+		     void *dst, int length)
+{
+	if (offset == offsetof(struct virtio_net_config, mac) &&
+	    length == ETHER_ADDR_LEN) {
+		int i;
+
+		for (i = 0; i < ETHER_ADDR_LEN; ++i)
+			((uint8_t *)dst)[i] = hw->mac_addr[i];
+		return;
+	}
+
+	if (offset == offsetof(struct virtio_net_config, status))
+		*(uint16_t *)dst = hw->status;
+
+	if (offset == offsetof(struct virtio_net_config, max_virtqueue_pairs))
+		*(uint16_t *)dst = hw->max_tx_queues;
+}
+
+static void
+vdev_write_dev_config(struct virtio_hw *hw, uint64_t offset,
+		      const void *src, int length)
+{
+	int i;
+
+	if ((offset == offsetof(struct virtio_net_config, mac)) &&
+	    (length == ETHER_ADDR_LEN))
+		for (i = 0; i < ETHER_ADDR_LEN; ++i)
+			hw->mac_addr[i] = ((const uint8_t *)src)[i];
+	else
+		rte_panic("offset=%" PRIu64 ", length=%d\n", offset, length);
+}
+
+static void
+vdev_set_status(struct virtio_hw *hw, uint8_t status)
+{
+	if (status & VIRTIO_CONFIG_S_DRIVER_OK)
+		kick_all_vq(hw);
+	hw->status = status;
+}
+
+static void
+vdev_reset(struct virtio_hw *hw __rte_unused)
+{
+	/* do nothing according to qemu vhost user spec */
+}
+
+static uint8_t
+vdev_get_status(struct virtio_hw *hw)
+{
+	return hw->status;
+}
+
+static uint64_t
+vdev_get_features(struct virtio_hw *hw)
+{
+	uint64_t host_features;
+
+	vhost_call(hw, VHOST_MSG_GET_FEATURES, &host_features);
+	if (hw->mac_specified)
+		host_features |= (1ull << VIRTIO_NET_F_MAC);
+	/* disable it until we support CQ */
+	host_features &= ~(1ull << VIRTIO_NET_F_CTRL_VQ);
+	host_features &= ~(1ull << VIRTIO_NET_F_CTRL_RX);
+	return host_features;
+}
+
+static void
+vdev_set_features(struct virtio_hw *hw, uint64_t features)
+{
+	features &= ~(1ull << VIRTIO_NET_F_MAC);
+	vhost_call(hw, VHOST_MSG_SET_FEATURES, &features);
+}
+
+static uint8_t
+vdev_get_isr(struct virtio_hw *hw __rte_unused)
+{
+	rte_panic("");
+}
+
+static uint16_t
+vdev_set_config_irq(struct virtio_hw *hw __rte_unused,
+		    uint16_t vec __rte_unused)
+{
+	rte_panic("");
+}
+
+static uint16_t
+vdev_get_queue_num(struct virtio_hw *hw,
+		   uint16_t queue_id __rte_unused)
+{
+	return hw->queue_num;
+}
+
+static void
+vdev_setup_queue(struct virtio_hw *hw __rte_unused,
+		 struct virtqueue *vq __rte_unused)
+{
+	/* do nothing */
+}
+
+static void
+vdev_del_queue(struct virtio_hw *hw __rte_unused,
+	       struct virtqueue *vq)
+{
+	struct vhost_vring_state state = {
+		.index = vq->vq_queue_index,
+	};
+
+	vhost_call(hw, VHOST_MSG_GET_VRING_BASE, &state);
+	PMD_DRV_LOG(DEBUG, "state.num = %d\n", state.num);
+}
+
+static void
+vdev_notify_queue(struct virtio_hw *hw, struct virtqueue *vq)
+{
+	uint64_t buf = 1;
+
+	if (write(hw->kickfds[vq->vq_queue_index],
+		  &buf, sizeof(uint64_t)) == -1)
+		rte_panic("%s\n", strerror(errno));
+}
+
+static const struct virtio_pci_ops vdev_ops = {
+	.read_dev_cfg	= vdev_read_dev_config,
+	.write_dev_cfg	= vdev_write_dev_config,
+	.reset		= vdev_reset,
+	.get_status	= vdev_get_status,
+	.set_status	= vdev_set_status,
+	.get_features	= vdev_get_features,
+	.set_features	= vdev_set_features,
+	.get_isr	= vdev_get_isr,
+	.set_config_irq	= vdev_set_config_irq,
+	.get_queue_num	= vdev_get_queue_num,
+	.setup_queue	= vdev_setup_queue,
+	.del_queue	= vdev_del_queue,
+	.notify_queue	= vdev_notify_queue,
+};
+
+#define TUN_DEF_SNDBUF	(1ull << 20)
+
+static void
+vhost_kernel_backend_setup(struct virtio_hw *hw, char *ifname)
+{
+	int fd;
+	int len = sizeof(struct virtio_net_hdr);
+	int req_mq = 0;
+	int sndbuf = TUN_DEF_SNDBUF;
+	unsigned int features;
+	struct ifreq ifr;
+
+	/* TODO:
+	 * 1. get/set offload capability, tap_probe_has_ufo, tap_fd_set_offload
+	 * 2. verify we can get/set vnet_hdr_len, tap_probe_vnet_hdr_len
+	 * 3. get number of memory regions from vhost module parameter
+	 * max_mem_regions, supported in newer version linux kernel
+	 */
+
+	fd = open(PATH_NET_TUN, O_RDWR);
+	if (fd < 0)
+		rte_panic("open %s error, %s\n", PATH_NET_TUN, strerror(errno));
+
+	memset(&ifr, 0, sizeof(ifr));
+	ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
+
+	if (ioctl(fd, TUNGETFEATURES, &features) == -1)
+		rte_panic("TUNGETFEATURES failed: %s", strerror(errno));
+
+	if (features & IFF_ONE_QUEUE)
+		ifr.ifr_flags |= IFF_ONE_QUEUE;
+
+	if (features & IFF_VNET_HDR)
+		ifr.ifr_flags |= IFF_VNET_HDR;
+	else
+		rte_panic("vnet_hdr requested, but kernel does not support\n");
+
+	if (req_mq) {
+		if (features & IFF_MULTI_QUEUE)
+			ifr.ifr_flags |= IFF_MULTI_QUEUE;
+		else
+			rte_panic("multiqueue requested, but kernel does not support\n");
+	}
+
+	if (ifname)
+		strncpy(ifr.ifr_name, ifname, IFNAMSIZ);
+	else
+		strncpy(ifr.ifr_name, "tap%d", IFNAMSIZ);
+	if (ioctl(fd, TUNSETIFF, (void *)&ifr) == -1)
+		rte_panic("TUNSETIFF failed: %s", strerror(errno));
+	fcntl(fd, F_SETFL, O_NONBLOCK);
+
+	if (ioctl(fd, TUNSETVNETHDRSZ, &len) == -1)
+		rte_panic("TUNSETVNETHDRSZ failed: %s\n", strerror(errno));
+
+	if (ioctl(fd, TUNSETSNDBUF, &sndbuf) == -1)
+		rte_panic("TUNSETSNDBUF failed: %s", strerror(errno));
+
+	hw->backfd = fd;
+	hw->vhostfd = open(hw->path, O_RDWR);
+	if (hw->vhostfd < 0)
+		rte_panic("open %s failed: %s\n", hw->path, strerror(errno));
+}
+
+static void
+vhost_user_backend_setup(struct virtio_hw *hw)
+{
+	int fd;
+	int flag;
+	struct sockaddr_un un;
+
+	fd = socket(AF_UNIX, SOCK_STREAM, 0);
+	if (fd < 0)
+		rte_panic("socket error, %s\n", strerror(errno));
+
+	flag = fcntl(fd, F_GETFD);
+	fcntl(fd, F_SETFD, flag | FD_CLOEXEC);
+
+	memset(&un, 0, sizeof(un));
+	un.sun_family = AF_UNIX;
+	snprintf(un.sun_path, sizeof(un.sun_path), "%s", hw->path);
+	if (connect(fd, (struct sockaddr *)&un, sizeof(un)) < 0) {
+		PMD_DRV_LOG(ERR, "connect error, %s\n", strerror(errno));
+		rte_panic("connect error, %s\n", strerror(errno));
+	}
+
+	hw->vhostfd = fd;
+}
+
+void
+virtio_vdev_init(struct rte_eth_dev_data *data, char *path,
+		 int nb_rx, int nb_tx, int nb_cq __attribute__ ((unused)),
+		 int queue_num, char *mac, char *ifname)
+{
+	int i, r;
+	struct stat s;
+	uint32_t tmp[ETHER_ADDR_LEN];
+	struct virtio_hw *hw = data->dev_private;
+
+	hw->vtpci_ops = &vdev_ops;
+	hw->io_base  = 0;
+	hw->use_msix = 0;
+	hw->modern   = 0;
+
+	hw->data = data;
+	hw->path = strdup(path);
+	hw->max_rx_queues = nb_rx;
+	hw->max_tx_queues = nb_tx;
+	hw->queue_num = queue_num;
+	hw->mac_specified = 0;
+	if (mac) {
+		r = sscanf(mac, "%x:%x:%x:%x:%x:%x", &tmp[0],
+			   &tmp[1], &tmp[2], &tmp[3], &tmp[4], &tmp[5]);
+		if (r == ETHER_ADDR_LEN) {
+			for (i = 0; i < ETHER_ADDR_LEN; ++i)
+				hw->mac_addr[i] = (uint8_t)tmp[i];
+			hw->mac_specified = 1;
+		} else
+			PMD_DRV_LOG(WARN, "wrong format of mac: %s", mac);
+	}
+
+	/* TODO: cq */
+
+	if (stat(hw->path, &s) < 0)
+		rte_panic("stat: %s failed, %s\n", hw->path, strerror(errno));
+
+	switch (s.st_mode & S_IFMT) {
+	case S_IFCHR:
+		hw->type = VHOST_KERNEL;
+		vhost_kernel_backend_setup(hw, ifname);
+		break;
+	case S_IFSOCK:
+		hw->type = VHOST_USER;
+		vhost_user_backend_setup(hw);
+		break;
+	default:
+		rte_panic("unknown file type of %s\n", hw->path);
+	}
+	if (vhost_call(hw, VHOST_MSG_SET_OWNER, NULL) == -1)
+		rte_panic("vhost set_owner failed: %s\n", strerror(errno));
+}
diff --git a/drivers/net/virtio/virtio_ethdev.h b/drivers/net/virtio/virtio_ethdev.h
index fed9571..fde77ca 100644
--- a/drivers/net/virtio/virtio_ethdev.h
+++ b/drivers/net/virtio/virtio_ethdev.h
@@ -123,5 +123,9 @@ uint16_t virtio_xmit_pkts_simple(void *tx_queue, struct rte_mbuf **tx_pkts,
 #define VTNET_LRO_FEATURES (VIRTIO_NET_F_GUEST_TSO4 | \
 			    VIRTIO_NET_F_GUEST_TSO6 | VIRTIO_NET_F_GUEST_ECN)
 
-
+#ifdef RTE_VIRTIO_VDEV
+void virtio_vdev_init(struct rte_eth_dev_data *data, char *path, int nb_rx,
+		      int nb_tx, int nb_cq, int queue_num, char *mac,
+		      char *ifname);
+#endif
 #endif /* _VIRTIO_ETHDEV_H_ */
diff --git a/drivers/net/virtio/virtio_pci.h b/drivers/net/virtio/virtio_pci.h
index 0544a07..a8394f8 100644
--- a/drivers/net/virtio/virtio_pci.h
+++ b/drivers/net/virtio/virtio_pci.h
@@ -150,7 +150,6 @@ struct virtqueue;
  * rest are per-device feature bits.
  */
 #define VIRTIO_TRANSPORT_F_START 28
-#define VIRTIO_TRANSPORT_F_END   32
 
 /* The Guest publishes the used index for which it expects an interrupt
  * at the end of the avail ring. Host should ignore the avail->flags field. */
@@ -266,6 +265,20 @@ struct virtio_hw {
 	struct virtio_pci_common_cfg *common_cfg;
 	struct virtio_net_config *dev_cfg;
 	const struct virtio_pci_ops *vtpci_ops;
+#ifdef RTE_VIRTIO_VDEV
+#define VHOST_KERNEL	0
+#define VHOST_USER	1
+	int		type; /* type of backend */
+	uint32_t	queue_num;
+	char		*path;
+	int		mac_specified;
+	int		vhostfd;
+	int		backfd; /* tap device used in vhost-net */
+	int		callfds[VIRTIO_MAX_VIRTQUEUES * 2 + 1];
+	int		kickfds[VIRTIO_MAX_VIRTQUEUES * 2 + 1];
+	uint8_t		status;
+	struct rte_eth_dev_data *data;
+#endif
 };
 
 /*
-- 
2.1.4



More information about the dev mailing list