[dpdk-dev] [PATCH v2 5/7] net/virtio_user: add vhost kernel support

Jason Wang jasowang at redhat.com
Mon Jan 9 05:39:54 CET 2017



On 2016年12月23日 15:14, Jianfeng Tan wrote:
> This patch add support vhost kernel as the backend for virtio_user.
> Three main hook functions are added:
>    - vhost_kernel_setup() to open char device, each vq pair needs one
>      vhostfd;
>    - vhost_kernel_ioctl() to communicate control messages with vhost
>      kernel module;
>    - vhost_kernel_enable_queue_pair() to open tap device and set it
>      as the backend of corresonding vhost fd (that is to say, vq pair).
>
> Signed-off-by: Jianfeng Tan <jianfeng.tan at intel.com>
> ---
>   drivers/net/virtio/Makefile                      |   1 +
>   drivers/net/virtio/virtio_user/vhost.h           |   2 +
>   drivers/net/virtio/virtio_user/vhost_kernel.c    | 364 +++++++++++++++++++++++
>   drivers/net/virtio/virtio_user/virtio_user_dev.c |  21 +-
>   drivers/net/virtio/virtio_user/virtio_user_dev.h |   4 +
>   5 files changed, 388 insertions(+), 4 deletions(-)
>   create mode 100644 drivers/net/virtio/virtio_user/vhost_kernel.c
>
> diff --git a/drivers/net/virtio/Makefile b/drivers/net/virtio/Makefile
> index 97972a6..faeffb2 100644
> --- a/drivers/net/virtio/Makefile
> +++ b/drivers/net/virtio/Makefile
> @@ -60,6 +60,7 @@ endif
>   
>   ifeq ($(CONFIG_RTE_VIRTIO_USER),y)
>   SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_user/vhost_user.c
> +SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_user/vhost_kernel.c
>   SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_user/virtio_user_dev.c
>   SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_user_ethdev.c
>   endif
> diff --git a/drivers/net/virtio/virtio_user/vhost.h b/drivers/net/virtio/virtio_user/vhost.h
> index bd67133..ffab13a 100644
> --- a/drivers/net/virtio/virtio_user/vhost.h
> +++ b/drivers/net/virtio/virtio_user/vhost.h
> @@ -120,4 +120,6 @@ struct virtio_user_backend_ops {
>   };
>   
>   struct virtio_user_backend_ops ops_user;
> +struct virtio_user_backend_ops ops_kernel;
> +
>   #endif
> diff --git a/drivers/net/virtio/virtio_user/vhost_kernel.c b/drivers/net/virtio/virtio_user/vhost_kernel.c
> new file mode 100644
> index 0000000..8984c5c
> --- /dev/null
> +++ b/drivers/net/virtio/virtio_user/vhost_kernel.c
> @@ -0,0 +1,364 @@
> +/*-
> + *   BSD LICENSE
> + *
> + *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
> + *   All rights reserved.
> + *
> + *   Redistribution and use in source and binary forms, with or without
> + *   modification, are permitted provided that the following conditions
> + *   are met:
> + *
> + *     * Redistributions of source code must retain the above copyright
> + *       notice, this list of conditions and the following disclaimer.
> + *     * Redistributions in binary form must reproduce the above copyright
> + *       notice, this list of conditions and the following disclaimer in
> + *       the documentation and/or other materials provided with the
> + *       distribution.
> + *     * Neither the name of Intel Corporation nor the names of its
> + *       contributors may be used to endorse or promote products derived
> + *       from this software without specific prior written permission.
> + *
> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + */
> +
> +#include <unistd.h>
> +#include <sys/types.h>
> +#include <sys/stat.h>
> +#include <fcntl.h>
> +#include <sys/ioctl.h>
> +#include <net/if.h>
> +#include <string.h>
> +#include <errno.h>
> +
> +#include <rte_memory.h>
> +#include <rte_eal_memconfig.h>
> +
> +#include "vhost.h"
> +#include "virtio_user_dev.h"
> +
> +struct vhost_memory_kernel {
> +	uint32_t nregions;
> +	uint32_t padding;
> +	struct vhost_memory_region regions[0];
> +};
> +
> +/* vhost kernel ioctls */
> +#define VHOST_VIRTIO 0xAF
> +#define VHOST_GET_FEATURES _IOR(VHOST_VIRTIO, 0x00, __u64)
> +#define VHOST_SET_FEATURES _IOW(VHOST_VIRTIO, 0x00, __u64)
> +#define VHOST_SET_OWNER _IO(VHOST_VIRTIO, 0x01)
> +#define VHOST_RESET_OWNER _IO(VHOST_VIRTIO, 0x02)
> +#define VHOST_SET_MEM_TABLE _IOW(VHOST_VIRTIO, 0x03, struct vhost_memory_kernel)
> +#define VHOST_SET_LOG_BASE _IOW(VHOST_VIRTIO, 0x04, __u64)
> +#define VHOST_SET_LOG_FD _IOW(VHOST_VIRTIO, 0x07, int)
> +#define VHOST_SET_VRING_NUM _IOW(VHOST_VIRTIO, 0x10, struct vhost_vring_state)
> +#define VHOST_SET_VRING_ADDR _IOW(VHOST_VIRTIO, 0x11, struct vhost_vring_addr)
> +#define VHOST_SET_VRING_BASE _IOW(VHOST_VIRTIO, 0x12, struct vhost_vring_state)
> +#define VHOST_GET_VRING_BASE _IOWR(VHOST_VIRTIO, 0x12, struct vhost_vring_state)
> +#define VHOST_SET_VRING_KICK _IOW(VHOST_VIRTIO, 0x20, struct vhost_vring_file)
> +#define VHOST_SET_VRING_CALL _IOW(VHOST_VIRTIO, 0x21, struct vhost_vring_file)
> +#define VHOST_SET_VRING_ERR _IOW(VHOST_VIRTIO, 0x22, struct vhost_vring_file)
> +#define VHOST_NET_SET_BACKEND _IOW(VHOST_VIRTIO, 0x30, struct vhost_vring_file)
> +
> +/* TUN ioctls */
> +#define TUNSETIFF     _IOW('T', 202, int)
> +#define TUNGETFEATURES _IOR('T', 207, unsigned int)
> +#define TUNSETOFFLOAD  _IOW('T', 208, unsigned int)
> +#define TUNGETIFF      _IOR('T', 210, unsigned int)
> +#define TUNSETSNDBUF   _IOW('T', 212, int)
> +#define TUNGETVNETHDRSZ _IOR('T', 215, int)
> +#define TUNSETVNETHDRSZ _IOW('T', 216, int)
> +#define TUNSETQUEUE  _IOW('T', 217, int)
> +#define TUNSETVNETLE _IOW('T', 220, int)
> +#define TUNSETVNETBE _IOW('T', 222, int)
> +
> +/* TUNSETIFF ifr flags */
> +#define IFF_TAP          0x0002
> +#define IFF_NO_PI        0x1000
> +#define IFF_ONE_QUEUE    0x2000
> +#define IFF_VNET_HDR     0x4000
> +#define IFF_MULTI_QUEUE  0x0100
> +#define IFF_ATTACH_QUEUE 0x0200
> +#define IFF_DETACH_QUEUE 0x0400

Do we really want to duplicate those things which has been exposed by 
uapi here?

> +
> +/* Constants */
> +#define TUN_DEF_SNDBUF	(1ull << 20)
> +#define PATH_NET_TUN	"/dev/net/tun"
> +#define VHOST_KERNEL_MAX_REGIONS	64

Unfortunate not a constant any more since c9ce42f72fd0 vhost: add 
max_mem_regions module parameter.

> +
> +static uint64_t vhost_req_user_to_kernel[] = {
> +	[VHOST_USER_SET_OWNER] = VHOST_SET_OWNER,
> +	[VHOST_USER_RESET_OWNER] = VHOST_RESET_OWNER,
> +	[VHOST_USER_SET_FEATURES] = VHOST_SET_FEATURES,
> +	[VHOST_USER_GET_FEATURES] = VHOST_GET_FEATURES,
> +	[VHOST_USER_SET_VRING_CALL] = VHOST_SET_VRING_CALL,
> +	[VHOST_USER_SET_VRING_NUM] = VHOST_SET_VRING_NUM,
> +	[VHOST_USER_SET_VRING_BASE] = VHOST_SET_VRING_BASE,
> +	[VHOST_USER_GET_VRING_BASE] = VHOST_GET_VRING_BASE,
> +	[VHOST_USER_SET_VRING_ADDR] = VHOST_SET_VRING_ADDR,
> +	[VHOST_USER_SET_VRING_KICK] = VHOST_SET_VRING_KICK,
> +	[VHOST_USER_SET_MEM_TABLE] = VHOST_SET_MEM_TABLE,
> +};
> +
> +/* By default, vhost kernel module allows 64 regions, but DPDK allows
> + * 256 segments. As a relief, below function merges those virtually
> + * adjacent memsegs into one region.
> + */
> +static struct vhost_memory_kernel *
> +prepare_vhost_memory_kernel(void)
> +{
> +	uint32_t i, j, k = 0;
> +	struct rte_memseg *seg;
> +	struct vhost_memory_region *mr;
> +	struct vhost_memory_kernel *vm;
> +
> +	vm = malloc(sizeof(struct vhost_memory_kernel) +
> +		    VHOST_KERNEL_MAX_REGIONS *
> +		    sizeof(struct vhost_memory_region));
> +
> +	for (i = 0; i < RTE_MAX_MEMSEG; ++i) {
> +		seg = &rte_eal_get_configuration()->mem_config->memseg[i];
> +		if (!seg->addr)
> +			break;

If we're sure the number of regions is less than 64(or the module 
parameter read from /sys), can we avoid the iteration here?

> +
> +		int new_region = 1;
> +
> +		for (j = 0; j < k; ++j) {
> +			mr = &vm->regions[j];
> +
> +			if (mr->userspace_addr + mr->memory_size ==
> +			    (uint64_t)seg->addr) {
> +				mr->memory_size += seg->len;
> +				new_region = 0;
> +				break;
> +			}
> +
> +			if ((uint64_t)seg->addr + seg->len ==
> +			    mr->userspace_addr) {
> +				mr->guest_phys_addr = (uint64_t)seg->addr;
> +				mr->userspace_addr = (uint64_t)seg->addr;
> +				mr->memory_size += seg->len;
> +				new_region = 0;
> +				break;
> +			}
> +		}
> +
> +		if (new_region == 0)
> +			continue;
> +
> +		mr = &vm->regions[k++];
> +		mr->guest_phys_addr = (uint64_t)seg->addr; /* use vaddr here! */
> +		mr->userspace_addr = (uint64_t)seg->addr;
> +		mr->memory_size = seg->len;
> +		mr->mmap_offset = 0;
> +
> +		if (k >= VHOST_KERNEL_MAX_REGIONS) {
> +			free(vm);
> +			return NULL;
> +		}
> +	}
> +
> +	vm->nregions = k;
> +	vm->padding = 0;
> +	return vm;
> +}
> +
> +static int
> +vhost_kernel_ioctl(struct virtio_user_dev *dev,
> +		   enum vhost_user_request req,
> +		   void *arg)
> +{
> +	int i, ret = -1;
> +	uint64_t req_kernel;
> +	struct vhost_memory_kernel *vm = NULL;
> +
> +	req_kernel = vhost_req_user_to_kernel[req];
> +
> +	if (req_kernel == VHOST_SET_MEM_TABLE) {
> +		vm = prepare_vhost_memory_kernel();
> +		if (!vm)
> +			return -1;
> +		arg = (void *)vm;
> +	}
> +
> +	/* Does not work when VIRTIO_F_IOMMU_PLATFORM now, why? */

I think the reason is when VIRTIO_F_IOMMU_PLATFORM is negotiated, all 
address should be iova instead of gpa.

> +	if (req_kernel == VHOST_SET_FEATURES)
> +		*(uint64_t *)arg &= ~(1ULL << VIRTIO_F_IOMMU_PLATFORM);
> +
> +	for (i = 0; i < VHOST_KERNEL_MAX_QUEUES; ++i) {
> +		if (dev->vhostfds[i] < 0)
> +			continue;
> +
> +		ret = ioctl(dev->vhostfds[i], req_kernel, arg);
> +		if (ret < 0)
> +			break;
> +	}
> +
> +	if (vm)
> +		free(vm);
> +
> +	return ret;
> +}
> +
> +/**
> + * Set up environment to talk with a vhost kernel backend.
> + *
> + * @return
> + *   - (-1) if fail to set up;
> + *   - (>=0) if successful.
> + */
> +static int
> +vhost_kernel_setup(struct virtio_user_dev *dev)
> +{
> +	int vhostfd;
> +	uint32_t q;
> +
> +	for (q = 0; q < dev->max_queue_pairs; ++q) {
> +		vhostfd = open(dev->path, O_RDWR);
> +		if (vhostfd < 0) {
> +			PMD_DRV_LOG(ERR, "fail to open %s, %s",
> +				    dev->path, strerror(errno));
> +			return -1;
> +		}
> +
> +		dev->vhostfds[q] = vhostfd;
> +	}
> +
> +	return 0;
> +}
> +
> +static int
> +vhost_kernel_set_backend(int vhostfd, int tapfd)
> +{
> +	struct vhost_vring_file f;
> +
> +	f.fd = tapfd;
> +	f.index = 0;
> +	if (ioctl(vhostfd, VHOST_NET_SET_BACKEND, &f) < 0) {
> +		PMD_DRV_LOG(ERR, "VHOST_NET_SET_BACKEND fails, %s",
> +				strerror(errno));
> +		return -1;
> +	}
> +
> +	f.index = 1;
> +	if (ioctl(vhostfd, VHOST_NET_SET_BACKEND, &f) < 0) {
> +		PMD_DRV_LOG(ERR, "VHOST_NET_SET_BACKEND fails, %s",
> +				strerror(errno));
> +		return -1;
> +	}
> +
> +	return 0;
> +}
> +
> +static int
> +vhost_kernel_enable_queue_pair(struct virtio_user_dev *dev,
> +			       uint16_t pair_idx,
> +			       int enable)
> +{
> +	unsigned int features;
> +	int sndbuf = TUN_DEF_SNDBUF;
> +	struct ifreq ifr;
> +	int hdr_size;
> +	int vhostfd;
> +	int tapfd;
> +
> +	vhostfd = dev->vhostfds[pair_idx];
> +
> +	if (!enable) {
> +		if (dev->tapfds[pair_idx]) {
> +			close(dev->tapfds[pair_idx]);
> +			dev->tapfds[pair_idx] = -1;
> +		}
> +		return vhost_kernel_set_backend(vhostfd, -1);

If this is used to for thing like ethtool -L in guest, we should use 
TUNSETQUEUE here.

> +	} else if (dev->tapfds[pair_idx] >= 0) {
> +		return 0;
> +	}
> +
> +	if ((dev->features & (1ULL << VIRTIO_NET_F_MRG_RXBUF)) ||
> +	    (dev->features & (1ULL << VIRTIO_F_VERSION_1)))
> +		hdr_size = sizeof(struct virtio_net_hdr_mrg_rxbuf);
> +	else
> +		hdr_size = sizeof(struct virtio_net_hdr);
> +
> +	/* TODO:
> +	 * 1. verify we can get/set vnet_hdr_len, tap_probe_vnet_hdr_len
> +	 * 2. get number of memory regions from vhost module parameter
> +	 * max_mem_regions, supported in newer version linux kernel
> +	 */
> +	tapfd = open(PATH_NET_TUN, O_RDWR);
> +	if (tapfd < 0) {
> +		PMD_DRV_LOG(ERR, "fail to open %s: %s",
> +			    PATH_NET_TUN, strerror(errno));
> +		return -1;
> +	}
> +
> +	/* Construct ifr */
> +	memset(&ifr, 0, sizeof(ifr));
> +	ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
> +
> +	if (ioctl(tapfd, TUNGETFEATURES, &features) == -1) {
> +		PMD_DRV_LOG(ERR, "TUNGETFEATURES failed: %s", strerror(errno));
> +		goto error;
> +	}
> +	if (features & IFF_ONE_QUEUE)
> +		ifr.ifr_flags |= IFF_ONE_QUEUE;
> +
> +	/* Let tap instead of vhost-net handle vnet header, as the latter does
> +	 * not support offloading. And in this case, we should not set feature
> +	 * bit VHOST_NET_F_VIRTIO_NET_HDR.
> +	 */
> +	if (features & IFF_VNET_HDR) {
> +		ifr.ifr_flags |= IFF_VNET_HDR;
> +	} else {
> +		PMD_DRV_LOG(ERR, "TAP does not support IFF_VNET_HDR");
> +		goto error;
> +	}
> +
> +	if (dev->ifname)
> +		strncpy(ifr.ifr_name, dev->ifname, IFNAMSIZ);
> +	else
> +		strncpy(ifr.ifr_name, "tap%d", IFNAMSIZ);
> +	if (ioctl(tapfd, TUNSETIFF, (void *)&ifr) == -1) {
> +		PMD_DRV_LOG(ERR, "TUNSETIFF failed: %s", strerror(errno));
> +		goto error;
> +	}

This requires CAP_NET_ADMIN, so we should really consider to accept a 
pre-created fd here.

> +
> +	fcntl(tapfd, F_SETFL, O_NONBLOCK);
> +
> +	if (ioctl(tapfd, TUNSETVNETHDRSZ, &hdr_size) < 0) {
> +		PMD_DRV_LOG(ERR, "TUNSETVNETHDRSZ failed: %s", strerror(errno));
> +		goto error;
> +	}
> +
> +	if (ioctl(tapfd, TUNSETSNDBUF, &sndbuf) < 0) {
> +		PMD_DRV_LOG(ERR, "TUNSETSNDBUF failed: %s", strerror(errno));
> +		goto error;
> +	}

Let's use INT_MAX as default here to survive from evil consumer here.

> +
> +	if (vhost_kernel_set_backend(vhostfd, tapfd) < 0)
> +		goto error;
> +
> +	dev->tapfds[pair_idx] = tapfd;
> +	if (!dev->ifname)
> +		dev->ifname = strdup(ifr.ifr_name);
> +
> +	return 0;
> +error:
> +	return -1;
> +}
> +
> +struct virtio_user_backend_ops ops_kernel = {
> +	.setup = vhost_kernel_setup,
> +	.send_request = vhost_kernel_ioctl,
> +	.enable_qp = vhost_kernel_enable_queue_pair
> +};
> diff --git a/drivers/net/virtio/virtio_user/virtio_user_dev.c b/drivers/net/virtio/virtio_user/virtio_user_dev.c
> index a818c29..c718b85 100644
> --- a/drivers/net/virtio/virtio_user/virtio_user_dev.c
> +++ b/drivers/net/virtio/virtio_user/virtio_user_dev.c
> @@ -219,7 +219,7 @@ is_vhost_user_by_type(const char *path)
>   static int
>   virtio_user_dev_setup(struct virtio_user_dev *dev)
>   {
> -	uint32_t i;
> +	uint32_t i, q;
>   
>   	dev->vhostfd = -1;
>   	for (i = 0; i < VIRTIO_MAX_VIRTQUEUES * 2 + 1; ++i) {
> @@ -227,12 +227,18 @@ virtio_user_dev_setup(struct virtio_user_dev *dev)
>   		dev->callfds[i] = -1;
>   	}
>   
> +	for (q = 0; q < VHOST_KERNEL_MAX_QUEUES; ++q) {
> +		dev->vhostfds[q] = -1;
> +		dev->tapfds[q] = -1;
> +	}
> +
>   	if (is_vhost_user_by_type(dev->path)) {
>   		dev->ops = &ops_user;
> -		return dev->ops->setup(dev);
> +	} else {
> +		dev->ops = &ops_kernel;
>   	}
>   
> -	return -1;
> +	return dev->ops->setup(dev);
>   }
>   
>   int
> @@ -284,7 +290,9 @@ virtio_user_dev_init(struct virtio_user_dev *dev, char *path, int queues,
>   void
>   virtio_user_dev_uninit(struct virtio_user_dev *dev)
>   {
> -	uint32_t i;
> +	uint32_t i, q;
> +
> +	dev->ops->send_request(dev, VHOST_USER_RESET_OWNER, NULL);
>   
>   	for (i = 0; i < dev->max_queue_pairs * 2; ++i) {
>   		close(dev->callfds[i]);
> @@ -292,6 +300,11 @@ virtio_user_dev_uninit(struct virtio_user_dev *dev)
>   	}
>   
>   	close(dev->vhostfd);
> +
> +	for (q = 0; q < VHOST_KERNEL_MAX_QUEUES; ++q) {
> +		close(dev->vhostfds[q]);
> +		close(dev->tapfds[q]);
> +	}
>   }
>   
>   static uint8_t
> diff --git a/drivers/net/virtio/virtio_user/virtio_user_dev.h b/drivers/net/virtio/virtio_user/virtio_user_dev.h
> index 503a496..148b2e6 100644
> --- a/drivers/net/virtio/virtio_user/virtio_user_dev.h
> +++ b/drivers/net/virtio/virtio_user/virtio_user_dev.h
> @@ -44,6 +44,10 @@ struct virtio_user_dev {
>   	int		vhostfd;
>   
>   	/* for vhost_kernel backend */
> +	char		*ifname;
> +#define VHOST_KERNEL_MAX_QUEUES		8
> +	int		vhostfds[VHOST_KERNEL_MAX_QUEUES];
> +	int		tapfds[VHOST_KERNEL_MAX_QUEUES];
>   
>   	/* for both vhost_user and vhost_kernel */
>   	int		callfds[VIRTIO_MAX_VIRTQUEUES * 2 + 1];



More information about the dev mailing list