[dpdk-dev] [PATCH 13/28] vhost: move mmap/munmap

Nikos Dragazis ndragazis at arrikto.com
Wed Jun 19 17:14:38 CEST 2019


Mapping the vhost memory regions is a transport-specific operation, so
this patch moves the relevant code into trans_af_unix.c.  The new
.map_mem_table()/.unmap_mem_table() interfaces allow transports to
perform the mapping and unmapping.

In addition, the function vhost_user_set_mem_table(), which performs the
mmaping, contains some code for postcopy live migration. However,
postcopy live migration is an AF_UNIX-bound feature, due to the
userfaultfd mechanism.  The virtio-vhost-user transport, which will be
added in later patches, cannot support it. Therefore, we move this code
into trans_af_unix.c as well.

The vhost_user_set_mem_table() debug logs have also been moved to the
.map_mem_table(). All new .map_mem_table() interfaces have to implement
the debug logs. This is necessary in order to keep the ordering of the
log messages in case of postcopy live migration.

Last but not least, after refactoring vhost_user_set_mem_table(),
read_vhost_message() is no longer being used in vhost_user.c. So, mark
it as static in trans_af_unix.c.

Signed-off-by: Nikos Dragazis <ndragazis at arrikto.com>
Signed-off-by: Stefan Hajnoczi <stefanha at redhat.com>
---
 lib/librte_vhost/trans_af_unix.c | 185 ++++++++++++++++++++++++++++++++++++++-
 lib/librte_vhost/vhost.h         |  22 +++++
 lib/librte_vhost/vhost_user.c    | 171 ++++--------------------------------
 lib/librte_vhost/vhost_user.h    |   3 +
 4 files changed, 225 insertions(+), 156 deletions(-)

diff --git a/lib/librte_vhost/trans_af_unix.c b/lib/librte_vhost/trans_af_unix.c
index 5f9ef5a..522823f 100644
--- a/lib/librte_vhost/trans_af_unix.c
+++ b/lib/librte_vhost/trans_af_unix.c
@@ -5,7 +5,14 @@
  */
 
 #include <sys/socket.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
 #include <sys/un.h>
+#include <sys/types.h>
+#include <sys/ioctl.h>
+#ifdef RTE_LIBRTE_VHOST_POSTCOPY
+#include <linux/userfaultfd.h>
+#endif
 #include <fcntl.h>
 
 #include <rte_log.h>
@@ -43,7 +50,7 @@ struct af_unix_socket {
 	struct sockaddr_un un;
 };
 
-int read_vhost_message(int sockfd, struct VhostUserMsg *msg);
+static int read_vhost_message(int sockfd, struct VhostUserMsg *msg);
 static int create_unix_socket(struct vhost_user_socket *vsocket);
 static int vhost_user_start_server(struct vhost_user_socket *vsocket);
 static int vhost_user_start_client(struct vhost_user_socket *vsocket);
@@ -317,7 +324,7 @@ vhost_user_server_new_connection(int fd, void *dat, int *remove __rte_unused)
 }
 
 /* return bytes# of read on success or negative val on failure. */
-int
+static int
 read_vhost_message(int sockfd, struct VhostUserMsg *msg)
 {
 	int ret;
@@ -771,6 +778,178 @@ af_unix_vring_call(struct virtio_net *dev __rte_unused,
 	return 0;
 }
 
+static uint64_t
+get_blk_size(int fd)
+{
+	struct stat stat;
+	int ret;
+
+	ret = fstat(fd, &stat);
+	return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize;
+}
+
+static int
+af_unix_map_mem_regions(struct virtio_net *dev, struct VhostUserMsg *msg)
+{
+	uint32_t i;
+	struct VhostUserMemory *memory = &msg->payload.memory;
+	struct vhost_user_connection *conn =
+		container_of(dev, struct vhost_user_connection, device);
+
+	for (i = 0; i < dev->mem->nregions; i++) {
+		struct rte_vhost_mem_region *reg = &dev->mem->regions[i];
+		uint64_t mmap_size = reg->mmap_size;
+		uint64_t mmap_offset = mmap_size - reg->size;
+		uint64_t alignment;
+		void *mmap_addr;
+		int populate;
+
+		/* mmap() without flag of MAP_ANONYMOUS, should be called
+		 * with length argument aligned with hugepagesz at older
+		 * longterm version Linux, like 2.6.32 and 3.2.72, or
+		 * mmap() will fail with EINVAL.
+		 *
+		 * to avoid failure, make sure in caller to keep length
+		 * aligned.
+		 */
+		alignment = get_blk_size(reg->fd);
+		if (alignment == (uint64_t)-1) {
+			RTE_LOG(ERR, VHOST_CONFIG,
+				"couldn't get hugepage size through fstat\n");
+			return -1;
+		}
+		mmap_size = RTE_ALIGN_CEIL(mmap_size, alignment);
+
+		populate = (dev->dequeue_zero_copy) ? MAP_POPULATE : 0;
+		mmap_addr = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
+				 MAP_SHARED | populate, reg->fd, 0);
+
+		if (mmap_addr == MAP_FAILED) {
+			RTE_LOG(ERR, VHOST_CONFIG,
+				"mmap region %u failed.\n", i);
+			return -1;
+		}
+
+		reg->mmap_addr = mmap_addr;
+		reg->mmap_size = mmap_size;
+		reg->host_user_addr = (uint64_t)(uintptr_t)reg->mmap_addr +
+				      mmap_offset;
+
+		if (dev->dequeue_zero_copy)
+			if (add_guest_pages(dev, reg, alignment) < 0) {
+				RTE_LOG(ERR, VHOST_CONFIG,
+					"adding guest pages to region %u failed.\n",
+					i);
+				return -1;
+			}
+
+		RTE_LOG(INFO, VHOST_CONFIG,
+			"guest memory region %u, size: 0x%" PRIx64 "\n"
+			"\t guest physical addr: 0x%" PRIx64 "\n"
+			"\t guest virtual  addr: 0x%" PRIx64 "\n"
+			"\t host  virtual  addr: 0x%" PRIx64 "\n"
+			"\t mmap addr : 0x%" PRIx64 "\n"
+			"\t mmap size : 0x%" PRIx64 "\n"
+			"\t mmap align: 0x%" PRIx64 "\n"
+			"\t mmap off  : 0x%" PRIx64 "\n",
+			i, reg->size,
+			reg->guest_phys_addr,
+			reg->guest_user_addr,
+			reg->host_user_addr,
+			(uint64_t)(uintptr_t)reg->mmap_addr,
+			reg->mmap_size,
+			alignment,
+			mmap_offset);
+
+		if (dev->postcopy_listening) {
+			/*
+			 * We haven't a better way right now than sharing
+			 * DPDK's virtual address with Qemu, so that Qemu can
+			 * retrieve the region offset when handling userfaults.
+			 */
+			memory->regions[i].userspace_addr =
+				reg->host_user_addr;
+		}
+	}
+
+	if (dev->postcopy_listening) {
+		/* Send the addresses back to qemu */
+		msg->fd_num = 0;
+		/* Send reply */
+		msg->flags &= ~VHOST_USER_VERSION_MASK;
+		msg->flags &= ~VHOST_USER_NEED_REPLY;
+		msg->flags |= VHOST_USER_VERSION;
+		msg->flags |= VHOST_USER_REPLY_MASK;
+		af_unix_send_reply(dev, msg);
+
+		/* Wait for qemu to acknolwedge it's got the addresses
+		 * we've got to wait before we're allowed to generate faults.
+		 */
+		VhostUserMsg ack_msg;
+		if (read_vhost_message(conn->connfd, &ack_msg) <= 0) {
+			RTE_LOG(ERR, VHOST_CONFIG,
+				"Failed to read qemu ack on postcopy set-mem-table\n");
+			return -1;
+		}
+		if (ack_msg.request.master != VHOST_USER_SET_MEM_TABLE) {
+			RTE_LOG(ERR, VHOST_CONFIG,
+				"Bad qemu ack on postcopy set-mem-table (%d)\n",
+				ack_msg.request.master);
+			return -1;
+		}
+
+		/* Now userfault register and we can use the memory */
+		for (i = 0; i < memory->nregions; i++) {
+#ifdef RTE_LIBRTE_VHOST_POSTCOPY
+			struct rte_vhost_mem_region *reg = &dev->mem->regions[i];
+			struct uffdio_register reg_struct;
+
+			/*
+			 * Let's register all the mmap'ed area to ensure
+			 * alignment on page boundary.
+			 */
+			reg_struct.range.start =
+				(uint64_t)(uintptr_t)reg->mmap_addr;
+			reg_struct.range.len = reg->mmap_size;
+			reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
+
+			if (ioctl(dev->postcopy_ufd, UFFDIO_REGISTER,
+						&reg_struct)) {
+				RTE_LOG(ERR, VHOST_CONFIG,
+					"Failed to register ufd for region %d: (ufd = %d) %s\n",
+					i, dev->postcopy_ufd,
+					strerror(errno));
+				return -1;
+			}
+			RTE_LOG(INFO, VHOST_CONFIG,
+				"\t userfaultfd registered for range : %llx - %llx\n",
+				reg_struct.range.start,
+				reg_struct.range.start +
+				reg_struct.range.len - 1);
+#else
+			return -1;
+#endif
+		}
+	}
+
+	return 0;
+}
+
+static void
+af_unix_unmap_mem_regions(struct virtio_net *dev)
+{
+	uint32_t i;
+	struct rte_vhost_mem_region *reg;
+
+	for (i = 0; i < dev->mem->nregions; i++) {
+		reg = &dev->mem->regions[i];
+		if (reg->host_user_addr) {
+			munmap(reg->mmap_addr, reg->mmap_size);
+			close(reg->fd);
+		}
+	}
+}
+
 const struct vhost_transport_ops af_unix_trans_ops = {
 	.socket_size = sizeof(struct af_unix_socket),
 	.device_size = sizeof(struct vhost_user_connection),
@@ -783,4 +962,6 @@ const struct vhost_transport_ops af_unix_trans_ops = {
 	.send_slave_req = af_unix_send_slave_req,
 	.process_slave_message_reply = af_unix_process_slave_message_reply,
 	.set_slave_req_fd = af_unix_set_slave_req_fd,
+	.map_mem_regions = af_unix_map_mem_regions,
+	.unmap_mem_regions = af_unix_unmap_mem_regions,
 };
diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index 2213fbe..28038c6 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -415,6 +415,28 @@ struct vhost_transport_ops {
 	 */
 	int (*set_slave_req_fd)(struct virtio_net *dev,
 				struct VhostUserMsg *msg);
+
+	/**
+	 * Map memory table regions in dev->mem->regions[].
+	 *
+	 * @param dev
+	 *  vhost device
+	 * @param msg
+	 *  message
+	 * @return
+	 *  0 on success, -1 on failure
+	 */
+	int (*map_mem_regions)(struct virtio_net *dev,
+				struct VhostUserMsg *msg);
+
+	/**
+	 * Unmap memory table regions in dev->mem->regions[] and free any
+	 * resources, such as file descriptors.
+	 *
+	 * @param dev
+	 *  vhost device
+	 */
+	void (*unmap_mem_regions)(struct virtio_net *dev);
 };
 
 /** The traditional AF_UNIX vhost-user protocol transport. */
diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c
index a4dcba1..ed8dbd8 100644
--- a/lib/librte_vhost/vhost_user.c
+++ b/lib/librte_vhost/vhost_user.c
@@ -81,17 +81,6 @@ static const char *vhost_message_str[VHOST_USER_MAX] = {
 };
 
 static int send_vhost_reply(struct virtio_net *dev, struct VhostUserMsg *msg);
-int read_vhost_message(int sockfd, struct VhostUserMsg *msg);
-
-static uint64_t
-get_blk_size(int fd)
-{
-	struct stat stat;
-	int ret;
-
-	ret = fstat(fd, &stat);
-	return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize;
-}
 
 /*
  * Reclaim all the outstanding zmbufs for a virtqueue.
@@ -120,7 +109,6 @@ static void
 free_mem_region(struct virtio_net *dev)
 {
 	uint32_t i;
-	struct rte_vhost_mem_region *reg;
 	struct vhost_virtqueue *vq;
 
 	if (!dev || !dev->mem)
@@ -134,13 +122,7 @@ free_mem_region(struct virtio_net *dev)
 		}
 	}
 
-	for (i = 0; i < dev->mem->nregions; i++) {
-		reg = &dev->mem->regions[i];
-		if (reg->host_user_addr) {
-			munmap(reg->mmap_addr, reg->mmap_size);
-			close(reg->fd);
-		}
-	}
+	dev->trans_ops->unmap_mem_regions(dev);
 }
 
 void
@@ -792,7 +774,7 @@ add_one_guest_page(struct virtio_net *dev, uint64_t guest_phys_addr,
 	return 0;
 }
 
-static int
+int
 add_guest_pages(struct virtio_net *dev, struct rte_vhost_mem_region *reg,
 		uint64_t page_size)
 {
@@ -881,18 +863,13 @@ vhost_memory_changed(struct VhostUserMemory *new,
 
 static int
 vhost_user_set_mem_table(struct virtio_net **pdev, struct VhostUserMsg *msg,
-			int main_fd)
+			int main_fd __rte_unused)
 {
 	struct virtio_net *dev = *pdev;
 	struct VhostUserMemory *memory = &msg->payload.memory;
 	struct rte_vhost_mem_region *reg;
-	void *mmap_addr;
-	uint64_t mmap_size;
 	uint64_t mmap_offset;
-	uint64_t alignment;
 	uint32_t i;
-	int populate;
-	int fd;
 
 	if (memory->nregions > VHOST_MEMORY_MAX_NREGIONS) {
 		RTE_LOG(ERR, VHOST_CONFIG,
@@ -904,8 +881,11 @@ vhost_user_set_mem_table(struct virtio_net **pdev, struct VhostUserMsg *msg,
 		RTE_LOG(INFO, VHOST_CONFIG,
 			"(%d) memory regions not changed\n", dev->vid);
 
-		for (i = 0; i < memory->nregions; i++)
-			close(msg->fds[i]);
+		for (i = 0; i < memory->nregions; i++) {
+			if (msg->fds[i] >= 0) {
+				close(msg->fds[i]);
+			}
+		}
 
 		return RTE_VHOST_MSG_RESULT_OK;
 	}
@@ -946,13 +926,15 @@ vhost_user_set_mem_table(struct virtio_net **pdev, struct VhostUserMsg *msg,
 	dev->mem->nregions = memory->nregions;
 
 	for (i = 0; i < memory->nregions; i++) {
-		fd  = msg->fds[i];
 		reg = &dev->mem->regions[i];
 
 		reg->guest_phys_addr = memory->regions[i].guest_phys_addr;
 		reg->guest_user_addr = memory->regions[i].userspace_addr;
 		reg->size            = memory->regions[i].memory_size;
-		reg->fd              = fd;
+		reg->mmap_size       = reg->size + memory->regions[i].mmap_offset;
+		reg->mmap_addr       = NULL;
+		reg->host_user_addr  = 0;
+		reg->fd              = msg->fds[i];
 
 		mmap_offset = memory->regions[i].mmap_offset;
 
@@ -962,132 +944,13 @@ vhost_user_set_mem_table(struct virtio_net **pdev, struct VhostUserMsg *msg,
 				"mmap_offset (%#"PRIx64") and memory_size "
 				"(%#"PRIx64") overflow\n",
 				mmap_offset, reg->size);
-			goto err_mmap;
-		}
-
-		mmap_size = reg->size + mmap_offset;
-
-		/* mmap() without flag of MAP_ANONYMOUS, should be called
-		 * with length argument aligned with hugepagesz at older
-		 * longterm version Linux, like 2.6.32 and 3.2.72, or
-		 * mmap() will fail with EINVAL.
-		 *
-		 * to avoid failure, make sure in caller to keep length
-		 * aligned.
-		 */
-		alignment = get_blk_size(fd);
-		if (alignment == (uint64_t)-1) {
-			RTE_LOG(ERR, VHOST_CONFIG,
-				"couldn't get hugepage size through fstat\n");
-			goto err_mmap;
-		}
-		mmap_size = RTE_ALIGN_CEIL(mmap_size, alignment);
-
-		populate = (dev->dequeue_zero_copy) ? MAP_POPULATE : 0;
-		mmap_addr = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
-				 MAP_SHARED | populate, fd, 0);
-
-		if (mmap_addr == MAP_FAILED) {
-			RTE_LOG(ERR, VHOST_CONFIG,
-				"mmap region %u failed.\n", i);
-			goto err_mmap;
+			goto err;
 		}
 
-		reg->mmap_addr = mmap_addr;
-		reg->mmap_size = mmap_size;
-		reg->host_user_addr = (uint64_t)(uintptr_t)mmap_addr +
-				      mmap_offset;
-
-		if (dev->dequeue_zero_copy)
-			if (add_guest_pages(dev, reg, alignment) < 0) {
-				RTE_LOG(ERR, VHOST_CONFIG,
-					"adding guest pages to region %u failed.\n",
-					i);
-				goto err_mmap;
-			}
-
-		RTE_LOG(INFO, VHOST_CONFIG,
-			"guest memory region %u, size: 0x%" PRIx64 "\n"
-			"\t guest physical addr: 0x%" PRIx64 "\n"
-			"\t guest virtual  addr: 0x%" PRIx64 "\n"
-			"\t host  virtual  addr: 0x%" PRIx64 "\n"
-			"\t mmap addr : 0x%" PRIx64 "\n"
-			"\t mmap size : 0x%" PRIx64 "\n"
-			"\t mmap align: 0x%" PRIx64 "\n"
-			"\t mmap off  : 0x%" PRIx64 "\n",
-			i, reg->size,
-			reg->guest_phys_addr,
-			reg->guest_user_addr,
-			reg->host_user_addr,
-			(uint64_t)(uintptr_t)mmap_addr,
-			mmap_size,
-			alignment,
-			mmap_offset);
-
-		if (dev->postcopy_listening) {
-			/*
-			 * We haven't a better way right now than sharing
-			 * DPDK's virtual address with Qemu, so that Qemu can
-			 * retrieve the region offset when handling userfaults.
-			 */
-			memory->regions[i].userspace_addr =
-				reg->host_user_addr;
-		}
 	}
-	if (dev->postcopy_listening) {
-		/* Send the addresses back to qemu */
-		msg->fd_num = 0;
-		send_vhost_reply(dev, msg);
-
-		/* Wait for qemu to acknolwedge it's got the addresses
-		 * we've got to wait before we're allowed to generate faults.
-		 */
-		VhostUserMsg ack_msg;
-		if (read_vhost_message(main_fd, &ack_msg) <= 0) {
-			RTE_LOG(ERR, VHOST_CONFIG,
-				"Failed to read qemu ack on postcopy set-mem-table\n");
-			goto err_mmap;
-		}
-		if (ack_msg.request.master != VHOST_USER_SET_MEM_TABLE) {
-			RTE_LOG(ERR, VHOST_CONFIG,
-				"Bad qemu ack on postcopy set-mem-table (%d)\n",
-				ack_msg.request.master);
-			goto err_mmap;
-		}
-
-		/* Now userfault register and we can use the memory */
-		for (i = 0; i < memory->nregions; i++) {
-#ifdef RTE_LIBRTE_VHOST_POSTCOPY
-			reg = &dev->mem->regions[i];
-			struct uffdio_register reg_struct;
 
-			/*
-			 * Let's register all the mmap'ed area to ensure
-			 * alignment on page boundary.
-			 */
-			reg_struct.range.start =
-				(uint64_t)(uintptr_t)reg->mmap_addr;
-			reg_struct.range.len = reg->mmap_size;
-			reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
-
-			if (ioctl(dev->postcopy_ufd, UFFDIO_REGISTER,
-						&reg_struct)) {
-				RTE_LOG(ERR, VHOST_CONFIG,
-					"Failed to register ufd for region %d: (ufd = %d) %s\n",
-					i, dev->postcopy_ufd,
-					strerror(errno));
-				goto err_mmap;
-			}
-			RTE_LOG(INFO, VHOST_CONFIG,
-				"\t userfaultfd registered for range : %llx - %llx\n",
-				reg_struct.range.start,
-				reg_struct.range.start +
-				reg_struct.range.len - 1);
-#else
-			goto err_mmap;
-#endif
-		}
-	}
+	if (dev->trans_ops->map_mem_regions(dev, msg) < 0)
+		goto err;
 
 	for (i = 0; i < dev->nr_vring; i++) {
 		struct vhost_virtqueue *vq = dev->virtqueue[i];
@@ -1103,7 +966,7 @@ vhost_user_set_mem_table(struct virtio_net **pdev, struct VhostUserMsg *msg,
 			dev = translate_ring_addresses(dev, i);
 			if (!dev) {
 				dev = *pdev;
-				goto err_mmap;
+				goto err;
 			}
 
 			*pdev = dev;
@@ -1114,7 +977,7 @@ vhost_user_set_mem_table(struct virtio_net **pdev, struct VhostUserMsg *msg,
 
 	return RTE_VHOST_MSG_RESULT_OK;
 
-err_mmap:
+err:
 	free_mem_region(dev);
 	rte_free(dev->mem);
 	dev->mem = NULL;
diff --git a/lib/librte_vhost/vhost_user.h b/lib/librte_vhost/vhost_user.h
index 0169bd2..200e47b 100644
--- a/lib/librte_vhost/vhost_user.h
+++ b/lib/librte_vhost/vhost_user.h
@@ -147,6 +147,9 @@ typedef struct VhostUserMsg {
 
 /* vhost_user.c */
 int vhost_user_msg_handler(int vid, int fd, const struct VhostUserMsg *msg);
+int add_guest_pages(struct virtio_net *dev,
+		   struct rte_vhost_mem_region *reg,
+		   uint64_t page_size);
 int vhost_user_iotlb_miss(struct virtio_net *dev, uint64_t iova, uint8_t perm);
 
 #endif
-- 
2.7.4



More information about the dev mailing list