[dpdk-dev] [PATCH v7 2/5] vfio: add multi container support

Xiao Wang xiao.w.wang at intel.com
Sun Apr 15 17:33:46 CEST 2018


This patch adds APIs to support container create/destroy and device
bind/unbind with a container. It also provides API for IOMMU programing
on a specified container.

A driver could use "rte_vfio_create_container" helper to create a
new container from eal, use "rte_vfio_bind_group" to bind a device
to the newly created container. During rte_vfio_setup_device the
container bound with the device will be used for IOMMU setup.

Signed-off-by: Junjie Chen <junjie.j.chen at intel.com>
Signed-off-by: Xiao Wang <xiao.w.wang at intel.com>
Reviewed-by: Maxime Coquelin <maxime.coquelin at redhat.com>
Reviewed-by: Ferruh Yigit <ferruh.yigit at intel.com>
---
 lib/librte_eal/bsdapp/eal/eal.c          |  52 +++++
 lib/librte_eal/common/include/rte_vfio.h | 119 ++++++++++++
 lib/librte_eal/linuxapp/eal/eal_vfio.c   | 316 +++++++++++++++++++++++++++++++
 lib/librte_eal/rte_eal_version.map       |   6 +
 4 files changed, 493 insertions(+)

diff --git a/lib/librte_eal/bsdapp/eal/eal.c b/lib/librte_eal/bsdapp/eal/eal.c
index 727adc5d2..c5106d0d6 100644
--- a/lib/librte_eal/bsdapp/eal/eal.c
+++ b/lib/librte_eal/bsdapp/eal/eal.c
@@ -769,6 +769,14 @@ int rte_vfio_noiommu_is_enabled(void);
 int rte_vfio_clear_group(int vfio_group_fd);
 int rte_vfio_dma_map(uint64_t vaddr, uint64_t iova, uint64_t len);
 int rte_vfio_dma_unmap(uint64_t vaddr, uint64_t iova, uint64_t len);
+int rte_vfio_container_create(void);
+int rte_vfio_container_destroy(int container_fd);
+int rte_vfio_bind_group(int container_fd, int iommu_group_no);
+int rte_vfio_unbind_group(int container_fd, int iommu_group_no);
+int rte_vfio_container_dma_map(int container_fd, uint64_t vaddr,
+		uint64_t iova, uint64_t len);
+int rte_vfio_container_dma_unmap(int container_fd, uint64_t vaddr,
+		uint64_t iova, uint64_t len);
 
 int rte_vfio_setup_device(__rte_unused const char *sysfs_base,
 		      __rte_unused const char *dev_addr,
@@ -818,3 +826,47 @@ rte_vfio_dma_unmap(uint64_t __rte_unused vaddr, uint64_t __rte_unused iova,
 {
 	return -1;
 }
+
+int __rte_experimental
+rte_vfio_container_create(void)
+{
+	return -1;
+}
+
+int __rte_experimental
+rte_vfio_container_destroy(__rte_unused int container_fd)
+{
+	return -1;
+}
+
+int __rte_experimental
+rte_vfio_bind_group(__rte_unused int container_fd,
+		__rte_unused int iommu_group_no)
+{
+	return -1;
+}
+
+int __rte_experimental
+rte_vfio_unbind_group(__rte_unused int container_fd,
+		__rte_unused int iommu_group_no)
+{
+	return -1;
+}
+
+int __rte_experimental
+rte_vfio_container_dma_map(__rte_unused int container_fd,
+			__rte_unused uint64_t vaddr,
+			__rte_unused uint64_t iova,
+			__rte_unused uint64_t len)
+{
+	return -1;
+}
+
+int __rte_experimental
+rte_vfio_container_dma_unmap(__rte_unused int container_fd,
+			__rte_unused uint64_t vaddr,
+			__rte_unused uint64_t iova,
+			__rte_unused uint64_t len)
+{
+	return -1;
+}
diff --git a/lib/librte_eal/common/include/rte_vfio.h b/lib/librte_eal/common/include/rte_vfio.h
index d26ab01cb..0c1509b29 100644
--- a/lib/librte_eal/common/include/rte_vfio.h
+++ b/lib/librte_eal/common/include/rte_vfio.h
@@ -168,6 +168,125 @@ rte_vfio_dma_map(uint64_t vaddr, uint64_t iova, uint64_t len);
 int __rte_experimental
 rte_vfio_dma_unmap(uint64_t vaddr, uint64_t iova, uint64_t len);
 
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
+ *
+ * Create a new container for device binding.
+ *
+ * @return
+ *   the container fd if successful
+ *   <0 if failed
+ */
+int __rte_experimental
+rte_vfio_container_create(void);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
+ *
+ * Destroy the container, unbind all vfio groups within it.
+ *
+ * @param container_fd
+ *   the container fd to destroy
+ *
+ * @return
+ *    0 if successful
+ *   <0 if failed
+ */
+int __rte_experimental
+rte_vfio_container_destroy(int container_fd);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
+ *
+ * Bind a IOMMU group to a container.
+ *
+ * @param container_fd
+ *   the container's fd
+ *
+ * @param iommu_group_no
+ *   the iommu_group_no to bind to container
+ *
+ * @return
+ *   group fd if successful
+ *   <0 if failed
+ */
+int __rte_experimental
+rte_vfio_bind_group(int container_fd, int iommu_group_no);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
+ *
+ * Unbind a IOMMU group from a container.
+ *
+ * @param container_fd
+ *   the container fd of container
+ *
+ * @param iommu_group_no
+ *   the iommu_group_no to delete from container
+ *
+ * @return
+ *    0 if successful
+ *   <0 if failed
+ */
+int __rte_experimental
+rte_vfio_unbind_group(int container_fd, int iommu_group_no);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
+ *
+ * Perform dma mapping for devices in a conainer.
+ *
+ * @param container_fd
+ *   the specified container fd
+ *
+ * @param vaddr
+ *   Starting virtual address of memory to be mapped.
+ *
+ * @param iova
+ *   Starting IOVA address of memory to be mapped.
+ *
+ * @param len
+ *   Length of memory segment being mapped.
+ *
+ * @return
+ *    0 if successful
+ *   <0 if failed
+ */
+int __rte_experimental
+rte_vfio_container_dma_map(int container_fd, uint64_t vaddr,
+		uint64_t iova, uint64_t len);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
+ *
+ * Perform dma unmapping for devices in a conainer.
+ *
+ * @param container_fd
+ *   the specified container fd
+ *
+ * @param vaddr
+ *   Starting virtual address of memory to be unmapped.
+ *
+ * @param iova
+ *   Starting IOVA address of memory to be unmapped.
+ *
+ * @param len
+ *   Length of memory segment being unmapped.
+ *
+ * @return
+ *    0 if successful
+ *   <0 if failed
+ */
+int __rte_experimental
+rte_vfio_container_dma_unmap(int container_fd, uint64_t vaddr,
+		uint64_t iova, uint64_t len);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c b/lib/librte_eal/linuxapp/eal/eal_vfio.c
index 46fba2d8d..2f566a621 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c
@@ -1668,6 +1668,278 @@ rte_vfio_noiommu_is_enabled(void)
 	return c == 'Y';
 }
 
+int __rte_experimental
+rte_vfio_container_create(void)
+{
+	int i;
+
+	/* Find an empty slot to store new vfio config */
+	for (i = 1; i < VFIO_MAX_CONTAINERS; i++) {
+		if (vfio_cfgs[i].vfio_container_fd == -1)
+			break;
+	}
+
+	if (i == VFIO_MAX_CONTAINERS) {
+		RTE_LOG(ERR, EAL, "exceed max vfio container limit\n");
+		return -1;
+	}
+
+	vfio_cfgs[i].vfio_container_fd = vfio_get_container_fd();
+	if (vfio_cfgs[i].vfio_container_fd < 0) {
+		RTE_LOG(NOTICE, EAL, "fail to create a new container\n");
+		return -1;
+	}
+
+	return vfio_cfgs[i].vfio_container_fd;
+}
+
+int __rte_experimental
+rte_vfio_container_destroy(int container_fd)
+{
+	struct vfio_config *vfio_cfg;
+	int i;
+
+	vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
+	if (vfio_cfg == NULL) {
+		RTE_LOG(ERR, EAL, "Invalid container fd\n");
+		return -1;
+	}
+
+	for (i = 0; i < VFIO_MAX_GROUPS; i++)
+		if (vfio_cfg->vfio_groups[i].group_no != -1)
+			rte_vfio_unbind_group(container_fd,
+				vfio_cfg->vfio_groups[i].group_no);
+
+	close(container_fd);
+	vfio_cfg->vfio_container_fd = -1;
+	vfio_cfg->vfio_active_groups = 0;
+	vfio_cfg->vfio_iommu_type = NULL;
+
+	return 0;
+}
+
+int __rte_experimental
+rte_vfio_bind_group(int container_fd, int iommu_group_no)
+{
+	struct vfio_config *vfio_cfg;
+	struct vfio_group *cur_grp;
+	int vfio_group_fd;
+	int i;
+
+	vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
+	if (vfio_cfg == NULL) {
+		RTE_LOG(ERR, EAL, "Invalid container fd\n");
+		return -1;
+	}
+
+	/* Check room for new group */
+	if (vfio_cfg->vfio_active_groups == VFIO_MAX_GROUPS) {
+		RTE_LOG(ERR, EAL, "Maximum number of VFIO groups reached!\n");
+		return -1;
+	}
+
+	/* Get an index for the new group */
+	for (i = 0; i < VFIO_MAX_GROUPS; i++)
+		if (vfio_cfg->vfio_groups[i].group_no == -1) {
+			cur_grp = &vfio_cfg->vfio_groups[i];
+			break;
+		}
+
+	/* This should not happen */
+	if (i == VFIO_MAX_GROUPS) {
+		RTE_LOG(ERR, EAL, "No VFIO group free slot found\n");
+		return -1;
+	}
+
+	vfio_group_fd = vfio_open_group_fd(iommu_group_no);
+	if (vfio_group_fd < 0) {
+		RTE_LOG(ERR, EAL, "Failed to open group %d\n", iommu_group_no);
+		return -1;
+	}
+	cur_grp->group_no = iommu_group_no;
+	cur_grp->fd = vfio_group_fd;
+	cur_grp->devices = 0;
+	vfio_cfg->vfio_active_groups++;
+
+	return vfio_group_fd;
+}
+
+int __rte_experimental
+rte_vfio_unbind_group(int container_fd, int iommu_group_no)
+{
+	struct vfio_config *vfio_cfg;
+	struct vfio_group *cur_grp;
+	int i;
+
+	vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
+	if (vfio_cfg == NULL) {
+		RTE_LOG(ERR, EAL, "Invalid container fd\n");
+		return -1;
+	}
+
+	for (i = 0; i < VFIO_MAX_GROUPS; i++) {
+		if (vfio_cfg->vfio_groups[i].group_no == iommu_group_no) {
+			cur_grp = &vfio_cfg->vfio_groups[i];
+			break;
+		}
+	}
+
+	/* This should not happen */
+	if (i == VFIO_MAX_GROUPS) {
+		RTE_LOG(ERR, EAL, "Specified group number not found\n");
+		return -1;
+	}
+
+	if (cur_grp->fd >= 0 && close(cur_grp->fd) < 0) {
+		RTE_LOG(ERR, EAL, "Error when closing vfio_group_fd for"
+			" iommu_group_no %d\n", iommu_group_no);
+		return -1;
+	}
+	cur_grp->group_no = -1;
+	cur_grp->fd = -1;
+	cur_grp->devices = 0;
+	vfio_cfg->vfio_active_groups--;
+
+	return 0;
+}
+
+int __rte_experimental
+rte_vfio_container_dma_map(int container_fd, uint64_t vaddr, uint64_t iova,
+		uint64_t len)
+{
+	struct user_mem_map *new_map;
+	struct vfio_config *vfio_cfg;
+	struct user_mem_maps *user_mem_maps;
+	int ret = 0;
+
+	if (len == 0) {
+		rte_errno = EINVAL;
+		return -1;
+	}
+
+	vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
+	if (vfio_cfg == NULL) {
+		RTE_LOG(ERR, EAL, "Invalid container fd\n");
+		return -1;
+	}
+
+	user_mem_maps = &vfio_cfg->mem_maps;
+	rte_spinlock_recursive_lock(&user_mem_maps->lock);
+	if (user_mem_maps->n_maps == VFIO_MAX_USER_MEM_MAPS) {
+		RTE_LOG(ERR, EAL, "No more space for user mem maps\n");
+		rte_errno = ENOMEM;
+		ret = -1;
+		goto out;
+	}
+	/* map the entry */
+	if (vfio_dma_mem_map(vfio_cfg, vaddr, iova, len, 1)) {
+		/* technically, this will fail if there are currently no devices
+		 * plugged in, even if a device were added later, this mapping
+		 * might have succeeded. however, since we cannot verify if this
+		 * is a valid mapping without having a device attached, consider
+		 * this to be unsupported, because we can't just store any old
+		 * mapping and pollute list of active mappings willy-nilly.
+		 */
+		RTE_LOG(ERR, EAL, "Couldn't map new region for DMA\n");
+		ret = -1;
+		goto out;
+	}
+	/* create new user mem map entry */
+	new_map = &user_mem_maps->maps[user_mem_maps->n_maps++];
+	new_map->addr = vaddr;
+	new_map->iova = iova;
+	new_map->len = len;
+
+	compact_user_maps(user_mem_maps);
+out:
+	rte_spinlock_recursive_unlock(&user_mem_maps->lock);
+	return ret;
+}
+
+int __rte_experimental
+rte_vfio_container_dma_unmap(int container_fd, uint64_t vaddr, uint64_t iova,
+		uint64_t len)
+{
+	struct user_mem_map *map, *new_map = NULL;
+	struct vfio_config *vfio_cfg;
+	struct user_mem_maps *user_mem_maps;
+	int ret = 0;
+
+	if (len == 0) {
+		rte_errno = EINVAL;
+		return -1;
+	}
+
+	vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
+	if (vfio_cfg == NULL) {
+		RTE_LOG(ERR, EAL, "Invalid container fd\n");
+		return -1;
+	}
+
+	user_mem_maps = &vfio_cfg->mem_maps;
+	rte_spinlock_recursive_lock(&user_mem_maps->lock);
+
+	/* find our mapping */
+	map = find_user_mem_map(user_mem_maps, vaddr, iova, len);
+	if (!map) {
+		RTE_LOG(ERR, EAL, "Couldn't find previously mapped region\n");
+		rte_errno = EINVAL;
+		ret = -1;
+		goto out;
+	}
+	if (map->addr != vaddr || map->iova != iova || map->len != len) {
+		/* we're partially unmapping a previously mapped region, so we
+		 * need to split entry into two.
+		 */
+		if (user_mem_maps->n_maps == VFIO_MAX_USER_MEM_MAPS) {
+			RTE_LOG(ERR, EAL, "Not enough space to store partial mapping\n");
+			rte_errno = ENOMEM;
+			ret = -1;
+			goto out;
+		}
+		new_map = &user_mem_maps->maps[user_mem_maps->n_maps++];
+	}
+
+	/* unmap the entry */
+	if (vfio_dma_mem_map(vfio_cfg, vaddr, iova, len, 0)) {
+		/* there may not be any devices plugged in, so unmapping will
+		 * fail with ENODEV/ENOTSUP rte_errno values, but that doesn't
+		 * stop us from removing the mapping, as the assumption is we
+		 * won't be needing this memory any more and thus will want to
+		 * prevent it from being remapped again on hotplug. so, only
+		 * fail if we indeed failed to unmap (e.g. if the mapping was
+		 * within our mapped range but had invalid alignment).
+		 */
+		if (rte_errno != ENODEV && rte_errno != ENOTSUP) {
+			RTE_LOG(ERR, EAL, "Couldn't unmap region for DMA\n");
+			ret = -1;
+			goto out;
+		} else {
+			RTE_LOG(DEBUG, EAL, "DMA unmapping failed, but removing mappings anyway\n");
+		}
+	}
+	/* remove map from the list of active mappings */
+	if (new_map != NULL) {
+		adjust_map(map, new_map, vaddr, len);
+
+		/* if we've created a new map by splitting, sort everything */
+		if (!is_null_map(new_map)) {
+			compact_user_maps(user_mem_maps);
+		} else {
+			/* we've created a new mapping, but it was unused */
+			user_mem_maps->n_maps--;
+		}
+	} else {
+		memset(map, 0, sizeof(*map));
+		compact_user_maps(user_mem_maps);
+		user_mem_maps->n_maps--;
+	}
+
+out:
+	rte_spinlock_recursive_unlock(&user_mem_maps->lock);
+	return ret;
+}
+
 #else
 
 int __rte_experimental
@@ -1684,4 +1956,48 @@ rte_vfio_dma_unmap(uint64_t __rte_unused vaddr, uint64_t __rte_unused iova,
 	return -1;
 }
 
+int __rte_experimental
+rte_vfio_container_create(void)
+{
+	return -1;
+}
+
+int __rte_experimental
+rte_vfio_container_destroy(__rte_unused int container_fd)
+{
+	return -1;
+}
+
+int __rte_experimental
+rte_vfio_bind_group(__rte_unused int container_fd,
+		__rte_unused int iommu_group_no)
+{
+	return -1;
+}
+
+int __rte_experimental
+rte_vfio_unbind_group(__rte_unused int container_fd,
+		__rte_unused int iommu_group_no)
+{
+	return -1;
+}
+
+int __rte_experimental
+rte_vfio_container_dma_map(__rte_unused int container_fd,
+		__rte_unused uint64_t vaddr,
+		__rte_unused uint64_t iova,
+		__rte_unused uint64_t len)
+{
+	return -1;
+}
+
+int __rte_experimental
+rte_vfio_container_dma_unmap(__rte_unused int container_fd,
+		__rte_unused uint64_t vaddr,
+		__rte_unused uint64_t iova,
+		__rte_unused uint64_t len)
+{
+	return -1;
+}
+
 #endif
diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
index 2b5b1dcf5..c5eff065e 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -284,7 +284,13 @@ EXPERIMENTAL {
 	rte_service_start_with_defaults;
 	rte_socket_count;
 	rte_socket_id_by_idx;
+	rte_vfio_bind_group;
+	rte_vfio_container_create;
+	rte_vfio_container_destroy;
+	rte_vfio_container_dma_map;
+	rte_vfio_container_dma_unmap;
 	rte_vfio_dma_map;
 	rte_vfio_dma_unmap;
+	rte_vfio_unbind_group;
 
 } DPDK_18.02;
-- 
2.15.1



More information about the dev mailing list