[dpdk-dev] [PATCH v4 1/4] eal/vfio: add multiple container support

Xiao Wang xiao.w.wang at intel.com
Wed Apr 4 16:40:39 CEST 2018


Currently eal vfio framework binds vfio group fd to the default
container fd during rte_vfio_setup_device, while in some cases,
e.g. vDPA (vhost data path acceleration), we want to put vfio group
to a separate container and program IOMMU via this container.

This patch adds some APIs to support container creating and device
binding with a container.

A driver could use "rte_vfio_create_container" helper to create a
new container from eal, use "rte_vfio_bind_group" to bind a device
to the newly created container.

During rte_vfio_setup_device, the container bound with the device
will be used for IOMMU setup.

Signed-off-by: Junjie Chen <junjie.j.chen at intel.com>
Signed-off-by: Xiao Wang <xiao.w.wang at intel.com>
Reviewed-by: Maxime Coquelin <maxime.coquelin at redhat.com>
---
v4:
- Remove API "rte_vfio_get_group_fd", "rte_vfio_bind_group" will return the fd.
- Align the internal vfio_cfg search APIs naming.
---
 config/common_base                       |   1 +
 lib/librte_eal/bsdapp/eal/eal.c          |  52 ++-
 lib/librte_eal/common/include/rte_vfio.h | 113 +++++++
 lib/librte_eal/linuxapp/eal/eal_vfio.c   | 521 +++++++++++++++++++++++++------
 lib/librte_eal/linuxapp/eal/eal_vfio.h   |   1 +
 lib/librte_eal/rte_eal_version.map       |   6 +
 6 files changed, 593 insertions(+), 101 deletions(-)

diff --git a/config/common_base b/config/common_base
index 7abf7c6fc..2c40b2603 100644
--- a/config/common_base
+++ b/config/common_base
@@ -74,6 +74,7 @@ CONFIG_RTE_EAL_ALWAYS_PANIC_ON_ERROR=n
 CONFIG_RTE_EAL_IGB_UIO=n
 CONFIG_RTE_EAL_VFIO=n
 CONFIG_RTE_MAX_VFIO_GROUPS=64
+CONFIG_RTE_MAX_VFIO_CONTAINERS=64
 CONFIG_RTE_MALLOC_DEBUG=n
 CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES=n
 
diff --git a/lib/librte_eal/bsdapp/eal/eal.c b/lib/librte_eal/bsdapp/eal/eal.c
index 4eafcb5ad..76f3beb39 100644
--- a/lib/librte_eal/bsdapp/eal/eal.c
+++ b/lib/librte_eal/bsdapp/eal/eal.c
@@ -38,6 +38,7 @@
 #include <rte_interrupts.h>
 #include <rte_bus.h>
 #include <rte_dev.h>
+#include <rte_vfio.h>
 #include <rte_devargs.h>
 #include <rte_version.h>
 #include <rte_atomic.h>
@@ -738,15 +739,6 @@ rte_eal_vfio_intr_mode(void)
 /* dummy forward declaration. */
 struct vfio_device_info;
 
-/* dummy prototypes. */
-int rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
-		int *vfio_dev_fd, struct vfio_device_info *device_info);
-int rte_vfio_release_device(const char *sysfs_base, const char *dev_addr, int fd);
-int rte_vfio_enable(const char *modname);
-int rte_vfio_is_enabled(const char *modname);
-int rte_vfio_noiommu_is_enabled(void);
-int rte_vfio_clear_group(int vfio_group_fd);
-
 int rte_vfio_setup_device(__rte_unused const char *sysfs_base,
 		      __rte_unused const char *dev_addr,
 		      __rte_unused int *vfio_dev_fd,
@@ -781,3 +773,45 @@ int rte_vfio_clear_group(__rte_unused int vfio_group_fd)
 {
 	return 0;
 }
+
+int __rte_experimental
+rte_vfio_create_container(void)
+{
+	return -1;
+}
+
+int __rte_experimental
+rte_vfio_destroy_container(__rte_unused int container_fd)
+{
+	return -1;
+}
+
+int __rte_experimental
+rte_vfio_bind_group(__rte_unused int container_fd,
+	__rte_unused int iommu_group_no)
+{
+	return -1;
+}
+
+int __rte_experimental
+rte_vfio_unbind_group(__rte_unused int container_fd,
+	__rte_unused int iommu_group_no)
+{
+	return -1;
+}
+
+int __rte_experimental
+rte_vfio_dma_map(__rte_unused int container_fd,
+	__rte_unused int dma_type,
+	__rte_unused const struct rte_memseg *ms)
+{
+	return -1;
+}
+
+int __rte_experimental
+rte_vfio_dma_unmap(__rte_unused int container_fd,
+	__rte_unused int dma_type,
+	__rte_unused const struct rte_memseg *ms)
+{
+	return -1;
+}
diff --git a/lib/librte_eal/common/include/rte_vfio.h b/lib/librte_eal/common/include/rte_vfio.h
index 249095e46..b6eb7bdb4 100644
--- a/lib/librte_eal/common/include/rte_vfio.h
+++ b/lib/librte_eal/common/include/rte_vfio.h
@@ -32,6 +32,8 @@
 extern "C" {
 #endif
 
+struct rte_memseg;
+
 /**
  * Setup vfio_cfg for the device identified by its address.
  * It discovers the configured I/O MMU groups or sets a new one for the device.
@@ -131,6 +133,117 @@ rte_vfio_clear_group(int vfio_group_fd);
 }
 #endif
 
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
+ *
+ * Create a new container.
+ *
+ * @return
+ *   the container fd if successful
+ *   <0 if failed
+ */
+int __rte_experimental
+rte_vfio_create_container(void);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
+ *
+ * Destroy the container, unbind all vfio groups within it.
+ *
+ * @param container_fd
+ *   the container fd to destroy
+ *
+ * @return
+ *    0 if successful
+ *   <0 if failed
+ */
+int __rte_experimental
+rte_vfio_destroy_container(int container_fd);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
+ *
+ * Bind a group number to container.
+ *
+ * @param container_fd
+ *   the container's fd
+ *
+ * @param iommu_group_no
+ *   the iommu_group_no to bind to container
+ *
+ * @return
+ *   group fd if successful
+ *   <0 if failed
+ */
+int __rte_experimental
+rte_vfio_bind_group(int container_fd, int iommu_group_no);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
+ *
+ * Unbind a group from specified container.
+ *
+ * @param container_fd
+ *   the container fd of container
+ *
+ * @param iommu_group_no
+ *   the iommu_group_no to delete from container
+ *
+ * @return
+ *    0 if successful
+ *   <0 if failed
+ */
+int __rte_experimental
+rte_vfio_unbind_group(int container_fd, int iommu_group_no);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
+ *
+ * Perform dma mapping for devices in specified conainer.
+ *
+ * @param container_fd
+ *   the specified container fd
+ *
+ * @param dma_type
+ *   the dma type for mapping
+ *
+ * @param ms
+ *   the dma address region to map
+ *
+ * @return
+ *    0 if successful
+ *   <0 if failed
+ */
+int __rte_experimental
+rte_vfio_dma_map(int container_fd, int dma_type, const struct rte_memseg *ms);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
+ *
+ * Perform dma unmapping for devices in specified conainer.
+ *
+ * @param container_fd
+ *   the specified container fd
+ *
+ * @param dma_type
+ *    the dma map type
+ *
+ * @param ms
+ *   the dma address region to unmap
+ *
+ * @return
+ *    0 if successful
+ *   <0 if failed
+ */
+int __rte_experimental
+rte_vfio_dma_unmap(int container_fd, int dma_type, const struct rte_memseg *ms);
+
 #endif /* VFIO_PRESENT */
 
 #endif /* _RTE_VFIO_H_ */
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c b/lib/librte_eal/linuxapp/eal/eal_vfio.c
index e44ae4d04..1685745ac 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c
@@ -9,6 +9,7 @@
 
 #include <rte_log.h>
 #include <rte_memory.h>
+#include <rte_malloc.h>
 #include <rte_eal_memconfig.h>
 #include <rte_vfio.h>
 
@@ -19,7 +20,9 @@
 #ifdef VFIO_PRESENT
 
 /* per-process VFIO config */
-static struct vfio_config vfio_cfg;
+static struct vfio_config default_vfio_cfg;
+
+static struct vfio_config *vfio_cfgs[VFIO_MAX_CONTAINERS] = {&default_vfio_cfg};
 
 static int vfio_type1_dma_map(int);
 static int vfio_spapr_dma_map(int);
@@ -35,38 +38,13 @@ static const struct vfio_iommu_type iommu_types[] = {
 	{ RTE_VFIO_NOIOMMU, "No-IOMMU", &vfio_noiommu_dma_map},
 };
 
-int
-vfio_get_group_fd(int iommu_group_no)
+static int
+vfio_open_group_fd(int iommu_group_no)
 {
-	int i;
 	int vfio_group_fd;
 	char filename[PATH_MAX];
-	struct vfio_group *cur_grp;
-
-	/* check if we already have the group descriptor open */
-	for (i = 0; i < VFIO_MAX_GROUPS; i++)
-		if (vfio_cfg.vfio_groups[i].group_no == iommu_group_no)
-			return vfio_cfg.vfio_groups[i].fd;
-
-	/* Lets see first if there is room for a new group */
-	if (vfio_cfg.vfio_active_groups == VFIO_MAX_GROUPS) {
-		RTE_LOG(ERR, EAL, "Maximum number of VFIO groups reached!\n");
-		return -1;
-	}
-
-	/* Now lets get an index for the new group */
-	for (i = 0; i < VFIO_MAX_GROUPS; i++)
-		if (vfio_cfg.vfio_groups[i].group_no == -1) {
-			cur_grp = &vfio_cfg.vfio_groups[i];
-			break;
-		}
 
-	/* This should not happen */
-	if (i == VFIO_MAX_GROUPS) {
-		RTE_LOG(ERR, EAL, "No VFIO group free slot found\n");
-		return -1;
-	}
-	/* if primary, try to open the group */
+	/* if in primary process, try to open the group */
 	if (internal_config.process_type == RTE_PROC_PRIMARY) {
 		/* try regular group format */
 		snprintf(filename, sizeof(filename),
@@ -75,8 +53,8 @@ vfio_get_group_fd(int iommu_group_no)
 		if (vfio_group_fd < 0) {
 			/* if file not found, it's not an error */
 			if (errno != ENOENT) {
-				RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", filename,
-						strerror(errno));
+				RTE_LOG(ERR, EAL, "Cannot open %s: %s\n",
+					filename, strerror(errno));
 				return -1;
 			}
 
@@ -86,8 +64,10 @@ vfio_get_group_fd(int iommu_group_no)
 			vfio_group_fd = open(filename, O_RDWR);
 			if (vfio_group_fd < 0) {
 				if (errno != ENOENT) {
-					RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", filename,
-							strerror(errno));
+					RTE_LOG(ERR, EAL,
+						"Cannot open %s: %s\n",
+						filename,
+						strerror(errno));
 					return -1;
 				}
 				return 0;
@@ -95,21 +75,19 @@ vfio_get_group_fd(int iommu_group_no)
 			/* noiommu group found */
 		}
 
-		cur_grp->group_no = iommu_group_no;
-		cur_grp->fd = vfio_group_fd;
-		vfio_cfg.vfio_active_groups++;
 		return vfio_group_fd;
 	}
-	/* if we're in a secondary process, request group fd from the primary
+	/*
+	 * if we're in a secondary process, request group fd from the primary
 	 * process via our socket
 	 */
 	else {
-		int socket_fd, ret;
-
-		socket_fd = vfio_mp_sync_connect_to_primary();
+		int ret;
+		int socket_fd = vfio_mp_sync_connect_to_primary();
 
 		if (socket_fd < 0) {
-			RTE_LOG(ERR, EAL, "  cannot connect to primary process!\n");
+			RTE_LOG(ERR, EAL,
+				"  cannot connect to primary process!\n");
 			return -1;
 		}
 		if (vfio_mp_sync_send_request(socket_fd, SOCKET_REQ_GROUP) < 0) {
@@ -122,6 +100,7 @@ vfio_get_group_fd(int iommu_group_no)
 			close(socket_fd);
 			return -1;
 		}
+
 		ret = vfio_mp_sync_receive_request(socket_fd);
 		switch (ret) {
 		case SOCKET_NO_FD:
@@ -132,9 +111,6 @@ vfio_get_group_fd(int iommu_group_no)
 			/* if we got the fd, store it and return it */
 			if (vfio_group_fd > 0) {
 				close(socket_fd);
-				cur_grp->group_no = iommu_group_no;
-				cur_grp->fd = vfio_group_fd;
-				vfio_cfg.vfio_active_groups++;
 				return vfio_group_fd;
 			}
 			/* fall-through on error */
@@ -147,70 +123,348 @@ vfio_get_group_fd(int iommu_group_no)
 	return -1;
 }
 
+static struct vfio_config *
+get_vfio_cfg_by_group_fd(int vfio_group_fd)
+{
+	struct vfio_config *vfio_cfg;
+	int i, j;
+
+	for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
+		if (!vfio_cfgs[i])
+			continue;
+
+		vfio_cfg = vfio_cfgs[i];
+		for (j = 0; j < VFIO_MAX_GROUPS; j++)
+			if (vfio_cfg->vfio_groups[j].fd == vfio_group_fd)
+				return vfio_cfg;
+	}
+
+	return &default_vfio_cfg;
+}
+
+static struct vfio_config *
+get_vfio_cfg_by_group_no(int iommu_group_no)
+{
+	struct vfio_config *vfio_cfg;
+	int i, j;
+
+	for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
+		if (!vfio_cfgs[i])
+			continue;
+
+		vfio_cfg = vfio_cfgs[i];
+		for (j = 0; j < VFIO_MAX_GROUPS; j++) {
+			if (vfio_cfg->vfio_groups[j].group_no ==
+					iommu_group_no)
+				return vfio_cfg;
+		}
+	}
+
+	return &default_vfio_cfg;
+}
 
 static int
-get_vfio_group_idx(int vfio_group_fd)
+get_container_idx(int container_fd)
 {
 	int i;
-	for (i = 0; i < VFIO_MAX_GROUPS; i++)
-		if (vfio_cfg.vfio_groups[i].fd == vfio_group_fd)
+
+	for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
+		if (!vfio_cfgs[i])
+			continue;
+
+		if (vfio_cfgs[i]->vfio_container_fd == container_fd)
 			return i;
+	}
+
+	return -1;
+}
+
+int __rte_experimental
+rte_vfio_create_container(void)
+{
+	struct vfio_config *vfio_cfg;
+	int i;
+
+	/* Find an empty slot to store new vfio config */
+	for (i = 1; i < VFIO_MAX_CONTAINERS; i++) {
+		if (vfio_cfgs[i] == NULL)
+			break;
+	}
+
+	if (i == VFIO_MAX_CONTAINERS) {
+		RTE_LOG(ERR, EAL, "exceed max vfio container limit\n");
+		return -1;
+	}
+
+	vfio_cfgs[i] = rte_zmalloc("vfio_container", sizeof(struct vfio_config),
+		RTE_CACHE_LINE_SIZE);
+	if (vfio_cfgs[i] == NULL)
+		return -ENOMEM;
+
+	RTE_LOG(INFO, EAL, "alloc container at slot %d\n", i);
+	vfio_cfg = vfio_cfgs[i];
+	vfio_cfg->vfio_active_groups = 0;
+	vfio_cfg->vfio_container_fd = vfio_get_container_fd();
+
+	if (vfio_cfg->vfio_container_fd < 0) {
+		rte_free(vfio_cfgs[i]);
+		vfio_cfgs[i] = NULL;
+		return -1;
+	}
+
+	for (i = 0; i < VFIO_MAX_GROUPS; i++) {
+		vfio_cfg->vfio_groups[i].group_no = -1;
+		vfio_cfg->vfio_groups[i].fd = -1;
+	}
+
+	return vfio_cfg->vfio_container_fd;
+}
+
+int __rte_experimental
+rte_vfio_destroy_container(int container_fd)
+{
+	struct vfio_config *vfio_cfg;
+	int i, idx;
+
+	idx = get_container_idx(container_fd);
+	if (idx < 0) {
+		RTE_LOG(ERR, EAL, "Invalid container fd\n");
+		return -1;
+	}
+
+	vfio_cfg = vfio_cfgs[idx];
+	for (i = 0; i < VFIO_MAX_GROUPS; i++)
+		if (vfio_cfg->vfio_groups[i].group_no != -1)
+			rte_vfio_unbind_group(container_fd,
+				vfio_cfg->vfio_groups[i].group_no);
+
+	rte_free(vfio_cfgs[idx]);
+	vfio_cfgs[idx] = NULL;
+	close(container_fd);
+
+	return 0;
+}
+
+int __rte_experimental
+rte_vfio_bind_group(int container_fd, int iommu_group_no)
+{
+	struct vfio_config *cur_vfio_cfg;
+	struct vfio_group *cur_grp;
+	int vfio_group_fd;
+	int i;
+
+	i = get_container_idx(container_fd);
+	if (i < 0) {
+		RTE_LOG(ERR, EAL, "Invalid container fd\n");
+		return -1;
+	}
+
+	cur_vfio_cfg = vfio_cfgs[i];
+	/* Check room for new group */
+	if (cur_vfio_cfg->vfio_active_groups == VFIO_MAX_GROUPS) {
+		RTE_LOG(ERR, EAL, "Maximum number of VFIO groups reached!\n");
+		return -1;
+	}
+
+	/* Get an index for the new group */
+	for (i = 0; i < VFIO_MAX_GROUPS; i++)
+		if (cur_vfio_cfg->vfio_groups[i].group_no == -1) {
+			cur_grp = &cur_vfio_cfg->vfio_groups[i];
+			break;
+		}
+
+	/* This should not happen */
+	if (i == VFIO_MAX_GROUPS) {
+		RTE_LOG(ERR, EAL, "No VFIO group free slot found\n");
+		return -1;
+	}
+
+	vfio_group_fd = vfio_open_group_fd(iommu_group_no);
+	if (vfio_group_fd < 0) {
+		RTE_LOG(ERR, EAL, "Failed to open group %d\n", iommu_group_no);
+		return -1;
+	}
+	cur_grp->group_no = iommu_group_no;
+	cur_grp->fd = vfio_group_fd;
+	cur_vfio_cfg->vfio_active_groups++;
+
+	return vfio_group_fd;
+}
+
+int __rte_experimental
+rte_vfio_unbind_group(int container_fd, int iommu_group_no)
+{
+	struct vfio_config *cur_vfio_cfg;
+	struct vfio_group *cur_grp;
+	int i;
+
+	i = get_container_idx(container_fd);
+	if (i < 0) {
+		RTE_LOG(ERR, EAL, "Invalid container fd\n");
+		return -1;
+	}
+
+	cur_vfio_cfg = vfio_cfgs[i];
+	for (i = 0; i < VFIO_MAX_GROUPS; i++) {
+		if (cur_vfio_cfg->vfio_groups[i].group_no == iommu_group_no) {
+			cur_grp = &cur_vfio_cfg->vfio_groups[i];
+			break;
+		}
+	}
+
+	/* This should not happen */
+	if (i == VFIO_MAX_GROUPS) {
+		RTE_LOG(ERR, EAL, "Specified group number not found\n");
+		return -1;
+	}
+
+	if (cur_grp->fd >= 0 && close(cur_grp->fd) < 0) {
+		RTE_LOG(ERR, EAL, "Error when closing vfio_group_fd for"
+				" iommu_group_no %d\n",
+			iommu_group_no);
+		return -1;
+	}
+	cur_grp->group_no = -1;
+	cur_grp->fd = -1;
+	cur_vfio_cfg->vfio_active_groups--;
+
+	return 0;
+}
+
+int
+vfio_get_group_fd(int iommu_group_no)
+{
+	struct vfio_group *cur_grp;
+	struct vfio_config *vfio_cfg;
+	int vfio_group_fd;
+	int i;
+
+	vfio_cfg = get_vfio_cfg_by_group_no(iommu_group_no);
+
+	/* check if we already have the group descriptor open */
+	for (i = 0; i < VFIO_MAX_GROUPS; i++)
+		if (vfio_cfg->vfio_groups[i].group_no == iommu_group_no)
+			return vfio_cfg->vfio_groups[i].fd;
+
+	/* Lets see first if there is room for a new group */
+	if (vfio_cfg->vfio_active_groups == VFIO_MAX_GROUPS) {
+		RTE_LOG(ERR, EAL, "Maximum number of VFIO groups reached!\n");
+		return -1;
+	}
+
+	/* Now lets get an index for the new group */
+	for (i = 0; i < VFIO_MAX_GROUPS; i++)
+		if (vfio_cfg->vfio_groups[i].group_no == -1) {
+			cur_grp = &vfio_cfg->vfio_groups[i];
+			break;
+		}
+
+	/* This should not happen */
+	if (i == VFIO_MAX_GROUPS) {
+		RTE_LOG(ERR, EAL, "No VFIO group free slot found\n");
+		return -1;
+	}
+
+	vfio_group_fd = vfio_open_group_fd(iommu_group_no);
+	if (vfio_group_fd < 0) {
+		RTE_LOG(ERR, EAL, "Failed to open group %d\n", iommu_group_no);
+		return -1;
+	}
+
+	cur_grp->group_no = iommu_group_no;
+	cur_grp->fd = vfio_group_fd;
+	vfio_cfg->vfio_active_groups++;
+
+	return vfio_group_fd;
+}
+
+static int
+get_vfio_group_idx(int vfio_group_fd)
+{
+	struct vfio_config *vfio_cfg;
+	int i, j;
+
+	for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
+		if (!vfio_cfgs[i])
+			continue;
+
+		vfio_cfg = vfio_cfgs[i];
+		for (j = 0; j < VFIO_MAX_GROUPS; j++) {
+			if (vfio_cfg->vfio_groups[j].fd == vfio_group_fd)
+				return j;
+		}
+	}
+
 	return -1;
 }
 
 static void
 vfio_group_device_get(int vfio_group_fd)
 {
+	struct vfio_config *vfio_cfg;
 	int i;
 
+	vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd);
+
 	i = get_vfio_group_idx(vfio_group_fd);
 	if (i < 0 || i > (VFIO_MAX_GROUPS - 1))
 		RTE_LOG(ERR, EAL, "  wrong vfio_group index (%d)\n", i);
 	else
-		vfio_cfg.vfio_groups[i].devices++;
+		vfio_cfg->vfio_groups[i].devices++;
 }
 
 static void
 vfio_group_device_put(int vfio_group_fd)
 {
+	struct vfio_config *vfio_cfg;
 	int i;
 
+	vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd);
+
 	i = get_vfio_group_idx(vfio_group_fd);
 	if (i < 0 || i > (VFIO_MAX_GROUPS - 1))
 		RTE_LOG(ERR, EAL, "  wrong vfio_group index (%d)\n", i);
 	else
-		vfio_cfg.vfio_groups[i].devices--;
+		vfio_cfg->vfio_groups[i].devices--;
 }
 
 static int
 vfio_group_device_count(int vfio_group_fd)
 {
+	struct vfio_config *vfio_cfg;
 	int i;
 
+	vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd);
+
 	i = get_vfio_group_idx(vfio_group_fd);
 	if (i < 0 || i > (VFIO_MAX_GROUPS - 1)) {
 		RTE_LOG(ERR, EAL, "  wrong vfio_group index (%d)\n", i);
 		return -1;
 	}
 
-	return vfio_cfg.vfio_groups[i].devices;
+	return vfio_cfg->vfio_groups[i].devices;
 }
 
 int
 rte_vfio_clear_group(int vfio_group_fd)
 {
+	struct vfio_config *vfio_cfg;
 	int i;
 	int socket_fd, ret;
 
+	vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd);
+
 	if (internal_config.process_type == RTE_PROC_PRIMARY) {
 
 		i = get_vfio_group_idx(vfio_group_fd);
-		if (i < 0)
+		if (i < 0 || i > (VFIO_MAX_GROUPS - 1)) {
+			RTE_LOG(ERR, EAL, "  wrong vfio_group index (%d)\n", i);
 			return -1;
-		vfio_cfg.vfio_groups[i].group_no = -1;
-		vfio_cfg.vfio_groups[i].fd = -1;
-		vfio_cfg.vfio_groups[i].devices = 0;
-		vfio_cfg.vfio_active_groups--;
+		}
+		vfio_cfg->vfio_groups[i].group_no = -1;
+		vfio_cfg->vfio_groups[i].fd = -1;
+		vfio_cfg->vfio_groups[i].devices = 0;
+		vfio_cfg->vfio_active_groups--;
 		return 0;
 	}
 
@@ -261,6 +515,8 @@ rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
 	struct vfio_group_status group_status = {
 			.argsz = sizeof(group_status)
 	};
+	struct vfio_config *vfio_cfg;
+	int vfio_container_fd;
 	int vfio_group_fd;
 	int iommu_group_no;
 	int ret;
@@ -309,12 +565,14 @@ rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
 		return -1;
 	}
 
+	vfio_cfg = get_vfio_cfg_by_group_no(iommu_group_no);
+	vfio_container_fd = vfio_cfg->vfio_container_fd;
+
 	/* check if group does not have a container yet */
 	if (!(group_status.flags & VFIO_GROUP_FLAGS_CONTAINER_SET)) {
-
 		/* add group to a container */
 		ret = ioctl(vfio_group_fd, VFIO_GROUP_SET_CONTAINER,
-				&vfio_cfg.vfio_container_fd);
+				&vfio_container_fd);
 		if (ret) {
 			RTE_LOG(ERR, EAL, "  %s cannot add VFIO group to container, "
 					"error %i (%s)\n", dev_addr, errno, strerror(errno));
@@ -331,11 +589,12 @@ rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
 		 * Note this can happen several times with the hotplug
 		 * functionality.
 		 */
+
 		if (internal_config.process_type == RTE_PROC_PRIMARY &&
-				vfio_cfg.vfio_active_groups == 1) {
+				vfio_cfg->vfio_active_groups == 1) {
 			/* select an IOMMU type which we will be using */
 			const struct vfio_iommu_type *t =
-				vfio_set_iommu_type(vfio_cfg.vfio_container_fd);
+				vfio_set_iommu_type(vfio_container_fd);
 			if (!t) {
 				RTE_LOG(ERR, EAL,
 					"  %s failed to select IOMMU type\n",
@@ -344,7 +603,13 @@ rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
 				rte_vfio_clear_group(vfio_group_fd);
 				return -1;
 			}
-			ret = t->dma_map_func(vfio_cfg.vfio_container_fd);
+			/* DMA map for the default container only. */
+			if (default_vfio_cfg.vfio_container_fd ==
+				vfio_container_fd)
+				ret = t->dma_map_func(vfio_container_fd);
+			else
+				ret = 0;
+
 			if (ret) {
 				RTE_LOG(ERR, EAL,
 					"  %s DMA remapping failed, error %i (%s)\n",
@@ -388,7 +653,7 @@ rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
 
 int
 rte_vfio_release_device(const char *sysfs_base, const char *dev_addr,
-		    int vfio_dev_fd)
+			int vfio_dev_fd)
 {
 	struct vfio_group_status group_status = {
 			.argsz = sizeof(group_status)
@@ -456,9 +721,9 @@ rte_vfio_enable(const char *modname)
 	int vfio_available;
 
 	for (i = 0; i < VFIO_MAX_GROUPS; i++) {
-		vfio_cfg.vfio_groups[i].fd = -1;
-		vfio_cfg.vfio_groups[i].group_no = -1;
-		vfio_cfg.vfio_groups[i].devices = 0;
+		default_vfio_cfg.vfio_groups[i].fd = -1;
+		default_vfio_cfg.vfio_groups[i].group_no = -1;
+		default_vfio_cfg.vfio_groups[i].devices = 0;
 	}
 
 	/* inform the user that we are probing for VFIO */
@@ -480,12 +745,12 @@ rte_vfio_enable(const char *modname)
 		return 0;
 	}
 
-	vfio_cfg.vfio_container_fd = vfio_get_container_fd();
+	default_vfio_cfg.vfio_container_fd = vfio_get_container_fd();
 
 	/* check if we have VFIO driver enabled */
-	if (vfio_cfg.vfio_container_fd != -1) {
+	if (default_vfio_cfg.vfio_container_fd != -1) {
 		RTE_LOG(NOTICE, EAL, "VFIO support initialized\n");
-		vfio_cfg.vfio_enabled = 1;
+		default_vfio_cfg.vfio_enabled = 1;
 	} else {
 		RTE_LOG(NOTICE, EAL, "VFIO support could not be initialized\n");
 	}
@@ -497,7 +762,7 @@ int
 rte_vfio_is_enabled(const char *modname)
 {
 	const int mod_available = rte_eal_check_module(modname) > 0;
-	return vfio_cfg.vfio_enabled && mod_available;
+	return default_vfio_cfg.vfio_enabled && mod_available;
 }
 
 const struct vfio_iommu_type *
@@ -665,41 +930,80 @@ vfio_get_group_no(const char *sysfs_base,
 }
 
 static int
-vfio_type1_dma_map(int vfio_container_fd)
+do_vfio_type1_dma_map(int vfio_container_fd, const struct rte_memseg *ms)
 {
-	const struct rte_memseg *ms = rte_eal_get_physmem_layout();
-	int i, ret;
+	int ret;
+	struct vfio_iommu_type1_dma_map dma_map;
 
-	/* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */
-	for (i = 0; i < RTE_MAX_MEMSEG; i++) {
-		struct vfio_iommu_type1_dma_map dma_map;
+	memset(&dma_map, 0, sizeof(dma_map));
+	dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
+	dma_map.vaddr = ms->addr_64;
+	dma_map.size = ms->len;
 
-		if (ms[i].addr == NULL)
-			break;
+	if (rte_eal_iova_mode() == RTE_IOVA_VA)
+		dma_map.iova = dma_map.vaddr;
+	else
+		dma_map.iova = ms->iova;
+	dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
 
-		memset(&dma_map, 0, sizeof(dma_map));
-		dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
-		dma_map.vaddr = ms[i].addr_64;
-		dma_map.size = ms[i].len;
-		if (rte_eal_iova_mode() == RTE_IOVA_VA)
-			dma_map.iova = dma_map.vaddr;
-		else
-			dma_map.iova = ms[i].iova;
-		dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
+	ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
+	if (ret) {
+		RTE_LOG(ERR, EAL,
+			"  cannot set up DMA remapping, error %i (%s)\n",
+			errno,
+			strerror(errno));
+		return -1;
+	}
 
-		ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
+	return 0;
+}
 
-		if (ret) {
-			RTE_LOG(ERR, EAL, "  cannot set up DMA remapping, "
-					  "error %i (%s)\n", errno,
-					  strerror(errno));
-			return -1;
-		}
+static int
+do_vfio_type1_dma_unmap(int vfio_container_fd, const struct rte_memseg *ms)
+{
+	int ret;
+	struct vfio_iommu_type1_dma_unmap dma_unmap;
+
+	memset(&dma_unmap, 0, sizeof(dma_unmap));
+	dma_unmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap);
+	dma_unmap.size = ms->len;
+
+	if (rte_eal_iova_mode() == RTE_IOVA_VA)
+		dma_unmap.iova = ms->addr_64;
+	else
+		dma_unmap.iova = ms->iova;
+	dma_unmap.flags = 0;
+
+	ret = ioctl(vfio_container_fd, VFIO_IOMMU_UNMAP_DMA, &dma_unmap);
+	if (ret) {
+		RTE_LOG(ERR, EAL,
+			"  cannot unmap DMA, error %i (%s)\n",
+			errno,
+			strerror(errno));
+		return -1;
 	}
 
 	return 0;
 }
 
+static int
+vfio_type1_dma_map(int vfio_container_fd)
+{
+	const struct rte_memseg *ms = rte_eal_get_physmem_layout();
+	int i;
+	int ret = 0;
+
+	for (i = 0; i < RTE_MAX_MEMSEG; i++) {
+		if (ms[i].addr == NULL)
+			break;
+		ret = do_vfio_type1_dma_map(vfio_container_fd, &ms[i]);
+		if (ret < 0)
+			return ret;
+	}
+
+	return ret;
+}
+
 static int
 vfio_spapr_dma_map(int vfio_container_fd)
 {
@@ -843,4 +1147,37 @@ rte_vfio_noiommu_is_enabled(void)
 	return c == 'Y';
 }
 
+int __rte_experimental
+rte_vfio_dma_map(int container_fd, int dma_type, const struct rte_memseg *ms)
+{
+
+	if (dma_type == RTE_VFIO_TYPE1) {
+		return do_vfio_type1_dma_map(container_fd, ms);
+	} else if (dma_type == RTE_VFIO_SPAPR) {
+		RTE_LOG(ERR, EAL,
+			"Additional dma map for SPAPR type not support yet.");
+			return -1;
+	} else if (dma_type == RTE_VFIO_NOIOMMU) {
+		return 0;
+	}
+
+	return -1;
+}
+
+int __rte_experimental
+rte_vfio_dma_unmap(int container_fd, int dma_type, const struct rte_memseg *ms)
+{
+	if (dma_type == RTE_VFIO_TYPE1) {
+		return do_vfio_type1_dma_unmap(container_fd, ms);
+	} else if (dma_type == RTE_VFIO_SPAPR) {
+		RTE_LOG(ERR, EAL,
+			"Additional dma unmap for SPAPR type not support yet.");
+			return -1;
+	} else if (dma_type == RTE_VFIO_NOIOMMU) {
+		return 0;
+	}
+
+	return -1;
+}
+
 #endif
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.h b/lib/librte_eal/linuxapp/eal/eal_vfio.h
index 80595773e..23a1e3608 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.h
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.h
@@ -86,6 +86,7 @@ struct vfio_iommu_spapr_tce_info {
 #endif
 
 #define VFIO_MAX_GROUPS RTE_MAX_VFIO_GROUPS
+#define VFIO_MAX_CONTAINERS RTE_MAX_VFIO_CONTAINERS
 
 /*
  * Function prototypes for VFIO multiprocess sync functions
diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
index f331f54c9..fcf9494d1 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -255,5 +255,11 @@ EXPERIMENTAL {
 	rte_service_set_runstate_mapped_check;
 	rte_service_set_stats_enable;
 	rte_service_start_with_defaults;
+	rte_vfio_bind_group;
+	rte_vfio_create_container;
+	rte_vfio_destroy_container;
+	rte_vfio_dma_map;
+	rte_vfio_dma_unmap;
+	rte_vfio_unbind_group;
 
 } DPDK_18.02;
-- 
2.15.1



More information about the dev mailing list