[dpdk-dev] [PATCH v6 46/70] vfio: allow to map other memory regions

Anatoly Burakov anatoly.burakov at intel.com
Wed Apr 11 14:30:21 CEST 2018


Currently it is not possible to use memory that is not owned by DPDK to
perform DMA. This scenarion might be used in vhost applications (like
SPDK) where guest send its own memory table. To fill this gap provide
API to allow registering arbitrary address in VFIO container.

Signed-off-by: Pawel Wodkowski <pawelx.wodkowski at intel.com>
Signed-off-by: Anatoly Burakov <anatoly.burakov at intel.com>
Signed-off-by: Gowrishankar Muthukrishnan <gowrishankar.m at linux.vnet.ibm.com>
Tested-by: Santosh Shukla <santosh.shukla at caviumnetworks.com>
Tested-by: Hemant Agrawal <hemant.agrawal at nxp.com>
Tested-by: Gowrishankar Muthukrishnan <gowrishankar.m at linux.vnet.ibm.com>
---
 lib/librte_eal/bsdapp/eal/eal.c          |  16 +
 lib/librte_eal/common/include/rte_vfio.h |  41 ++
 lib/librte_eal/linuxapp/eal/eal_vfio.c   | 708 +++++++++++++++++++++++++++----
 lib/librte_eal/linuxapp/eal/eal_vfio.h   |  12 +
 lib/librte_eal/rte_eal_version.map       |   2 +
 5 files changed, 705 insertions(+), 74 deletions(-)

diff --git a/lib/librte_eal/bsdapp/eal/eal.c b/lib/librte_eal/bsdapp/eal/eal.c
index 8e25d78..032a5ea 100644
--- a/lib/librte_eal/bsdapp/eal/eal.c
+++ b/lib/librte_eal/bsdapp/eal/eal.c
@@ -749,6 +749,8 @@ int rte_vfio_enable(const char *modname);
 int rte_vfio_is_enabled(const char *modname);
 int rte_vfio_noiommu_is_enabled(void);
 int rte_vfio_clear_group(int vfio_group_fd);
+int rte_vfio_dma_map(uint64_t vaddr, uint64_t iova, uint64_t len);
+int rte_vfio_dma_unmap(uint64_t vaddr, uint64_t iova, uint64_t len);
 
 int rte_vfio_setup_device(__rte_unused const char *sysfs_base,
 		      __rte_unused const char *dev_addr,
@@ -784,3 +786,17 @@ int rte_vfio_clear_group(__rte_unused int vfio_group_fd)
 {
 	return 0;
 }
+
+int __rte_experimental
+rte_vfio_dma_map(uint64_t __rte_unused vaddr, __rte_unused uint64_t iova,
+		  __rte_unused uint64_t len)
+{
+	return -1;
+}
+
+int __rte_experimental
+rte_vfio_dma_unmap(uint64_t __rte_unused vaddr, uint64_t __rte_unused iova,
+		    __rte_unused uint64_t len)
+{
+	return -1;
+}
diff --git a/lib/librte_eal/common/include/rte_vfio.h b/lib/librte_eal/common/include/rte_vfio.h
index 249095e..d26ab01 100644
--- a/lib/librte_eal/common/include/rte_vfio.h
+++ b/lib/librte_eal/common/include/rte_vfio.h
@@ -127,6 +127,47 @@ int rte_vfio_noiommu_is_enabled(void);
 int
 rte_vfio_clear_group(int vfio_group_fd);
 
+/**
+ * Map memory region for use with VFIO.
+ *
+ * @note requires at least one device to be attached at the time of mapping.
+ *
+ * @param vaddr
+ *   Starting virtual address of memory to be mapped.
+ *
+ * @param iova
+ *   Starting IOVA address of memory to be mapped.
+ *
+ * @param len
+ *   Length of memory segment being mapped.
+ *
+ * @return
+ *   0 if success.
+ *   -1 on error.
+ */
+int  __rte_experimental
+rte_vfio_dma_map(uint64_t vaddr, uint64_t iova, uint64_t len);
+
+
+/**
+ * Unmap memory region from VFIO.
+ *
+ * @param vaddr
+ *   Starting virtual address of memory to be unmapped.
+ *
+ * @param iova
+ *   Starting IOVA address of memory to be unmapped.
+ *
+ * @param len
+ *   Length of memory segment being unmapped.
+ *
+ * @return
+ *   0 if success.
+ *   -1 on error.
+ */
+int __rte_experimental
+rte_vfio_dma_unmap(uint64_t vaddr, uint64_t iova, uint64_t len);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c b/lib/librte_eal/linuxapp/eal/eal_vfio.c
index fb41e82..c1f0f87 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c
@@ -2,11 +2,13 @@
  * Copyright(c) 2010-2014 Intel Corporation
  */
 
+#include <inttypes.h>
 #include <string.h>
 #include <fcntl.h>
 #include <unistd.h>
 #include <sys/ioctl.h>
 
+#include <rte_errno.h>
 #include <rte_log.h>
 #include <rte_memory.h>
 #include <rte_eal_memconfig.h>
@@ -22,19 +24,227 @@
 static struct vfio_config vfio_cfg;
 
 static int vfio_type1_dma_map(int);
+static int vfio_type1_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int);
 static int vfio_spapr_dma_map(int);
+static int vfio_spapr_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int);
 static int vfio_noiommu_dma_map(int);
+static int vfio_noiommu_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int);
+static int vfio_dma_mem_map(uint64_t vaddr, uint64_t iova, uint64_t len,
+		int do_map);
 
 /* IOMMU types we support */
 static const struct vfio_iommu_type iommu_types[] = {
 	/* x86 IOMMU, otherwise known as type 1 */
-	{ RTE_VFIO_TYPE1, "Type 1", &vfio_type1_dma_map},
+	{
+		.type_id = RTE_VFIO_TYPE1,
+		.name = "Type 1",
+		.dma_map_func = &vfio_type1_dma_map,
+		.dma_user_map_func = &vfio_type1_dma_mem_map
+	},
 	/* ppc64 IOMMU, otherwise known as spapr */
-	{ RTE_VFIO_SPAPR, "sPAPR", &vfio_spapr_dma_map},
+	{
+		.type_id = RTE_VFIO_SPAPR,
+		.name = "sPAPR",
+		.dma_map_func = &vfio_spapr_dma_map,
+		.dma_user_map_func = &vfio_spapr_dma_mem_map
+	},
 	/* IOMMU-less mode */
-	{ RTE_VFIO_NOIOMMU, "No-IOMMU", &vfio_noiommu_dma_map},
+	{
+		.type_id = RTE_VFIO_NOIOMMU,
+		.name = "No-IOMMU",
+		.dma_map_func = &vfio_noiommu_dma_map,
+		.dma_user_map_func = &vfio_noiommu_dma_mem_map
+	},
 };
 
+/* hot plug/unplug of VFIO groups may cause all DMA maps to be dropped. we can
+ * recreate the mappings for DPDK segments, but we cannot do so for memory that
+ * was registered by the user themselves, so we need to store the user mappings
+ * somewhere, to recreate them later.
+ */
+#define VFIO_MAX_USER_MEM_MAPS 256
+struct user_mem_map {
+	uint64_t addr;
+	uint64_t iova;
+	uint64_t len;
+};
+static struct {
+	rte_spinlock_t lock;
+	int n_maps;
+	struct user_mem_map maps[VFIO_MAX_USER_MEM_MAPS];
+} user_mem_maps = {
+	.lock = RTE_SPINLOCK_INITIALIZER
+};
+
+static int
+is_null_map(const struct user_mem_map *map)
+{
+	return map->addr == 0 && map->iova == 0 && map->len == 0;
+}
+
+/* we may need to merge user mem maps together in case of user mapping/unmapping
+ * chunks of memory, so we'll need a comparator function to sort segments.
+ */
+static int
+user_mem_map_cmp(const void *a, const void *b)
+{
+	const struct user_mem_map *umm_a = a;
+	const struct user_mem_map *umm_b = b;
+
+	/* move null entries to end */
+	if (is_null_map(umm_a))
+		return 1;
+	if (is_null_map(umm_b))
+		return -1;
+
+	/* sort by iova first */
+	if (umm_a->iova < umm_b->iova)
+		return -1;
+	if (umm_a->iova > umm_b->iova)
+		return 1;
+
+	if (umm_a->addr < umm_b->addr)
+		return -1;
+	if (umm_a->addr > umm_b->addr)
+		return 1;
+
+	if (umm_a->len < umm_b->len)
+		return -1;
+	if (umm_a->len > umm_b->len)
+		return 1;
+
+	return 0;
+}
+
+/* adjust user map entry. this may result in shortening of existing map, or in
+ * splitting existing map in two pieces.
+ */
+static void
+adjust_map(struct user_mem_map *src, struct user_mem_map *end,
+		uint64_t remove_va_start, uint64_t remove_len)
+{
+	/* if va start is same as start address, we're simply moving start */
+	if (remove_va_start == src->addr) {
+		src->addr += remove_len;
+		src->iova += remove_len;
+		src->len -= remove_len;
+	} else if (remove_va_start + remove_len == src->addr + src->len) {
+		/* we're shrinking mapping from the end */
+		src->len -= remove_len;
+	} else {
+		/* we're blowing a hole in the middle */
+		struct user_mem_map tmp;
+		uint64_t total_len = src->len;
+
+		/* adjust source segment length */
+		src->len = remove_va_start - src->addr;
+
+		/* create temporary segment in the middle */
+		tmp.addr = src->addr + src->len;
+		tmp.iova = src->iova + src->len;
+		tmp.len = remove_len;
+
+		/* populate end segment - this one we will be keeping */
+		end->addr = tmp.addr + tmp.len;
+		end->iova = tmp.iova + tmp.len;
+		end->len = total_len - src->len - tmp.len;
+	}
+}
+
+/* try merging two maps into one, return 1 if succeeded */
+static int
+merge_map(struct user_mem_map *left, struct user_mem_map *right)
+{
+	if (left->addr + left->len != right->addr)
+		return 0;
+	if (left->iova + left->len != right->iova)
+		return 0;
+
+	left->len += right->len;
+
+	memset(right, 0, sizeof(*right));
+
+	return 1;
+}
+
+static struct user_mem_map *
+find_user_mem_map(uint64_t addr, uint64_t iova, uint64_t len)
+{
+	uint64_t va_end = addr + len;
+	uint64_t iova_end = iova + len;
+	int i;
+
+	for (i = 0; i < user_mem_maps.n_maps; i++) {
+		struct user_mem_map *map = &user_mem_maps.maps[i];
+		uint64_t map_va_end = map->addr + map->len;
+		uint64_t map_iova_end = map->iova + map->len;
+
+		/* check start VA */
+		if (addr < map->addr || addr >= map_va_end)
+			continue;
+		/* check if IOVA end is within boundaries */
+		if (va_end <= map->addr || va_end >= map_va_end)
+			continue;
+
+		/* check start PA */
+		if (iova < map->iova || iova >= map_iova_end)
+			continue;
+		/* check if IOVA end is within boundaries */
+		if (iova_end <= map->iova || iova_end >= map_iova_end)
+			continue;
+
+		/* we've found our map */
+		return map;
+	}
+	return NULL;
+}
+
+/* this will sort all user maps, and merge/compact any adjacent maps */
+static void
+compact_user_maps(void)
+{
+	int i, n_merged, cur_idx;
+
+	qsort(user_mem_maps.maps, user_mem_maps.n_maps,
+			sizeof(user_mem_maps.maps[0]), user_mem_map_cmp);
+
+	/* we'll go over the list backwards when merging */
+	n_merged = 0;
+	for (i = user_mem_maps.n_maps - 2; i >= 0; i--) {
+		struct user_mem_map *l, *r;
+
+		l = &user_mem_maps.maps[i];
+		r = &user_mem_maps.maps[i + 1];
+
+		if (is_null_map(l) || is_null_map(r))
+			continue;
+
+		if (merge_map(l, r))
+			n_merged++;
+	}
+
+	/* the entries are still sorted, but now they have holes in them, so
+	 * walk through the list and remove the holes
+	 */
+	if (n_merged > 0) {
+		cur_idx = 0;
+		for (i = 0; i < user_mem_maps.n_maps; i++) {
+			if (!is_null_map(&user_mem_maps.maps[i])) {
+				struct user_mem_map *src, *dst;
+
+				src = &user_mem_maps.maps[i];
+				dst = &user_mem_maps.maps[cur_idx++];
+
+				if (src != dst) {
+					memcpy(dst, src, sizeof(*src));
+					memset(src, 0, sizeof(*src));
+				}
+			}
+		}
+		user_mem_maps.n_maps = cur_idx;
+	}
+}
+
 int
 vfio_get_group_fd(int iommu_group_no)
 {
@@ -263,7 +473,7 @@ rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
 	};
 	int vfio_group_fd;
 	int iommu_group_no;
-	int ret;
+	int i, ret;
 
 	/* get group number */
 	ret = vfio_get_group_no(sysfs_base, dev_addr, &iommu_group_no);
@@ -333,9 +543,10 @@ rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
 		 */
 		if (internal_config.process_type == RTE_PROC_PRIMARY &&
 				vfio_cfg.vfio_active_groups == 1) {
+			const struct vfio_iommu_type *t;
+
 			/* select an IOMMU type which we will be using */
-			const struct vfio_iommu_type *t =
-				vfio_set_iommu_type(vfio_cfg.vfio_container_fd);
+			t = vfio_set_iommu_type(vfio_cfg.vfio_container_fd);
 			if (!t) {
 				RTE_LOG(ERR, EAL,
 					"  %s failed to select IOMMU type\n",
@@ -353,6 +564,38 @@ rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
 				rte_vfio_clear_group(vfio_group_fd);
 				return -1;
 			}
+
+			vfio_cfg.vfio_iommu_type = t;
+
+			/* re-map all user-mapped segments */
+			rte_spinlock_lock(&user_mem_maps.lock);
+
+			/* this IOMMU type may not support DMA mapping, but
+			 * if we have mappings in the list - that means we have
+			 * previously mapped something successfully, so we can
+			 * be sure that DMA mapping is supported.
+			 */
+			for (i = 0; i < user_mem_maps.n_maps; i++) {
+				struct user_mem_map *map;
+				map = &user_mem_maps.maps[i];
+
+				ret = t->dma_user_map_func(
+						vfio_cfg.vfio_container_fd,
+						map->addr, map->iova, map->len,
+						1);
+				if (ret) {
+					RTE_LOG(ERR, EAL, "Couldn't map user memory for DMA: "
+							"va: 0x%" PRIx64 " "
+							"iova: 0x%" PRIx64 " "
+							"len: 0x%" PRIu64 "\n",
+							map->addr, map->iova,
+							map->len);
+					rte_spinlock_unlock(
+							&user_mem_maps.lock);
+					return -1;
+				}
+			}
+			rte_spinlock_unlock(&user_mem_maps.lock);
 		}
 	}
 
@@ -668,23 +911,49 @@ static int
 type1_map(const struct rte_memseg *ms, void *arg)
 {
 	int *vfio_container_fd = arg;
+
+	return vfio_type1_dma_mem_map(*vfio_container_fd, ms->addr_64, ms->iova,
+			ms->len, 1);
+}
+
+static int
+vfio_type1_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
+		uint64_t len, int do_map)
+{
 	struct vfio_iommu_type1_dma_map dma_map;
+	struct vfio_iommu_type1_dma_unmap dma_unmap;
 	int ret;
 
-	memset(&dma_map, 0, sizeof(dma_map));
-	dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
-	dma_map.vaddr = ms->addr_64;
-	dma_map.size = ms->len;
-	dma_map.iova = ms->iova;
-	dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
+	if (do_map != 0) {
+		memset(&dma_map, 0, sizeof(dma_map));
+		dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
+		dma_map.vaddr = vaddr;
+		dma_map.size = len;
+		dma_map.iova = iova;
+		dma_map.flags = VFIO_DMA_MAP_FLAG_READ |
+				VFIO_DMA_MAP_FLAG_WRITE;
 
-	ret = ioctl(*vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
-
-	if (ret) {
-		RTE_LOG(ERR, EAL, "  cannot set up DMA remapping, error %i (%s)\n",
+		ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
+		if (ret) {
+			RTE_LOG(ERR, EAL, "  cannot set up DMA remapping, error %i (%s)\n",
 				errno, strerror(errno));
-		return -1;
+				return -1;
+		}
+	} else {
+		memset(&dma_unmap, 0, sizeof(dma_unmap));
+		dma_unmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap);
+		dma_unmap.size = len;
+		dma_unmap.iova = iova;
+
+		ret = ioctl(vfio_container_fd, VFIO_IOMMU_UNMAP_DMA,
+				&dma_unmap);
+		if (ret) {
+			RTE_LOG(ERR, EAL, "  cannot clear DMA remapping, error %i (%s)\n",
+					errno, strerror(errno));
+			return -1;
+		}
 	}
+
 	return 0;
 }
 
@@ -694,12 +963,78 @@ vfio_type1_dma_map(int vfio_container_fd)
 	return rte_memseg_walk(type1_map, &vfio_container_fd);
 }
 
+static int
+vfio_spapr_dma_do_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
+		uint64_t len, int do_map)
+{
+	struct vfio_iommu_type1_dma_map dma_map;
+	struct vfio_iommu_type1_dma_unmap dma_unmap;
+	int ret;
+
+	if (do_map != 0) {
+		memset(&dma_map, 0, sizeof(dma_map));
+		dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
+		dma_map.vaddr = vaddr;
+		dma_map.size = len;
+		dma_map.iova = iova;
+		dma_map.flags = VFIO_DMA_MAP_FLAG_READ |
+				VFIO_DMA_MAP_FLAG_WRITE;
+
+		ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
+		if (ret) {
+			RTE_LOG(ERR, EAL, "  cannot set up DMA remapping, error %i (%s)\n",
+				errno, strerror(errno));
+				return -1;
+		}
+
+	} else {
+		struct vfio_iommu_spapr_register_memory reg = {
+			.argsz = sizeof(reg),
+			.flags = 0
+		};
+		reg.vaddr = (uintptr_t) vaddr;
+		reg.size = len;
+
+		ret = ioctl(vfio_container_fd,
+				VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY, &reg);
+		if (ret) {
+			RTE_LOG(ERR, EAL, "  cannot unregister vaddr for IOMMU, error %i (%s)\n",
+					errno, strerror(errno));
+			return -1;
+		}
+
+		memset(&dma_unmap, 0, sizeof(dma_unmap));
+		dma_unmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap);
+		dma_unmap.size = len;
+		dma_unmap.iova = iova;
+
+		ret = ioctl(vfio_container_fd, VFIO_IOMMU_UNMAP_DMA,
+				&dma_unmap);
+		if (ret) {
+			RTE_LOG(ERR, EAL, "  cannot clear DMA remapping, error %i (%s)\n",
+					errno, strerror(errno));
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
+static int
+vfio_spapr_map_walk(const struct rte_memseg *ms, void *arg)
+{
+	int *vfio_container_fd = arg;
+
+	return vfio_spapr_dma_mem_map(*vfio_container_fd, ms->addr_64, ms->iova,
+			ms->len, 1);
+}
+
 struct spapr_walk_param {
 	uint64_t window_size;
 	uint64_t hugepage_sz;
 };
 static int
-spapr_window_size(const struct rte_memseg *ms, void *arg)
+vfio_spapr_window_size_walk(const struct rte_memseg *ms, void *arg)
 {
 	struct spapr_walk_param *param = arg;
 	uint64_t max = ms->iova + ms->len;
@@ -713,39 +1048,43 @@ spapr_window_size(const struct rte_memseg *ms, void *arg)
 }
 
 static int
-spapr_map(const struct rte_memseg *ms, void *arg)
-{
-	struct vfio_iommu_type1_dma_map dma_map;
-	struct vfio_iommu_spapr_register_memory reg = {
-		.argsz = sizeof(reg),
-		.flags = 0
+vfio_spapr_create_new_dma_window(int vfio_container_fd,
+		struct vfio_iommu_spapr_tce_create *create) {
+	struct vfio_iommu_spapr_tce_remove remove = {
+		.argsz = sizeof(remove),
+	};
+	struct vfio_iommu_spapr_tce_info info = {
+		.argsz = sizeof(info),
 	};
-	int *vfio_container_fd = arg;
 	int ret;
 
-	reg.vaddr = (uintptr_t) ms->addr;
-	reg.size = ms->len;
-	ret = ioctl(*vfio_container_fd,
-		VFIO_IOMMU_SPAPR_REGISTER_MEMORY, &reg);
+	/* query spapr iommu info */
+	ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info);
 	if (ret) {
-		RTE_LOG(ERR, EAL, "  cannot register vaddr for IOMMU, error %i (%s)\n",
-				errno, strerror(errno));
+		RTE_LOG(ERR, EAL, "  cannot get iommu info, "
+				"error %i (%s)\n", errno, strerror(errno));
 		return -1;
 	}
 
-	memset(&dma_map, 0, sizeof(dma_map));
-	dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
-	dma_map.vaddr = ms->addr_64;
-	dma_map.size = ms->len;
-	dma_map.iova = ms->iova;
-	dma_map.flags = VFIO_DMA_MAP_FLAG_READ |
-			 VFIO_DMA_MAP_FLAG_WRITE;
-
-	ret = ioctl(*vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
+	/* remove default DMA of 32 bit window */
+	remove.start_addr = info.dma32_window_start;
+	ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove);
+	if (ret) {
+		RTE_LOG(ERR, EAL, "  cannot remove default DMA window, "
+				"error %i (%s)\n", errno, strerror(errno));
+		return -1;
+	}
 
+	/* create new DMA window */
+	ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_CREATE, create);
 	if (ret) {
-		RTE_LOG(ERR, EAL, "  cannot set up DMA remapping, error %i (%s)\n",
-				errno, strerror(errno));
+		RTE_LOG(ERR, EAL, "  cannot create new DMA window, "
+				"error %i (%s)\n", errno, strerror(errno));
+		return -1;
+	}
+
+	if (create->start_addr != 0) {
+		RTE_LOG(ERR, EAL, "  DMA window start address != 0\n");
 		return -1;
 	}
 
@@ -753,61 +1092,116 @@ spapr_map(const struct rte_memseg *ms, void *arg)
 }
 
 static int
-vfio_spapr_dma_map(int vfio_container_fd)
+vfio_spapr_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
+		uint64_t len, int do_map)
 {
 	struct spapr_walk_param param;
-	int ret;
-	struct vfio_iommu_spapr_tce_info info = {
-		.argsz = sizeof(info),
-	};
 	struct vfio_iommu_spapr_tce_create create = {
 		.argsz = sizeof(create),
 	};
-	struct vfio_iommu_spapr_tce_remove remove = {
-		.argsz = sizeof(remove),
-	};
+	int i, ret = 0;
+
+	rte_spinlock_lock(&user_mem_maps.lock);
 
+	/* check if window size needs to be adjusted */
 	memset(&param, 0, sizeof(param));
 
-	/* query spapr iommu info */
-	ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info);
-	if (ret) {
-		RTE_LOG(ERR, EAL, "  cannot get iommu info, "
-				"error %i (%s)\n", errno, strerror(errno));
-		return -1;
+	if (rte_memseg_walk(vfio_spapr_window_size_walk, &param) < 0) {
+		RTE_LOG(ERR, EAL, "Could not get window size\n");
+		ret = -1;
+		goto out;
 	}
 
-	/* remove default DMA of 32 bit window */
-	remove.start_addr = info.dma32_window_start;
-	ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove);
-	if (ret) {
-		RTE_LOG(ERR, EAL, "  cannot remove default DMA window, "
-				"error %i (%s)\n", errno, strerror(errno));
-		return -1;
+	/* also check user maps */
+	for (i = 0; i < user_mem_maps.n_maps; i++) {
+		uint64_t max = user_mem_maps.maps[i].iova +
+				user_mem_maps.maps[i].len;
+		create.window_size = RTE_MAX(create.window_size, max);
 	}
 
-	/* create DMA window from 0 to max(phys_addr + len) */
-	rte_memseg_walk(spapr_window_size, &param);
-
 	/* sPAPR requires window size to be a power of 2 */
 	create.window_size = rte_align64pow2(param.window_size);
 	create.page_shift = __builtin_ctzll(param.hugepage_sz);
 	create.levels = 1;
 
-	ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
-	if (ret) {
-		RTE_LOG(ERR, EAL, "  cannot create new DMA window, "
-				"error %i (%s)\n", errno, strerror(errno));
-		return -1;
+	if (do_map) {
+		/* re-create window and remap the entire memory */
+		if (iova > create.window_size) {
+			if (vfio_spapr_create_new_dma_window(vfio_container_fd,
+					&create) < 0) {
+				RTE_LOG(ERR, EAL, "Could not create new DMA window\n");
+				ret = -1;
+				goto out;
+			}
+			if (rte_memseg_walk(vfio_spapr_map_walk,
+					&vfio_container_fd) < 0) {
+				RTE_LOG(ERR, EAL, "Could not recreate DMA maps\n");
+				ret = -1;
+				goto out;
+			}
+			/* remap all user maps */
+			for (i = 0; i < user_mem_maps.n_maps; i++) {
+				struct user_mem_map *map =
+						&user_mem_maps.maps[i];
+				if (vfio_spapr_dma_do_map(vfio_container_fd,
+						map->addr, map->iova, map->len,
+						1)) {
+					RTE_LOG(ERR, EAL, "Could not recreate user DMA maps\n");
+					ret = -1;
+					goto out;
+				}
+			}
+		}
+
+		/* now that we've remapped all of the memory that was present
+		 * before, map the segment that we were requested to map.
+		 */
+		if (vfio_spapr_dma_do_map(vfio_container_fd,
+				vaddr, iova, len, 1) < 0) {
+			RTE_LOG(ERR, EAL, "Could not map segment\n");
+			ret = -1;
+			goto out;
+		}
+	} else {
+		/* for unmap, check if iova within DMA window */
+		if (iova > create.window_size) {
+			RTE_LOG(ERR, EAL, "iova beyond DMA window for unmap");
+			ret = -1;
+			goto out;
+		}
+
+		vfio_spapr_dma_do_map(vfio_container_fd, vaddr, iova, len, 0);
 	}
+out:
+	rte_spinlock_unlock(&user_mem_maps.lock);
+	return ret;
+}
 
-	if (create.start_addr != 0) {
-		RTE_LOG(ERR, EAL, "  DMA window start address != 0\n");
+static int
+vfio_spapr_dma_map(int vfio_container_fd)
+{
+	struct vfio_iommu_spapr_tce_create create = {
+		.argsz = sizeof(create),
+	};
+	struct spapr_walk_param param;
+
+	memset(&param, 0, sizeof(param));
+
+	/* create DMA window from 0 to max(phys_addr + len) */
+	rte_memseg_walk(vfio_spapr_window_size_walk, &param);
+
+	/* sPAPR requires window size to be a power of 2 */
+	create.window_size = rte_align64pow2(param.window_size);
+	create.page_shift = __builtin_ctzll(param.hugepage_sz);
+	create.levels = 1;
+
+	if (vfio_spapr_create_new_dma_window(vfio_container_fd, &create) < 0) {
+		RTE_LOG(ERR, EAL, "Could not create new DMA window\n");
 		return -1;
 	}
 
 	/* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */
-	if (rte_memseg_walk(spapr_map, &vfio_container_fd) < 0)
+	if (rte_memseg_walk(vfio_spapr_map_walk, &vfio_container_fd) < 0)
 		return -1;
 
 	return 0;
@@ -820,6 +1214,156 @@ vfio_noiommu_dma_map(int __rte_unused vfio_container_fd)
 	return 0;
 }
 
+static int
+vfio_noiommu_dma_mem_map(int __rte_unused vfio_container_fd,
+			 uint64_t __rte_unused vaddr,
+			 uint64_t __rte_unused iova, uint64_t __rte_unused len,
+			 int __rte_unused do_map)
+{
+	/* No-IOMMU mode does not need DMA mapping */
+	return 0;
+}
+
+static int
+vfio_dma_mem_map(uint64_t vaddr, uint64_t iova, uint64_t len, int do_map)
+{
+	const struct vfio_iommu_type *t = vfio_cfg.vfio_iommu_type;
+
+	if (!t) {
+		RTE_LOG(ERR, EAL, "  VFIO support not initialized\n");
+		rte_errno = ENODEV;
+		return -1;
+	}
+
+	if (!t->dma_user_map_func) {
+		RTE_LOG(ERR, EAL,
+			"  VFIO custom DMA region maping not supported by IOMMU %s\n",
+			t->name);
+		rte_errno = ENOTSUP;
+		return -1;
+	}
+
+	return t->dma_user_map_func(vfio_cfg.vfio_container_fd, vaddr, iova,
+			len, do_map);
+}
+
+int __rte_experimental
+rte_vfio_dma_map(uint64_t vaddr, uint64_t iova, uint64_t len)
+{
+	struct user_mem_map *new_map;
+	int ret = 0;
+
+	if (len == 0) {
+		rte_errno = EINVAL;
+		return -1;
+	}
+
+	rte_spinlock_lock(&user_mem_maps.lock);
+	if (user_mem_maps.n_maps == VFIO_MAX_USER_MEM_MAPS) {
+		RTE_LOG(ERR, EAL, "No more space for user mem maps\n");
+		rte_errno = ENOMEM;
+		ret = -1;
+		goto out;
+	}
+	/* map the entry */
+	if (vfio_dma_mem_map(vaddr, iova, len, 1)) {
+		/* technically, this will fail if there are currently no devices
+		 * plugged in, even if a device were added later, this mapping
+		 * might have succeeded. however, since we cannot verify if this
+		 * is a valid mapping without having a device attached, consider
+		 * this to be unsupported, because we can't just store any old
+		 * mapping and pollute list of active mappings willy-nilly.
+		 */
+		RTE_LOG(ERR, EAL, "Couldn't map new region for DMA\n");
+		ret = -1;
+		goto out;
+	}
+	/* create new user mem map entry */
+	new_map = &user_mem_maps.maps[user_mem_maps.n_maps++];
+	new_map->addr = vaddr;
+	new_map->iova = iova;
+	new_map->len = len;
+
+	compact_user_maps();
+out:
+	rte_spinlock_unlock(&user_mem_maps.lock);
+	return ret;
+}
+
+int __rte_experimental
+rte_vfio_dma_unmap(uint64_t vaddr, uint64_t iova, uint64_t len)
+{
+	struct user_mem_map *map, *new_map = NULL;
+	int ret = 0;
+
+	if (len == 0) {
+		rte_errno = EINVAL;
+		return -1;
+	}
+
+	rte_spinlock_lock(&user_mem_maps.lock);
+
+	/* find our mapping */
+	map = find_user_mem_map(vaddr, iova, len);
+	if (!map) {
+		RTE_LOG(ERR, EAL, "Couldn't find previously mapped region\n");
+		rte_errno = EINVAL;
+		ret = -1;
+		goto out;
+	}
+	if (map->addr != vaddr || map->iova != iova || map->len != len) {
+		/* we're partially unmapping a previously mapped region, so we
+		 * need to split entry into two.
+		 */
+		if (user_mem_maps.n_maps == VFIO_MAX_USER_MEM_MAPS) {
+			RTE_LOG(ERR, EAL, "Not enough space to store partial mapping\n");
+			rte_errno = ENOMEM;
+			ret = -1;
+			goto out;
+		}
+		new_map = &user_mem_maps.maps[user_mem_maps.n_maps++];
+	}
+
+	/* unmap the entry */
+	if (vfio_dma_mem_map(vaddr, iova, len, 0)) {
+		/* there may not be any devices plugged in, so unmapping will
+		 * fail with ENODEV/ENOTSUP rte_errno values, but that doesn't
+		 * stop us from removing the mapping, as the assumption is we
+		 * won't be needing this memory any more and thus will want to
+		 * prevent it from being remapped again on hotplug. so, only
+		 * fail if we indeed failed to unmap (e.g. if the mapping was
+		 * within our mapped range but had invalid alignment).
+		 */
+		if (rte_errno != ENODEV && rte_errno != ENOTSUP) {
+			RTE_LOG(ERR, EAL, "Couldn't unmap region for DMA\n");
+			ret = -1;
+			goto out;
+		} else {
+			RTE_LOG(DEBUG, EAL, "DMA unmapping failed, but removing mappings anyway\n");
+		}
+	}
+	/* remove map from the list of active mappings */
+	if (new_map != NULL) {
+		adjust_map(map, new_map, vaddr, len);
+
+		/* if we've created a new map by splitting, sort everything */
+		if (!is_null_map(new_map)) {
+			compact_user_maps();
+		} else {
+			/* we've created a new mapping, but it was unused */
+			user_mem_maps.n_maps--;
+		}
+	} else {
+		memset(map, 0, sizeof(*map));
+		compact_user_maps();
+		user_mem_maps.n_maps--;
+	}
+
+out:
+	rte_spinlock_unlock(&user_mem_maps.lock);
+	return ret;
+}
+
 int
 rte_vfio_noiommu_is_enabled(void)
 {
@@ -852,4 +1396,20 @@ rte_vfio_noiommu_is_enabled(void)
 	return c == 'Y';
 }
 
+#else
+
+int __rte_experimental
+rte_vfio_dma_map(uint64_t __rte_unused vaddr, __rte_unused uint64_t iova,
+		  __rte_unused uint64_t len)
+{
+	return -1;
+}
+
+int __rte_experimental
+rte_vfio_dma_unmap(uint64_t __rte_unused vaddr, uint64_t __rte_unused iova,
+		    __rte_unused uint64_t len)
+{
+	return -1;
+}
+
 #endif
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.h b/lib/librte_eal/linuxapp/eal/eal_vfio.h
index 8059577..549f442 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.h
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.h
@@ -19,6 +19,7 @@
 
 #ifdef VFIO_PRESENT
 
+#include <stdint.h>
 #include <linux/vfio.h>
 
 #define RTE_VFIO_TYPE1 VFIO_TYPE1_IOMMU
@@ -26,6 +27,7 @@
 #ifndef VFIO_SPAPR_TCE_v2_IOMMU
 #define RTE_VFIO_SPAPR 7
 #define VFIO_IOMMU_SPAPR_REGISTER_MEMORY _IO(VFIO_TYPE, VFIO_BASE + 17)
+#define VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY _IO(VFIO_TYPE, VFIO_BASE + 18)
 #define VFIO_IOMMU_SPAPR_TCE_CREATE _IO(VFIO_TYPE, VFIO_BASE + 19)
 #define VFIO_IOMMU_SPAPR_TCE_REMOVE _IO(VFIO_TYPE, VFIO_BASE + 20)
 
@@ -110,6 +112,7 @@ struct vfio_config {
 	int vfio_enabled;
 	int vfio_container_fd;
 	int vfio_active_groups;
+	const struct vfio_iommu_type *vfio_iommu_type;
 	struct vfio_group vfio_groups[VFIO_MAX_GROUPS];
 };
 
@@ -119,9 +122,18 @@ struct vfio_config {
  * */
 typedef int (*vfio_dma_func_t)(int);
 
+/* Custom memory region DMA mapping function prototype.
+ * Takes VFIO container fd, virtual address, phisical address, length and
+ * operation type (0 to unmap 1 for map) as a parameters.
+ * Returns 0 on success, -1 on error.
+ **/
+typedef int (*vfio_dma_user_func_t)(int fd, uint64_t vaddr, uint64_t iova,
+		uint64_t len, int do_map);
+
 struct vfio_iommu_type {
 	int type_id;
 	const char *name;
+	vfio_dma_user_func_t dma_user_map_func;
 	vfio_dma_func_t dma_map_func;
 };
 
diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
index 79433b7..76209f9 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -263,5 +263,7 @@ EXPERIMENTAL {
 	rte_service_start_with_defaults;
 	rte_socket_count;
 	rte_socket_id_by_idx;
+	rte_vfio_dma_map;
+	rte_vfio_dma_unmap;
 
 } DPDK_18.02;
-- 
2.7.4


More information about the dev mailing list