[dpdk-dev] [PATCH] vfio: fix expanding DMA area in ppc64le

Takeshi T Yoshimura TYOS at jp.ibm.com
Thu Jun 13 04:30:47 CEST 2019


>宛先: dev at dpdk.org
>送信元: Takeshi Yoshimura <tyos at jp.ibm.com>
>日付: 2019/06/13 11:22AM
>Cc: drc at ibm.com, pradeep at us.ibm.com, Takeshi Yoshimura
><tyos at jp.ibm.com>
>件名: [EXTERNAL] [PATCH] vfio: fix expanding DMA area in ppc64le
>
>In ppc64le, expanding DMA areas always fail because we cannot remove
>a DMA window. As a result, we cannot allocate more than one memseg in
>ppc64le. This is because vfio_spapr_dma_mem_map() doesn't unmap all
>the mapped DMA before removing the window. This patch fixes this
>incorrect behavior.
>
>I added a global variable to track current window size since we do
>not have better ways to get exact size of it than doing so. sPAPR
>IOMMU seems not to provide any ways to get window size with ioctl
>interfaces. rte_memseg_walk*() is currently used to calculate window
>size, but it walks memsegs that are marked as used, not mapped. So,
>we need to determine if a given memseg is mapped or not, otherwise
>the ioctl reports errors due to attempting to unregister memory
>addresses that are not registered. The global variable is excluded
>in non-ppc64le binaries.
>
>Similar problems happen in user maps. We need to avoid attempting to
>unmap the address that is given as the function's parameter. The
>compaction of user maps prevents us from passing correct length for
>unmapping DMA at the window recreation. So, I removed it in ppc64le.
>
>I also fixed the order of ioctl for unregister and unmap. The ioctl
>for unregister sometimes report device busy errors due to the
>existence of mapped area.
>
>Signed-off-by: Takeshi Yoshimura <tyos at jp.ibm.com>
>---
> lib/librte_eal/linux/eal/eal_vfio.c | 154
>+++++++++++++++++++---------
> 1 file changed, 103 insertions(+), 51 deletions(-)
>
>diff --git a/lib/librte_eal/linux/eal/eal_vfio.c
>b/lib/librte_eal/linux/eal/eal_vfio.c
>index f16c5c3c0..c1b275b56 100644
>--- a/lib/librte_eal/linux/eal/eal_vfio.c
>+++ b/lib/librte_eal/linux/eal/eal_vfio.c
>@@ -93,6 +93,7 @@ is_null_map(const struct user_mem_map *map)
> 	return map->addr == 0 && map->iova == 0 && map->len == 0;
> }
> 
>+#ifndef RTE_ARCH_PPC_64
> /* we may need to merge user mem maps together in case of user
>mapping/unmapping
>  * chunks of memory, so we'll need a comparator function to sort
>segments.
>  */
>@@ -126,6 +127,7 @@ user_mem_map_cmp(const void *a, const void *b)
> 
> 	return 0;
> }
>+#endif
> 
> /* adjust user map entry. this may result in shortening of existing
>map, or in
>  * splitting existing map in two pieces.
>@@ -162,6 +164,7 @@ adjust_map(struct user_mem_map *src, struct
>user_mem_map *end,
> 	}
> }
> 
>+#ifndef RTE_ARCH_PPC_64
> /* try merging two maps into one, return 1 if succeeded */
> static int
> merge_map(struct user_mem_map *left, struct user_mem_map *right)
>@@ -177,6 +180,7 @@ merge_map(struct user_mem_map *left, struct
>user_mem_map *right)
> 
> 	return 1;
> }
>+#endif
> 
> static struct user_mem_map *
> find_user_mem_map(struct user_mem_maps *user_mem_maps, uint64_t
>addr,
>@@ -211,6 +215,16 @@ find_user_mem_map(struct user_mem_maps
>*user_mem_maps, uint64_t addr,
> 	return NULL;
> }
> 
>+#ifdef RTE_ARCH_PPC_64
>+/* Recreation of DMA window requires unregistering DMA memory.
>+ * Compaction confuses the logic and causes false reports in the
>recreation.
>+ * For now, we do not compact user maps in ppc64le.
>+ */
>+static void
>+compact_user_maps(__rte_unused struct user_mem_maps *user_mem_maps)
>+{
>+}
>+#else
> /* this will sort all user maps, and merge/compact any adjacent maps
>*/
> static void
> compact_user_maps(struct user_mem_maps *user_mem_maps)
>@@ -256,6 +270,7 @@ compact_user_maps(struct user_mem_maps
>*user_mem_maps)
> 		user_mem_maps->n_maps = cur_idx;
> 	}
> }
>+#endif
> 
> static int
> vfio_open_group_fd(int iommu_group_num)
>@@ -1306,6 +1321,7 @@ vfio_type1_dma_map(int vfio_container_fd)
> 	return rte_memseg_walk(type1_map, &vfio_container_fd);
> }
> 
>+#ifdef RTE_ARCH_PPC_64
> static int
> vfio_spapr_dma_do_map(int vfio_container_fd, uint64_t vaddr,
>uint64_t iova,
> 		uint64_t len, int do_map)
>@@ -1357,14 +1373,6 @@ vfio_spapr_dma_do_map(int vfio_container_fd,
>uint64_t vaddr, uint64_t iova,
> 		}
> 
> 	} else {
>-		ret = ioctl(vfio_container_fd,
>-				VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY, &reg);
>-		if (ret) {
>-			RTE_LOG(ERR, EAL, "  cannot unregister vaddr for IOMMU, error %i
>(%s)\n",
>-					errno, strerror(errno));
>-			return -1;
>-		}
>-
> 		memset(&dma_unmap, 0, sizeof(dma_unmap));
> 		dma_unmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap);
> 		dma_unmap.size = len;
>@@ -1377,24 +1385,50 @@ vfio_spapr_dma_do_map(int vfio_container_fd,
>uint64_t vaddr, uint64_t iova,
> 					errno, strerror(errno));
> 			return -1;
> 		}
>+
>+		ret = ioctl(vfio_container_fd,
>+				VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY, &reg);
>+		if (ret) {
>+			RTE_LOG(ERR, EAL, "  cannot unregister vaddr for IOMMU, error %i
>(%s)\n",
>+					errno, strerror(errno));
>+			return -1;
>+		}
> 	}
> 
> 	return 0;
> }
> 
>+struct spapr_remap_walk_param {
>+	int vfio_container_fd;
>+	uint64_t window_size;
>+};
>+
> static int
> vfio_spapr_map_walk(const struct rte_memseg_list *msl,
> 		const struct rte_memseg *ms, void *arg)
> {
>-	int *vfio_container_fd = arg;
>+	struct spapr_remap_walk_param *p = arg;
> 
>-	if (msl->external)
>+	if (msl->external || ms->iova >= p->window_size)
> 		return 0;
> 
>-	return vfio_spapr_dma_do_map(*vfio_container_fd, ms->addr_64,
>ms->iova,
>+	return vfio_spapr_dma_do_map(p->vfio_container_fd, ms->addr_64,
>ms->iova,
> 			ms->len, 1);
> }
> 
>+static int
>+vfio_spapr_unmap_walk(const struct rte_memseg_list *msl,
>+		const struct rte_memseg *ms, void *arg)
>+{
>+	struct spapr_remap_walk_param *p = arg;
>+
>+	if (msl->external || ms->iova >= p->window_size)
>+		return 0;
>+
>+	return vfio_spapr_dma_do_map(p->vfio_container_fd, ms->addr_64,
>ms->iova,
>+			ms->len, 0);
>+}
>+
> struct spapr_walk_param {
> 	uint64_t window_size;
> 	uint64_t hugepage_sz;
>@@ -1481,14 +1515,13 @@ vfio_spapr_create_new_dma_window(int
>vfio_container_fd,
> 	return 0;
> }
> 
>+static struct vfio_iommu_spapr_tce_create prev_create;
>+
> static int
> vfio_spapr_dma_mem_map(int vfio_container_fd, uint64_t vaddr,
>uint64_t iova,
> 		uint64_t len, int do_map)
> {
>-	struct spapr_walk_param param;
>-	struct vfio_iommu_spapr_tce_create create = {
>-		.argsz = sizeof(create),
>-	};
>+	struct vfio_iommu_spapr_tce_create create;
> 	struct vfio_config *vfio_cfg;
> 	struct user_mem_maps *user_mem_maps;
> 	int i, ret = 0;
>@@ -1502,43 +1535,59 @@ vfio_spapr_dma_mem_map(int vfio_container_fd,
>uint64_t vaddr, uint64_t iova,
> 	user_mem_maps = &vfio_cfg->mem_maps;
> 	rte_spinlock_recursive_lock(&user_mem_maps->lock);
> 
>-	/* check if window size needs to be adjusted */
>-	memset(&param, 0, sizeof(param));
>-
>-	/* we're inside a callback so use thread-unsafe version */
>-	if (rte_memseg_walk_thread_unsafe(vfio_spapr_window_size_walk,
>-				&param) < 0) {
>-		RTE_LOG(ERR, EAL, "Could not get window size\n");
>-		ret = -1;
>-		goto out;
>-	}
>+	memcpy(&create, &prev_create, sizeof(create));
> 
> 	/* also check user maps */
> 	for (i = 0; i < user_mem_maps->n_maps; i++) {
>-		uint64_t max = user_mem_maps->maps[i].iova +
>-				user_mem_maps->maps[i].len;
>-		create.window_size = RTE_MAX(create.window_size, max);
>+		struct user_mem_map *map = &user_mem_maps->maps[i];
>+
>+		if (vaddr == map->addr && len == map->len)
>+			continue;
>+		create.window_size = RTE_MAX(create.window_size, map->iova +
>map->len);
> 	}
> 
> 	/* sPAPR requires window size to be a power of 2 */
>-	create.window_size = rte_align64pow2(param.window_size);
>-	create.page_shift = __builtin_ctzll(param.hugepage_sz);
>-	create.levels = 1;
>+	create.window_size = rte_align64pow2(create.window_size);
> 
> 	if (do_map) {
>-		void *addr;
> 		/* re-create window and remap the entire memory */
>-		if (iova > create.window_size) {
>+		if (iova + len > create.window_size) {
>+			struct spapr_remap_walk_param param = {
>+				.vfio_container_fd = vfio_container_fd,
>+			    .window_size = create.window_size,
>+			};
>+
>+			/* we're inside a callback, so use thread-unsafe version
>+			 */
>+			rte_memseg_walk_thread_unsafe(vfio_spapr_unmap_walk,
>+					&param);
>+			/* destruct all user maps */
>+			for (i = 0; i < user_mem_maps->n_maps; i++) {
>+				struct user_mem_map *map =
>+						&user_mem_maps->maps[i];
>+				if (vaddr == map->addr && len == map->len)
>+					continue;
>+				if (vfio_spapr_dma_do_map(vfio_container_fd,
>+						map->addr, map->iova, map->len,
>+						0)) {
>+					RTE_LOG(ERR, EAL, "Could not destruct user DMA maps\n");
>+					ret = -1;
>+					goto out;
>+				}
>+			}
>+
>+			create.window_size = rte_align64pow2(iova + len);
> 			if (vfio_spapr_create_new_dma_window(vfio_container_fd,
> 					&create) < 0) {
> 				RTE_LOG(ERR, EAL, "Could not create new DMA window\n");
> 				ret = -1;
> 				goto out;
> 			}
>+			memcpy(&prev_create, &create, sizeof(create));
> 			/* we're inside a callback, so use thread-unsafe version
> 			 */
> 			if (rte_memseg_walk_thread_unsafe(vfio_spapr_map_walk,
>-					&vfio_container_fd) < 0) {
>+					&param) < 0) {
> 				RTE_LOG(ERR, EAL, "Could not recreate DMA maps\n");
> 				ret = -1;
> 				goto out;
>@@ -1547,6 +1596,8 @@ vfio_spapr_dma_mem_map(int vfio_container_fd,
>uint64_t vaddr, uint64_t iova,
> 			for (i = 0; i < user_mem_maps->n_maps; i++) {
> 				struct user_mem_map *map =
> 						&user_mem_maps->maps[i];
>+				if (vaddr == map->addr && len == map->len)
>+					continue;
> 				if (vfio_spapr_dma_do_map(vfio_container_fd,
> 						map->addr, map->iova, map->len,
> 						1)) {
>@@ -1556,23 +1607,8 @@ vfio_spapr_dma_mem_map(int vfio_container_fd,
>uint64_t vaddr, uint64_t iova,
> 				}
> 			}
> 		}
>-
>-		/* now that we've remapped all of the memory that was present
>-		 * before, map the segment that we were requested to map.
>-		 *
>-		 * however, if we were called by the callback, the memory we
>-		 * were called with was already in the memseg list, so previous
>-		 * mapping should've mapped that segment already.
>-		 *
>-		 * virt2memseg_list is a relatively cheap check, so use that. if
>-		 * memory is within any memseg list, it's a memseg, so it's
>-		 * already mapped.
>-		 */
>-		addr = (void *)(uintptr_t)vaddr;
>-		if (rte_mem_virt2memseg_list(addr) == NULL &&
>-				vfio_spapr_dma_do_map(vfio_container_fd,
>-					vaddr, iova, len, 1) < 0) {
>-			RTE_LOG(ERR, EAL, "Could not map segment\n");
>+		if (vfio_spapr_dma_do_map(vfio_container_fd, vaddr, iova, len, 1))
>{
>+			RTE_LOG(ERR, EAL, "Failed to map DMA\n");
> 			ret = -1;
> 			goto out;
> 		}
>@@ -1613,6 +1649,7 @@ vfio_spapr_dma_map(int vfio_container_fd)
> 		RTE_LOG(ERR, EAL, "Could not create new DMA window\n");
> 		return -1;
> 	}
>+	memcpy(&prev_create, &create, sizeof(create));
> 
> 	/* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */
> 	if (rte_memseg_walk(vfio_spapr_map_walk, &vfio_container_fd) < 0)
>@@ -1620,6 +1657,21 @@ vfio_spapr_dma_map(int vfio_container_fd)
> 
> 	return 0;
> }
>+#else
>+static int
>+vfio_spapr_dma_mem_map(int __rte_unused vfio_container_fd,
>+			uint64_t __rte_unused vaddr,
>+			uint64_t __rte_unused iova, uint64_t __rte_unused len,
>+			int __rte_unused do_map)
>+{
>+	return 0;
>+}
>+static int
>+vfio_spapr_dma_map(int __rte_unused vfio_container_fd)
>+{
>+	return 0;
>+}
>+#endif
> 
> static int
> vfio_noiommu_dma_map(int __rte_unused vfio_container_fd)
>-- 
>2.17.1
>
>

Added CC: aconole at redhat.com
I updated the patch not to break builds in x86.



More information about the dev mailing list