[dpdk-dev] [PATCH v3 1/1] vfio: modify spapr iommu support to use static window sizing

David Christensen drc at linux.vnet.ibm.com
Thu Sep 3 20:55:54 CEST 2020


Ping

On 8/10/20 2:07 PM, David Christensen wrote:
> The SPAPR IOMMU requires that a DMA window size be defined before memory
> can be mapped for DMA. Current code dynamically modifies the DMA window
> size in response to every new memory allocation which is potentially
> dangerous because all existing mappings need to be unmapped/remapped in
> order to resize the DMA window, leaving hardware holding IOVA addresses
> that are temporarily unmapped.  The new SPAPR code statically assigns
> the DMA window size on first use, using the largest physical memory
> memory address when IOVA=PA and the highest existing memseg virtual
> address when IOVA=VA.
> 
> Signed-off-by: David Christensen <drc at linux.vnet.ibm.com>
> ---
>   lib/librte_eal/linux/eal_vfio.c | 412 ++++++++++++++------------------
>   1 file changed, 186 insertions(+), 226 deletions(-)
> 
> diff --git a/lib/librte_eal/linux/eal_vfio.c b/lib/librte_eal/linux/eal_vfio.c
> index e07979936..4456761fc 100644
> --- a/lib/librte_eal/linux/eal_vfio.c
> +++ b/lib/librte_eal/linux/eal_vfio.c
> @@ -18,6 +18,7 @@
>   #include "eal_memcfg.h"
>   #include "eal_vfio.h"
>   #include "eal_private.h"
> +#include "eal_internal_cfg.h"
> 
>   #ifdef VFIO_PRESENT
> 
> @@ -536,17 +537,6 @@ vfio_mem_event_callback(enum rte_mem_event type, const void *addr, size_t len,
>   		return;
>   	}
> 
> -#ifdef RTE_ARCH_PPC_64
> -	ms = rte_mem_virt2memseg(addr, msl);
> -	while (cur_len < len) {
> -		int idx = rte_fbarray_find_idx(&msl->memseg_arr, ms);
> -
> -		rte_fbarray_set_free(&msl->memseg_arr, idx);
> -		cur_len += ms->len;
> -		++ms;
> -	}
> -	cur_len = 0;
> -#endif
>   	/* memsegs are contiguous in memory */
>   	ms = rte_mem_virt2memseg(addr, msl);
> 
> @@ -607,17 +597,6 @@ vfio_mem_event_callback(enum rte_mem_event type, const void *addr, size_t len,
>   						iova_expected - iova_start, 0);
>   		}
>   	}
> -#ifdef RTE_ARCH_PPC_64
> -	cur_len = 0;
> -	ms = rte_mem_virt2memseg(addr, msl);
> -	while (cur_len < len) {
> -		int idx = rte_fbarray_find_idx(&msl->memseg_arr, ms);
> -
> -		rte_fbarray_set_used(&msl->memseg_arr, idx);
> -		cur_len += ms->len;
> -		++ms;
> -	}
> -#endif
>   }
> 
>   static int
> @@ -1433,21 +1412,30 @@ vfio_type1_dma_map(int vfio_container_fd)
>   	return rte_memseg_walk(type1_map, &vfio_container_fd);
>   }
> 
> +/* Track the size of the statically allocated DMA window for SPAPR */
> +uint64_t spapr_dma_win_len;
> +uint64_t spapr_dma_win_page_sz;
> +
>   static int
>   vfio_spapr_dma_do_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
>   		uint64_t len, int do_map)
>   {
> -	struct vfio_iommu_type1_dma_map dma_map;
> -	struct vfio_iommu_type1_dma_unmap dma_unmap;
> -	int ret;
>   	struct vfio_iommu_spapr_register_memory reg = {
>   		.argsz = sizeof(reg),
> +		.vaddr = (uintptr_t) vaddr,
> +		.size = len,
>   		.flags = 0
>   	};
> -	reg.vaddr = (uintptr_t) vaddr;
> -	reg.size = len;
> +	int ret;
> 
>   	if (do_map != 0) {
> +		struct vfio_iommu_type1_dma_map dma_map;
> +
> +		if (iova + len > spapr_dma_win_len) {
> +			RTE_LOG(ERR, EAL, "  dma map attempt outside DMA window\n");
> +			return -1;
> +		}
> +
>   		ret = ioctl(vfio_container_fd,
>   				VFIO_IOMMU_SPAPR_REGISTER_MEMORY, &reg);
>   		if (ret) {
> @@ -1466,24 +1454,14 @@ vfio_spapr_dma_do_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
> 
>   		ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
>   		if (ret) {
> -			/**
> -			 * In case the mapping was already done EBUSY will be
> -			 * returned from kernel.
> -			 */
> -			if (errno == EBUSY) {
> -				RTE_LOG(DEBUG, EAL,
> -					" Memory segment is already mapped,"
> -					" skipping");
> -			} else {
> -				RTE_LOG(ERR, EAL,
> -					"  cannot set up DMA remapping,"
> -					" error %i (%s)\n", errno,
> -					strerror(errno));
> -				return -1;
> -			}
> +			RTE_LOG(ERR, EAL, "  cannot map vaddr for IOMMU, "
> +				"error %i (%s)\n", errno, strerror(errno));
> +			return -1;
>   		}
> 
>   	} else {
> +		struct vfio_iommu_type1_dma_map dma_unmap;
> +
>   		memset(&dma_unmap, 0, sizeof(dma_unmap));
>   		dma_unmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap);
>   		dma_unmap.size = len;
> @@ -1492,21 +1470,21 @@ vfio_spapr_dma_do_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
>   		ret = ioctl(vfio_container_fd, VFIO_IOMMU_UNMAP_DMA,
>   				&dma_unmap);
>   		if (ret) {
> -			RTE_LOG(ERR, EAL, "  cannot clear DMA remapping, error %i (%s)\n",
> -					errno, strerror(errno));
> +			RTE_LOG(ERR, EAL, "  cannot unmap vaddr for IOMMU, "
> +				"error %i (%s)\n", errno, strerror(errno));
>   			return -1;
>   		}
> 
>   		ret = ioctl(vfio_container_fd,
>   				VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY, &reg);
>   		if (ret) {
> -			RTE_LOG(ERR, EAL, "  cannot unregister vaddr for IOMMU, error %i (%s)\n",
> -					errno, strerror(errno));
> +			RTE_LOG(ERR, EAL, "  cannot unregister vaddr for IOMMU, "
> +				"error %i (%s)\n", errno, strerror(errno));
>   			return -1;
>   		}
>   	}
> 
> -	return 0;
> +	return ret;
>   }
> 
>   static int
> @@ -1523,251 +1501,233 @@ vfio_spapr_map_walk(const struct rte_memseg_list *msl,
>   	if (ms->iova == RTE_BAD_IOVA)
>   		return 0;
> 
> -	return vfio_spapr_dma_do_map(*vfio_container_fd, ms->addr_64, ms->iova,
> -			ms->len, 1);
> +	return vfio_spapr_dma_do_map(*vfio_container_fd,
> +		ms->addr_64, ms->iova, ms->len, 1);
>   }
> 
> +struct spapr_size_walk_param {
> +	uint64_t max_va;
> +	uint64_t page_sz;
> +	int external;
> +};
> +
> +/*
> + * In order to set the DMA window size required for the SPAPR IOMMU
> + * we need to walk the existing virtual memory allocations as well as
> + * find the hugepage size used.
> + */
>   static int
> -vfio_spapr_unmap_walk(const struct rte_memseg_list *msl,
> -		const struct rte_memseg *ms, void *arg)
> +vfio_spapr_size_walk(const struct rte_memseg_list *msl, void *arg)
>   {
> -	int *vfio_container_fd = arg;
> +	struct spapr_size_walk_param *param = arg;
> +	uint64_t max = (uint64_t) msl->base_va + (uint64_t) msl->len;
> 
> -	/* skip external memory that isn't a heap */
> -	if (msl->external && !msl->heap)
> -		return 0;
> +	if (msl->external) {
> +		param->external++;
> +		if (!msl->heap)
> +			return 0;
> +	}
> 
> -	/* skip any segments with invalid IOVA addresses */
> -	if (ms->iova == RTE_BAD_IOVA)
> -		return 0;
> +	if (max > param->max_va) {
> +		param->page_sz = msl->page_sz;
> +		param->max_va = max;
> +	}
> 
> -	return vfio_spapr_dma_do_map(*vfio_container_fd, ms->addr_64, ms->iova,
> -			ms->len, 0);
> +	return 0;
>   }
> 
> -struct spapr_walk_param {
> -	uint64_t window_size;
> -	uint64_t hugepage_sz;
> -};
> -
> +/*
> + * The SPAPRv2 IOMMU supports 2 DMA windows with starting
> + * address at 0 or 1<<59.  By default, a DMA window is set
> + * at address 0, 2GB long, with a 4KB page.  For DPDK we
> + * must remove the default window and setup a new DMA window
> + * based on the hugepage size and memory requirements of
> + * the application before we can map memory for DMA.
> + */
>   static int
> -vfio_spapr_window_size_walk(const struct rte_memseg_list *msl,
> -		const struct rte_memseg *ms, void *arg)
> +spapr_dma_win_size(void)
>   {
> -	struct spapr_walk_param *param = arg;
> -	uint64_t max = ms->iova + ms->len;
> +	struct spapr_size_walk_param param;
> 
> -	/* skip external memory that isn't a heap */
> -	if (msl->external && !msl->heap)
> +	/* only create DMA window once */
> +	if (spapr_dma_win_len > 0)
>   		return 0;
> 
> -	/* skip any segments with invalid IOVA addresses */
> -	if (ms->iova == RTE_BAD_IOVA)
> -		return 0;
> +	/* walk the memseg list to find the page size/max VA address */
> +	memset(&param, 0, sizeof(param));
> +	if (rte_memseg_list_walk(vfio_spapr_size_walk, &param) < 0) {
> +		RTE_LOG(ERR, EAL, "Failed to walk memseg list for DMA "
> +			"window size\n");
> +		return -1;
> +	}
> +
> +	/* We can't be sure if DMA window covers external memory */
> +	if (param.external > 0)
> +		RTE_LOG(WARNING, EAL, "Detected external memory which may "
> +			"not be managed by the IOMMU\n");
> +
> +	/* find the maximum IOVA address for setting the DMA window size */
> +	if (rte_eal_iova_mode() == RTE_IOVA_PA) {
> +		static const char proc_iomem[] = "/proc/iomem";
> +		static const char str_sysram[] = "System RAM";
> +		uint64_t start, end, max = 0;
> +		char *line = NULL;
> +		char *dash, *space;
> +		size_t line_len;
> +
> +		/*
> +		 * Example "System RAM" in /proc/iomem:
> +		 * 00000000-1fffffffff : System RAM
> +		 * 200000000000-201fffffffff : System RAM
> +		 */
> +		FILE *fd = fopen(proc_iomem, "r");
> +		if (fd == NULL) {
> +			RTE_LOG(ERR, EAL, "Cannot open %s\n", proc_iomem);
> +			return -1;
> +		}
> +		/* Scan /proc/iomem for the highest PA in the system */
> +		while (getline(&line, &line_len, fd) != -1) {
> +			if (strstr(line, str_sysram) == NULL)
> +				continue;
> +
> +			space = strstr(line, " ");
> +			dash = strstr(line, "-");
> +
> +			/* Validate the format of the memory string */
> +			if (space == NULL || dash == NULL || space < dash) {
> +				RTE_LOG(ERR, EAL, "Can't parse line \"%s\" in "
> +					"file %s\n", line, proc_iomem);
> +				continue;
> +			}
> +
> +			start = strtoull(line, NULL, 16);
> +			end   = strtoull(dash + 1, NULL, 16);
> +			RTE_LOG(DEBUG, EAL, "Found system RAM from 0x%"
> +				PRIx64 " to 0x%" PRIx64 "\n", start, end);
> +			if (end > max)
> +				max = end;
> +		}
> +		free(line);
> +		fclose(fd);
> 
> -	if (max > param->window_size) {
> -		param->hugepage_sz = ms->hugepage_sz;
> -		param->window_size = max;
> +		if (max == 0) {
> +			RTE_LOG(ERR, EAL, "Failed to find valid \"System RAM\" "
> +				"entry in file %s\n", proc_iomem);
> +			return -1;
> +		}
> +
> +		spapr_dma_win_len = rte_align64pow2(max + 1);
> +		RTE_LOG(DEBUG, EAL, "Setting DMA window size to 0x%"
> +			PRIx64 "\n", spapr_dma_win_len);
> +	} else if (rte_eal_iova_mode() == RTE_IOVA_VA) {
> +		RTE_LOG(DEBUG, EAL, "Highest VA address in memseg list is 0x%"
> +			PRIx64 "\n", param.max_va);
> +		spapr_dma_win_len = rte_align64pow2(param.max_va);
> +		RTE_LOG(DEBUG, EAL, "Setting DMA window size to 0x%"
> +			PRIx64 "\n", spapr_dma_win_len);
> +	} else {
> +		RTE_LOG(ERR, EAL, "Unsupported IOVA mode\n");
> +		return -1;
>   	}
> 
> +	spapr_dma_win_page_sz = param.page_sz;
> +	rte_mem_set_dma_mask(__builtin_ctzll(spapr_dma_win_len));
>   	return 0;
>   }
> 
>   static int
> -vfio_spapr_create_new_dma_window(int vfio_container_fd,
> -		struct vfio_iommu_spapr_tce_create *create) {
> +vfio_spapr_create_dma_window(int vfio_container_fd)
> +{
> +	struct vfio_iommu_spapr_tce_create create = {
> +		.argsz = sizeof(create), };
>   	struct vfio_iommu_spapr_tce_remove remove = {
> -		.argsz = sizeof(remove),
> -	};
> +		.argsz = sizeof(remove), };
>   	struct vfio_iommu_spapr_tce_info info = {
> -		.argsz = sizeof(info),
> -	};
> +		.argsz = sizeof(info), };
>   	int ret;
> 
> -	/* query spapr iommu info */
> +	ret = spapr_dma_win_size();
> +	if (ret < 0)
> +		return ret;
> +
>   	ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info);
>   	if (ret) {
> -		RTE_LOG(ERR, EAL, "  cannot get iommu info, "
> -				"error %i (%s)\n", errno, strerror(errno));
> +		RTE_LOG(ERR, EAL, "  can't get iommu info, "
> +			"error %i (%s)\n", errno, strerror(errno));
>   		return -1;
>   	}
> 
> -	/* remove default DMA of 32 bit window */
> +	/* remove default DMA window */
>   	remove.start_addr = info.dma32_window_start;
>   	ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove);
> -	if (ret) {
> -		RTE_LOG(ERR, EAL, "  cannot remove default DMA window, "
> -				"error %i (%s)\n", errno, strerror(errno));
> +	if (ret)
>   		return -1;
> -	}
> 
> -	/* create new DMA window */
> -	ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_CREATE, create);
> +	/* create a new DMA window (start address is not selectable) */
> +	create.window_size = spapr_dma_win_len;
> +	create.page_shift  = __builtin_ctzll(spapr_dma_win_page_sz);
> +	create.levels = 1;
> +	ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
>   	if (ret) {
> -#ifdef VFIO_IOMMU_SPAPR_INFO_DDW
> -		/* try possible page_shift and levels for workaround */
> +		/* if at first we don't succeed, try more levels */
>   		uint32_t levels;
> 
> -		for (levels = create->levels + 1;
> +		for (levels = create.levels + 1;
>   			ret && levels <= info.ddw.levels; levels++) {
> -			create->levels = levels;
> +			create.levels = levels;
>   			ret = ioctl(vfio_container_fd,
> -				VFIO_IOMMU_SPAPR_TCE_CREATE, create);
> -		}
> -#endif
> -		if (ret) {
> -			RTE_LOG(ERR, EAL, "  cannot create new DMA window, "
> -					"error %i (%s)\n", errno, strerror(errno));
> -			return -1;
> +				VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
>   		}
>   	}
> -
> -	if (create->start_addr != 0) {
> -		RTE_LOG(ERR, EAL, "  DMA window start address != 0\n");
> +	if (ret) {
> +		RTE_LOG(ERR, EAL, "  cannot create new DMA window, "
> +			"error %i (%s)\n", errno, strerror(errno));
> +		RTE_LOG(ERR, EAL, "  consider using a larger hugepage size if "
> +			"supported by the system\n");
>   		return -1;
>   	}
> 
> -	return 0;
> +	/* verify the start address  */
> +	if (create.start_addr != 0) {
> +		RTE_LOG(ERR, EAL, "  received unsupported start address 0x%"
> +			PRIx64 "\n", (uint64_t)create.start_addr);
> +		return -1;
> +	}
> +	return ret;
>   }
> 
>   static int
> -vfio_spapr_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
> -		uint64_t len, int do_map)
> +vfio_spapr_dma_mem_map(int vfio_container_fd, uint64_t vaddr,
> +	uint64_t iova, uint64_t len, int do_map)
>   {
> -	struct spapr_walk_param param;
> -	struct vfio_iommu_spapr_tce_create create = {
> -		.argsz = sizeof(create),
> -	};
> -	struct vfio_config *vfio_cfg;
> -	struct user_mem_maps *user_mem_maps;
> -	int i, ret = 0;
> -
> -	vfio_cfg = get_vfio_cfg_by_container_fd(vfio_container_fd);
> -	if (vfio_cfg == NULL) {
> -		RTE_LOG(ERR, EAL, "  invalid container fd!\n");
> -		return -1;
> -	}
> -
> -	user_mem_maps = &vfio_cfg->mem_maps;
> -	rte_spinlock_recursive_lock(&user_mem_maps->lock);
> -
> -	/* check if window size needs to be adjusted */
> -	memset(&param, 0, sizeof(param));
> -
> -	/* we're inside a callback so use thread-unsafe version */
> -	if (rte_memseg_walk_thread_unsafe(vfio_spapr_window_size_walk,
> -				&param) < 0) {
> -		RTE_LOG(ERR, EAL, "Could not get window size\n");
> -		ret = -1;
> -		goto out;
> -	}
> -
> -	/* also check user maps */
> -	for (i = 0; i < user_mem_maps->n_maps; i++) {
> -		uint64_t max = user_mem_maps->maps[i].iova +
> -				user_mem_maps->maps[i].len;
> -		param.window_size = RTE_MAX(param.window_size, max);
> -	}
> -
> -	/* sPAPR requires window size to be a power of 2 */
> -	create.window_size = rte_align64pow2(param.window_size);
> -	create.page_shift = __builtin_ctzll(param.hugepage_sz);
> -	create.levels = 1;
> +	int ret = 0;
> 
>   	if (do_map) {
> -		/* re-create window and remap the entire memory */
> -		if (iova + len > create.window_size) {
> -			/* release all maps before recreating the window */
> -			if (rte_memseg_walk_thread_unsafe(vfio_spapr_unmap_walk,
> -					&vfio_container_fd) < 0) {
> -				RTE_LOG(ERR, EAL, "Could not release DMA maps\n");
> -				ret = -1;
> -				goto out;
> -			}
> -			/* release all user maps */
> -			for (i = 0; i < user_mem_maps->n_maps; i++) {
> -				struct user_mem_map *map =
> -						&user_mem_maps->maps[i];
> -				if (vfio_spapr_dma_do_map(vfio_container_fd,
> -						map->addr, map->iova, map->len,
> -						0)) {
> -					RTE_LOG(ERR, EAL, "Could not release user DMA maps\n");
> -					ret = -1;
> -					goto out;
> -				}
> -			}
> -			create.window_size = rte_align64pow2(iova + len);
> -			if (vfio_spapr_create_new_dma_window(vfio_container_fd,
> -					&create) < 0) {
> -				RTE_LOG(ERR, EAL, "Could not create new DMA window\n");
> -				ret = -1;
> -				goto out;
> -			}
> -			/* we're inside a callback, so use thread-unsafe version
> -			 */
> -			if (rte_memseg_walk_thread_unsafe(vfio_spapr_map_walk,
> -					&vfio_container_fd) < 0) {
> -				RTE_LOG(ERR, EAL, "Could not recreate DMA maps\n");
> -				ret = -1;
> -				goto out;
> -			}
> -			/* remap all user maps */
> -			for (i = 0; i < user_mem_maps->n_maps; i++) {
> -				struct user_mem_map *map =
> -						&user_mem_maps->maps[i];
> -				if (vfio_spapr_dma_do_map(vfio_container_fd,
> -						map->addr, map->iova, map->len,
> -						1)) {
> -					RTE_LOG(ERR, EAL, "Could not recreate user DMA maps\n");
> -					ret = -1;
> -					goto out;
> -				}
> -			}
> -		}
> -		if (vfio_spapr_dma_do_map(vfio_container_fd, vaddr, iova, len, 1)) {
> +		if (vfio_spapr_dma_do_map(vfio_container_fd,
> +			vaddr, iova, len, 1)) {
>   			RTE_LOG(ERR, EAL, "Failed to map DMA\n");
>   			ret = -1;
> -			goto out;
>   		}
>   	} else {
> -		/* for unmap, check if iova within DMA window */
> -		if (iova > create.window_size) {
> -			RTE_LOG(ERR, EAL, "iova beyond DMA window for unmap");
> +		if (vfio_spapr_dma_do_map(vfio_container_fd,
> +			vaddr, iova, len, 0)) {
> +			RTE_LOG(ERR, EAL, "Failed to unmap DMA\n");
>   			ret = -1;
> -			goto out;
>   		}
> -
> -		vfio_spapr_dma_do_map(vfio_container_fd, vaddr, iova, len, 0);
>   	}
> -out:
> -	rte_spinlock_recursive_unlock(&user_mem_maps->lock);
> +
>   	return ret;
>   }
> 
>   static int
>   vfio_spapr_dma_map(int vfio_container_fd)
>   {
> -	struct vfio_iommu_spapr_tce_create create = {
> -		.argsz = sizeof(create),
> -	};
> -	struct spapr_walk_param param;
> -
> -	memset(&param, 0, sizeof(param));
> -
> -	/* create DMA window from 0 to max(phys_addr + len) */
> -	rte_memseg_walk(vfio_spapr_window_size_walk, &param);
> -
> -	/* sPAPR requires window size to be a power of 2 */
> -	create.window_size = rte_align64pow2(param.window_size);
> -	create.page_shift = __builtin_ctzll(param.hugepage_sz);
> -	create.levels = 1;
> -
> -	if (vfio_spapr_create_new_dma_window(vfio_container_fd, &create) < 0) {
> -		RTE_LOG(ERR, EAL, "Could not create new DMA window\n");
> +	if (vfio_spapr_create_dma_window(vfio_container_fd) < 0) {
> +		RTE_LOG(ERR, EAL, "Could not create new DMA window!\n");
>   		return -1;
>   	}
> 
> -	/* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */
> +	/* map all existing DPDK segments for DMA */
>   	if (rte_memseg_walk(vfio_spapr_map_walk, &vfio_container_fd) < 0)
>   		return -1;
> 


More information about the dev mailing list