[dpdk-dev] [RFC PATCH] Enable uio_pci_generic support

Danny Zhou danny.zhou at intel.com
Mon Oct 27 18:48:02 CET 2014


Linux kernel provides UIO as well as VFIO mechanism to support writing user
space device driver. Comparing to UIO which is available since 2.6.32 kernel,
the VFIO is introduced into kernel since version 3.6.0 with better interrupt
and memory protection (build on top of Intel VT-d technology) supports.
Basically, UIO and VFIO do two common things below:
1) Map PCIe device's I/O memory space to user space driver
2) Support device interrupt notification mechanism allows user space
   driver/application is notified when device interrupt triggers.

To run an DPDK application and make use of VFIO, two in_kernel modules
vfio and vfio-pci module must be loaded. But to use UIO, a DPDK kernel
module igb_uio, which was there since DPDK is invented, must be loaded to
attach to in_kernel uio module. As an possible solution to deprecate igb_uio, 
this RFC patch leverages uio_pci_generic this in_kernel module to support
DPDK user space PMD in a generic fashion (like how VFIO works today), to
partially (DPDK KNI module rte_kni still sits in kernel) remove DPDK dependency
on GPL code igb_uio in kernel. Tests with uio_pci_generic shows performance
remains same and LSC interrupts works as before.

Example to bind Network Ports to uio_pci_generic:
	modprobe uio
	modprobe uio_pci_generic
	/* to bind device 08:00.0, to the uio_pci_generic driver */
	./tools/dpdk_nic_bind.py -b uio_pci_generic 08:00.0

Note: this RFC patch does not completely remove igb_uio support due to
igb_uio supports creating maximum number of SR-IOV VFs (Virtual Functions)
by using max_vfs kernel parameter on older kernels (kernel 3.7.x and below).
Specifically, igb_uio explicitly calls pci_enable_sriov() to create VFs, while
it is not invoked in either uio or uio_pci_generic. On kernel 3.8.x
and above, user can use the standard sysfs to enable VFs. For examples:

#echo $num_vf_enabled > /sys/class/net/$dev/device/sriov_numvfs   //enable VFs
#echo 0 > /sys/class/net/$dev/device/sriov_numvfs                 //disable VFs

---
 lib/librte_eal/common/include/rte_pci.h            |   1 +
 lib/librte_eal/linuxapp/eal/eal_interrupts.c       |  67 +++++--
 lib/librte_eal/linuxapp/eal/eal_pci.c              |   2 +-
 lib/librte_eal/linuxapp/eal/eal_pci_uio.c          | 203 ++++++++++++++++-----
 .../linuxapp/eal/include/exec-env/rte_interrupts.h |   1 +
 tools/dpdk_nic_bind.py                             |   2 +-
 6 files changed, 214 insertions(+), 62 deletions(-)

diff --git a/lib/librte_eal/common/include/rte_pci.h b/lib/librte_eal/common/include/rte_pci.h
index 66ed793..71ca882 100644
--- a/lib/librte_eal/common/include/rte_pci.h
+++ b/lib/librte_eal/common/include/rte_pci.h
@@ -148,6 +148,7 @@ struct rte_pci_device {
 	struct rte_pci_id id;                   /**< PCI ID. */
 	struct rte_pci_resource mem_resource[PCI_MAX_RESOURCE];   /**< PCI Memory Resource */
 	struct rte_intr_handle intr_handle;     /**< Interrupt handle */
+	char driver_name[BUFSIZ];               /**< driver name */
 	const struct rte_pci_driver *driver;    /**< Associated driver */
 	uint16_t max_vfs;                       /**< sriov enable if not zero */
 	int numa_node;                          /**< NUMA node connection */
diff --git a/lib/librte_eal/linuxapp/eal/eal_interrupts.c b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
index dc2668a..f2da778 100644
--- a/lib/librte_eal/linuxapp/eal/eal_interrupts.c
+++ b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
@@ -361,6 +361,53 @@ vfio_disable_msix(struct rte_intr_handle *intr_handle) {
 }
 #endif
 
+static int
+uio_intr_enable(struct rte_intr_handle *intr_handle)
+{
+	unsigned char command_high;
+
+	/* use config file descriptor for uio_pci_generic */
+	if (pread(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
+		RTE_LOG(ERR, EAL,
+			"Error reading interrupts status for fd %d\n",
+			intr_handle->uio_cfg_fd);
+		return -1;
+	}
+	/* disable interrupts */
+	command_high |= 0x4;
+	if (pwrite(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
+		RTE_LOG(ERR, EAL,
+			"Error disabling interrupts for fd %d\n",
+			intr_handle->uio_cfg_fd);
+		return -1;
+	}
+
+	return 0;
+}
+
+static int
+uio_intr_disable(struct rte_intr_handle *intr_handle)
+{
+	unsigned char command_high;
+
+	if (pread(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
+		RTE_LOG(ERR, EAL,
+			"Error reading interrupts status for fd %d\n",
+			intr_handle->uio_cfg_fd);
+		return -1;
+	}
+	/* enable interrupts */
+	command_high &= ~0x4;
+	if (pwrite(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
+		RTE_LOG(ERR, EAL,
+			"Error enabling interrupts for fd %d\n",
+			intr_handle->uio_cfg_fd);
+		return -1;
+	}
+
+	return 0;
+}
+
 int
 rte_intr_callback_register(struct rte_intr_handle *intr_handle,
 			rte_intr_callback_fn cb, void *cb_arg)
@@ -500,20 +547,14 @@ rte_intr_callback_unregister(struct rte_intr_handle *intr_handle,
 int
 rte_intr_enable(struct rte_intr_handle *intr_handle)
 {
-	const int value = 1;
-
-	if (!intr_handle || intr_handle->fd < 0)
+	if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd)
 		return -1;
 
 	switch (intr_handle->type){
 	/* write to the uio fd to enable the interrupt */
 	case RTE_INTR_HANDLE_UIO:
-		if (write(intr_handle->fd, &value, sizeof(value)) < 0) {
-			RTE_LOG(ERR, EAL,
-				"Error enabling interrupts for fd %d\n",
-							intr_handle->fd);
+		if (uio_intr_enable(intr_handle))
 			return -1;
-		}
 		break;
 	/* not used at this moment */
 	case RTE_INTR_HANDLE_ALARM:
@@ -546,20 +587,14 @@ rte_intr_enable(struct rte_intr_handle *intr_handle)
 int
 rte_intr_disable(struct rte_intr_handle *intr_handle)
 {
-	const int value = 0;
-
-	if (!intr_handle || intr_handle->fd < 0)
+	if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd)
 		return -1;
 
 	switch (intr_handle->type){
 	/* write to the uio fd to disable the interrupt */
 	case RTE_INTR_HANDLE_UIO:
-		if (write(intr_handle->fd, &value, sizeof(value)) < 0){
-			RTE_LOG(ERR, EAL,
-				"Error disabling interrupts for fd %d\n",
-							intr_handle->fd);
+		if (uio_intr_disable(intr_handle))
 			return -1;
-		}
 		break;
 	/* not used at this moment */
 	case RTE_INTR_HANDLE_ALARM:
diff --git a/lib/librte_eal/linuxapp/eal/eal_pci.c b/lib/librte_eal/linuxapp/eal/eal_pci.c
index 5fe3961..5c73908 100644
--- a/lib/librte_eal/linuxapp/eal/eal_pci.c
+++ b/lib/librte_eal/linuxapp/eal/eal_pci.c
@@ -498,7 +498,7 @@ pci_map_device(struct rte_pci_device *dev)
 			return ret;
 	}
 #endif
-	/* map resources for devices that use igb_uio */
+	/* map resources for devices that use uio_pci_generic or igb_uio */
 	if (!mapped) {
 		ret = pci_uio_map_resource(dev);
 		if (ret != 0)
diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_uio.c b/lib/librte_eal/linuxapp/eal/eal_pci_uio.c
index 7e62266..4b159e2 100644
--- a/lib/librte_eal/linuxapp/eal/eal_pci_uio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_pci_uio.c
@@ -35,6 +35,7 @@
 #include <fcntl.h>
 #include <dirent.h>
 #include <sys/stat.h>
+#include <linux/pci_regs.h>
 
 #include <rte_log.h>
 #include <rte_pci.h>
@@ -46,72 +47,130 @@
 #include "eal_filesystem.h"
 #include "eal_pci_init.h"
 
+#define IORESOURCE_MEM        0x00000200
+#define UIO_PCI_GENERIC_NAME  "uio_pci_generic"
+
 static int pci_parse_sysfs_value(const char *filename, uint64_t *val);
 
 
 #define OFF_MAX              ((uint64_t)(off_t)-1)
 static int
-pci_uio_get_mappings(const char *devname, struct pci_map maps[], int nb_maps)
+pci_uio_get_mappings(struct rte_pci_device *dev, const char *devname,
+				struct pci_map maps[], int nb_maps)
 {
-	int i;
+	struct rte_pci_addr *loc = &dev->addr;
+	int i = 0;
 	char dirname[PATH_MAX];
 	char filename[PATH_MAX];
 	uint64_t offset, size;
+	unsigned long long start_addr, end_addr, flags;
+	FILE *f;
 
-	for (i = 0; i != nb_maps; i++) {
-
-		/* check if map directory exists */
-		snprintf(dirname, sizeof(dirname),
-			"%s/maps/map%u", devname, i);
-
-		if (access(dirname, F_OK) != 0)
-			break;
-
-		/* get mapping offset */
+	if (strncmp(dev->driver_name, UIO_PCI_GENERIC_NAME,
+			strlen(UIO_PCI_GENERIC_NAME)) == 0) {
 		snprintf(filename, sizeof(filename),
-			"%s/offset", dirname);
-		if (pci_parse_sysfs_value(filename, &offset) < 0) {
-			RTE_LOG(ERR, EAL,
-				"%s(): cannot parse offset of %s\n",
-				__func__, dirname);
-			return -1;
-		}
+			SYSFS_PCI_DEVICES "/" PCI_PRI_FMT "/resource",
+			loc->domain, loc->bus, loc->devid, loc->function);
 
-		/* get mapping size */
-		snprintf(filename, sizeof(filename),
-			"%s/size", dirname);
-		if (pci_parse_sysfs_value(filename, &size) < 0) {
+		f = fopen(filename, "r");
+		if (f == NULL) {
 			RTE_LOG(ERR, EAL,
-				"%s(): cannot parse size of %s\n",
-				__func__, dirname);
+			"%s(): cannot open sysfs %s\n",
+			__func__, filename);
 			return -1;
 		}
 
-		/* get mapping physical address */
-		snprintf(filename, sizeof(filename),
-			"%s/addr", dirname);
-		if (pci_parse_sysfs_value(filename, &maps[i].phaddr) < 0) {
-			RTE_LOG(ERR, EAL,
-				"%s(): cannot parse addr of %s\n",
-				__func__, dirname);
-			return -1;
+		while (fscanf(f, "%llx %llx %llx", &start_addr,
+				&end_addr, &flags) == 3 && i < nb_maps) {
+			if (flags & IORESOURCE_MEM) {
+				maps[i].offset = 0x0;
+				maps[i].size = end_addr - start_addr + 1;
+				maps[i].phaddr = start_addr;
+				i++;
+			}
 		}
+		fclose(f);
+	} else {
+		for (i = 0; i != nb_maps; i++) {
+			/* check if map directory exists */
+			snprintf(dirname, sizeof(dirname),
+				"%s/maps/map%u", devname, i);
+
+			if (access(dirname, F_OK) != 0)
+				break;
+
+			/* get mapping offset */
+			snprintf(filename, sizeof(filename),
+				"%s/offset", dirname);
+			if (pci_parse_sysfs_value(filename, &offset) < 0) {
+				RTE_LOG(ERR, EAL,
+					"%s(): cannot parse offset of %s\n",
+					__func__, dirname);
+				return -1;
+			}
 
-		if ((offset > OFF_MAX) || (size > SIZE_MAX)) {
-			RTE_LOG(ERR, EAL,
-				"%s(): offset/size exceed system max value\n",
-				__func__);
-			return -1;
-		}
+			/* get mapping size */
+			snprintf(filename, sizeof(filename),
+				"%s/size", dirname);
+			if (pci_parse_sysfs_value(filename, &size) < 0) {
+				RTE_LOG(ERR, EAL,
+					"%s(): cannot parse size of %s\n",
+					__func__, dirname);
+				return -1;
+			}
 
-		maps[i].offset = offset;
-		maps[i].size = size;
+			/* get mapping physical address */
+			snprintf(filename, sizeof(filename),
+				"%s/addr", dirname);
+			if (pci_parse_sysfs_value(filename,
+						&maps[i].phaddr) < 0) {
+				RTE_LOG(ERR, EAL,
+					"%s(): cannot parse addr of %s\n",
+					__func__, dirname);
+				return -1;
+			}
+
+			if ((offset > OFF_MAX) || (size > SIZE_MAX)) {
+				RTE_LOG(ERR, EAL,
+					"%s(): offset/size exceed system max value\n",
+					__func__);
+				return -1;
+			}
+
+			maps[i].offset = offset;
+			maps[i].size = size;
+		}
 	}
 
 	return i;
 }
 
 static int
+pci_uio_set_bus_master(int dev_fd)
+{
+	uint16_t reg;
+	int ret;
+
+	ret = pread(dev_fd, &reg, sizeof(reg), PCI_COMMAND);
+	if (ret != sizeof(reg)) {
+		RTE_LOG(ERR, EAL,
+			"Cannot read command from PCI config space!\n");
+		return -1;
+	}
+
+	reg |= PCI_COMMAND_MASTER;
+
+	ret = pwrite(dev_fd, &reg, sizeof(reg), PCI_COMMAND);
+	if (ret != sizeof(reg)) {
+		RTE_LOG(ERR, EAL,
+			"Cannot write command to PCI config space!\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+static int
 pci_uio_map_secondary(struct rte_pci_device *dev)
 {
 	int fd, i;
@@ -210,6 +269,10 @@ pci_get_uio_dev(struct rte_pci_device *dev, char *dstbuf,
 	struct dirent *e;
 	DIR *dir;
 	char dirname[PATH_MAX];
+	FILE *f;
+	char fullpath[PATH_MAX];
+	char buf[BUFSIZ];
+	char *s;
 
 	/* depending on kernel version, uio can be located in uio/uioX
 	 * or uio:uioX */
@@ -265,6 +328,30 @@ pci_get_uio_dev(struct rte_pci_device *dev, char *dstbuf,
 	if (e == NULL)
 		return -1;
 
+	/* remember driver name of uio device */
+	snprintf(fullpath, sizeof(fullpath),
+			SYSFS_PCI_DEVICES "/" PCI_PRI_FMT "/uio/%s/name" ,
+			loc->domain, loc->bus, loc->devid,
+			loc->function, e->d_name);
+
+	f = fopen(fullpath, "r");
+	if (f == NULL) {
+		RTE_LOG(ERR, EAL,
+			"%s(): cannot open sysfs to get driver name\n",
+			__func__);
+		return -1;
+	}
+	s = fgets(buf, sizeof(buf), f);
+	if (s == NULL) {
+		RTE_LOG(ERR, EAL,
+			"%s(): cannot read sysfs to get driver name\n",
+			__func__);
+		fclose(f);
+		return -1;
+	}
+	strncpy(dev->driver_name, buf, sizeof(buf));
+	fclose(f);
+
 	/* create uio device if we've been asked to */
 	if (internal_config.create_uio_dev &&
 			pci_mknod_uio_dev(dstbuf, uio_num) < 0)
@@ -279,6 +366,7 @@ pci_uio_map_resource(struct rte_pci_device *dev)
 {
 	int i, j;
 	char dirname[PATH_MAX];
+	char cfgname[PATH_MAX];
 	char devname[PATH_MAX]; /* contains the /dev/uioX */
 	void *mapaddr;
 	int uio_num;
@@ -291,6 +379,7 @@ pci_uio_map_resource(struct rte_pci_device *dev)
 	struct pci_map *maps;
 
 	dev->intr_handle.fd = -1;
+	dev->intr_handle.uio_cfg_fd = -1;
 	dev->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
 
 	/* secondary processes - use already recorded details */
@@ -315,6 +404,33 @@ pci_uio_map_resource(struct rte_pci_device *dev)
 	}
 	dev->intr_handle.type = RTE_INTR_HANDLE_UIO;
 
+	snprintf(cfgname, sizeof(cfgname),
+			"/sys/class/uio/uio%u/device/config", uio_num);
+	dev->intr_handle.uio_cfg_fd = open(cfgname, O_RDWR);
+	if (dev->intr_handle.uio_cfg_fd < 0) {
+		RTE_LOG(ERR, EAL, "Cannot open %s: %s\n",
+			cfgname, strerror(errno));
+		return -1;
+	}
+
+	/* handle Misc. stuff for uio_pci_generic  */
+	if (strncmp(dev->driver_name, UIO_PCI_GENERIC_NAME,
+			strlen(UIO_PCI_GENERIC_NAME)) == 0) {
+		/* update devname for uio_pci_generic  */
+		snprintf(devname, sizeof(devname),
+			SYSFS_PCI_DEVICES "/" PCI_PRI_FMT "/resource%d",
+			loc->domain, loc->bus, loc->devid, loc->function, 0);
+
+		/* set bus master that is not done by uio_pci_generic */
+		if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+			if (pci_uio_set_bus_master(dev->intr_handle.uio_cfg_fd)) {
+				RTE_LOG(ERR, EAL,
+					"Cannot set up bus mastering!\n");
+				return -1;
+			}
+		}
+	}
+
 	/* allocate the mapping details for secondary processes*/
 	uio_res = rte_zmalloc("UIO_RES", sizeof(*uio_res), 0);
 	if (uio_res == NULL) {
@@ -327,13 +443,12 @@ pci_uio_map_resource(struct rte_pci_device *dev)
 	memcpy(&uio_res->pci_addr, &dev->addr, sizeof(uio_res->pci_addr));
 
 	/* collect info about device mappings */
-	nb_maps = pci_uio_get_mappings(dirname, uio_res->maps,
-				       RTE_DIM(uio_res->maps));
+	nb_maps = pci_uio_get_mappings(dev, dirname, uio_res->maps,
+						RTE_DIM(uio_res->maps));
 	if (nb_maps < 0) {
 		rte_free(uio_res);
 		return nb_maps;
 	}
-
 	uio_res->nb_maps = nb_maps;
 
 	/* Map all BARs */
diff --git a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
index 23eafd9..151323d 100644
--- a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
+++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
@@ -51,6 +51,7 @@ enum rte_intr_handle_type {
 /** Handle for interrupts. */
 struct rte_intr_handle {
 	int vfio_dev_fd;                 /**< VFIO device file descriptor */
+	int uio_cfg_fd;                  /**< UIO config file descriptor */
 	int fd;                          /**< file descriptor */
 	enum rte_intr_handle_type type;  /**< handle type */
 };
diff --git a/tools/dpdk_nic_bind.py b/tools/dpdk_nic_bind.py
index 812b6a1..2483056 100755
--- a/tools/dpdk_nic_bind.py
+++ b/tools/dpdk_nic_bind.py
@@ -43,7 +43,7 @@ ETHERNET_CLASS = "0200"
 # Each device within this is itself a dictionary of device properties
 devices = {}
 # list of supported DPDK drivers
-dpdk_drivers = [ "igb_uio", "vfio-pci" ]
+dpdk_drivers = [ "igb_uio", "vfio-pci", "uio_pci_generic" ]
 
 # command-line arg flags
 b_flag = None
-- 
1.8.1.4



More information about the dev mailing list