[dpdk-dev] [RFC 3/3] bus/pci: add mdev support
Tiwei Bie
tiwei.bie at intel.com
Wed Apr 3 09:18:44 CEST 2019
This patch adds the mdev support in PCI bus driver. A mdev
driver is introduced to probe the mdev devices whose device
API is "vfio-pci" on the mdev bus.
PS. There are some hacks in this patch for now.
Signed-off-by: Cunming Liang <cunming.liang at intel.com>
Signed-off-by: Tiwei Bie <tiwei.bie at intel.com>
---
drivers/bus/pci/Makefile | 3 +
drivers/bus/pci/linux/Makefile | 4 +
drivers/bus/pci/linux/pci_vfio.c | 35 ++-
drivers/bus/pci/linux/pci_vfio_mdev.c | 305 ++++++++++++++++++++++++++
drivers/bus/pci/meson.build | 4 +-
drivers/bus/pci/pci_common.c | 17 +-
drivers/bus/pci/private.h | 9 +
drivers/bus/pci/rte_bus_pci.h | 11 +-
8 files changed, 370 insertions(+), 18 deletions(-)
create mode 100644 drivers/bus/pci/linux/pci_vfio_mdev.c
diff --git a/drivers/bus/pci/Makefile b/drivers/bus/pci/Makefile
index de53ce1bf..085ec9066 100644
--- a/drivers/bus/pci/Makefile
+++ b/drivers/bus/pci/Makefile
@@ -27,6 +27,9 @@ CFLAGS += -DALLOW_EXPERIMENTAL_API
LDLIBS += -lrte_eal -lrte_mbuf -lrte_mempool -lrte_ring
LDLIBS += -lrte_ethdev -lrte_pci -lrte_kvargs
+ifeq ($(CONFIG_RTE_LIBRTE_MDEV_BUS),y)
+LDLIBS += -lrte_bus_mdev
+endif
include $(RTE_SDK)/drivers/bus/pci/$(SYSTEM)/Makefile
SRCS-$(CONFIG_RTE_LIBRTE_PCI_BUS) := $(addprefix $(SYSTEM)/,$(SRCS))
diff --git a/drivers/bus/pci/linux/Makefile b/drivers/bus/pci/linux/Makefile
index 90404468b..88bbc2390 100644
--- a/drivers/bus/pci/linux/Makefile
+++ b/drivers/bus/pci/linux/Makefile
@@ -4,3 +4,7 @@
SRCS += pci.c
SRCS += pci_uio.c
SRCS += pci_vfio.c
+
+ifeq ($(CONFIG_RTE_LIBRTE_MDEV_BUS),y)
+ SRCS += pci_vfio_mdev.c
+endif
diff --git a/drivers/bus/pci/linux/pci_vfio.c b/drivers/bus/pci/linux/pci_vfio.c
index ebf6ccd3c..c2c4c6a50 100644
--- a/drivers/bus/pci/linux/pci_vfio.c
+++ b/drivers/bus/pci/linux/pci_vfio.c
@@ -13,6 +13,9 @@
#include <rte_log.h>
#include <rte_pci.h>
+#ifdef RTE_LIBRTE_MDEV_BUS
+#include <rte_bus_mdev.h>
+#endif
#include <rte_bus_pci.h>
#include <rte_eal_memconfig.h>
#include <rte_malloc.h>
@@ -20,6 +23,7 @@
#include <rte_eal.h>
#include <rte_bus.h>
#include <rte_spinlock.h>
+#include <rte_uuid.h>
#include "eal_filesystem.h"
@@ -648,6 +652,7 @@ pci_vfio_map_resource_primary(struct rte_pci_device *dev)
{
struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
char pci_addr[PATH_MAX] = {0};
+ const char *sysfs_path;
int vfio_dev_fd;
struct rte_pci_addr *loc = &dev->addr;
int i, ret;
@@ -663,10 +668,20 @@ pci_vfio_map_resource_primary(struct rte_pci_device *dev)
#endif
/* store PCI address string */
- snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT,
+ if (dev->use_uuid) {
+#ifdef RTE_LIBRTE_MDEV_BUS
+ sysfs_path = rte_mdev_get_sysfs_path();
+ rte_uuid_unparse(dev->uuid, pci_addr, sizeof(pci_addr));
+#else
+ return -1;
+#endif
+ } else {
+ sysfs_path = rte_pci_get_sysfs_path();
+ snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT,
loc->domain, loc->bus, loc->devid, loc->function);
+ }
- ret = rte_vfio_setup_device(rte_pci_get_sysfs_path(), pci_addr,
+ ret = rte_vfio_setup_device(sysfs_path, pci_addr,
&vfio_dev_fd, &device_info);
if (ret)
return ret;
@@ -793,6 +808,7 @@ pci_vfio_map_resource_secondary(struct rte_pci_device *dev)
{
struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
char pci_addr[PATH_MAX] = {0};
+ const char *sysfs_path;
int vfio_dev_fd;
struct rte_pci_addr *loc = &dev->addr;
int i, ret;
@@ -808,8 +824,19 @@ pci_vfio_map_resource_secondary(struct rte_pci_device *dev)
#endif
/* store PCI address string */
- snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT,
+ if (dev->use_uuid) {
+#ifdef RTE_LIBRTE_MDEV_BUS
+ sysfs_path = rte_mdev_get_sysfs_path();
+ rte_uuid_unparse(dev->uuid, pci_addr, sizeof(pci_addr));
+#else
+ return -1;
+#endif
+ } else {
+ sysfs_path = rte_pci_get_sysfs_path();
+ snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT,
loc->domain, loc->bus, loc->devid, loc->function);
+ }
+
/* if we're in a secondary process, just find our tailq entry */
TAILQ_FOREACH(vfio_res, vfio_res_list, next) {
@@ -825,7 +852,7 @@ pci_vfio_map_resource_secondary(struct rte_pci_device *dev)
return -1;
}
- ret = rte_vfio_setup_device(rte_pci_get_sysfs_path(), pci_addr,
+ ret = rte_vfio_setup_device(sysfs_path, pci_addr,
&vfio_dev_fd, &device_info);
if (ret)
return ret;
diff --git a/drivers/bus/pci/linux/pci_vfio_mdev.c b/drivers/bus/pci/linux/pci_vfio_mdev.c
new file mode 100644
index 000000000..92498c2fe
--- /dev/null
+++ b/drivers/bus/pci/linux/pci_vfio_mdev.c
@@ -0,0 +1,305 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2019 Intel Corporation
+ */
+
+#include <string.h>
+#include <dirent.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <linux/pci_regs.h>
+
+#include <rte_log.h>
+#include <rte_pci.h>
+#include <rte_eal_memconfig.h>
+#include <rte_malloc.h>
+#include <rte_devargs.h>
+#include <rte_memcpy.h>
+#include <rte_vfio.h>
+#include <rte_bus_mdev.h>
+
+#include "eal_private.h"
+#include "eal_filesystem.h"
+
+#include "private.h"
+
+extern struct rte_pci_bus rte_pci_bus;
+
+static int
+get_pci_id(const char *sysfs_base, const char *dev_addr,
+ struct rte_pci_id *pci_id)
+{
+ int ret = 0;
+ int iommu_group_num;
+ int vfio_group_fd;
+ int vfio_dev_fd;
+ int container;
+ int class;
+ char name[PATH_MAX];
+ struct vfio_group_status group_status = {
+ .argsz = sizeof(group_status) };
+
+ container = open("/dev/vfio/vfio", O_RDWR);
+ if (container < 0) {
+ RTE_LOG(WARNING, EAL, "Failed to open VFIO container\n");
+ ret = -1;
+ goto out;
+ }
+
+ if (ioctl(container, VFIO_GET_API_VERSION) != VFIO_API_VERSION) {
+ /* Unknown API version */
+ RTE_LOG(WARNING, EAL, "Unknown VFIO API version\n");
+ ret = -1;
+ goto close_container;
+ }
+
+ if (rte_vfio_get_group_num(sysfs_base, dev_addr,
+ &iommu_group_num) <= 0) {
+ RTE_LOG(WARNING, EAL, "%s not managed by VFIO driver\n",
+ dev_addr);
+ ret = -1;
+ goto close_container;
+ }
+
+ snprintf(name, sizeof(name), "/dev/vfio/%d", iommu_group_num);
+
+ vfio_group_fd = open(name, O_RDWR);
+ if (vfio_group_fd < 0) {
+ ret = -1;
+ goto close_container;
+ }
+
+ /* if group_fd == 0, that means the device isn't managed by VFIO */
+ if (vfio_group_fd == 0) {
+ RTE_LOG(WARNING, EAL, "%s not managed by VFIO driver\n",
+ dev_addr);
+ ret = -1;
+ goto close_group;
+ }
+
+ if (ioctl(vfio_group_fd, VFIO_GROUP_GET_STATUS, &group_status)) {
+ RTE_LOG(ERR, EAL, "%s cannot get group status, error %i (%s)\n",
+ dev_addr, errno, strerror(errno));
+ ret = -1;
+ goto close_group;
+ }
+
+ if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
+ RTE_LOG(ERR, EAL, "%s VFIO group is not viable!\n", dev_addr);
+ ret = -1;
+ goto close_group;
+ }
+
+ if (!(group_status.flags & VFIO_GROUP_FLAGS_CONTAINER_SET)) {
+ if (ioctl(vfio_group_fd, VFIO_GROUP_SET_CONTAINER,
+ &container)) {
+ RTE_LOG(ERR, EAL, "%s cannot add VFIO group to container, error %i (%s)\n",
+ dev_addr, errno, strerror(errno));
+ ret = -1;
+ goto close_group;
+ }
+ }
+
+ if (ioctl(container, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU)) {
+ RTE_LOG(ERR, EAL, "%s cannot set iommu, error %i (%s)\n",
+ dev_addr, errno, strerror(errno));
+ ret = -1;
+ goto close_group;
+ }
+
+ vfio_dev_fd = ioctl(vfio_group_fd, VFIO_GROUP_GET_DEVICE_FD, dev_addr);
+ if (vfio_dev_fd < 0) {
+ /* if we cannot get a device fd, this implies a problem with
+ * the VFIO group or the container not having IOMMU configured.
+ */
+ RTE_LOG(ERR, EAL, "Getting a vfio_dev_fd for %s failed errno %d\n",
+ dev_addr, errno);
+ ret = -1;
+ goto close_group;
+ }
+
+ /* vendor_id */
+ if (pread64(vfio_dev_fd, &pci_id->vendor_id, sizeof(uint16_t),
+ VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
+ PCI_VENDOR_ID) != sizeof(uint16_t)) {
+ RTE_LOG(ERR, EAL, "Cannot read VendorID from PCI config space\n");
+ ret = -1;
+ goto close_device;
+ }
+
+ /* device_id */
+ if (pread64(vfio_dev_fd, &pci_id->device_id, sizeof(uint16_t),
+ VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
+ PCI_DEVICE_ID) != sizeof(uint16_t)) {
+ RTE_LOG(ERR, EAL, "Cannot read DeviceID from PCI config space\n");
+ ret = -1;
+ goto close_device;
+ }
+
+ /* subsystem_vendor_id */
+ if (pread64(vfio_dev_fd, &pci_id->subsystem_vendor_id, sizeof(uint16_t),
+ VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
+ PCI_SUBSYSTEM_VENDOR_ID) != sizeof(uint16_t)) {
+ RTE_LOG(ERR, EAL, "Cannot read SubVendorID from PCI config space\n");
+ ret = -1;
+ goto close_device;
+ }
+
+ /* subsystem_device_id */
+ if (pread64(vfio_dev_fd, &pci_id->subsystem_device_id, sizeof(uint16_t),
+ VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
+ PCI_SUBSYSTEM_ID) != sizeof(uint16_t)) {
+ RTE_LOG(ERR, EAL, "Cannot read SubDeviceID from PCI config space\n");
+ ret = -1;
+ goto close_device;
+ }
+
+ /* class_id */
+ if (pread64(vfio_dev_fd, &class, sizeof(uint32_t),
+ VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
+ PCI_CLASS_REVISION) != sizeof(uint32_t)) {
+ RTE_LOG(ERR, EAL, "Cannot read ClassID from PCI config space\n");
+ ret = -1;
+ goto close_device;
+ }
+ pci_id->class_id = class >> 8;
+
+close_device:
+ if (close(vfio_dev_fd) < 0) {
+ RTE_LOG(INFO, EAL, "Error when closing VFIO device for %s\n",
+ dev_addr);
+ ret = -1;
+ }
+
+close_group:
+ if (close(vfio_group_fd) < 0) {
+ RTE_LOG(INFO, EAL, "Error when closing VFIO group for %s\n",
+ dev_addr);
+ ret = -1;
+ }
+
+close_container:
+ if (close(container) < 0) {
+ RTE_LOG(INFO, EAL, "Error when closing VFIO container\n");
+ ret = -1;
+ }
+
+out:
+ return ret;
+}
+
+static int vfio_pci_probe(struct rte_mdev_driver *mdev_drv __rte_unused,
+ struct rte_mdev_device *mdev_dev)
+{
+ char name[RTE_UUID_STRLEN];
+ struct rte_pci_device *dev;
+ struct rte_bus *bus;
+ int ret;
+
+ bus = rte_bus_find_by_name("pci");
+ if (bus == NULL) {
+ RTE_LOG(ERR, EAL, "Cannot find bus pci\n");
+ return -ENOENT;
+ }
+
+ if (bus->plug == NULL) {
+ RTE_LOG(ERR, EAL, "Function plug not supported by bus (%s)\n",
+ bus->name);
+ return -ENOTSUP;
+ }
+
+ dev = malloc(sizeof(*dev));
+ if (dev == NULL)
+ return -ENOMEM;
+
+ memset(dev, 0, sizeof(*dev));
+ dev->device.bus = &rte_pci_bus.bus;
+ rte_uuid_unparse(mdev_dev->addr, name, sizeof(name));
+
+ if (get_pci_id(rte_mdev_get_sysfs_path(), name, &dev->id)) {
+ free(dev);
+ return -1;
+ }
+
+ snprintf(dev->name, sizeof(dev->name), "%s", name);
+ dev->device.name = dev->name;
+ dev->kdrv = RTE_KDRV_VFIO;
+ dev->use_uuid = 1;
+ rte_uuid_copy(dev->uuid, mdev_dev->addr);
+
+ // TODO: dev->device.devargs, etc
+
+ memset(&dev->addr, -1, sizeof(dev->addr)); // XXX: TODO
+
+ /* device is valid, add to the list (sorted) */
+ if (TAILQ_EMPTY(&rte_pci_bus.device_list)) {
+ rte_pci_add_device(dev);
+ } else {
+ struct rte_pci_device *dev2;
+ int ret;
+
+ TAILQ_FOREACH(dev2, &rte_pci_bus.device_list, next) {
+ // XXX
+ ret = rte_pci_addr_cmp(&dev->addr, &dev2->addr);
+ if (ret == 0)
+ ret = strncmp(dev->name, dev2->name,
+ sizeof(dev->name));
+ if (ret > 0)
+ continue;
+ if (ret < 0) {
+ rte_pci_insert_device(dev2, dev);
+ goto plug;
+ }
+ /* already registered */
+ free(dev);
+ return 0;
+ }
+
+ rte_pci_add_device(dev);
+ }
+
+plug:
+ ret = bus->plug(&dev->device);
+ if (ret != 0) {
+ rte_pci_remove_device(dev);
+ free(dev);
+ } else {
+ mdev_dev->private = dev;
+ }
+ return ret;
+}
+
+static int vfio_pci_remove(struct rte_mdev_device *mdev_dev)
+{
+ struct rte_pci_device *dev = mdev_dev->private;
+ struct rte_bus *bus;
+ int ret;
+
+ if (dev == NULL)
+ return 0;
+
+ bus = rte_bus_find_by_name("pci");
+ if (bus == NULL) {
+ RTE_LOG(ERR, EAL, "Cannot find bus pci\n");
+ return -ENOENT;
+ }
+
+ if (bus->unplug == NULL) {
+ RTE_LOG(ERR, EAL, "Function unplug not supported by bus (%s)\n",
+ bus->name);
+ return -ENOTSUP;
+ }
+
+ ret = bus->unplug(&dev->device);
+ if (ret == 0)
+ mdev_dev->private = NULL;
+
+ return ret;
+}
+
+static struct rte_mdev_driver vfio_pci_drv = {
+ .dev_api = RTE_MDEV_DEV_API_VFIO_PCI,
+ .probe = vfio_pci_probe,
+ .remove = vfio_pci_remove
+};
+
+RTE_MDEV_REGISTER_DRIVER(mdev_vfio_pci, vfio_pci_drv);
diff --git a/drivers/bus/pci/meson.build b/drivers/bus/pci/meson.build
index a3140ff97..c3e884657 100644
--- a/drivers/bus/pci/meson.build
+++ b/drivers/bus/pci/meson.build
@@ -11,8 +11,10 @@ sources = files('pci_common.c',
if host_machine.system() == 'linux'
sources += files('linux/pci.c',
'linux/pci_uio.c',
- 'linux/pci_vfio.c')
+ 'linux/pci_vfio.c',
+ 'linux/pci_vfio_mdev.c')
includes += include_directories('linux')
+ deps += ['bus_mdev']
else
sources += files('bsd/pci.c')
includes += include_directories('bsd')
diff --git a/drivers/bus/pci/pci_common.c b/drivers/bus/pci/pci_common.c
index 704b9d71a..6b47333e6 100644
--- a/drivers/bus/pci/pci_common.c
+++ b/drivers/bus/pci/pci_common.c
@@ -124,21 +124,17 @@ rte_pci_probe_one_driver(struct rte_pci_driver *dr,
{
int ret;
bool already_probed;
- struct rte_pci_addr *loc;
if ((dr == NULL) || (dev == NULL))
return -EINVAL;
- loc = &dev->addr;
-
/* The device is not blacklisted; Check if driver supports it */
if (!rte_pci_match(dr, dev))
/* Match of device and driver failed */
return 1;
- RTE_LOG(INFO, EAL, "PCI device "PCI_PRI_FMT" on NUMA socket %i\n",
- loc->domain, loc->bus, loc->devid, loc->function,
- dev->device.numa_node);
+ RTE_LOG(INFO, EAL, "PCI device %s on NUMA socket %i\n",
+ dev->name, dev->device.numa_node);
/* no initialization when blacklisted, return without error */
if (dev->device.devargs != NULL &&
@@ -208,7 +204,6 @@ rte_pci_probe_one_driver(struct rte_pci_driver *dr,
static int
rte_pci_detach_dev(struct rte_pci_device *dev)
{
- struct rte_pci_addr *loc;
struct rte_pci_driver *dr;
int ret = 0;
@@ -216,11 +211,9 @@ rte_pci_detach_dev(struct rte_pci_device *dev)
return -EINVAL;
dr = dev->driver;
- loc = &dev->addr;
- RTE_LOG(DEBUG, EAL, "PCI device "PCI_PRI_FMT" on NUMA socket %i\n",
- loc->domain, loc->bus, loc->devid,
- loc->function, dev->device.numa_node);
+ RTE_LOG(DEBUG, EAL, "PCI device %s on NUMA socket %i\n",
+ dev->name, dev->device.numa_node);
RTE_LOG(DEBUG, EAL, " remove driver: %x:%x %s\n", dev->id.vendor_id,
dev->id.device_id, dr->driver.name);
@@ -387,7 +380,7 @@ rte_pci_insert_device(struct rte_pci_device *exist_pci_dev,
}
/* Remove a device from PCI bus */
-static void
+void
rte_pci_remove_device(struct rte_pci_device *pci_dev)
{
TAILQ_REMOVE(&rte_pci_bus.device_list, pci_dev, next);
diff --git a/drivers/bus/pci/private.h b/drivers/bus/pci/private.h
index 13c3324bb..d5815ee44 100644
--- a/drivers/bus/pci/private.h
+++ b/drivers/bus/pci/private.h
@@ -67,6 +67,15 @@ void rte_pci_add_device(struct rte_pci_device *pci_dev);
void rte_pci_insert_device(struct rte_pci_device *exist_pci_dev,
struct rte_pci_device *new_pci_dev);
+/**
+ * Remove a PCI device from the PCI Bus.
+ *
+ * @param pci_dev
+ * PCI device to remove
+ * @return void
+ */
+void rte_pci_remove_device(struct rte_pci_device *pci_dev);
+
/**
* Update a pci device object by asking the kernel for the latest information.
*
diff --git a/drivers/bus/pci/rte_bus_pci.h b/drivers/bus/pci/rte_bus_pci.h
index 06e004cd3..465a44935 100644
--- a/drivers/bus/pci/rte_bus_pci.h
+++ b/drivers/bus/pci/rte_bus_pci.h
@@ -51,6 +51,13 @@ TAILQ_HEAD(rte_pci_driver_list, rte_pci_driver);
struct rte_devargs;
+/* It's RTE_UUID_STRLEN, which is bigger than PCI_PRI_STR_SIZE. */
+#define RTE_PCI_NAME_LEN (36 + 1)
+
+// XXX: we can't include rte_uuid.h directly due to the conflicts
+// introduced by stdbool.h
+typedef unsigned char rte_uuid_t[16];
+
/**
* A structure describing a PCI device.
*/
@@ -58,6 +65,8 @@ struct rte_pci_device {
TAILQ_ENTRY(rte_pci_device) next; /**< Next probed PCI device. */
struct rte_device device; /**< Inherit core device */
struct rte_pci_addr addr; /**< PCI location. */
+ rte_uuid_t uuid; /**< Mdev location. */
+ uint8_t use_uuid; /**< True if uuid field valid. */
struct rte_pci_id id; /**< PCI ID. */
struct rte_mem_resource mem_resource[PCI_MAX_RESOURCE];
/**< PCI Memory Resource */
@@ -65,7 +74,7 @@ struct rte_pci_device {
struct rte_pci_driver *driver; /**< PCI driver used in probing */
uint16_t max_vfs; /**< sriov enable if not zero */
enum rte_kernel_driver kdrv; /**< Kernel driver passthrough */
- char name[PCI_PRI_STR_SIZE+1]; /**< PCI location (ASCII) */
+ char name[RTE_PCI_NAME_LEN]; /**< PCI/Mdev location (ASCII) */
struct rte_intr_handle vfio_req_intr_handle;
/**< Handler of VFIO request interrupt */
};
--
2.17.1
More information about the dev
mailing list