[dpdk-dev] [PATCH v7 1/2] eal: add uevent monitor for hot plug
Matan Azrad
matan at mellanox.com
Tue Jan 2 18:02:55 CET 2018
Hi Jeff
Maybe I'm touching in previous discussions but please see some comments\questions.
From: Jeff Guo:
> This patch aim to add a general uevent mechanism in eal device layer,
> to enable all linux kernel object hot plug monitoring, so user could use these
> APIs to monitor and read out the device status info that sent from the kernel
> side, then corresponding to handle it, such as detach or attach the
> device, and even benefit to use it to do smoothly fail safe work.
>
> 1) About uevent monitoring:
> a: add one epolling to poll the netlink socket, to monitor the uevent of
> the device, add device_state in struct of rte_device, to identify the
> device state machine.
> b: add enum of rte_eal_dev_event_type and struct of rte_eal_uevent.
> c: add below API in rte eal device common layer.
> rte_eal_dev_monitor_enable
> rte_dev_callback_register
> rte_dev_callback_unregister
> _rte_dev_callback_process
> rte_dev_monitor_start
> rte_dev_monitor_stop
>
> 2) About failure handler, use pci uio for example,
> add pci_remap_device in bus layer and below function to process it:
> rte_pci_remap_device
> pci_uio_remap_resource
> pci_map_private_resource
> add rte_pci_dev_bind_driver to bind pci device with explicit driver.
>
> Signed-off-by: Jeff Guo <jia.guo at intel.com>
> ---
> v7->v6:
> a.modify vdev part according to the vdev rework
> b.re-define and split the func into common and bus specific code
> c.fix some incorrect issue.
> b.fix the system hung after send packcet issue.
> ---
> drivers/bus/pci/bsd/pci.c | 30 ++
> drivers/bus/pci/linux/pci.c | 87 +++++
> drivers/bus/pci/linux/pci_init.h | 1 +
> drivers/bus/pci/pci_common.c | 43 +++
> drivers/bus/pci/pci_common_uio.c | 28 ++
> drivers/bus/pci/private.h | 12 +
> drivers/bus/pci/rte_bus_pci.h | 25 ++
> drivers/bus/vdev/vdev.c | 36 +++
> lib/librte_eal/bsdapp/eal/eal_dev.c | 64 ++++
> .../bsdapp/eal/include/exec-env/rte_dev.h | 106 ++++++
> lib/librte_eal/common/eal_common_bus.c | 30 ++
> lib/librte_eal/common/eal_common_dev.c | 169 ++++++++++
> lib/librte_eal/common/include/rte_bus.h | 69 ++++
> lib/librte_eal/common/include/rte_dev.h | 89 ++++++
> lib/librte_eal/linuxapp/eal/Makefile | 3 +-
> lib/librte_eal/linuxapp/eal/eal_alarm.c | 5 +
> lib/librte_eal/linuxapp/eal/eal_dev.c | 356
> +++++++++++++++++++++
> .../linuxapp/eal/include/exec-env/rte_dev.h | 106 ++++++
> lib/librte_eal/linuxapp/igb_uio/igb_uio.c | 6 +
> lib/librte_pci/rte_pci.c | 20 ++
> lib/librte_pci/rte_pci.h | 17 +
> 21 files changed, 1301 insertions(+), 1 deletion(-)
> create mode 100644 lib/librte_eal/bsdapp/eal/eal_dev.c
> create mode 100644 lib/librte_eal/bsdapp/eal/include/exec-env/rte_dev.h
> create mode 100644 lib/librte_eal/linuxapp/eal/eal_dev.c
> create mode 100644 lib/librte_eal/linuxapp/eal/include/exec-env/rte_dev.h
>
> diff --git a/drivers/bus/pci/bsd/pci.c b/drivers/bus/pci/bsd/pci.c
> index b8e2178..d58dbf6 100644
> --- a/drivers/bus/pci/bsd/pci.c
> +++ b/drivers/bus/pci/bsd/pci.c
> @@ -126,6 +126,29 @@ rte_pci_unmap_device(struct rte_pci_device *dev)
> }
> }
>
> +/* re-map pci device */
> +int
> +rte_pci_remap_device(struct rte_pci_device *dev)
> +{
> + int ret;
> +
> + if (dev == NULL)
> + return -EINVAL;
> +
> + switch (dev->kdrv) {
> + case RTE_KDRV_NIC_UIO:
> + ret = pci_uio_remap_resource(dev);
> + break;
> + default:
> + RTE_LOG(DEBUG, EAL,
> + " Not managed by a supported kernel driver,
> skipped\n");
> + ret = 1;
> + break;
> + }
> +
> + return ret;
> +}
> +
> void
> pci_uio_free_resource(struct rte_pci_device *dev,
> struct mapped_pci_resource *uio_res)
> @@ -678,3 +701,10 @@ rte_pci_ioport_unmap(struct rte_pci_ioport *p)
>
> return ret;
> }
> +
> +int
> +rte_pci_dev_bind_driver(const char *dev_name, const char *drv_type)
> +{
> + return -1;
> +}
> +
> diff --git a/drivers/bus/pci/linux/pci.c b/drivers/bus/pci/linux/pci.c
> index 5da6728..792fd2c 100644
> --- a/drivers/bus/pci/linux/pci.c
> +++ b/drivers/bus/pci/linux/pci.c
> @@ -145,6 +145,38 @@ rte_pci_unmap_device(struct rte_pci_device *dev)
> }
> }
>
> +/* Map pci device */
> +int
> +rte_pci_remap_device(struct rte_pci_device *dev)
> +{
> + int ret = -1;
> +
> + if (dev == NULL)
> + return -EINVAL;
> +
> + switch (dev->kdrv) {
> + case RTE_KDRV_VFIO:
> +#ifdef VFIO_PRESENT
> + /* no thing to do */
> +#endif
> + break;
> + case RTE_KDRV_IGB_UIO:
> + case RTE_KDRV_UIO_GENERIC:
> + if (rte_eal_using_phys_addrs()) {
> + /* map resources for devices that use uio */
> + ret = pci_uio_remap_resource(dev);
> + }
> + break;
> + default:
> + RTE_LOG(DEBUG, EAL,
> + " Not managed by a supported kernel driver,
> skipped\n");
> + ret = 1;
> + break;
> + }
> +
> + return ret;
> +}
> +
> void *
> pci_find_max_end_va(void)
> {
> @@ -386,6 +418,8 @@ pci_scan_one(const char *dirname, const struct
> rte_pci_addr *addr)
> rte_pci_add_device(dev);
> }
>
> + dev->device.state = DEVICE_PARSED;
> + TAILQ_INIT(&(dev->device.uev_cbs));
> return 0;
> }
>
> @@ -854,3 +888,56 @@ rte_pci_ioport_unmap(struct rte_pci_ioport *p)
>
> return ret;
> }
> +
> +int
> +rte_pci_dev_bind_driver(const char *dev_name, const char *drv_type)
> +{
> + char drv_bind_path[1024];
> + char drv_override_path[1024]; /* contains the /dev/uioX */
> + int drv_override_fd;
> + int drv_bind_fd;
> +
> + RTE_SET_USED(drv_type);
> +
> + snprintf(drv_override_path, sizeof(drv_override_path),
> + "/sys/bus/pci/devices/%s/driver_override", dev_name);
> +
> + /* specify the driver for a device by writing to driver_override */
> + drv_override_fd = open(drv_override_path, O_WRONLY);
> + if (drv_override_fd < 0) {
> + RTE_LOG(ERR, EAL, "Cannot open %s: %s\n",
> + drv_override_path, strerror(errno));
> + goto err;
> + }
> +
> + if (write(drv_override_fd, drv_type, sizeof(drv_type)) < 0) {
> + RTE_LOG(ERR, EAL,
> + "Error: bind failed - Cannot write "
> + "driver %s to device %s\n", drv_type, dev_name);
> + goto err;
> + }
> +
> + close(drv_override_fd);
> +
> + snprintf(drv_bind_path, sizeof(drv_bind_path),
> + "/sys/bus/pci/drivers/%s/bind", drv_type);
> +
> + /* do the bind by writing device to the specific driver */
> + drv_bind_fd = open(drv_bind_path, O_WRONLY | O_APPEND);
> + if (drv_bind_fd < 0) {
> + RTE_LOG(ERR, EAL, "Cannot open %s: %s\n",
> + drv_bind_path, strerror(errno));
> + goto err;
> + }
> +
> + if (write(drv_bind_fd, dev_name, sizeof(dev_name)) < 0)
> + goto err;
> +
> + close(drv_bind_fd);
> + return 0;
> +err:
> + close(drv_override_fd);
> + close(drv_bind_fd);
> + return -1;
> +}
> +
> diff --git a/drivers/bus/pci/linux/pci_init.h b/drivers/bus/pci/linux/pci_init.h
> index f342c47..5838402 100644
> --- a/drivers/bus/pci/linux/pci_init.h
> +++ b/drivers/bus/pci/linux/pci_init.h
> @@ -58,6 +58,7 @@ int pci_uio_alloc_resource(struct rte_pci_device *dev,
> struct mapped_pci_resource **uio_res);
> void pci_uio_free_resource(struct rte_pci_device *dev,
> struct mapped_pci_resource *uio_res);
> +int pci_uio_remap_resource(struct rte_pci_device *dev);
> int pci_uio_map_resource_by_index(struct rte_pci_device *dev, int
> res_idx,
> struct mapped_pci_resource *uio_res, int map_idx);
>
> diff --git a/drivers/bus/pci/pci_common.c b/drivers/bus/pci/pci_common.c
> index 104fdf9..5417b32 100644
> --- a/drivers/bus/pci/pci_common.c
> +++ b/drivers/bus/pci/pci_common.c
> @@ -282,6 +282,7 @@ pci_probe_all_drivers(struct rte_pci_device *dev)
> if (rc > 0)
> /* positive value means driver doesn't support it */
> continue;
> + dev->device.state = DEVICE_PROBED;
> return 0;
> }
> return 1;
> @@ -481,6 +482,7 @@ rte_pci_insert_device(struct rte_pci_device
> *exist_pci_dev,
> void
> rte_pci_remove_device(struct rte_pci_device *pci_dev)
> {
> + RTE_LOG(DEBUG, EAL, " rte_pci_remove_device for device list\n");
> TAILQ_REMOVE(&rte_pci_bus.device_list, pci_dev, next);
> }
>
> @@ -502,6 +504,44 @@ pci_find_device(const struct rte_device *start,
> rte_dev_cmp_t cmp,
> return NULL;
> }
>
> +static struct rte_device *
> +pci_find_device_by_name(const struct rte_device *start,
> + rte_dev_cmp_name_t cmp_name,
> + const void *data)
> +{
> + struct rte_pci_device *dev;
> +
> + FOREACH_DEVICE_ON_PCIBUS(dev) {
> + if (start && &dev->device == start) {
> + start = NULL; /* starting point found */
> + continue;
> + }
> + if (cmp_name(dev->device.name, data) == 0)
> + return &dev->device;
> + }
> +
> + return NULL;
> +}
> +
> +static int
> +pci_remap_device(struct rte_device *dev)
> +{
> + struct rte_pci_device *pdev;
> + int ret;
> +
> + if (dev == NULL)
> + return -EINVAL;
> +
> + pdev = RTE_DEV_TO_PCI(dev);
> +
> + /* remap resources for devices that use igb_uio */
> + ret = rte_pci_remap_device(pdev);
> + if (ret != 0)
> + RTE_LOG(ERR, EAL, "failed to remap device %s",
> + dev->name);
> + return ret;
> +}
> +
> static int
> pci_plug(struct rte_device *dev)
> {
> @@ -528,10 +568,13 @@ struct rte_pci_bus rte_pci_bus = {
> .scan = rte_pci_scan,
> .probe = rte_pci_probe,
> .find_device = pci_find_device,
> + .find_device_by_name = pci_find_device_by_name,
> .plug = pci_plug,
> .unplug = pci_unplug,
> .parse = pci_parse,
> .get_iommu_class = rte_pci_get_iommu_class,
> + .remap_device = pci_remap_device,
> + .bind_driver = rte_pci_dev_bind_driver,
> },
> .device_list = TAILQ_HEAD_INITIALIZER(rte_pci_bus.device_list),
> .driver_list = TAILQ_HEAD_INITIALIZER(rte_pci_bus.driver_list),
> diff --git a/drivers/bus/pci/pci_common_uio.c
> b/drivers/bus/pci/pci_common_uio.c
> index 0671131..8cb4009 100644
> --- a/drivers/bus/pci/pci_common_uio.c
> +++ b/drivers/bus/pci/pci_common_uio.c
> @@ -176,6 +176,34 @@ pci_uio_unmap(struct mapped_pci_resource
> *uio_res)
> }
> }
>
> +/* remap the PCI resource of a PCI device in private virtual memory */
> +int
> +pci_uio_remap_resource(struct rte_pci_device *dev)
> +{
> + int i;
> + uint64_t phaddr;
> + void *map_address;
> +
> + /* Map all BARs */
> + for (i = 0; i != PCI_MAX_RESOURCE; i++) {
> + /* skip empty BAR */
> + phaddr = dev->mem_resource[i].phys_addr;
> + if (phaddr == 0)
> + continue;
> + map_address = pci_map_private_resource(
> + dev->mem_resource[i].addr, 0,
> + (size_t)dev->mem_resource[i].len);
> + if (map_address == MAP_FAILED)
> + goto error;
> + memset(map_address, 0xFF, (size_t)dev-
> >mem_resource[i].len);
> + dev->mem_resource[i].addr = map_address;
> + }
> +
> + return 0;
> +error:
> + return -1;
> +}
> +
> static struct mapped_pci_resource *
> pci_uio_find_resource(struct rte_pci_device *dev)
> {
> diff --git a/drivers/bus/pci/private.h b/drivers/bus/pci/private.h
> index 2283f09..10baa1a 100644
> --- a/drivers/bus/pci/private.h
> +++ b/drivers/bus/pci/private.h
> @@ -202,6 +202,18 @@ void pci_uio_free_resource(struct rte_pci_device
> *dev,
> struct mapped_pci_resource *uio_res);
>
> /**
> + * remap the pci uio resource..
> + *
> + * @param dev
> + * Point to the struct rte pci device.
> + * @return
> + * - On success, zero.
> + * - On failure, a negative value.
> + */
> +int
> +pci_uio_remap_resource(struct rte_pci_device *dev);
> +
> +/**
> * Map device memory to uio resource
> *
> * This function is private to EAL.
> diff --git a/drivers/bus/pci/rte_bus_pci.h b/drivers/bus/pci/rte_bus_pci.h
> index d4a2996..1662f3b 100644
> --- a/drivers/bus/pci/rte_bus_pci.h
> +++ b/drivers/bus/pci/rte_bus_pci.h
> @@ -52,6 +52,8 @@ extern "C" {
> #include <sys/queue.h>
> #include <stdint.h>
> #include <inttypes.h>
> +#include <unistd.h>
> +#include <fcntl.h>
>
> #include <rte_debug.h>
> #include <rte_interrupts.h>
> @@ -197,6 +199,15 @@ int rte_pci_map_device(struct rte_pci_device *dev);
> void rte_pci_unmap_device(struct rte_pci_device *dev);
>
> /**
> + * Remap this device
> + *
> + * @param dev
> + * A pointer to a rte_pci_device structure describing the device
> + * to use
> + */
> +int rte_pci_remap_device(struct rte_pci_device *dev);
> +
> +/**
> * Dump the content of the PCI bus.
> *
> * @param f
> @@ -333,6 +344,20 @@ void rte_pci_ioport_read(struct rte_pci_ioport *p,
> void rte_pci_ioport_write(struct rte_pci_ioport *p,
> const void *data, size_t len, off_t offset);
>
> +/**
> + * It can be used to bind a device to a specific type of driver.
> + *
> + * @param dev_name
> + * The device name.
> + * @param drv_type
> + * The specific driver's type.
> + *
> + * @return
> + * - On success, zero.
> + * - On failure, a negative value.
> + */
> +int rte_pci_dev_bind_driver(const char *dev_name, const char *drv_type);
> +
> #ifdef __cplusplus
> }
> #endif
> diff --git a/drivers/bus/vdev/vdev.c b/drivers/bus/vdev/vdev.c
> index fd7736d..773f6e0 100644
> --- a/drivers/bus/vdev/vdev.c
> +++ b/drivers/bus/vdev/vdev.c
> @@ -323,6 +323,39 @@ vdev_find_device(const struct rte_device *start,
> rte_dev_cmp_t cmp,
> return NULL;
> }
>
> +static struct rte_device *
> +vdev_find_device_by_name(const struct rte_device *start,
> + rte_dev_cmp_name_t cmp_name,
> + const void *data)
> +{
> + struct rte_vdev_device *dev;
> +
> + TAILQ_FOREACH(dev, &vdev_device_list, next) {
> + if (start && &dev->device == start) {
> + start = NULL;
> + continue;
> + }
> + if (cmp_name(dev->device.name, data) == 0)
> + return &dev->device;
> + }
> + return NULL;
> +}
> +
> +static int
> +vdev_remap_device(struct rte_device *dev)
> +{
> + RTE_SET_USED(dev);
> + return 0;
> +}
> +
> +static int
> +vdev_bind_driver(const char *dev_name, const char *drv_type)
> +{
> + RTE_SET_USED(dev_name);
> + RTE_SET_USED(drv_type);
> + return 0;
> +}
> +
> static int
> vdev_plug(struct rte_device *dev)
> {
> @@ -339,9 +372,12 @@ static struct rte_bus rte_vdev_bus = {
> .scan = vdev_scan,
> .probe = vdev_probe,
> .find_device = vdev_find_device,
> + .find_device_by_name = vdev_find_device_by_name,
> .plug = vdev_plug,
> .unplug = vdev_unplug,
> .parse = vdev_parse,
> + .remap_device = vdev_remap_device,
> + .bind_driver = vdev_bind_driver,
> };
>
> RTE_REGISTER_BUS(vdev, rte_vdev_bus);
> diff --git a/lib/librte_eal/bsdapp/eal/eal_dev.c
> b/lib/librte_eal/bsdapp/eal/eal_dev.c
> new file mode 100644
> index 0000000..6ea9a74
> --- /dev/null
> +++ b/lib/librte_eal/bsdapp/eal/eal_dev.c
> @@ -0,0 +1,64 @@
> +/*-
> + * Copyright(c) 2010-2017 Intel Corporation.
> + * All rights reserved.
> + *
> + * Redistribution and use in source and binary forms, with or without
> + * modification, are permitted provided that the following conditions
> + * are met:
> + *
> + * * Redistributions of source code must retain the above copyright
> + * notice, this list of conditions and the following disclaimer.
> + * * Redistributions in binary form must reproduce the above copyright
> + * notice, this list of conditions and the following disclaimer in
> + * the documentation and/or other materials provided with the
> + * distribution.
> + * * Neither the name of Intel Corporation nor the names of its
> + * contributors may be used to endorse or promote products derived
> + * from this software without specific prior written permission.
> + *
> + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
> CONTRIBUTORS
> + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
> NOT
> + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
> FITNESS FOR
> + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
> COPYRIGHT
> + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
> INCIDENTAL,
> + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
> NOT
> + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
> OF USE,
> + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
> AND ON ANY
> + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
> TORT
> + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
> THE USE
> + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
> DAMAGE.
> + */
> +
> +#include <stdio.h>
> +#include <string.h>
> +#include <inttypes.h>
> +#include <sys/queue.h>
> +#include <sys/signalfd.h>
> +#include <sys/ioctl.h>
> +#include <sys/socket.h>
> +#include <linux/netlink.h>
> +#include <sys/epoll.h>
> +#include <unistd.h>
> +#include <signal.h>
> +#include <stdbool.h>
> +
> +#include <rte_malloc.h>
> +#include <rte_bus.h>
> +#include <rte_dev.h>
> +#include <rte_devargs.h>
> +#include <rte_debug.h>
> +#include <rte_log.h>
> +
> +#include "eal_thread.h"
> +
> +int
> +rte_dev_monitor_start(void)
> +{
> + return -1;
> +}
> +
> +int
> +rte_dev_monitor_stop(void)
> +{
> + return -1;
> +}
> diff --git a/lib/librte_eal/bsdapp/eal/include/exec-env/rte_dev.h
> b/lib/librte_eal/bsdapp/eal/include/exec-env/rte_dev.h
> new file mode 100644
> index 0000000..6a6feb5
> --- /dev/null
> +++ b/lib/librte_eal/bsdapp/eal/include/exec-env/rte_dev.h
> @@ -0,0 +1,106 @@
> +/*-
> + * BSD LICENSE
> + *
> + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
> + * All rights reserved.
> + *
> + * Redistribution and use in source and binary forms, with or without
> + * modification, are permitted provided that the following conditions
> + * are met:
> + *
> + * * Redistributions of source code must retain the above copyright
> + * notice, this list of conditions and the following disclaimer.
> + * * Redistributions in binary form must reproduce the above copyright
> + * notice, this list of conditions and the following disclaimer in
> + * the documentation and/or other materials provided with the
> + * distribution.
> + * * Neither the name of Intel Corporation nor the names of its
> + * contributors may be used to endorse or promote products derived
> + * from this software without specific prior written permission.
> + *
> + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
> CONTRIBUTORS
> + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
> NOT
> + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
> FITNESS FOR
> + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
> COPYRIGHT
> + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
> INCIDENTAL,
> + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
> NOT
> + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
> OF USE,
> + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
> AND ON ANY
> + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
> TORT
> + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
> THE USE
> + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
> DAMAGE.
> + */
> +
> +#ifndef _RTE_DEV_H_
> +#error "don't include this file directly, please include generic <rte_dev.h>"
> +#endif
> +
> +#ifndef _RTE_LINUXAPP_DEV_H_
> +#define _RTE_LINUXAPP_DEV_H_
> +
> +#include <stdio.h>
> +
> +#include <rte_dev.h>
> +
> +#define RTE_EAL_UEV_MSG_LEN 4096
> +#define RTE_EAL_UEV_MSG_ELEM_LEN 128
> +
> +enum uev_subsystem {
> + UEV_SUBSYSTEM_UIO,
> + UEV_SUBSYSTEM_VFIO,
> + UEV_SUBSYSTEM_PCI,
> + UEV_SUBSYSTEM_MAX
> +};
> +
> +enum uev_monitor_netlink_group {
> + UEV_MONITOR_KERNEL,
> + UEV_MONITOR_UDEV,
> +};
> +
> +/**
> + * The device event type.
> + */
> +enum rte_eal_dev_event_type {
> + RTE_EAL_DEV_EVENT_UNKNOWN, /**< unknown event type */
> + RTE_EAL_DEV_EVENT_ADD, /**< device adding event */
> + RTE_EAL_DEV_EVENT_REMOVE,
> + /**< device removing event */
> + RTE_EAL_DEV_EVENT_CHANGE,
> + /**< device status change event */
> + RTE_EAL_DEV_EVENT_MOVE, /**< device sys path move
> event */
> + RTE_EAL_DEV_EVENT_ONLINE, /**< device online event */
> + RTE_EAL_DEV_EVENT_OFFLINE, /**< device offline event */
> + RTE_EAL_DEV_EVENT_MAX /**< max value of this enum
> */
> +};
> +
> +struct rte_eal_uevent {
> + enum rte_eal_dev_event_type type; /**< device event type */
> + int subsystem; /**< subsystem id */
> + char *devname; /**< device name */
> + enum uev_monitor_netlink_group group; /**< device netlink
> group */
> +};
> +
> +/**
> + * Start the device uevent monitoring.
> + *
> + * @param none
> + * @return
> + * - On success, zero.
> + * - On failure, a negative value.
> + */
> +int
> +rte_dev_monitor_start(void);
> +
> +/**
> + * Stop the device uevent monitoring .
> + *
> + * @param none
> + * @return
> + * - On success, zero.
> + * - On failure, a negative value.
> + */
> +
> +int
> +rte_dev_monitor_stop(void);
> +
> +#endif /* _RTE_LINUXAPP_DEV_H_ */
> diff --git a/lib/librte_eal/common/eal_common_bus.c
> b/lib/librte_eal/common/eal_common_bus.c
> index 3e022d5..b7219c9 100644
> --- a/lib/librte_eal/common/eal_common_bus.c
> +++ b/lib/librte_eal/common/eal_common_bus.c
> @@ -51,8 +51,11 @@ rte_bus_register(struct rte_bus *bus)
> RTE_VERIFY(bus->scan);
> RTE_VERIFY(bus->probe);
> RTE_VERIFY(bus->find_device);
> + RTE_VERIFY(bus->find_device_by_name);
> /* Buses supporting driver plug also require unplug. */
> RTE_VERIFY(!bus->plug || bus->unplug);
> + RTE_VERIFY(bus->remap_device);
> + RTE_VERIFY(bus->bind_driver);
>
> TAILQ_INSERT_TAIL(&rte_bus_list, bus, next);
> RTE_LOG(DEBUG, EAL, "Registered [%s] bus.\n", bus->name);
> @@ -170,6 +173,14 @@ cmp_rte_device(const struct rte_device *dev1,
> const void *_dev2)
> }
>
> static int
> +cmp_rte_device_name(const char *dev_name1, const void *_dev_name2)
> +{
> + const char *dev_name2 = _dev_name2;
> +
> + return strcmp(dev_name1, dev_name2);
> +}
> +
> +static int
> bus_find_device(const struct rte_bus *bus, const void *_dev)
> {
> struct rte_device *dev;
> @@ -178,6 +189,25 @@ bus_find_device(const struct rte_bus *bus, const
> void *_dev)
> return dev == NULL;
> }
>
> +static struct rte_device *
> +bus_find_device_by_name(const struct rte_bus *bus, const void
> *_dev_name)
> +{
> + struct rte_device *dev;
> +
> + dev = bus->find_device_by_name(NULL, cmp_rte_device_name,
> _dev_name);
> + return dev;
> +}
> +
> +struct rte_device *
> +
> +rte_bus_find_device(const struct rte_bus *bus, const void *_dev_name)
> +{
> + struct rte_device *dev;
> +
> + dev = bus_find_device_by_name(bus, _dev_name);
> + return dev;
> +}
> +
> struct rte_bus *
> rte_bus_find_by_device(const struct rte_device *dev)
> {
> diff --git a/lib/librte_eal/common/eal_common_dev.c
> b/lib/librte_eal/common/eal_common_dev.c
> index dda8f58..47909e8 100644
> --- a/lib/librte_eal/common/eal_common_dev.c
> +++ b/lib/librte_eal/common/eal_common_dev.c
> @@ -42,9 +42,31 @@
> #include <rte_devargs.h>
> #include <rte_debug.h>
> #include <rte_log.h>
> +#include <rte_spinlock.h>
> +#include <rte_malloc.h>
>
> #include "eal_private.h"
>
> +/* spinlock for device callbacks */
> +static rte_spinlock_t rte_dev_cb_lock = RTE_SPINLOCK_INITIALIZER;
> +
> +/**
> + * The user application callback description.
> + *
> + * It contains callback address to be registered by user application,
> + * the pointer to the parameters for callback, and the event type.
> + */
> +struct rte_eal_dev_callback {
> + TAILQ_ENTRY(rte_eal_dev_callback) next; /**< Callbacks list */
> + rte_eal_dev_cb_fn cb_fn; /**< Callback address */
> + void *cb_arg; /**< Parameter for callback */
> + void *ret_param; /**< Return parameter */
> + enum rte_eal_dev_event_type event; /**< device event type */
> + uint32_t active; /**< Callback is executing */
> +};
> +
> +static struct rte_eal_dev_callback *dev_add_cb;
> +
> static int cmp_detached_dev_name(const struct rte_device *dev,
> const void *_name)
> {
> @@ -234,3 +256,150 @@ int rte_eal_hotplug_remove(const char *busname,
> const char *devname)
> rte_eal_devargs_remove(busname, devname);
> return ret;
> }
> +
> +int
> +rte_eal_dev_monitor_enable(void)
> +{
> + int ret;
> +
> + ret = rte_dev_monitor_start();
> + if (ret)
> + RTE_LOG(ERR, EAL, "Can not init device monitor\n");
> + return ret;
> +}
> +
> +int
> +rte_dev_callback_register(struct rte_device *device,
> + enum rte_eal_dev_event_type event,
> + rte_eal_dev_cb_fn cb_fn, void *cb_arg)
> +{
> + struct rte_eal_dev_callback *user_cb;
> +
> + if (!cb_fn)
> + return -EINVAL;
> +
What's about checking the device pointer is not NULL ?
> + rte_spinlock_lock(&rte_dev_cb_lock);
> +
> + if (TAILQ_EMPTY(&(device->uev_cbs)))
> + TAILQ_INIT(&(device->uev_cbs));
> +
> + if (event == RTE_EAL_DEV_EVENT_ADD) {
> + user_cb = NULL;
> + } else {
> + TAILQ_FOREACH(user_cb, &(device->uev_cbs), next) {
> + if (user_cb->cb_fn == cb_fn &&
> + user_cb->cb_arg == cb_arg &&
> + user_cb->event == event) {
> + break;
> + }
> + }
> + }
> +
> + /* create a new callback. */
> + if (user_cb == NULL) {
> + /* allocate a new interrupt callback entity */
> + user_cb = rte_zmalloc("eal device event",
> + sizeof(*user_cb), 0);
> + if (user_cb == NULL) {
> + RTE_LOG(ERR, EAL, "Can not allocate memory\n");
Missing rte_spinlock_unlock.
> + return -ENOMEM;
> + }
> + user_cb->cb_fn = cb_fn;
> + user_cb->cb_arg = cb_arg;
> + user_cb->event = event;
> + if (event == RTE_EAL_DEV_EVENT_ADD)
> + dev_add_cb = user_cb;
Only one dpdk entity can register to ADD callback?
I suggest to add option to register all devices maybe by using dummy device which will include all the "ALL_DEVICES" callbacks per event.
All means past, present and future devices, by this way 1 callback can be called for all the devices and more than one dpdk entity could register to an ADD\NEW event.
What's about NEW instead of ADD?
I also suggest to add the device pointer as a parameter to the callback(which will be managed by EAL).
> + else
> + TAILQ_INSERT_TAIL(&(device->uev_cbs), user_cb,
> next);
> + }
> +
> + rte_spinlock_unlock(&rte_dev_cb_lock);
> + return 0;
> +}
> +
> +int
> +rte_dev_callback_unregister(struct rte_device *device,
> + enum rte_eal_dev_event_type event,
> + rte_eal_dev_cb_fn cb_fn, void *cb_arg)
> +{
> + int ret;
> + struct rte_eal_dev_callback *cb, *next;
> +
> + if (!cb_fn)
> + return -EINVAL;
> +
> + rte_spinlock_lock(&rte_dev_cb_lock);
> +
> + ret = 0;
> + if (event == RTE_EAL_DEV_EVENT_ADD) {
> + rte_free(dev_add_cb);
> + dev_add_cb = NULL;
> + } else {
Device NULL checking?
> + for (cb = TAILQ_FIRST(&(device->uev_cbs)); cb != NULL;
> + cb = next) {
> +
> + next = TAILQ_NEXT(cb, next);
> +
> + if (cb->cb_fn != cb_fn || cb->event != event ||
> + (cb->cb_arg != (void *)-1 &&
> + cb->cb_arg != cb_arg))
> + continue;
> +
> + /*
> + * if this callback is not executing right now,
> + * then remove it.
> + */
> + if (cb->active == 0) {
> + TAILQ_REMOVE(&(device->uev_cbs), cb,
> next);
> + rte_free(cb);
> + } else {
> + ret = -EAGAIN;
> + }
> + }
> + }
> + rte_spinlock_unlock(&rte_dev_cb_lock);
> + return ret;
> +}
> +
> +int
> +_rte_dev_callback_process(struct rte_device *device,
> + enum rte_eal_dev_event_type event,
> + void *cb_arg, void *ret_param)
> +{
> + struct rte_eal_dev_callback dev_cb;
> + struct rte_eal_dev_callback *cb_lst;
> + int rc = 0;
> +
> + rte_spinlock_lock(&rte_dev_cb_lock);
> + if (event == RTE_EAL_DEV_EVENT_ADD) {
> + if (cb_arg != NULL)
> + dev_add_cb->cb_arg = cb_arg;
> +
> + if (ret_param != NULL)
> + dev_add_cb->ret_param = ret_param;
> +
> + rte_spinlock_unlock(&rte_dev_cb_lock);
Can't someone free it when it running?
I suggest to keep the lock locked.
Callbacks are not allowed to use this mechanism to prevent deadlock.
> + rc = dev_add_cb->cb_fn(dev_add_cb->event,
> + dev_add_cb->cb_arg, dev_add_cb-
> >ret_param);
> + rte_spinlock_lock(&rte_dev_cb_lock);
> + } else {
> + TAILQ_FOREACH(cb_lst, &(device->uev_cbs), next) {
> + if (cb_lst->cb_fn == NULL || cb_lst->event != event)
> + continue;
> + dev_cb = *cb_lst;
> + cb_lst->active = 1;
> + if (cb_arg != NULL)
> + dev_cb.cb_arg = cb_arg;
> + if (ret_param != NULL)
> + dev_cb.ret_param = ret_param;
> +
> + rte_spinlock_unlock(&rte_dev_cb_lock);
The current active flag doesn't do it thread safe here, I suggest to keep the lock locked.
Scenario:
1. Thread A see active = 0 in unregister function.
2. Context switch.
3. Thread B start the callback.
4. Context switch.
5. Thread A free it.
6. Context switch.
7. Seg fault in Thread B.
> + rc = dev_cb.cb_fn(dev_cb.event,
> + dev_cb.cb_arg, dev_cb.ret_param);
> + rte_spinlock_lock(&rte_dev_cb_lock);
> + cb_lst->active = 0;
> + }
> + }
> + rte_spinlock_unlock(&rte_dev_cb_lock);
> + return rc;
> +}
> diff --git a/lib/librte_eal/common/include/rte_bus.h
> b/lib/librte_eal/common/include/rte_bus.h
> index 6fb0834..6c4ae31 100644
> --- a/lib/librte_eal/common/include/rte_bus.h
> +++ b/lib/librte_eal/common/include/rte_bus.h
> @@ -122,6 +122,34 @@ typedef struct rte_device *
> const void *data);
>
> /**
> + * Device iterator to find a device on a bus.
> + *
> + * This function returns an rte_device if one of those held by the bus
> + * matches the data passed as parameter.
> + *
> + * If the comparison function returns zero this function should stop iterating
> + * over any more devices. To continue a search the device of a previous
> search
> + * can be passed via the start parameter.
> + *
> + * @param cmp
> + * the device name comparison function.
> + *
> + * @param data
> + * Data to compare each device against.
> + *
> + * @param start
> + * starting point for the iteration
> + *
> + * @return
> + * The first device matching the data, NULL if none exists.
> + */
> +typedef struct rte_device *
> +(*rte_bus_find_device_by_name_t)(const struct rte_device *start,
> + rte_dev_cmp_name_t cmp,
> + const void *data);
> +
> +
> +/**
> * Implementation specific probe function which is responsible for linking
> * devices on that bus with applicable drivers.
> *
> @@ -168,6 +196,37 @@ typedef int (*rte_bus_unplug_t)(struct rte_device
> *dev);
> typedef int (*rte_bus_parse_t)(const char *name, void *addr);
>
> /**
> + * Implementation specific remap function which is responsible for
> remmaping
> + * devices on that bus from original share memory resource to a private
> memory
> + * resource for the sake of device has been removal.
> + *
> + * @param dev
> + * Device pointer that was returned by a previous call to find_device.
> + *
> + * @return
> + * 0 on success.
> + * !0 on error.
> + */
> +typedef int (*rte_bus_remap_device_t)(struct rte_device *dev);
> +
> +/**
> + * Implementation specific bind driver function which is responsible for bind
> + * a explicit type of driver with a devices on that bus.
> + *
> + * @param dev_name
> + * device textual description.
> + *
> + * @param drv_type
> + * driver type textual description.
> + *
> + * @return
> + * 0 on success.
> + * !0 on error.
> + */
> +typedef int (*rte_bus_bind_driver_t)(const char *dev_name,
> + const char *drv_type);
> +
> +/**
> * Bus scan policies
> */
> enum rte_bus_scan_mode {
> @@ -206,9 +265,13 @@ struct rte_bus {
> rte_bus_scan_t scan; /**< Scan for devices attached to bus */
> rte_bus_probe_t probe; /**< Probe devices on bus */
> rte_bus_find_device_t find_device; /**< Find a device on the bus */
> + rte_bus_find_device_by_name_t find_device_by_name;
> + /**< Find a device on the bus */
> rte_bus_plug_t plug; /**< Probe single device for drivers */
> rte_bus_unplug_t unplug; /**< Remove single device from driver
> */
> rte_bus_parse_t parse; /**< Parse a device name */
> + rte_bus_remap_device_t remap_device; /**< remap a device */
> + rte_bus_bind_driver_t bind_driver; /**< bind a driver for bus device
> */
> struct rte_bus_conf conf; /**< Bus configuration */
> rte_bus_get_iommu_class_t get_iommu_class; /**< Get iommu
> class */
> };
> @@ -306,6 +369,12 @@ struct rte_bus *rte_bus_find(const struct rte_bus
> *start, rte_bus_cmp_t cmp,
> struct rte_bus *rte_bus_find_by_device(const struct rte_device *dev);
>
> /**
> + * Find the registered bus for a particular device.
> + */
> +struct rte_device *rte_bus_find_device(const struct rte_bus *bus,
> + const void *dev_name);
> +
> +/**
> * Find the registered bus for a given name.
> */
> struct rte_bus *rte_bus_find_by_name(const char *busname);
> diff --git a/lib/librte_eal/common/include/rte_dev.h
> b/lib/librte_eal/common/include/rte_dev.h
> index 9342e0c..19971d0 100644
> --- a/lib/librte_eal/common/include/rte_dev.h
> +++ b/lib/librte_eal/common/include/rte_dev.h
> @@ -51,6 +51,15 @@ extern "C" {
>
> #include <rte_log.h>
>
> +#include <exec-env/rte_dev.h>
> +
> +typedef int (*rte_eal_dev_cb_fn)(enum rte_eal_dev_event_type event,
> + void *cb_arg, void *ret_param);
> +
> +struct rte_eal_dev_callback;
> +/** @internal Structure to keep track of registered callbacks */
> +TAILQ_HEAD(rte_eal_dev_cb_list, rte_eal_dev_callback);
> +
> __attribute__((format(printf, 2, 0)))
> static inline void
> rte_pmd_debug_trace(const char *func_name, const char *fmt, ...)
> @@ -157,6 +166,13 @@ struct rte_driver {
> */
> #define RTE_DEV_NAME_MAX_LEN 64
>
> +enum device_state {
> + DEVICE_UNDEFINED,
> + DEVICE_FAULT,
> + DEVICE_PARSED,
> + DEVICE_PROBED,
> +};
> +
> /**
> * A structure describing a generic device.
> */
> @@ -166,6 +182,9 @@ struct rte_device {
> const struct rte_driver *driver;/**< Associated driver */
> int numa_node; /**< NUMA node connection */
> struct rte_devargs *devargs; /**< Device user arguments */
> + enum device_state state; /**< Device state */
> + /** User application callbacks for device event */
> + struct rte_eal_dev_cb_list uev_cbs;
> };
>
> /**
> @@ -248,6 +267,8 @@ int rte_eal_hotplug_remove(const char *busname,
> const char *devname);
> */
> typedef int (*rte_dev_cmp_t)(const struct rte_device *dev, const void
> *data);
>
> +typedef int (*rte_dev_cmp_name_t)(const char *dev_name, const void
> *data);
> +
> #define RTE_PMD_EXPORT_NAME_ARRAY(n, idx) n##idx[]
>
> #define RTE_PMD_EXPORT_NAME(name, idx) \
> @@ -293,4 +314,72 @@ __attribute__((used)) = str
> }
> #endif
>
> +/**
> + * It enable the device event monitoring for a specific event.
> + *
> + * @param none
> + * @return
> + * - On success, zero.
> + * - On failure, a negative value.
> + */
> +int
> +rte_eal_dev_monitor_enable(void);
> +/**
> + * It registers the callback for the specific event. Multiple
> + * callbacks cal be registered at the same time.
> + * @param event
> + * The device event type.
> + * @param cb_fn
> + * callback address.
> + * @param cb_arg
> + * address of parameter for callback.
> + *
> + * @return
> + * - On success, zero.
> + * - On failure, a negative value.
> + */
> +int rte_dev_callback_register(struct rte_device *device,
> + enum rte_eal_dev_event_type event,
> + rte_eal_dev_cb_fn cb_fn, void *cb_arg);
> +
> +/**
> + * It unregisters the callback according to the specified event.
> + *
> + * @param event
> + * The event type which corresponding to the callback.
> + * @param cb_fn
> + * callback address.
> + * address of parameter for callback, (void *)-1 means to remove all
> + * registered which has the same callback address.
> + *
> + * @return
> + * - On success, return the number of callback entities removed.
> + * - On failure, a negative value.
> + */
> +int rte_dev_callback_unregister(struct rte_device *device,
> + enum rte_eal_dev_event_type event,
> + rte_eal_dev_cb_fn cb_fn, void *cb_arg);
> +
> +/**
> + * @internal Executes all the user application registered callbacks for
> + * the specific device. It is for DPDK internal user only. User
> + * application should not call it directly.
> + *
> + * @param event
> + * The device event type.
> + * @param cb_arg
> + * callback parameter.
> + * @param ret_param
> + * To pass data back to user application.
> + * This allows the user application to decide if a particular function
> + * is permitted or not.
> + *
> + * @return
> + * - On success, return zero.
> + * - On failure, a negative value.
> + */
> +int
> +_rte_dev_callback_process(struct rte_device *device,
> + enum rte_eal_dev_event_type event,
> + void *cb_arg, void *ret_param);
> #endif /* _RTE_DEV_H_ */
> diff --git a/lib/librte_eal/linuxapp/eal/Makefile
> b/lib/librte_eal/linuxapp/eal/Makefile
> index 5a7b8b2..05a2437 100644
> --- a/lib/librte_eal/linuxapp/eal/Makefile
> +++ b/lib/librte_eal/linuxapp/eal/Makefile
> @@ -67,6 +67,7 @@ SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) +=
> eal_lcore.c
> SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_timer.c
> SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_interrupts.c
> SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_alarm.c
> +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_dev.c
>
> # from common dir
> SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_lcore.c
> @@ -120,7 +121,7 @@ ifeq ($(CONFIG_RTE_TOOLCHAIN_GCC),y)
> CFLAGS_eal_thread.o += -Wno-return-type
> endif
>
> -INC := rte_kni_common.h
> +INC := rte_kni_common.h rte_dev.h
>
> SYMLINK-$(CONFIG_RTE_EXEC_ENV_LINUXAPP)-include/exec-env := \
> $(addprefix include/exec-env/,$(INC))
> diff --git a/lib/librte_eal/linuxapp/eal/eal_alarm.c
> b/lib/librte_eal/linuxapp/eal/eal_alarm.c
> index 8e4a775..29e73a7 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_alarm.c
> +++ b/lib/librte_eal/linuxapp/eal/eal_alarm.c
> @@ -209,6 +209,7 @@ rte_eal_alarm_cancel(rte_eal_alarm_callback cb_fn,
> void *cb_arg)
> int count = 0;
> int err = 0;
> int executing;
> + int ret;
>
> if (!cb_fn) {
> rte_errno = EINVAL;
> @@ -259,6 +260,10 @@ rte_eal_alarm_cancel(rte_eal_alarm_callback cb_fn,
> void *cb_arg)
> }
> ap_prev = ap;
> }
> +
> + ret |= rte_intr_callback_unregister(&intr_handle,
> + eal_alarm_callback, NULL);
> +
> rte_spinlock_unlock(&alarm_list_lk);
> } while (executing != 0);
>
> diff --git a/lib/librte_eal/linuxapp/eal/eal_dev.c
> b/lib/librte_eal/linuxapp/eal/eal_dev.c
> new file mode 100644
> index 0000000..49fd0dc
> --- /dev/null
> +++ b/lib/librte_eal/linuxapp/eal/eal_dev.c
> @@ -0,0 +1,356 @@
> +/*-
> + * Copyright(c) 2010-2017 Intel Corporation.
> + * All rights reserved.
> + *
> + * Redistribution and use in source and binary forms, with or without
> + * modification, are permitted provided that the following conditions
> + * are met:
> + *
> + * * Redistributions of source code must retain the above copyright
> + * notice, this list of conditions and the following disclaimer.
> + * * Redistributions in binary form must reproduce the above copyright
> + * notice, this list of conditions and the following disclaimer in
> + * the documentation and/or other materials provided with the
> + * distribution.
> + * * Neither the name of Intel Corporation nor the names of its
> + * contributors may be used to endorse or promote products derived
> + * from this software without specific prior written permission.
> + *
> + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
> CONTRIBUTORS
> + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
> NOT
> + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
> FITNESS FOR
> + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
> COPYRIGHT
> + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
> INCIDENTAL,
> + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
> NOT
> + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
> OF USE,
> + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
> AND ON ANY
> + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
> TORT
> + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
> THE USE
> + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
> DAMAGE.
> + */
> +
> +#include <stdio.h>
> +#include <string.h>
> +#include <inttypes.h>
> +#include <sys/queue.h>
> +#include <sys/signalfd.h>
> +#include <sys/ioctl.h>
> +#include <sys/socket.h>
> +#include <linux/netlink.h>
> +#include <sys/epoll.h>
> +#include <unistd.h>
> +#include <signal.h>
> +#include <stdbool.h>
> +
> +#include <rte_malloc.h>
> +#include <rte_bus.h>
> +#include <rte_dev.h>
> +#include <rte_devargs.h>
> +#include <rte_debug.h>
> +#include <rte_log.h>
> +
> +#include "eal_thread.h"
> +
> +/* uev monitoring thread */
> +static pthread_t uev_monitor_thread;
> +
> +bool udev_exit = true;
> +
> +bool no_request_thread = true;
> +
> +static void sig_handler(int signum)
> +{
> + if (signum == SIGINT || signum == SIGTERM)
> + rte_dev_monitor_stop();
> +}
> +
> +static int
> +dev_monitor_fd_new(void)
> +{
> +
> + int uevent_fd;
> +
> + uevent_fd = socket(PF_NETLINK, SOCK_RAW | SOCK_CLOEXEC |
> + SOCK_NONBLOCK,
> + NETLINK_KOBJECT_UEVENT);
> + if (uevent_fd < 0) {
> + RTE_LOG(ERR, EAL, "create uevent fd failed\n");
> + return -1;
> + }
> + return uevent_fd;
> +}
> +
> +static int
> +dev_monitor_enable(int netlink_fd)
> +{
> + struct sockaddr_nl addr;
> + int ret;
> + int size = 64 * 1024;
> + int nonblock = 1;
> +
> + memset(&addr, 0, sizeof(addr));
> + addr.nl_family = AF_NETLINK;
> + addr.nl_pid = 0;
> + addr.nl_groups = 0xffffffff;
> +
> + if (bind(netlink_fd, (struct sockaddr *) &addr, sizeof(addr)) < 0) {
> + RTE_LOG(ERR, EAL, "bind failed\n");
> + goto err;
> + }
> +
> + setsockopt(netlink_fd, SOL_SOCKET, SO_PASSCRED, &size,
> sizeof(size));
> +
> + ret = ioctl(netlink_fd, FIONBIO, &nonblock);
> + if (ret != 0) {
> + RTE_LOG(ERR, EAL, "ioctl(FIONBIO) failed\n");
> + goto err;
> + }
> + return 0;
> +err:
> + close(netlink_fd);
> + return -1;
> +}
> +
> +static void
> +dev_uev_parse(const char *buf, struct rte_eal_uevent *event)
> +{
> + char action[RTE_EAL_UEV_MSG_ELEM_LEN];
> + char subsystem[RTE_EAL_UEV_MSG_ELEM_LEN];
> + char dev_path[RTE_EAL_UEV_MSG_ELEM_LEN];
> + char pci_slot_name[RTE_EAL_UEV_MSG_ELEM_LEN];
> + int i = 0;
> +
> + memset(action, 0, RTE_EAL_UEV_MSG_ELEM_LEN);
> + memset(subsystem, 0, RTE_EAL_UEV_MSG_ELEM_LEN);
> + memset(dev_path, 0, RTE_EAL_UEV_MSG_ELEM_LEN);
> + memset(pci_slot_name, 0, RTE_EAL_UEV_MSG_ELEM_LEN);
> +
> + while (i < RTE_EAL_UEV_MSG_LEN) {
> + for (; i < RTE_EAL_UEV_MSG_LEN; i++) {
> + if (*buf)
> + break;
> + buf++;
> + }
> + if (!strncmp(buf, "libudev", 7)) {
> + buf += 7;
> + i += 7;
> + event->group = UEV_MONITOR_UDEV;
> + }
> + if (!strncmp(buf, "ACTION=", 7)) {
> + buf += 7;
> + i += 7;
> + snprintf(action, sizeof(action), "%s", buf);
> + } else if (!strncmp(buf, "DEVPATH=", 8)) {
> + buf += 8;
> + i += 8;
> + snprintf(dev_path, sizeof(dev_path), "%s", buf);
> + } else if (!strncmp(buf, "SUBSYSTEM=", 10)) {
> + buf += 10;
> + i += 10;
> + snprintf(subsystem, sizeof(subsystem), "%s", buf);
> + } else if (!strncmp(buf, "PCI_SLOT_NAME=", 14)) {
> + buf += 14;
> + i += 14;
> + snprintf(pci_slot_name, sizeof(subsystem), "%s",
> buf);
> + event->devname = pci_slot_name;
> + }
> + for (; i < RTE_EAL_UEV_MSG_LEN; i++) {
> + if (*buf == '\0')
> + break;
> + buf++;
> + }
> + }
> +
> + if (!strncmp(subsystem, "pci", 3))
> + event->subsystem = UEV_SUBSYSTEM_PCI;
> + if (!strncmp(action, "add", 3))
> + event->type = RTE_EAL_DEV_EVENT_ADD;
> + if (!strncmp(action, "remove", 6))
> + event->type = RTE_EAL_DEV_EVENT_REMOVE;
> + event->devname = pci_slot_name;
> +}
> +
> +static int
> +dev_uev_receive(int fd, struct rte_eal_uevent *uevent)
> +{
> + int ret;
> + char buf[RTE_EAL_UEV_MSG_LEN];
> +
> + memset(uevent, 0, sizeof(struct rte_eal_uevent));
> + memset(buf, 0, RTE_EAL_UEV_MSG_LEN);
> +
> + ret = recv(fd, buf, RTE_EAL_UEV_MSG_LEN - 1, MSG_DONTWAIT);
> + if (ret < 0) {
> + RTE_LOG(ERR, EAL,
> + "Socket read error(%d): %s\n",
> + errno, strerror(errno));
> + return -1;
> + } else if (ret == 0)
> + /* connection closed */
> + return -1;
> +
> + dev_uev_parse(buf, uevent);
> +
> + return 0;
> +}
> +
> +static int
> +dev_uev_process(struct epoll_event *events, int nfds)
> +{
> + struct rte_bus *bus;
> + struct rte_device *dev;
> + struct rte_eal_uevent uevent;
> + int ret;
> + int i;
> +
> + for (i = 0; i < nfds; i++) {
> + /**
> + * check device uevent from kernel side, no need to check
> + * uevent from udev.
> + */
> + if ((dev_uev_receive(events[i].data.fd, &uevent)) ||
> + (uevent.group == UEV_MONITOR_UDEV))
> + return 0;
> +
> + /* default handle all pci devcie when is being hot plug */
> + if (uevent.subsystem == UEV_SUBSYSTEM_PCI) {
> + bus = rte_bus_find_by_name("pci");
> + dev = rte_bus_find_device(bus, uevent.devname);
> + if (uevent.type == RTE_EAL_DEV_EVENT_REMOVE) {
> +
> + if ((!dev) || dev->state ==
> DEVICE_UNDEFINED)
> + return 0;
> + dev->state = DEVICE_FAULT;
> +
> + /**
> + * remap the resource to be fake
> + * before user's removal processing
> + */
> + ret = bus->remap_device(dev);
> + if (!ret)
> +
> return(_rte_dev_callback_process(dev,
> + RTE_EAL_DEV_EVENT_REMOVE,
> + NULL, NULL));
What is the reason to keep this device in EAL device list after the removal?
I suggest to remove it (driver remove, bus remove and EAL remove) after the callbacks running.
By this way EAL can initiate all device removals.
> + } else if (uevent.type == RTE_EAL_DEV_EVENT_ADD)
> {
> + if (dev == NULL) {
> + /**
> + * bind the driver to the device
> + * before user's add processing
> + */
> + bus->bind_driver(
> + uevent.devname,
> + "igb_uio");
> +
Similar comments here:
EAL can initiate all device probe operations by adding the device and probing it here before the callback running.
Then, also the device pointer can be passed to the callbacks.
> return(_rte_dev_callback_process(NULL,
> + RTE_EAL_DEV_EVENT_ADD,
> + uevent.devname, NULL));
> + }
> + }
> + }
> + }
> + return 0;
> +}
> +
> +/**
> + * It builds/rebuilds up the epoll file descriptor with all the
> + * file descriptors being waited on. Then handles the interrupts.
> + *
> + * @param arg
> + * pointer. (unused)
> + *
> + * @return
> + * never return;
> + */
> +static __attribute__((noreturn)) void *
> +dev_uev_monitoring(__rte_unused void *arg)
> +{
> + struct sigaction act;
> + sigset_t mask;
> + int netlink_fd;
> + struct epoll_event ep_kernel;
> + int fd_ep;
> +
> + udev_exit = false;
> +
> + /* set signal handlers */
> + memset(&act, 0x00, sizeof(struct sigaction));
> + act.sa_handler = sig_handler;
> + sigemptyset(&act.sa_mask);
> + act.sa_flags = SA_RESTART;
> + sigaction(SIGINT, &act, NULL);
> + sigaction(SIGTERM, &act, NULL);
> + sigemptyset(&mask);
> + sigaddset(&mask, SIGINT);
> + sigaddset(&mask, SIGTERM);
> + sigprocmask(SIG_UNBLOCK, &mask, NULL);
> +
> + fd_ep = epoll_create1(EPOLL_CLOEXEC);
> + if (fd_ep < 0) {
> + RTE_LOG(ERR, EAL, "error creating epoll fd: %m\n");
> + goto out;
> + }
> +
> + netlink_fd = dev_monitor_fd_new();
> +
> + if (dev_monitor_enable(netlink_fd) < 0) {
> + RTE_LOG(ERR, EAL, "error subscribing to kernel events\n");
> + goto out;
> + }
> +
> + memset(&ep_kernel, 0, sizeof(struct epoll_event));
> + ep_kernel.events = EPOLLIN | EPOLLPRI | EPOLLRDHUP | EPOLLHUP;
> + ep_kernel.data.fd = netlink_fd;
> + if (epoll_ctl(fd_ep, EPOLL_CTL_ADD, netlink_fd,
> + &ep_kernel) < 0) {
> + RTE_LOG(ERR, EAL, "error addding fd to epoll: %m\n");
> + goto out;
> + }
> +
> + while (!udev_exit) {
> + int fdcount;
> + struct epoll_event ev[1];
> +
> + fdcount = epoll_wait(fd_ep, ev, 1, -1);
> + if (fdcount < 0) {
> + if (errno != EINTR)
> + RTE_LOG(ERR, EAL, "error receiving uevent "
> + "message: %m\n");
> + continue;
> + }
> +
> + /* epoll_wait has at least one fd ready to read */
> + if (dev_uev_process(ev, fdcount) < 0) {
> + if (errno != EINTR)
> + RTE_LOG(ERR, EAL, "error processing uevent
> "
> + "message: %m\n");
> + }
> + }
> +out:
> + if (fd_ep >= 0)
> + close(fd_ep);
> + if (netlink_fd >= 0)
> + close(netlink_fd);
> + rte_panic("uev monitoring fail\n");
> +}
> +
> +int
> +rte_dev_monitor_start(void)
> +{
Maybe add option to run it also by new EAL command line parameter?
> + int ret;
> +
> + if (!no_request_thread)
> + return 0;
> + no_request_thread = false;
> +
> + /* create the host thread to wait/handle the uevent from kernel */
> + ret = pthread_create(&uev_monitor_thread, NULL,
> + dev_uev_monitoring, NULL);
What is the reason to open new thread for hotplug?
Why not to use the current dpdk host thread by the alarm mechanism?
> + return ret;
> +}
> +
> +int
> +rte_dev_monitor_stop(void)
> +{
> + udev_exit = true;
> + no_request_thread = true;
> + return 0;
> +}
> diff --git a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_dev.h
> b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_dev.h
> new file mode 100644
> index 0000000..6a6feb5
> --- /dev/null
> +++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_dev.h
> @@ -0,0 +1,106 @@
> +/*-
> + * BSD LICENSE
> + *
> + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
> + * All rights reserved.
> + *
> + * Redistribution and use in source and binary forms, with or without
> + * modification, are permitted provided that the following conditions
> + * are met:
> + *
> + * * Redistributions of source code must retain the above copyright
> + * notice, this list of conditions and the following disclaimer.
> + * * Redistributions in binary form must reproduce the above copyright
> + * notice, this list of conditions and the following disclaimer in
> + * the documentation and/or other materials provided with the
> + * distribution.
> + * * Neither the name of Intel Corporation nor the names of its
> + * contributors may be used to endorse or promote products derived
> + * from this software without specific prior written permission.
> + *
> + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
> CONTRIBUTORS
> + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
> NOT
> + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
> FITNESS FOR
> + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
> COPYRIGHT
> + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
> INCIDENTAL,
> + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
> NOT
> + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
> OF USE,
> + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
> AND ON ANY
> + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
> TORT
> + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
> THE USE
> + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
> DAMAGE.
> + */
> +
> +#ifndef _RTE_DEV_H_
> +#error "don't include this file directly, please include generic <rte_dev.h>"
> +#endif
> +
> +#ifndef _RTE_LINUXAPP_DEV_H_
> +#define _RTE_LINUXAPP_DEV_H_
> +
> +#include <stdio.h>
> +
> +#include <rte_dev.h>
> +
> +#define RTE_EAL_UEV_MSG_LEN 4096
> +#define RTE_EAL_UEV_MSG_ELEM_LEN 128
> +
> +enum uev_subsystem {
> + UEV_SUBSYSTEM_UIO,
> + UEV_SUBSYSTEM_VFIO,
> + UEV_SUBSYSTEM_PCI,
> + UEV_SUBSYSTEM_MAX
> +};
> +
> +enum uev_monitor_netlink_group {
> + UEV_MONITOR_KERNEL,
> + UEV_MONITOR_UDEV,
> +};
> +
> +/**
> + * The device event type.
> + */
> +enum rte_eal_dev_event_type {
> + RTE_EAL_DEV_EVENT_UNKNOWN, /**< unknown event type */
> + RTE_EAL_DEV_EVENT_ADD, /**< device adding event */
> + RTE_EAL_DEV_EVENT_REMOVE,
> + /**< device removing event */
> + RTE_EAL_DEV_EVENT_CHANGE,
> + /**< device status change event */
> + RTE_EAL_DEV_EVENT_MOVE, /**< device sys path move
> event */
> + RTE_EAL_DEV_EVENT_ONLINE, /**< device online event */
> + RTE_EAL_DEV_EVENT_OFFLINE, /**< device offline event */
> + RTE_EAL_DEV_EVENT_MAX /**< max value of this enum
> */
> +};
> +
> +struct rte_eal_uevent {
> + enum rte_eal_dev_event_type type; /**< device event type */
> + int subsystem; /**< subsystem id */
> + char *devname; /**< device name */
> + enum uev_monitor_netlink_group group; /**< device netlink
> group */
> +};
> +
> +/**
> + * Start the device uevent monitoring.
> + *
> + * @param none
> + * @return
> + * - On success, zero.
> + * - On failure, a negative value.
> + */
> +int
> +rte_dev_monitor_start(void);
> +
> +/**
> + * Stop the device uevent monitoring .
> + *
> + * @param none
> + * @return
> + * - On success, zero.
> + * - On failure, a negative value.
> + */
> +
> +int
> +rte_dev_monitor_stop(void);
> +
> +#endif /* _RTE_LINUXAPP_DEV_H_ */
> diff --git a/lib/librte_eal/linuxapp/igb_uio/igb_uio.c
> b/lib/librte_eal/linuxapp/igb_uio/igb_uio.c
> index a3a98c1..d0e07b4 100644
> --- a/lib/librte_eal/linuxapp/igb_uio/igb_uio.c
> +++ b/lib/librte_eal/linuxapp/igb_uio/igb_uio.c
> @@ -354,6 +354,12 @@ igbuio_pci_release(struct uio_info *info, struct
> inode *inode)
> struct rte_uio_pci_dev *udev = info->priv;
> struct pci_dev *dev = udev->pdev;
>
> + /* check if device have been remove before release */
> + if ((&dev->dev.kobj)->state_remove_uevent_sent == 1) {
> + pr_info("The device have been removed\n");
> + return -1;
> + }
> +
> /* disable interrupts */
> igbuio_pci_disable_interrupts(udev);
>
> diff --git a/lib/librte_pci/rte_pci.c b/lib/librte_pci/rte_pci.c
> index 0160fc1..feb5fd7 100644
> --- a/lib/librte_pci/rte_pci.c
> +++ b/lib/librte_pci/rte_pci.c
> @@ -172,6 +172,26 @@ rte_pci_addr_parse(const char *str, struct
> rte_pci_addr *addr)
> return -1;
> }
>
> +/* map a private resource from an address*/
> +void *
> +pci_map_private_resource(void *requested_addr, off_t offset, size_t size)
> +{
> + void *mapaddr;
> +
> + mapaddr = mmap(requested_addr, size,
> + PROT_READ | PROT_WRITE,
> + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
> -1, 0);
> + if (mapaddr == MAP_FAILED) {
> + RTE_LOG(ERR, EAL, "%s(): cannot mmap(%p, 0x%lx, 0x%lx): "
> + "%s (%p)\n",
> + __func__, requested_addr,
> + (unsigned long)size, (unsigned long)offset,
> + strerror(errno), mapaddr);
> + } else
> + RTE_LOG(DEBUG, EAL, " PCI memory mapped at %p\n",
> mapaddr);
> +
> + return mapaddr;
> +}
>
> /* map a particular resource from a file */
> void *
> diff --git a/lib/librte_pci/rte_pci.h b/lib/librte_pci/rte_pci.h
> index 4f2cd18..f6091a6 100644
> --- a/lib/librte_pci/rte_pci.h
> +++ b/lib/librte_pci/rte_pci.h
> @@ -227,6 +227,23 @@ int rte_pci_addr_cmp(const struct rte_pci_addr
> *addr,
> int rte_pci_addr_parse(const char *str, struct rte_pci_addr *addr);
>
> /**
> + * @internal
> + * Map to a particular private resource.
> + *
> + * @param requested_addr
> + * The starting address for the new mapping range.
> + * @param offset
> + * The offset for the mapping range.
> + * @param size
> + * The size for the mapping range.
> + * @return
> + * - On success, the function returns a pointer to the mapped area.
> + * - On error, the value MAP_FAILED is returned.
> + */
> +void *pci_map_private_resource(void *requested_addr, off_t offset,
> + size_t size);
> +
> +/**
> * Map a particular resource from a file.
> *
> * @param requested_addr
> --
> 2.7.4
More information about the dev
mailing list