[dpdk-dev] [PATCH v7 1/2] eal: add uevent monitor for hot plug

Matan Azrad matan at mellanox.com
Tue Jan 2 18:02:55 CET 2018


Hi Jeff

Maybe I'm touching in previous discussions but please see some comments\questions.

From: Jeff Guo:
> This patch aim to add a general uevent mechanism in eal device layer,
> to enable all linux kernel object hot plug monitoring, so user could use these
> APIs to monitor and read out the device status info that sent from the kernel
> side, then corresponding to handle it, such as detach or attach the
> device, and even benefit to use it to do smoothly fail safe work.
> 
> 1) About uevent monitoring:
> a: add one epolling to poll the netlink socket, to monitor the uevent of
>    the device, add device_state in struct of rte_device, to identify the
>    device state machine.
> b: add enum of rte_eal_dev_event_type and struct of rte_eal_uevent.
> c: add below API in rte eal device common layer.
>    rte_eal_dev_monitor_enable
>    rte_dev_callback_register
>    rte_dev_callback_unregister
>    _rte_dev_callback_process
>    rte_dev_monitor_start
>    rte_dev_monitor_stop
> 
> 2) About failure handler, use pci uio for example,
>    add pci_remap_device in bus layer and below function to process it:
>    rte_pci_remap_device
>    pci_uio_remap_resource
>    pci_map_private_resource
>    add rte_pci_dev_bind_driver to bind pci device with explicit driver.
> 
> Signed-off-by: Jeff Guo <jia.guo at intel.com>
> ---
> v7->v6:
> a.modify vdev part according to the vdev rework
> b.re-define and split the func into common and bus specific code
> c.fix some incorrect issue.
> b.fix the system hung after send packcet issue.
> ---
>  drivers/bus/pci/bsd/pci.c                          |  30 ++
>  drivers/bus/pci/linux/pci.c                        |  87 +++++
>  drivers/bus/pci/linux/pci_init.h                   |   1 +
>  drivers/bus/pci/pci_common.c                       |  43 +++
>  drivers/bus/pci/pci_common_uio.c                   |  28 ++
>  drivers/bus/pci/private.h                          |  12 +
>  drivers/bus/pci/rte_bus_pci.h                      |  25 ++
>  drivers/bus/vdev/vdev.c                            |  36 +++
>  lib/librte_eal/bsdapp/eal/eal_dev.c                |  64 ++++
>  .../bsdapp/eal/include/exec-env/rte_dev.h          | 106 ++++++
>  lib/librte_eal/common/eal_common_bus.c             |  30 ++
>  lib/librte_eal/common/eal_common_dev.c             | 169 ++++++++++
>  lib/librte_eal/common/include/rte_bus.h            |  69 ++++
>  lib/librte_eal/common/include/rte_dev.h            |  89 ++++++
>  lib/librte_eal/linuxapp/eal/Makefile               |   3 +-
>  lib/librte_eal/linuxapp/eal/eal_alarm.c            |   5 +
>  lib/librte_eal/linuxapp/eal/eal_dev.c              | 356
> +++++++++++++++++++++
>  .../linuxapp/eal/include/exec-env/rte_dev.h        | 106 ++++++
>  lib/librte_eal/linuxapp/igb_uio/igb_uio.c          |   6 +
>  lib/librte_pci/rte_pci.c                           |  20 ++
>  lib/librte_pci/rte_pci.h                           |  17 +
>  21 files changed, 1301 insertions(+), 1 deletion(-)
>  create mode 100644 lib/librte_eal/bsdapp/eal/eal_dev.c
>  create mode 100644 lib/librte_eal/bsdapp/eal/include/exec-env/rte_dev.h
>  create mode 100644 lib/librte_eal/linuxapp/eal/eal_dev.c
>  create mode 100644 lib/librte_eal/linuxapp/eal/include/exec-env/rte_dev.h
> 
> diff --git a/drivers/bus/pci/bsd/pci.c b/drivers/bus/pci/bsd/pci.c
> index b8e2178..d58dbf6 100644
> --- a/drivers/bus/pci/bsd/pci.c
> +++ b/drivers/bus/pci/bsd/pci.c
> @@ -126,6 +126,29 @@ rte_pci_unmap_device(struct rte_pci_device *dev)
>  	}
>  }
> 
> +/* re-map pci device */
> +int
> +rte_pci_remap_device(struct rte_pci_device *dev)
> +{
> +	int ret;
> +
> +	if (dev == NULL)
> +		return -EINVAL;
> +
> +	switch (dev->kdrv) {
> +	case RTE_KDRV_NIC_UIO:
> +		ret = pci_uio_remap_resource(dev);
> +		break;
> +	default:
> +		RTE_LOG(DEBUG, EAL,
> +			"  Not managed by a supported kernel driver,
> skipped\n");
> +		ret = 1;
> +		break;
> +	}
> +
> +	return ret;
> +}
> +
>  void
>  pci_uio_free_resource(struct rte_pci_device *dev,
>  		struct mapped_pci_resource *uio_res)
> @@ -678,3 +701,10 @@ rte_pci_ioport_unmap(struct rte_pci_ioport *p)
> 
>  	return ret;
>  }
> +
> +int
> +rte_pci_dev_bind_driver(const char *dev_name, const char *drv_type)
> +{
> +	return -1;
> +}
> +
> diff --git a/drivers/bus/pci/linux/pci.c b/drivers/bus/pci/linux/pci.c
> index 5da6728..792fd2c 100644
> --- a/drivers/bus/pci/linux/pci.c
> +++ b/drivers/bus/pci/linux/pci.c
> @@ -145,6 +145,38 @@ rte_pci_unmap_device(struct rte_pci_device *dev)
>  	}
>  }
> 
> +/* Map pci device */
> +int
> +rte_pci_remap_device(struct rte_pci_device *dev)
> +{
> +	int ret = -1;
> +
> +	if (dev == NULL)
> +		return -EINVAL;
> +
> +	switch (dev->kdrv) {
> +	case RTE_KDRV_VFIO:
> +#ifdef VFIO_PRESENT
> +		/* no thing to do */
> +#endif
> +		break;
> +	case RTE_KDRV_IGB_UIO:
> +	case RTE_KDRV_UIO_GENERIC:
> +		if (rte_eal_using_phys_addrs()) {
> +			/* map resources for devices that use uio */
> +			ret = pci_uio_remap_resource(dev);
> +		}
> +		break;
> +	default:
> +		RTE_LOG(DEBUG, EAL,
> +			"  Not managed by a supported kernel driver,
> skipped\n");
> +		ret = 1;
> +		break;
> +	}
> +
> +	return ret;
> +}
> +
>  void *
>  pci_find_max_end_va(void)
>  {
> @@ -386,6 +418,8 @@ pci_scan_one(const char *dirname, const struct
> rte_pci_addr *addr)
>  		rte_pci_add_device(dev);
>  	}
> 
> +	dev->device.state = DEVICE_PARSED;
> +	TAILQ_INIT(&(dev->device.uev_cbs));
>  	return 0;
>  }
> 
> @@ -854,3 +888,56 @@ rte_pci_ioport_unmap(struct rte_pci_ioport *p)
> 
>  	return ret;
>  }
> +
> +int
> +rte_pci_dev_bind_driver(const char *dev_name, const char *drv_type)
> +{
> +	char drv_bind_path[1024];
> +	char drv_override_path[1024]; /* contains the /dev/uioX */
> +	int drv_override_fd;
> +	int drv_bind_fd;
> +
> +	RTE_SET_USED(drv_type);
> +
> +	snprintf(drv_override_path, sizeof(drv_override_path),
> +		"/sys/bus/pci/devices/%s/driver_override", dev_name);
> +
> +	/* specify the driver for a device by writing to driver_override */
> +	drv_override_fd = open(drv_override_path, O_WRONLY);
> +	if (drv_override_fd < 0) {
> +		RTE_LOG(ERR, EAL, "Cannot open %s: %s\n",
> +			drv_override_path, strerror(errno));
> +		goto err;
> +	}
> +
> +	if (write(drv_override_fd, drv_type, sizeof(drv_type)) < 0) {
> +		RTE_LOG(ERR, EAL,
> +			"Error: bind failed - Cannot write "
> +			"driver %s to device %s\n", drv_type, dev_name);
> +		goto err;
> +	}
> +
> +	close(drv_override_fd);
> +
> +	snprintf(drv_bind_path, sizeof(drv_bind_path),
> +		"/sys/bus/pci/drivers/%s/bind", drv_type);
> +
> +	/* do the bind by writing device to the specific driver  */
> +	drv_bind_fd = open(drv_bind_path, O_WRONLY | O_APPEND);
> +	if (drv_bind_fd < 0) {
> +		RTE_LOG(ERR, EAL, "Cannot open %s: %s\n",
> +			drv_bind_path, strerror(errno));
> +		goto err;
> +	}
> +
> +	if (write(drv_bind_fd, dev_name, sizeof(dev_name)) < 0)
> +		goto err;
> +
> +	close(drv_bind_fd);
> +	return 0;
> +err:
> +	close(drv_override_fd);
> +	close(drv_bind_fd);
> +	return -1;
> +}
> +
> diff --git a/drivers/bus/pci/linux/pci_init.h b/drivers/bus/pci/linux/pci_init.h
> index f342c47..5838402 100644
> --- a/drivers/bus/pci/linux/pci_init.h
> +++ b/drivers/bus/pci/linux/pci_init.h
> @@ -58,6 +58,7 @@ int pci_uio_alloc_resource(struct rte_pci_device *dev,
>  		struct mapped_pci_resource **uio_res);
>  void pci_uio_free_resource(struct rte_pci_device *dev,
>  		struct mapped_pci_resource *uio_res);
> +int pci_uio_remap_resource(struct rte_pci_device *dev);
>  int pci_uio_map_resource_by_index(struct rte_pci_device *dev, int
> res_idx,
>  		struct mapped_pci_resource *uio_res, int map_idx);
> 
> diff --git a/drivers/bus/pci/pci_common.c b/drivers/bus/pci/pci_common.c
> index 104fdf9..5417b32 100644
> --- a/drivers/bus/pci/pci_common.c
> +++ b/drivers/bus/pci/pci_common.c
> @@ -282,6 +282,7 @@ pci_probe_all_drivers(struct rte_pci_device *dev)
>  		if (rc > 0)
>  			/* positive value means driver doesn't support it */
>  			continue;
> +		dev->device.state = DEVICE_PROBED;
>  		return 0;
>  	}
>  	return 1;
> @@ -481,6 +482,7 @@ rte_pci_insert_device(struct rte_pci_device
> *exist_pci_dev,
>  void
>  rte_pci_remove_device(struct rte_pci_device *pci_dev)
>  {
> +	RTE_LOG(DEBUG, EAL, " rte_pci_remove_device for device list\n");
>  	TAILQ_REMOVE(&rte_pci_bus.device_list, pci_dev, next);
>  }
> 
> @@ -502,6 +504,44 @@ pci_find_device(const struct rte_device *start,
> rte_dev_cmp_t cmp,
>  	return NULL;
>  }
> 
> +static struct rte_device *
> +pci_find_device_by_name(const struct rte_device *start,
> +		rte_dev_cmp_name_t cmp_name,
> +		const void *data)
> +{
> +	struct rte_pci_device *dev;
> +
> +	FOREACH_DEVICE_ON_PCIBUS(dev) {
> +		if (start && &dev->device == start) {
> +			start = NULL; /* starting point found */
> +			continue;
> +		}
> +		if (cmp_name(dev->device.name, data) == 0)
> +			return &dev->device;
> +	}
> +
> +	return NULL;
> +}
> +
> +static int
> +pci_remap_device(struct rte_device *dev)
> +{
> +	struct rte_pci_device *pdev;
> +	int ret;
> +
> +	if (dev == NULL)
> +		return -EINVAL;
> +
> +	pdev = RTE_DEV_TO_PCI(dev);
> +
> +	/* remap resources for devices that use igb_uio */
> +	ret = rte_pci_remap_device(pdev);
> +	if (ret != 0)
> +		RTE_LOG(ERR, EAL, "failed to remap device %s",
> +			dev->name);
> +	return ret;
> +}
> +
>  static int
>  pci_plug(struct rte_device *dev)
>  {
> @@ -528,10 +568,13 @@ struct rte_pci_bus rte_pci_bus = {
>  		.scan = rte_pci_scan,
>  		.probe = rte_pci_probe,
>  		.find_device = pci_find_device,
> +		.find_device_by_name = pci_find_device_by_name,
>  		.plug = pci_plug,
>  		.unplug = pci_unplug,
>  		.parse = pci_parse,
>  		.get_iommu_class = rte_pci_get_iommu_class,
> +		.remap_device = pci_remap_device,
> +		.bind_driver = rte_pci_dev_bind_driver,
>  	},
>  	.device_list = TAILQ_HEAD_INITIALIZER(rte_pci_bus.device_list),
>  	.driver_list = TAILQ_HEAD_INITIALIZER(rte_pci_bus.driver_list),
> diff --git a/drivers/bus/pci/pci_common_uio.c
> b/drivers/bus/pci/pci_common_uio.c
> index 0671131..8cb4009 100644
> --- a/drivers/bus/pci/pci_common_uio.c
> +++ b/drivers/bus/pci/pci_common_uio.c
> @@ -176,6 +176,34 @@ pci_uio_unmap(struct mapped_pci_resource
> *uio_res)
>  	}
>  }
> 
> +/* remap the PCI resource of a PCI device in private virtual memory */
> +int
> +pci_uio_remap_resource(struct rte_pci_device *dev)
> +{
> +	int i;
> +	uint64_t phaddr;
> +	void *map_address;
> +
> +	/* Map all BARs */
> +	for (i = 0; i != PCI_MAX_RESOURCE; i++) {
> +		/* skip empty BAR */
> +		phaddr = dev->mem_resource[i].phys_addr;
> +		if (phaddr == 0)
> +			continue;
> +		map_address = pci_map_private_resource(
> +				dev->mem_resource[i].addr, 0,
> +				(size_t)dev->mem_resource[i].len);
> +		if (map_address == MAP_FAILED)
> +			goto error;
> +		memset(map_address, 0xFF, (size_t)dev-
> >mem_resource[i].len);
> +		dev->mem_resource[i].addr = map_address;
> +	}
> +
> +	return 0;
> +error:
> +	return -1;
> +}
> +
>  static struct mapped_pci_resource *
>  pci_uio_find_resource(struct rte_pci_device *dev)
>  {
> diff --git a/drivers/bus/pci/private.h b/drivers/bus/pci/private.h
> index 2283f09..10baa1a 100644
> --- a/drivers/bus/pci/private.h
> +++ b/drivers/bus/pci/private.h
> @@ -202,6 +202,18 @@ void pci_uio_free_resource(struct rte_pci_device
> *dev,
>  		struct mapped_pci_resource *uio_res);
> 
>  /**
> + * remap the pci uio resource..
> + *
> + * @param dev
> + *   Point to the struct rte pci device.
> + * @return
> + *   - On success, zero.
> + *   - On failure, a negative value.
> + */
> +int
> +pci_uio_remap_resource(struct rte_pci_device *dev);
> +
> +/**
>   * Map device memory to uio resource
>   *
>   * This function is private to EAL.
> diff --git a/drivers/bus/pci/rte_bus_pci.h b/drivers/bus/pci/rte_bus_pci.h
> index d4a2996..1662f3b 100644
> --- a/drivers/bus/pci/rte_bus_pci.h
> +++ b/drivers/bus/pci/rte_bus_pci.h
> @@ -52,6 +52,8 @@ extern "C" {
>  #include <sys/queue.h>
>  #include <stdint.h>
>  #include <inttypes.h>
> +#include <unistd.h>
> +#include <fcntl.h>
> 
>  #include <rte_debug.h>
>  #include <rte_interrupts.h>
> @@ -197,6 +199,15 @@ int rte_pci_map_device(struct rte_pci_device *dev);
>  void rte_pci_unmap_device(struct rte_pci_device *dev);
> 
>  /**
> + * Remap this device
> + *
> + * @param dev
> + *   A pointer to a rte_pci_device structure describing the device
> + *   to use
> + */
> +int rte_pci_remap_device(struct rte_pci_device *dev);
> +
> +/**
>   * Dump the content of the PCI bus.
>   *
>   * @param f
> @@ -333,6 +344,20 @@ void rte_pci_ioport_read(struct rte_pci_ioport *p,
>  void rte_pci_ioport_write(struct rte_pci_ioport *p,
>  		const void *data, size_t len, off_t offset);
> 
> +/**
> + * It can be used to bind a device to a specific type of driver.
> + *
> + * @param dev_name
> + *  The device name.
> + * @param drv_type
> + *  The specific driver's type.
> + *
> + * @return
> + *  - On success, zero.
> + *  - On failure, a negative value.
> + */
> +int rte_pci_dev_bind_driver(const char *dev_name, const char *drv_type);
> +
>  #ifdef __cplusplus
>  }
>  #endif
> diff --git a/drivers/bus/vdev/vdev.c b/drivers/bus/vdev/vdev.c
> index fd7736d..773f6e0 100644
> --- a/drivers/bus/vdev/vdev.c
> +++ b/drivers/bus/vdev/vdev.c
> @@ -323,6 +323,39 @@ vdev_find_device(const struct rte_device *start,
> rte_dev_cmp_t cmp,
>  	return NULL;
>  }
> 
> +static struct rte_device *
> +vdev_find_device_by_name(const struct rte_device *start,
> +		rte_dev_cmp_name_t cmp_name,
> +		const void *data)
> +{
> +	struct rte_vdev_device *dev;
> +
> +	TAILQ_FOREACH(dev, &vdev_device_list, next) {
> +		if (start && &dev->device == start) {
> +			start = NULL;
> +			continue;
> +		}
> +		if (cmp_name(dev->device.name, data) == 0)
> +			return &dev->device;
> +	}
> +	return NULL;
> +}
> +
> +static int
> +vdev_remap_device(struct rte_device *dev)
> +{
> +	RTE_SET_USED(dev);
> +	return 0;
> +}
> +
> +static int
> +vdev_bind_driver(const char *dev_name, const char *drv_type)
> +{
> +	RTE_SET_USED(dev_name);
> +	RTE_SET_USED(drv_type);
> +	return 0;
> +}
> +
>  static int
>  vdev_plug(struct rte_device *dev)
>  {
> @@ -339,9 +372,12 @@ static struct rte_bus rte_vdev_bus = {
>  	.scan = vdev_scan,
>  	.probe = vdev_probe,
>  	.find_device = vdev_find_device,
> +	.find_device_by_name = vdev_find_device_by_name,
>  	.plug = vdev_plug,
>  	.unplug = vdev_unplug,
>  	.parse = vdev_parse,
> +	.remap_device = vdev_remap_device,
> +	.bind_driver = vdev_bind_driver,
>  };
> 
>  RTE_REGISTER_BUS(vdev, rte_vdev_bus);
> diff --git a/lib/librte_eal/bsdapp/eal/eal_dev.c
> b/lib/librte_eal/bsdapp/eal/eal_dev.c
> new file mode 100644
> index 0000000..6ea9a74
> --- /dev/null
> +++ b/lib/librte_eal/bsdapp/eal/eal_dev.c
> @@ -0,0 +1,64 @@
> +/*-
> + *   Copyright(c) 2010-2017 Intel Corporation.
> + *   All rights reserved.
> + *
> + *   Redistribution and use in source and binary forms, with or without
> + *   modification, are permitted provided that the following conditions
> + *   are met:
> + *
> + *     * Redistributions of source code must retain the above copyright
> + *       notice, this list of conditions and the following disclaimer.
> + *     * Redistributions in binary form must reproduce the above copyright
> + *       notice, this list of conditions and the following disclaimer in
> + *       the documentation and/or other materials provided with the
> + *       distribution.
> + *     * Neither the name of Intel Corporation nor the names of its
> + *       contributors may be used to endorse or promote products derived
> + *       from this software without specific prior written permission.
> + *
> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
> CONTRIBUTORS
> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
> NOT
> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
> FITNESS FOR
> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
> COPYRIGHT
> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
> INCIDENTAL,
> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
> NOT
> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
> OF USE,
> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
> AND ON ANY
> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
> TORT
> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
> THE USE
> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
> DAMAGE.
> + */
> +
> +#include <stdio.h>
> +#include <string.h>
> +#include <inttypes.h>
> +#include <sys/queue.h>
> +#include <sys/signalfd.h>
> +#include <sys/ioctl.h>
> +#include <sys/socket.h>
> +#include <linux/netlink.h>
> +#include <sys/epoll.h>
> +#include <unistd.h>
> +#include <signal.h>
> +#include <stdbool.h>
> +
> +#include <rte_malloc.h>
> +#include <rte_bus.h>
> +#include <rte_dev.h>
> +#include <rte_devargs.h>
> +#include <rte_debug.h>
> +#include <rte_log.h>
> +
> +#include "eal_thread.h"
> +
> +int
> +rte_dev_monitor_start(void)
> +{
> +	return -1;
> +}
> +
> +int
> +rte_dev_monitor_stop(void)
> +{
> +	return -1;
> +}
> diff --git a/lib/librte_eal/bsdapp/eal/include/exec-env/rte_dev.h
> b/lib/librte_eal/bsdapp/eal/include/exec-env/rte_dev.h
> new file mode 100644
> index 0000000..6a6feb5
> --- /dev/null
> +++ b/lib/librte_eal/bsdapp/eal/include/exec-env/rte_dev.h
> @@ -0,0 +1,106 @@
> +/*-
> + *   BSD LICENSE
> + *
> + *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
> + *   All rights reserved.
> + *
> + *   Redistribution and use in source and binary forms, with or without
> + *   modification, are permitted provided that the following conditions
> + *   are met:
> + *
> + *     * Redistributions of source code must retain the above copyright
> + *       notice, this list of conditions and the following disclaimer.
> + *     * Redistributions in binary form must reproduce the above copyright
> + *       notice, this list of conditions and the following disclaimer in
> + *       the documentation and/or other materials provided with the
> + *       distribution.
> + *     * Neither the name of Intel Corporation nor the names of its
> + *       contributors may be used to endorse or promote products derived
> + *       from this software without specific prior written permission.
> + *
> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
> CONTRIBUTORS
> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
> NOT
> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
> FITNESS FOR
> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
> COPYRIGHT
> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
> INCIDENTAL,
> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
> NOT
> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
> OF USE,
> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
> AND ON ANY
> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
> TORT
> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
> THE USE
> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
> DAMAGE.
> + */
> +
> +#ifndef _RTE_DEV_H_
> +#error "don't include this file directly, please include generic <rte_dev.h>"
> +#endif
> +
> +#ifndef _RTE_LINUXAPP_DEV_H_
> +#define _RTE_LINUXAPP_DEV_H_
> +
> +#include <stdio.h>
> +
> +#include <rte_dev.h>
> +
> +#define RTE_EAL_UEV_MSG_LEN 4096
> +#define RTE_EAL_UEV_MSG_ELEM_LEN 128
> +
> +enum uev_subsystem {
> +	UEV_SUBSYSTEM_UIO,
> +	UEV_SUBSYSTEM_VFIO,
> +	UEV_SUBSYSTEM_PCI,
> +	UEV_SUBSYSTEM_MAX
> +};
> +
> +enum uev_monitor_netlink_group {
> +	UEV_MONITOR_KERNEL,
> +	UEV_MONITOR_UDEV,
> +};
> +
> +/**
> + * The device event type.
> + */
> +enum rte_eal_dev_event_type {
> +	RTE_EAL_DEV_EVENT_UNKNOWN,	/**< unknown event type */
> +	RTE_EAL_DEV_EVENT_ADD,		/**< device adding event */
> +	RTE_EAL_DEV_EVENT_REMOVE,
> +					/**< device removing event */
> +	RTE_EAL_DEV_EVENT_CHANGE,
> +					/**< device status change event */
> +	RTE_EAL_DEV_EVENT_MOVE,		/**< device sys path move
> event */
> +	RTE_EAL_DEV_EVENT_ONLINE,	/**< device online event */
> +	RTE_EAL_DEV_EVENT_OFFLINE,	/**< device offline event */
> +	RTE_EAL_DEV_EVENT_MAX		/**< max value of this enum
> */
> +};
> +
> +struct rte_eal_uevent {
> +	enum rte_eal_dev_event_type type;	/**< device event type */
> +	int subsystem;				/**< subsystem id */
> +	char *devname;				/**< device name */
> +	enum uev_monitor_netlink_group group;	/**< device netlink
> group */
> +};
> +
> +/**
> + * Start the device uevent monitoring.
> + *
> + * @param none
> + * @return
> + *   - On success, zero.
> + *   - On failure, a negative value.
> + */
> +int
> +rte_dev_monitor_start(void);
> +
> +/**
> + * Stop the device uevent monitoring .
> + *
> + * @param none
> + * @return
> + *   - On success, zero.
> + *   - On failure, a negative value.
> + */
> +
> +int
> +rte_dev_monitor_stop(void);
> +
> +#endif /* _RTE_LINUXAPP_DEV_H_ */
> diff --git a/lib/librte_eal/common/eal_common_bus.c
> b/lib/librte_eal/common/eal_common_bus.c
> index 3e022d5..b7219c9 100644
> --- a/lib/librte_eal/common/eal_common_bus.c
> +++ b/lib/librte_eal/common/eal_common_bus.c
> @@ -51,8 +51,11 @@ rte_bus_register(struct rte_bus *bus)
>  	RTE_VERIFY(bus->scan);
>  	RTE_VERIFY(bus->probe);
>  	RTE_VERIFY(bus->find_device);
> +	RTE_VERIFY(bus->find_device_by_name);
>  	/* Buses supporting driver plug also require unplug. */
>  	RTE_VERIFY(!bus->plug || bus->unplug);
> +	RTE_VERIFY(bus->remap_device);
> +	RTE_VERIFY(bus->bind_driver);
> 
>  	TAILQ_INSERT_TAIL(&rte_bus_list, bus, next);
>  	RTE_LOG(DEBUG, EAL, "Registered [%s] bus.\n", bus->name);
> @@ -170,6 +173,14 @@ cmp_rte_device(const struct rte_device *dev1,
> const void *_dev2)
>  }
> 
>  static int
> +cmp_rte_device_name(const char *dev_name1, const void *_dev_name2)
> +{
> +	const char *dev_name2 = _dev_name2;
> +
> +	return strcmp(dev_name1, dev_name2);
> +}
> +
> +static int
>  bus_find_device(const struct rte_bus *bus, const void *_dev)
>  {
>  	struct rte_device *dev;
> @@ -178,6 +189,25 @@ bus_find_device(const struct rte_bus *bus, const
> void *_dev)
>  	return dev == NULL;
>  }
> 
> +static struct rte_device *
> +bus_find_device_by_name(const struct rte_bus *bus, const void
> *_dev_name)
> +{
> +	struct rte_device *dev;
> +
> +	dev = bus->find_device_by_name(NULL, cmp_rte_device_name,
> _dev_name);
> +	return dev;
> +}
> +
> +struct rte_device *
> +
> +rte_bus_find_device(const struct rte_bus *bus, const void *_dev_name)
> +{
> +	struct rte_device *dev;
> +
> +	dev = bus_find_device_by_name(bus, _dev_name);
> +	return dev;
> +}
> +
>  struct rte_bus *
>  rte_bus_find_by_device(const struct rte_device *dev)
>  {
> diff --git a/lib/librte_eal/common/eal_common_dev.c
> b/lib/librte_eal/common/eal_common_dev.c
> index dda8f58..47909e8 100644
> --- a/lib/librte_eal/common/eal_common_dev.c
> +++ b/lib/librte_eal/common/eal_common_dev.c
> @@ -42,9 +42,31 @@
>  #include <rte_devargs.h>
>  #include <rte_debug.h>
>  #include <rte_log.h>
> +#include <rte_spinlock.h>
> +#include <rte_malloc.h>
> 
>  #include "eal_private.h"
> 
> +/* spinlock for device callbacks */
> +static rte_spinlock_t rte_dev_cb_lock = RTE_SPINLOCK_INITIALIZER;
> +
> +/**
> + * The user application callback description.
> + *
> + * It contains callback address to be registered by user application,
> + * the pointer to the parameters for callback, and the event type.
> + */
> +struct rte_eal_dev_callback {
> +	TAILQ_ENTRY(rte_eal_dev_callback) next; /**< Callbacks list */
> +	rte_eal_dev_cb_fn cb_fn;                /**< Callback address */
> +	void *cb_arg;                           /**< Parameter for callback */
> +	void *ret_param;                        /**< Return parameter */
> +	enum rte_eal_dev_event_type event;      /**< device event type */
> +	uint32_t active;                        /**< Callback is executing */
> +};
> +
> +static struct rte_eal_dev_callback *dev_add_cb;
> +
>  static int cmp_detached_dev_name(const struct rte_device *dev,
>  	const void *_name)
>  {
> @@ -234,3 +256,150 @@ int rte_eal_hotplug_remove(const char *busname,
> const char *devname)
>  	rte_eal_devargs_remove(busname, devname);
>  	return ret;
>  }
> +
> +int
> +rte_eal_dev_monitor_enable(void)
> +{
> +	int ret;
> +
> +	ret = rte_dev_monitor_start();
> +	if (ret)
> +		RTE_LOG(ERR, EAL, "Can not init device monitor\n");
> +	return ret;
> +}
> +
> +int
> +rte_dev_callback_register(struct rte_device *device,
> +			enum rte_eal_dev_event_type event,
> +			rte_eal_dev_cb_fn cb_fn, void *cb_arg)
> +{
> +	struct rte_eal_dev_callback *user_cb;
> +
> +	if (!cb_fn)
> +		return -EINVAL;
> +

What's about checking the device pointer is not NULL ?

> +	rte_spinlock_lock(&rte_dev_cb_lock);
> +
> +	if (TAILQ_EMPTY(&(device->uev_cbs)))
> +		TAILQ_INIT(&(device->uev_cbs));
> +
> +	if (event == RTE_EAL_DEV_EVENT_ADD) {
> +		user_cb = NULL;
> +	} else {
> +		TAILQ_FOREACH(user_cb, &(device->uev_cbs), next) {
> +			if (user_cb->cb_fn == cb_fn &&
> +				user_cb->cb_arg == cb_arg &&
> +				user_cb->event == event) {
> +				break;
> +			}
> +		}
> +	}
> +
> +	/* create a new callback. */
> +	if (user_cb == NULL) {
> +		/* allocate a new interrupt callback entity */
> +		user_cb = rte_zmalloc("eal device event",
> +					sizeof(*user_cb), 0);
> +		if (user_cb == NULL) {
> +			RTE_LOG(ERR, EAL, "Can not allocate memory\n");

Missing rte_spinlock_unlock.

> +			return -ENOMEM;
> +		}
> +		user_cb->cb_fn = cb_fn;
> +		user_cb->cb_arg = cb_arg;
> +		user_cb->event = event;
> +		if (event == RTE_EAL_DEV_EVENT_ADD)
> +			dev_add_cb = user_cb;

Only one dpdk entity can register to ADD callback?

I suggest to add option to register all devices maybe by using dummy device which will include all the "ALL_DEVICES"  callbacks per event.  
All means past, present and future devices, by this way 1 callback can be called for all the devices and more than one dpdk entity could register to  an ADD\NEW event.
What's about NEW instead of ADD?

I also suggest to add the device pointer as a parameter to the callback(which will be managed by EAL).

> +		else
> +			TAILQ_INSERT_TAIL(&(device->uev_cbs), user_cb,
> next);
> +	}
> +
> +	rte_spinlock_unlock(&rte_dev_cb_lock);
> +	return 0;
> +}
> +
> +int
> +rte_dev_callback_unregister(struct rte_device *device,
> +			enum rte_eal_dev_event_type event,
> +			rte_eal_dev_cb_fn cb_fn, void *cb_arg)
> +{
> +	int ret;
> +	struct rte_eal_dev_callback *cb, *next;
> +
> +	if (!cb_fn)
> +		return -EINVAL;
> +
> +	rte_spinlock_lock(&rte_dev_cb_lock);
> +
> +	ret = 0;
> +	if (event == RTE_EAL_DEV_EVENT_ADD) {
> +		rte_free(dev_add_cb);
> +		dev_add_cb = NULL;
> +	} else {

Device NULL checking?

> +		for (cb = TAILQ_FIRST(&(device->uev_cbs)); cb != NULL;
> +		      cb = next) {
> +
> +			next = TAILQ_NEXT(cb, next);
> +
> +			if (cb->cb_fn != cb_fn || cb->event != event ||
> +					(cb->cb_arg != (void *)-1 &&
> +					cb->cb_arg != cb_arg))
> +				continue;
> +
> +			/*
> +			 * if this callback is not executing right now,
> +			 * then remove it.
> +			 */
> +			if (cb->active == 0) {
> +				TAILQ_REMOVE(&(device->uev_cbs), cb,
> next);
> +				rte_free(cb);
> +			} else {
> +				ret = -EAGAIN;
> +			}
> +		}
> +	}
> +	rte_spinlock_unlock(&rte_dev_cb_lock);
> +	return ret;
> +}
> +
> +int
> +_rte_dev_callback_process(struct rte_device *device,
> +			enum rte_eal_dev_event_type event,
> +			void *cb_arg, void *ret_param)
> +{
> +	struct rte_eal_dev_callback dev_cb;
> +	struct rte_eal_dev_callback *cb_lst;
> +	int rc = 0;
> +
> +	rte_spinlock_lock(&rte_dev_cb_lock);
> +	if (event == RTE_EAL_DEV_EVENT_ADD) {
> +		if (cb_arg != NULL)
> +			dev_add_cb->cb_arg = cb_arg;
> +
> +		if (ret_param != NULL)
> +			dev_add_cb->ret_param = ret_param;
> +
> +		rte_spinlock_unlock(&rte_dev_cb_lock);

Can't someone free it when it running?
I suggest to  keep the lock locked.
Callbacks are not allowed to use this mechanism to prevent deadlock. 

> +		rc = dev_add_cb->cb_fn(dev_add_cb->event,
> +				dev_add_cb->cb_arg, dev_add_cb-
> >ret_param);
> +		rte_spinlock_lock(&rte_dev_cb_lock);
> +	} else {
> +		TAILQ_FOREACH(cb_lst, &(device->uev_cbs), next) {
> +			if (cb_lst->cb_fn == NULL || cb_lst->event != event)
> +				continue;
> +			dev_cb = *cb_lst;
> +			cb_lst->active = 1;
> +			if (cb_arg != NULL)
> +				dev_cb.cb_arg = cb_arg;
> +			if (ret_param != NULL)
> +				dev_cb.ret_param = ret_param;
> +
> +			rte_spinlock_unlock(&rte_dev_cb_lock);

The current active flag doesn't do it  thread safe here, I suggest to keep the lock locked.
Scenario:
	1. Thread A see active = 0 in unregister function.
	2. Context switch.
	3. Thread B start the callback.
	4. Context switch.
	5. Thread A free it.
	6. Context switch.
	7. Seg fault in Thread B.

> +			rc = dev_cb.cb_fn(dev_cb.event,
> +					dev_cb.cb_arg, dev_cb.ret_param);
> +			rte_spinlock_lock(&rte_dev_cb_lock);
> +			cb_lst->active = 0;
> +		}
> +	}
> +	rte_spinlock_unlock(&rte_dev_cb_lock);
> +	return rc;
> +}
> diff --git a/lib/librte_eal/common/include/rte_bus.h
> b/lib/librte_eal/common/include/rte_bus.h
> index 6fb0834..6c4ae31 100644
> --- a/lib/librte_eal/common/include/rte_bus.h
> +++ b/lib/librte_eal/common/include/rte_bus.h
> @@ -122,6 +122,34 @@ typedef struct rte_device *
>  			 const void *data);
> 
>  /**
> + * Device iterator to find a device on a bus.
> + *
> + * This function returns an rte_device if one of those held by the bus
> + * matches the data passed as parameter.
> + *
> + * If the comparison function returns zero this function should stop iterating
> + * over any more devices. To continue a search the device of a previous
> search
> + * can be passed via the start parameter.
> + *
> + * @param cmp
> + *	the device name comparison function.
> + *
> + * @param data
> + *	Data to compare each device against.
> + *
> + * @param start
> + *	starting point for the iteration
> + *
> + * @return
> + *	The first device matching the data, NULL if none exists.
> + */
> +typedef struct rte_device *
> +(*rte_bus_find_device_by_name_t)(const struct rte_device *start,
> +			 rte_dev_cmp_name_t cmp,
> +			 const void *data);
> +
> +
> +/**
>   * Implementation specific probe function which is responsible for linking
>   * devices on that bus with applicable drivers.
>   *
> @@ -168,6 +196,37 @@ typedef int (*rte_bus_unplug_t)(struct rte_device
> *dev);
>  typedef int (*rte_bus_parse_t)(const char *name, void *addr);
> 
>  /**
> + * Implementation specific remap function which is responsible for
> remmaping
> + * devices on that bus from original share memory resource to a private
> memory
> + * resource for the sake of device has been removal.
> + *
> + * @param dev
> + *	Device pointer that was returned by a previous call to find_device.
> + *
> + * @return
> + *	0 on success.
> + *	!0 on error.
> + */
> +typedef int (*rte_bus_remap_device_t)(struct rte_device *dev);
> +
> +/**
> + * Implementation specific bind driver function which is responsible for bind
> + * a explicit type of driver with a devices on that bus.
> + *
> + * @param dev_name
> + *	device textual description.
> + *
> + * @param drv_type
> + *	driver type textual description.
> + *
> + * @return
> + *	0 on success.
> + *	!0 on error.
> + */
> +typedef int (*rte_bus_bind_driver_t)(const char *dev_name,
> +				const char *drv_type);
> +
> +/**
>   * Bus scan policies
>   */
>  enum rte_bus_scan_mode {
> @@ -206,9 +265,13 @@ struct rte_bus {
>  	rte_bus_scan_t scan;         /**< Scan for devices attached to bus */
>  	rte_bus_probe_t probe;       /**< Probe devices on bus */
>  	rte_bus_find_device_t find_device; /**< Find a device on the bus */
> +	rte_bus_find_device_by_name_t find_device_by_name;
> +				     /**< Find a device on the bus */
>  	rte_bus_plug_t plug;         /**< Probe single device for drivers */
>  	rte_bus_unplug_t unplug;     /**< Remove single device from driver
> */
>  	rte_bus_parse_t parse;       /**< Parse a device name */
> +	rte_bus_remap_device_t remap_device;       /**< remap a device */
> +	rte_bus_bind_driver_t bind_driver; /**< bind a driver for bus device
> */
>  	struct rte_bus_conf conf;    /**< Bus configuration */
>  	rte_bus_get_iommu_class_t get_iommu_class; /**< Get iommu
> class */
>  };
> @@ -306,6 +369,12 @@ struct rte_bus *rte_bus_find(const struct rte_bus
> *start, rte_bus_cmp_t cmp,
>  struct rte_bus *rte_bus_find_by_device(const struct rte_device *dev);
> 
>  /**
> + * Find the registered bus for a particular device.
> + */
> +struct rte_device *rte_bus_find_device(const struct rte_bus *bus,
> +				const void *dev_name);
> +
> +/**
>   * Find the registered bus for a given name.
>   */
>  struct rte_bus *rte_bus_find_by_name(const char *busname);
> diff --git a/lib/librte_eal/common/include/rte_dev.h
> b/lib/librte_eal/common/include/rte_dev.h
> index 9342e0c..19971d0 100644
> --- a/lib/librte_eal/common/include/rte_dev.h
> +++ b/lib/librte_eal/common/include/rte_dev.h
> @@ -51,6 +51,15 @@ extern "C" {
> 
>  #include <rte_log.h>
> 
> +#include <exec-env/rte_dev.h>
> +
> +typedef int (*rte_eal_dev_cb_fn)(enum rte_eal_dev_event_type event,
> +					void *cb_arg, void *ret_param);
> +
> +struct rte_eal_dev_callback;
> +/** @internal Structure to keep track of registered callbacks */
> +TAILQ_HEAD(rte_eal_dev_cb_list, rte_eal_dev_callback);
> +
>  __attribute__((format(printf, 2, 0)))
>  static inline void
>  rte_pmd_debug_trace(const char *func_name, const char *fmt, ...)
> @@ -157,6 +166,13 @@ struct rte_driver {
>   */
>  #define RTE_DEV_NAME_MAX_LEN 64
> 
> +enum device_state {
> +	DEVICE_UNDEFINED,
> +	DEVICE_FAULT,
> +	DEVICE_PARSED,
> +	DEVICE_PROBED,
> +};
> +
>  /**
>   * A structure describing a generic device.
>   */
> @@ -166,6 +182,9 @@ struct rte_device {
>  	const struct rte_driver *driver;/**< Associated driver */
>  	int numa_node;                /**< NUMA node connection */
>  	struct rte_devargs *devargs;  /**< Device user arguments */
> +	enum device_state state;  /**< Device state */
> +	/** User application callbacks for device event */
> +	struct rte_eal_dev_cb_list uev_cbs;
>  };
> 
>  /**
> @@ -248,6 +267,8 @@ int rte_eal_hotplug_remove(const char *busname,
> const char *devname);
>   */
>  typedef int (*rte_dev_cmp_t)(const struct rte_device *dev, const void
> *data);
> 
> +typedef int (*rte_dev_cmp_name_t)(const char *dev_name, const void
> *data);
> +
>  #define RTE_PMD_EXPORT_NAME_ARRAY(n, idx) n##idx[]
> 
>  #define RTE_PMD_EXPORT_NAME(name, idx) \
> @@ -293,4 +314,72 @@ __attribute__((used)) = str
>  }
>  #endif
> 
> +/**
> + * It enable the device event monitoring for a specific event.
> + *
> + * @param none
> + * @return
> + *   - On success, zero.
> + *   - On failure, a negative value.
> + */
> +int
> +rte_eal_dev_monitor_enable(void);
> +/**
> + * It registers the callback for the specific event. Multiple
> + * callbacks cal be registered at the same time.
> + * @param event
> + *  The device event type.
> + * @param cb_fn
> + *  callback address.
> + * @param cb_arg
> + *  address of parameter for callback.
> + *
> + * @return
> + *  - On success, zero.
> + *  - On failure, a negative value.
> + */
> +int rte_dev_callback_register(struct rte_device *device,
> +			enum rte_eal_dev_event_type event,
> +			rte_eal_dev_cb_fn cb_fn, void *cb_arg);
> +
> +/**
> + * It unregisters the callback according to the specified event.
> + *
> + * @param event
> + *  The event type which corresponding to the callback.
> + * @param cb_fn
> + *  callback address.
> + *  address of parameter for callback, (void *)-1 means to remove all
> + *  registered which has the same callback address.
> + *
> + * @return
> + *  - On success, return the number of callback entities removed.
> + *  - On failure, a negative value.
> + */
> +int rte_dev_callback_unregister(struct rte_device *device,
> +			enum rte_eal_dev_event_type event,
> +			rte_eal_dev_cb_fn cb_fn, void *cb_arg);
> +
> +/**
> + * @internal Executes all the user application registered callbacks for
> + * the specific device. It is for DPDK internal user only. User
> + * application should not call it directly.
> + *
> + * @param event
> + *  The device event type.
> + * @param cb_arg
> + *  callback parameter.
> + * @param ret_param
> + *  To pass data back to user application.
> + *  This allows the user application to decide if a particular function
> + *  is permitted or not.
> + *
> + * @return
> + *  - On success, return zero.
> + *  - On failure, a negative value.
> + */
> +int
> +_rte_dev_callback_process(struct rte_device *device,
> +			enum rte_eal_dev_event_type event,
> +			void *cb_arg, void *ret_param);
>  #endif /* _RTE_DEV_H_ */
> diff --git a/lib/librte_eal/linuxapp/eal/Makefile
> b/lib/librte_eal/linuxapp/eal/Makefile
> index 5a7b8b2..05a2437 100644
> --- a/lib/librte_eal/linuxapp/eal/Makefile
> +++ b/lib/librte_eal/linuxapp/eal/Makefile
> @@ -67,6 +67,7 @@ SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) +=
> eal_lcore.c
>  SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_timer.c
>  SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_interrupts.c
>  SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_alarm.c
> +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_dev.c
> 
>  # from common dir
>  SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_lcore.c
> @@ -120,7 +121,7 @@ ifeq ($(CONFIG_RTE_TOOLCHAIN_GCC),y)
>  CFLAGS_eal_thread.o += -Wno-return-type
>  endif
> 
> -INC := rte_kni_common.h
> +INC := rte_kni_common.h rte_dev.h
> 
>  SYMLINK-$(CONFIG_RTE_EXEC_ENV_LINUXAPP)-include/exec-env := \
>  	$(addprefix include/exec-env/,$(INC))
> diff --git a/lib/librte_eal/linuxapp/eal/eal_alarm.c
> b/lib/librte_eal/linuxapp/eal/eal_alarm.c
> index 8e4a775..29e73a7 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_alarm.c
> +++ b/lib/librte_eal/linuxapp/eal/eal_alarm.c
> @@ -209,6 +209,7 @@ rte_eal_alarm_cancel(rte_eal_alarm_callback cb_fn,
> void *cb_arg)
>  	int count = 0;
>  	int err = 0;
>  	int executing;
> +	int ret;
> 
>  	if (!cb_fn) {
>  		rte_errno = EINVAL;
> @@ -259,6 +260,10 @@ rte_eal_alarm_cancel(rte_eal_alarm_callback cb_fn,
> void *cb_arg)
>  			}
>  			ap_prev = ap;
>  		}
> +
> +		ret |= rte_intr_callback_unregister(&intr_handle,
> +				eal_alarm_callback, NULL);
> +
>  		rte_spinlock_unlock(&alarm_list_lk);
>  	} while (executing != 0);
> 
> diff --git a/lib/librte_eal/linuxapp/eal/eal_dev.c
> b/lib/librte_eal/linuxapp/eal/eal_dev.c
> new file mode 100644
> index 0000000..49fd0dc
> --- /dev/null
> +++ b/lib/librte_eal/linuxapp/eal/eal_dev.c
> @@ -0,0 +1,356 @@
> +/*-
> + *   Copyright(c) 2010-2017 Intel Corporation.
> + *   All rights reserved.
> + *
> + *   Redistribution and use in source and binary forms, with or without
> + *   modification, are permitted provided that the following conditions
> + *   are met:
> + *
> + *     * Redistributions of source code must retain the above copyright
> + *       notice, this list of conditions and the following disclaimer.
> + *     * Redistributions in binary form must reproduce the above copyright
> + *       notice, this list of conditions and the following disclaimer in
> + *       the documentation and/or other materials provided with the
> + *       distribution.
> + *     * Neither the name of Intel Corporation nor the names of its
> + *       contributors may be used to endorse or promote products derived
> + *       from this software without specific prior written permission.
> + *
> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
> CONTRIBUTORS
> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
> NOT
> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
> FITNESS FOR
> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
> COPYRIGHT
> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
> INCIDENTAL,
> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
> NOT
> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
> OF USE,
> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
> AND ON ANY
> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
> TORT
> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
> THE USE
> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
> DAMAGE.
> + */
> +
> +#include <stdio.h>
> +#include <string.h>
> +#include <inttypes.h>
> +#include <sys/queue.h>
> +#include <sys/signalfd.h>
> +#include <sys/ioctl.h>
> +#include <sys/socket.h>
> +#include <linux/netlink.h>
> +#include <sys/epoll.h>
> +#include <unistd.h>
> +#include <signal.h>
> +#include <stdbool.h>
> +
> +#include <rte_malloc.h>
> +#include <rte_bus.h>
> +#include <rte_dev.h>
> +#include <rte_devargs.h>
> +#include <rte_debug.h>
> +#include <rte_log.h>
> +
> +#include "eal_thread.h"
> +
> +/* uev monitoring thread */
> +static pthread_t uev_monitor_thread;
> +
> +bool udev_exit = true;
> +
> +bool no_request_thread = true;
> +
> +static void sig_handler(int signum)
> +{
> +	if (signum == SIGINT || signum == SIGTERM)
> +		rte_dev_monitor_stop();
> +}
> +
> +static int
> +dev_monitor_fd_new(void)
> +{
> +
> +	int uevent_fd;
> +
> +	uevent_fd = socket(PF_NETLINK, SOCK_RAW | SOCK_CLOEXEC |
> +			SOCK_NONBLOCK,
> +			NETLINK_KOBJECT_UEVENT);
> +	if (uevent_fd < 0) {
> +		RTE_LOG(ERR, EAL, "create uevent fd failed\n");
> +		return -1;
> +	}
> +	return uevent_fd;
> +}
> +
> +static int
> +dev_monitor_enable(int netlink_fd)
> +{
> +	struct sockaddr_nl addr;
> +	int ret;
> +	int size = 64 * 1024;
> +	int nonblock = 1;
> +
> +	memset(&addr, 0, sizeof(addr));
> +	addr.nl_family = AF_NETLINK;
> +	addr.nl_pid = 0;
> +	addr.nl_groups = 0xffffffff;
> +
> +	if (bind(netlink_fd, (struct sockaddr *) &addr, sizeof(addr)) < 0) {
> +		RTE_LOG(ERR, EAL, "bind failed\n");
> +		goto err;
> +	}
> +
> +	setsockopt(netlink_fd, SOL_SOCKET, SO_PASSCRED, &size,
> sizeof(size));
> +
> +	ret = ioctl(netlink_fd, FIONBIO, &nonblock);
> +	if (ret != 0) {
> +		RTE_LOG(ERR, EAL, "ioctl(FIONBIO) failed\n");
> +		goto err;
> +	}
> +	return 0;
> +err:
> +	close(netlink_fd);
> +	return -1;
> +}
> +
> +static void
> +dev_uev_parse(const char *buf, struct rte_eal_uevent *event)
> +{
> +	char action[RTE_EAL_UEV_MSG_ELEM_LEN];
> +	char subsystem[RTE_EAL_UEV_MSG_ELEM_LEN];
> +	char dev_path[RTE_EAL_UEV_MSG_ELEM_LEN];
> +	char pci_slot_name[RTE_EAL_UEV_MSG_ELEM_LEN];
> +	int i = 0;
> +
> +	memset(action, 0, RTE_EAL_UEV_MSG_ELEM_LEN);
> +	memset(subsystem, 0, RTE_EAL_UEV_MSG_ELEM_LEN);
> +	memset(dev_path, 0, RTE_EAL_UEV_MSG_ELEM_LEN);
> +	memset(pci_slot_name, 0, RTE_EAL_UEV_MSG_ELEM_LEN);
> +
> +	while (i < RTE_EAL_UEV_MSG_LEN) {
> +		for (; i < RTE_EAL_UEV_MSG_LEN; i++) {
> +			if (*buf)
> +				break;
> +			buf++;
> +		}
> +		if (!strncmp(buf, "libudev", 7)) {
> +			buf += 7;
> +			i += 7;
> +			event->group = UEV_MONITOR_UDEV;
> +		}
> +		if (!strncmp(buf, "ACTION=", 7)) {
> +			buf += 7;
> +			i += 7;
> +			snprintf(action, sizeof(action), "%s", buf);
> +		} else if (!strncmp(buf, "DEVPATH=", 8)) {
> +			buf += 8;
> +			i += 8;
> +			snprintf(dev_path, sizeof(dev_path), "%s", buf);
> +		} else if (!strncmp(buf, "SUBSYSTEM=", 10)) {
> +			buf += 10;
> +			i += 10;
> +			snprintf(subsystem, sizeof(subsystem), "%s", buf);
> +		} else if (!strncmp(buf, "PCI_SLOT_NAME=", 14)) {
> +			buf += 14;
> +			i += 14;
> +			snprintf(pci_slot_name, sizeof(subsystem), "%s",
> buf);
> +			event->devname = pci_slot_name;
> +		}
> +		for (; i < RTE_EAL_UEV_MSG_LEN; i++) {
> +			if (*buf == '\0')
> +				break;
> +			buf++;
> +		}
> +	}
> +
> +	if (!strncmp(subsystem, "pci", 3))
> +		event->subsystem = UEV_SUBSYSTEM_PCI;
> +	if (!strncmp(action, "add", 3))
> +		event->type = RTE_EAL_DEV_EVENT_ADD;
> +	if (!strncmp(action, "remove", 6))
> +		event->type = RTE_EAL_DEV_EVENT_REMOVE;
> +	event->devname = pci_slot_name;
> +}
> +
> +static int
> +dev_uev_receive(int fd, struct rte_eal_uevent *uevent)
> +{
> +	int ret;
> +	char buf[RTE_EAL_UEV_MSG_LEN];
> +
> +	memset(uevent, 0, sizeof(struct rte_eal_uevent));
> +	memset(buf, 0, RTE_EAL_UEV_MSG_LEN);
> +
> +	ret = recv(fd, buf, RTE_EAL_UEV_MSG_LEN - 1, MSG_DONTWAIT);
> +	if (ret < 0) {
> +		RTE_LOG(ERR, EAL,
> +		"Socket read error(%d): %s\n",
> +		errno, strerror(errno));
> +		return -1;
> +	} else if (ret == 0)
> +		/* connection closed */
> +		return -1;
> +
> +	dev_uev_parse(buf, uevent);
> +
> +	return 0;
> +}
> +
> +static int
> +dev_uev_process(struct epoll_event *events, int nfds)
> +{
> +	struct rte_bus *bus;
> +	struct rte_device *dev;
> +	struct rte_eal_uevent uevent;
> +	int ret;
> +	int i;
> +
> +	for (i = 0; i < nfds; i++) {
> +		/**
> +		 * check device uevent from kernel side, no need to check
> +		 * uevent from udev.
> +		 */
> +		if ((dev_uev_receive(events[i].data.fd, &uevent)) ||
> +			(uevent.group == UEV_MONITOR_UDEV))
> +			return 0;
> +
> +		/* default handle all pci devcie when is being hot plug */
> +		if (uevent.subsystem == UEV_SUBSYSTEM_PCI) {
> +			bus = rte_bus_find_by_name("pci");
> +			dev = rte_bus_find_device(bus, uevent.devname);
> +			if (uevent.type == RTE_EAL_DEV_EVENT_REMOVE) {
> +
> +				if ((!dev) || dev->state ==
> DEVICE_UNDEFINED)
> +					return 0;
> +				dev->state = DEVICE_FAULT;
> +
> +				/**
> +				 * remap the resource to be fake
> +				 * before user's removal processing
> +				 */
> +				ret = bus->remap_device(dev);
> +				if (!ret)
> +
> 	return(_rte_dev_callback_process(dev,
> +					  RTE_EAL_DEV_EVENT_REMOVE,
> +					  NULL, NULL));

What is the reason to keep this device in EAL device list after the removal?
I suggest to remove it (driver remove, bus remove and EAL remove) after the callbacks running.
By this way EAL can initiate all device removals.

> +			} else if (uevent.type == RTE_EAL_DEV_EVENT_ADD)
> {
> +				if (dev == NULL) {
> +					/**
> +					 * bind the driver to the device
> +					 * before user's add processing
> +					 */
> +					bus->bind_driver(
> +						uevent.devname,
> +						"igb_uio");
> +

Similar comments here:
EAL can initiate all device probe operations by adding the device and probing it here before the callback running.
Then, also the device pointer can be passed to the callbacks.

> 	return(_rte_dev_callback_process(NULL,
> +					  RTE_EAL_DEV_EVENT_ADD,
> +					  uevent.devname, NULL));
> +				}
> +			}
> +		}
> +	}
> +	return 0;
> +}
> +
> +/**
> + * It builds/rebuilds up the epoll file descriptor with all the
> + * file descriptors being waited on. Then handles the interrupts.
> + *
> + * @param arg
> + *  pointer. (unused)
> + *
> + * @return
> + *  never return;
> + */
> +static __attribute__((noreturn)) void *
> +dev_uev_monitoring(__rte_unused void *arg)
> +{
> +	struct sigaction act;
> +	sigset_t mask;
> +	int netlink_fd;
> +	struct epoll_event ep_kernel;
> +	int fd_ep;
> +
> +	udev_exit = false;
> +
> +	/* set signal handlers */
> +	memset(&act, 0x00, sizeof(struct sigaction));
> +	act.sa_handler = sig_handler;
> +	sigemptyset(&act.sa_mask);
> +	act.sa_flags = SA_RESTART;
> +	sigaction(SIGINT, &act, NULL);
> +	sigaction(SIGTERM, &act, NULL);
> +	sigemptyset(&mask);
> +	sigaddset(&mask, SIGINT);
> +	sigaddset(&mask, SIGTERM);
> +	sigprocmask(SIG_UNBLOCK, &mask, NULL);
> +
> +	fd_ep = epoll_create1(EPOLL_CLOEXEC);
> +	if (fd_ep < 0) {
> +		RTE_LOG(ERR, EAL, "error creating epoll fd: %m\n");
> +		goto out;
> +	}
> +
> +	netlink_fd = dev_monitor_fd_new();
> +
> +	if (dev_monitor_enable(netlink_fd) < 0) {
> +		RTE_LOG(ERR, EAL, "error subscribing to kernel events\n");
> +		goto out;
> +	}
> +
> +	memset(&ep_kernel, 0, sizeof(struct epoll_event));
> +	ep_kernel.events = EPOLLIN | EPOLLPRI | EPOLLRDHUP | EPOLLHUP;
> +	ep_kernel.data.fd = netlink_fd;
> +	if (epoll_ctl(fd_ep, EPOLL_CTL_ADD, netlink_fd,
> +		&ep_kernel) < 0) {
> +		RTE_LOG(ERR, EAL, "error addding fd to epoll: %m\n");
> +		goto out;
> +	}
> +
> +	while (!udev_exit) {
> +		int fdcount;
> +		struct epoll_event ev[1];
> +
> +		fdcount = epoll_wait(fd_ep, ev, 1, -1);
> +		if (fdcount < 0) {
> +			if (errno != EINTR)
> +				RTE_LOG(ERR, EAL, "error receiving uevent "
> +					"message: %m\n");
> +				continue;
> +			}
> +
> +		/* epoll_wait has at least one fd ready to read */
> +		if (dev_uev_process(ev, fdcount) < 0) {
> +			if (errno != EINTR)
> +				RTE_LOG(ERR, EAL, "error processing uevent
> "
> +					"message: %m\n");
> +		}
> +	}
> +out:
> +	if (fd_ep >= 0)
> +		close(fd_ep);
> +	if (netlink_fd >= 0)
> +		close(netlink_fd);
> +	rte_panic("uev monitoring fail\n");
> +}
> +
> +int
> +rte_dev_monitor_start(void)
> +{

Maybe add option to run it also by new EAL command line parameter?

> +	int ret;
> +
> +	if (!no_request_thread)
> +		return 0;
> +	no_request_thread = false;
> +
> +	/* create the host thread to wait/handle the uevent from kernel */
> +	ret = pthread_create(&uev_monitor_thread, NULL,
> +		dev_uev_monitoring, NULL);

What is the reason to open new thread for hotplug?
Why not to use the current dpdk host thread by the alarm mechanism? 

> +	return ret;
> +}
> +
> +int
> +rte_dev_monitor_stop(void)
> +{
> +	udev_exit = true;
> +	no_request_thread = true;
> +	return 0;
> +}
> diff --git a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_dev.h
> b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_dev.h
> new file mode 100644
> index 0000000..6a6feb5
> --- /dev/null
> +++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_dev.h
> @@ -0,0 +1,106 @@
> +/*-
> + *   BSD LICENSE
> + *
> + *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
> + *   All rights reserved.
> + *
> + *   Redistribution and use in source and binary forms, with or without
> + *   modification, are permitted provided that the following conditions
> + *   are met:
> + *
> + *     * Redistributions of source code must retain the above copyright
> + *       notice, this list of conditions and the following disclaimer.
> + *     * Redistributions in binary form must reproduce the above copyright
> + *       notice, this list of conditions and the following disclaimer in
> + *       the documentation and/or other materials provided with the
> + *       distribution.
> + *     * Neither the name of Intel Corporation nor the names of its
> + *       contributors may be used to endorse or promote products derived
> + *       from this software without specific prior written permission.
> + *
> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
> CONTRIBUTORS
> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
> NOT
> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
> FITNESS FOR
> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
> COPYRIGHT
> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
> INCIDENTAL,
> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
> NOT
> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
> OF USE,
> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
> AND ON ANY
> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
> TORT
> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
> THE USE
> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
> DAMAGE.
> + */
> +
> +#ifndef _RTE_DEV_H_
> +#error "don't include this file directly, please include generic <rte_dev.h>"
> +#endif
> +
> +#ifndef _RTE_LINUXAPP_DEV_H_
> +#define _RTE_LINUXAPP_DEV_H_
> +
> +#include <stdio.h>
> +
> +#include <rte_dev.h>
> +
> +#define RTE_EAL_UEV_MSG_LEN 4096
> +#define RTE_EAL_UEV_MSG_ELEM_LEN 128
> +
> +enum uev_subsystem {
> +	UEV_SUBSYSTEM_UIO,
> +	UEV_SUBSYSTEM_VFIO,
> +	UEV_SUBSYSTEM_PCI,
> +	UEV_SUBSYSTEM_MAX
> +};
> +
> +enum uev_monitor_netlink_group {
> +	UEV_MONITOR_KERNEL,
> +	UEV_MONITOR_UDEV,
> +};
> +
> +/**
> + * The device event type.
> + */
> +enum rte_eal_dev_event_type {
> +	RTE_EAL_DEV_EVENT_UNKNOWN,	/**< unknown event type */
> +	RTE_EAL_DEV_EVENT_ADD,		/**< device adding event */
> +	RTE_EAL_DEV_EVENT_REMOVE,
> +					/**< device removing event */
> +	RTE_EAL_DEV_EVENT_CHANGE,
> +					/**< device status change event */
> +	RTE_EAL_DEV_EVENT_MOVE,		/**< device sys path move
> event */
> +	RTE_EAL_DEV_EVENT_ONLINE,	/**< device online event */
> +	RTE_EAL_DEV_EVENT_OFFLINE,	/**< device offline event */
> +	RTE_EAL_DEV_EVENT_MAX		/**< max value of this enum
> */
> +};
> +
> +struct rte_eal_uevent {
> +	enum rte_eal_dev_event_type type;	/**< device event type */
> +	int subsystem;				/**< subsystem id */
> +	char *devname;				/**< device name */
> +	enum uev_monitor_netlink_group group;	/**< device netlink
> group */
> +};
> +
> +/**
> + * Start the device uevent monitoring.
> + *
> + * @param none
> + * @return
> + *   - On success, zero.
> + *   - On failure, a negative value.
> + */
> +int
> +rte_dev_monitor_start(void);
> +
> +/**
> + * Stop the device uevent monitoring .
> + *
> + * @param none
> + * @return
> + *   - On success, zero.
> + *   - On failure, a negative value.
> + */
> +
> +int
> +rte_dev_monitor_stop(void);
> +
> +#endif /* _RTE_LINUXAPP_DEV_H_ */
> diff --git a/lib/librte_eal/linuxapp/igb_uio/igb_uio.c
> b/lib/librte_eal/linuxapp/igb_uio/igb_uio.c
> index a3a98c1..d0e07b4 100644
> --- a/lib/librte_eal/linuxapp/igb_uio/igb_uio.c
> +++ b/lib/librte_eal/linuxapp/igb_uio/igb_uio.c
> @@ -354,6 +354,12 @@ igbuio_pci_release(struct uio_info *info, struct
> inode *inode)
>  	struct rte_uio_pci_dev *udev = info->priv;
>  	struct pci_dev *dev = udev->pdev;
> 
> +	/* check if device have been remove before release */
> +	if ((&dev->dev.kobj)->state_remove_uevent_sent == 1) {
> +		pr_info("The device have been removed\n");
> +		return -1;
> +	}
> +
>  	/* disable interrupts */
>  	igbuio_pci_disable_interrupts(udev);
> 
> diff --git a/lib/librte_pci/rte_pci.c b/lib/librte_pci/rte_pci.c
> index 0160fc1..feb5fd7 100644
> --- a/lib/librte_pci/rte_pci.c
> +++ b/lib/librte_pci/rte_pci.c
> @@ -172,6 +172,26 @@ rte_pci_addr_parse(const char *str, struct
> rte_pci_addr *addr)
>  	return -1;
>  }
> 
> +/* map a private resource from an address*/
> +void *
> +pci_map_private_resource(void *requested_addr, off_t offset, size_t size)
> +{
> +	void *mapaddr;
> +
> +	mapaddr = mmap(requested_addr, size,
> +			   PROT_READ | PROT_WRITE,
> +			   MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
> -1, 0);
> +	if (mapaddr == MAP_FAILED) {
> +		RTE_LOG(ERR, EAL, "%s(): cannot mmap(%p, 0x%lx, 0x%lx): "
> +			"%s (%p)\n",
> +			__func__, requested_addr,
> +			(unsigned long)size, (unsigned long)offset,
> +			strerror(errno), mapaddr);
> +	} else
> +		RTE_LOG(DEBUG, EAL, "  PCI memory mapped at %p\n",
> mapaddr);
> +
> +	return mapaddr;
> +}
> 
>  /* map a particular resource from a file */
>  void *
> diff --git a/lib/librte_pci/rte_pci.h b/lib/librte_pci/rte_pci.h
> index 4f2cd18..f6091a6 100644
> --- a/lib/librte_pci/rte_pci.h
> +++ b/lib/librte_pci/rte_pci.h
> @@ -227,6 +227,23 @@ int rte_pci_addr_cmp(const struct rte_pci_addr
> *addr,
>  int rte_pci_addr_parse(const char *str, struct rte_pci_addr *addr);
> 
>  /**
> + * @internal
> + * Map to a particular private resource.
> + *
> + * @param requested_addr
> + *      The starting address for the new mapping range.
> + * @param offset
> + *      The offset for the mapping range.
> + * @param size
> + *      The size for the mapping range.
> + * @return
> + *   - On success, the function returns a pointer to the mapped area.
> + *   - On error, the value MAP_FAILED is returned.
> + */
> +void *pci_map_private_resource(void *requested_addr, off_t offset,
> +		size_t size);
> +
> +/**
>   * Map a particular resource from a file.
>   *
>   * @param requested_addr
> --
> 2.7.4



More information about the dev mailing list