[dpdk-dev] [PATCH v3] net/mlx5: support device removal event

Adrien Mazarguil adrien.mazarguil at 6wind.com
Mon Sep 4 17:33:09 CEST 2017


Hi Matan,

One comment I have is, while this patch adds support for RMV, it also
silently addresses a bug (see large comment you added to
priv_link_status_update()).

This should be split in two commits, with the fix part coming first and CC
stable at dpdk.org, and a second commit adding RMV support proper.

More below.

On Mon, Sep 04, 2017 at 04:55:53PM +0300, Matan Azrad wrote:
> Extend the LSC event handling to support the device removal as well.
> The Verbs library may send several related events, which are
> different from LSC event.
> 
> The mlx5 event handling has been made capable of receiving and
> signaling several event types at once.
> 
> This support includes next:
> 1. Removal event detection according to the user configuration.
> 2. Calling to all registered mlx5 removal callbacks.
> 3. Capabilities extension to include removal interrupt handling.
> 
> Signed-off-by: Matan Azrad <matan at mellanox.com>
> ---
>  drivers/net/mlx5/mlx5.c        |   2 +-
>  drivers/net/mlx5/mlx5_ethdev.c | 103 +++++++++++++++++++++++++++++------------
>  2 files changed, 74 insertions(+), 31 deletions(-)
> 
> Changes:
> V2:
> Replace link status update function name.
> add inconsistent link workaround comment.
> 
> V3:
> Fix indentations.
> Accurate inconsistent link comment.
> 
> 
> diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
> index bd66a7c..1a3d7f1 100644
> --- a/drivers/net/mlx5/mlx5.c
> +++ b/drivers/net/mlx5/mlx5.c
> @@ -865,7 +865,7 @@ static struct rte_pci_driver mlx5_driver = {
>  	},
>  	.id_table = mlx5_pci_id_map,
>  	.probe = mlx5_pci_probe,
> -	.drv_flags = RTE_PCI_DRV_INTR_LSC,
> +	.drv_flags = RTE_PCI_DRV_INTR_LSC | RTE_PCI_DRV_INTR_RMV,
>  };
>  
>  /**
> diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
> index 57f6237..cdbd723 100644
> --- a/drivers/net/mlx5/mlx5_ethdev.c
> +++ b/drivers/net/mlx5/mlx5_ethdev.c
> @@ -1112,47 +1112,84 @@ mlx5_ibv_device_to_pci_addr(const struct ibv_device *device,
>  }
>  
>  /**
> - * Link status handler.
> + * Update the link status.
>   *
>   * @param priv
>   *   Pointer to private structure.
> - * @param dev
> - *   Pointer to the rte_eth_dev structure.
>   *
>   * @return
> - *   Nonzero if the callback process can be called immediately.
> + *   Zero if the callback process can be called immediately.
>   */
>  static int
> -priv_dev_link_status_handler(struct priv *priv, struct rte_eth_dev *dev)
> +priv_link_status_update(struct priv *priv)
> +{
> +	struct rte_eth_link *link = &priv->dev->data->dev_link;
> +
> +	mlx5_link_update(priv->dev, 0);
> +	if (((link->link_speed == 0) && link->link_status) ||
> +		((link->link_speed != 0) && !link->link_status)) {
> +		/*
> +		 * Inconsistent status.
> +		 * The link status is read from Ethtool through an IOCTL,
> +		 * but as the application may work in polling mode it
> +		 * may get the port event before the Kernel driver had
> +		 * time to process it. PMD then request the link from
> +		 * the kernel but the event is still not processed (due
> +		 * to more urgent interrupts) and finally the PMD may
> +		 * get an inconsistent link.
> +		 * Setting alarm for later checking.
> +		 */

While adding a comment is nice, there's too much info in there. From the PMD
standpoint, what happens is the interrupt occurs much before the kernel
netdevice exposes the new status, so it needs to be checked later. Can you
sum it up in fewer words?

> +		if (!priv->pending_alarm) {
> +			priv->pending_alarm = 1;
> +			rte_eal_alarm_set(MLX5_ALARM_TIMEOUT_US,
> +					  mlx5_dev_link_status_handler,
> +					  priv->dev);
> +		}
> +		return 1;
> +	} else if (unlikely(priv->pending_alarm)) {
> +		/* In case of link interrupt while link alarm was setting. */
> +		priv->pending_alarm = 0;
> +		rte_eal_alarm_cancel(mlx5_dev_link_status_handler, priv->dev);
> +	}
> +	return 0;
> +}
> +
> +/**
> + * Device status handler.
> + *
> + * @param priv
> + *   Pointer to private structure.
> + * @param events
> + *   Pointer to event flags holder.
> + *
> + * @return
> + *   Events bitmap of callback process which can be called immediately.
> + */
> +static uint32_t
> +priv_dev_status_handler(struct priv *priv)
>  {
>  	struct ibv_async_event event;
> -	struct rte_eth_link *link = &dev->data->dev_link;
> -	int ret = 0;
> +	uint32_t ret = 0;
>  
>  	/* Read all message and acknowledge them. */
>  	for (;;) {
>  		if (ibv_get_async_event(priv->ctx, &event))
>  			break;
> -
> -		if (event.event_type != IBV_EVENT_PORT_ACTIVE &&
> -		    event.event_type != IBV_EVENT_PORT_ERR)
> +		if ((event.event_type == IBV_EVENT_PORT_ACTIVE ||
> +			event.event_type == IBV_EVENT_PORT_ERR) &&
> +			(priv->dev->data->dev_conf.intr_conf.lsc == 1))
> +			ret |= (1 << RTE_ETH_EVENT_INTR_LSC);
> +		else if (event.event_type == IBV_EVENT_DEVICE_FATAL &&
> +			priv->dev->data->dev_conf.intr_conf.rmv == 1)
> +			ret |= (1 << RTE_ETH_EVENT_INTR_RMV);
> +		else
>  			DEBUG("event type %d on port %d not handled",
>  			      event.event_type, event.element.port_num);

What you also need to mention in the commit log of the fix is that splitting
priv_dev_status_handler() and priv_link_status_update() addresses another
bug here: this loop consumed *all* events, even during alarms. An alarm
occurring for a LSC event could eat a RMV event that the application would
never receive. This also affects mlx4, for which I intend to submit a fix
soon.

>  		ibv_ack_async_event(&event);
>  	}
> -	mlx5_link_update(dev, 0);
> -	if (((link->link_speed == 0) && link->link_status) ||
> -	    ((link->link_speed != 0) && !link->link_status)) {
> -		if (!priv->pending_alarm) {
> -			/* Inconsistent status, check again later. */
> -			priv->pending_alarm = 1;
> -			rte_eal_alarm_set(MLX5_ALARM_TIMEOUT_US,
> -					  mlx5_dev_link_status_handler,
> -					  dev);
> -		}
> -	} else {
> -		ret = 1;
> -	}
> +	if (ret & (1 << RTE_ETH_EVENT_INTR_LSC))
> +		if (priv_link_status_update(priv))
> +			ret &= ~(1 << RTE_ETH_EVENT_INTR_LSC);
>  	return ret;
>  }
>  
> @@ -1172,9 +1209,9 @@ mlx5_dev_link_status_handler(void *arg)
>  	priv_lock(priv);
>  	assert(priv->pending_alarm == 1);
>  	priv->pending_alarm = 0;
> -	ret = priv_dev_link_status_handler(priv, dev);
> +	ret = priv_link_status_update(priv);
>  	priv_unlock(priv);
> -	if (ret)
> +	if (!ret)
>  		_rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC, NULL,
>  					      NULL);
>  }
> @@ -1192,14 +1229,17 @@ mlx5_dev_interrupt_handler(void *cb_arg)
>  {
>  	struct rte_eth_dev *dev = cb_arg;
>  	struct priv *priv = dev->data->dev_private;
> -	int ret;
> +	uint32_t events;
>  
>  	priv_lock(priv);
> -	ret = priv_dev_link_status_handler(priv, dev);
> +	events = priv_dev_status_handler(priv);
>  	priv_unlock(priv);
> -	if (ret)
> +	if (events & (1 << RTE_ETH_EVENT_INTR_LSC))
>  		_rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC, NULL,
>  					      NULL);
> +	if (events & (1 << RTE_ETH_EVENT_INTR_RMV))
> +		_rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_RMV, NULL,
> +					      NULL);
>  }
>  
>  /**
> @@ -1213,7 +1253,8 @@ mlx5_dev_interrupt_handler(void *cb_arg)
>  void
>  priv_dev_interrupt_handler_uninstall(struct priv *priv, struct rte_eth_dev *dev)
>  {
> -	if (!dev->data->dev_conf.intr_conf.lsc)
> +	if (!dev->data->dev_conf.intr_conf.lsc &&
> +		!dev->data->dev_conf.intr_conf.rmv)
>  		return;
>  	rte_intr_callback_unregister(&priv->intr_handle,
>  				     mlx5_dev_interrupt_handler,
> @@ -1238,7 +1279,8 @@ priv_dev_interrupt_handler_install(struct priv *priv, struct rte_eth_dev *dev)
>  {
>  	int rc, flags;
>  
> -	if (!dev->data->dev_conf.intr_conf.lsc)
> +	if (!dev->data->dev_conf.intr_conf.lsc &&
> +		!dev->data->dev_conf.intr_conf.rmv)
>  		return;
>  	assert(priv->ctx->async_fd > 0);
>  	flags = fcntl(priv->ctx->async_fd, F_GETFL);
> @@ -1246,6 +1288,7 @@ priv_dev_interrupt_handler_install(struct priv *priv, struct rte_eth_dev *dev)
>  	if (rc < 0) {
>  		INFO("failed to change file descriptor async event queue");
>  		dev->data->dev_conf.intr_conf.lsc = 0;
> +		dev->data->dev_conf.intr_conf.rmv = 0;
>  	} else {
>  		priv->intr_handle.fd = priv->ctx->async_fd;
>  		priv->intr_handle.type = RTE_INTR_HANDLE_EXT;
> -- 
> 2.7.4
> 

-- 
Adrien Mazarguil
6WIND


More information about the dev mailing list