[dpdk-dev] Segfault when eal thread executing mlx5 nic‘s lsc event

Shahaf Shuler shahafs at mellanox.com
Wed Mar 6 07:02:51 CET 2019


Hi Yunjian,

Wednesday, March 6, 2019 5:06 AM, Zhaohui (zhaohui, Polestar):
> Subject: [dpdk-dev] Segfault when eal thread executing mlx5 nic‘s lsc event
> 
> Hi:
>     I think the flow list may be accessed in the same time by two different
> threads and may cause some errors. Do it need a lock to protect the flow list?

In general it is the application responsibility to synchronize the control between the different threads. The PMD does not support multi-thread control what so ever. 

On this specific case, when working on top of bond, the bond reacts to the LSC event while, I assume, your application is inserting/deleting rules using rte_flow APIs. 

The right fix should be for the bond PMD to mutex before calling rte_eth_dev_default_mac_addr_set. 

> 
>                                                                                                    Thanks
>                                                                                                               Yunjian
> (gdb) bt
> #0 0x00007f54c9641237 in raise () from /usr/lib64/libc.so.6
> #1 0x00007f54c9642928 in abort () from /usr/lib64/libc.so.6
> #2 0x00000000006a8749 in PAT_abort ()
> #3 0x00000000006a588d in patchIllInsHandler ()
> #4 <signal handler called>
> #5 0x00007f54c6acd2c8 in flow_list_destroy (dev=dev at entry=0xad8940
> <rte_eth_devices+16512>, flow=0x1444b1b00, list=0x14455e618) at
> /usr/src/debug/dpdk-mlx4-pmd-18.11/drivers/net/mlx5/mlx5_flow.c:2150
> #6 0x00007f54c6acfe1b in mlx5_flow_list_flush (dev=0xad8940
> <rte_eth_devices+16512>, list=0x14455e618) at /usr/src/debug/dpdk-mlx4-
> pmd-18.11/drivers/net/mlx5/mlx5_flow.c:2170
> #7 0x00007f54c6ac5cc4 in mlx5_traffic_disable (dev=<optimized out>) at
> /usr/src/debug/dpdk-mlx4-pmd-18.11/drivers/net/mlx5/mlx5_trigger.c:384
> #8 0x00007f54c6ac637d in mlx5_traffic_restart (dev=0xad8940
> <rte_eth_devices+16512>) at /usr/src/debug/dpdk-mlx4-pmd-
> 18.11/drivers/net/mlx5/mlx5_trigger.c:400
> #9 0x00007f54d1db3bba in rte_eth_dev_default_mac_addr_set
> (port_id=<optimized out>, addr=0x140200f40) at /usr/src/debug/dpdk-
> 18.11/lib/librte_ethdev/rte_ethdev.c:3230
> #10 0x00007f54cd8dee81 in mac_address_slaves_update
> (bonded_eth_dev=bonded_eth_dev at entry=0xad48c0 <rte_eth_devices>)
> at /usr/src/debug/dpdk-
> 18.11/drivers/net/bonding/rte_eth_bond_pmd.c:1842
> #11 0x00007f54cd8e0c31 in bond_ethdev_lsc_event_callback
> (port_id=<optimized out>, type=<optimized out>, param=<optimized out>,
> ret_param=<optimized out>) at /usr/src/debug/dpdk-
> 18.11/drivers/net/bonding/rte_eth_bond_pmd.c:3070
> #12 0x00007f54cd8e117b in bond_ethdev_slave_lsc_delay (cb_arg=0xad48c0
> <rte_eth_devices>) at /usr/src/debug/dpdk-
> 18.11/drivers/net/bonding/rte_eth_bond_pmd.c:2298
> #13 0x00007f54d25ebe5f in eal_alarm_callback (arg=<optimized out>) at
> /usr/src/debug/dpdk-18.11/lib/librte_eal/linuxapp/eal/eal_alarm.c:90
> #14 0x00007f54d25ea8aa in eal_intr_process_interrupts (nfds=<optimized
> out>, events=<optimized out>) at /usr/src/debug/dpdk-
> 18.11/lib/librte_eal/linuxapp/eal/eal_interrupts.c:838
> #15 eal_intr_handle_interrupts (totalfds=<optimized out>, pfd=21) at
> /usr/src/debug/dpdk-18.11/lib/librte_eal/linuxapp/eal/eal_interrupts.c:885
> #16 eal_intr_thread_main (arg=<optimized out>) at /usr/src/debug/dpdk-
> 18.11/lib/librte_eal/linuxapp/eal/eal_interrupts.c:965
> #17 0x00007f54cade6dd5 in start_thread () from /usr/lib64/libpthread.so.0
> #18 0x00007f54c970950d in clone () from /usr/lib64/libc.so.6
> 
> 
> In order to solve this problem(core dump). Something modified like this:(
> Looking forward to your reply)
> 
> From: zhaohui8 <zhaohui8 at huawei.com>
> ---
>  drivers/net/mlx5/mlx5.c         |  1 +
>  drivers/net/mlx5/mlx5.h         |  1 +
>  drivers/net/mlx5/mlx5_flow.c     |  33 ++++++++++++++++++++++++++------
> -
>  drivers/net/mlx5/mlx5_trigger.c   |  12 +++++++++++-
>  4 files changed, 39 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index
> 9e5cab1..e8ae816 100644
> --- a/drivers/net/mlx5/mlx5.c
> +++ b/drivers/net/mlx5/mlx5.c
> @@ -1195,6 +1195,7 @@
>  			priv->tcf_context = NULL;
>  		}
>  	}
> +	rte_rwlock_init(&priv->flows_rwlock);
>  	TAILQ_INIT(&priv->flows);
>  	TAILQ_INIT(&priv->ctrl_flows);
>  	/* Hint libmlx5 to use PMD allocator for data plane resources */ diff --
> git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h index
> bc500b2..cb8657c 100644
> --- a/drivers/net/mlx5/mlx5.h
> +++ b/drivers/net/mlx5/mlx5.h
> @@ -202,6 +202,7 @@ struct priv {
>  	unsigned int (*reta_idx)[]; /* RETA index table. */
>  	unsigned int reta_idx_n; /* RETA index size. */
>  	struct mlx5_drop drop_queue; /* Flow drop queues. */
> +	rte_rwlock_t flows_rwlock; /* flows Lock. */
>  	struct mlx5_flows flows; /* RTE Flow rules. */
>  	struct mlx5_flows ctrl_flows; /* Control flow rules. */
>  	LIST_HEAD(counters, mlx5_flow_counter) flow_counters; diff --git
> a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c index
> 97dc3e1..2c18602 100644
> --- a/drivers/net/mlx5/mlx5_flow.c
> +++ b/drivers/net/mlx5/mlx5_flow.c
> @@ -2121,9 +2121,13 @@ struct rte_flow *
>  		 const struct rte_flow_action actions[],
>  		 struct rte_flow_error *error)
>  {
> -	return flow_list_create(dev,
> +	struct rte_flow * flow;
> +	rte_rwlock_write_lock(&((struct priv *)dev->data->dev_private)-
> >flows_rwlock);
> +	flow = flow_list_create(dev,
>  				&((struct priv *)dev->data->dev_private)-
> >flows,
>  				attr, items, actions, error);
> +	rte_rwlock_write_unlock(&((struct priv *)dev->data->dev_private)-
> >flows_rwlock);
> +	return flow;
>  }
> 
>  /**
> @@ -2235,12 +2239,13 @@ struct rte_flow *
>  	struct priv *priv = dev->data->dev_private;
>  	struct rte_flow *flow;
>  	int ret = 0;
> -
> +	rte_rwlock_read_lock(&priv->flows_rwlock);
>  	TAILQ_FOREACH(flow, &priv->flows, next) {
>  		DRV_LOG(DEBUG, "port %u flow %p still referenced",
>  			dev->data->port_id, (void *)flow);
>  		++ret;
>  	}
> +	rte_rwlock_read_unlock(&priv->flows_rwlock);
>  	return ret;
>  }
> 
> @@ -2320,10 +2325,14 @@ struct rte_flow *
>  	}
>  	for (i = 0; i != priv->reta_idx_n; ++i)
>  		queue[i] = (*priv->reta_idx)[i];
> +	rte_rwlock_write_lock(&priv->flows_rwlock);
>  	flow = flow_list_create(dev, &priv->ctrl_flows,
>  				&attr, items, actions, &error);
> -	if (!flow)
> +	if (!flow) {
> +		rte_rwlock_write_unlock(&priv->flows_rwlock);
>  		return -rte_errno;
> +	}
> +	rte_rwlock_write_unlock(&priv->flows_rwlock);
>  	return 0;
>  }
> 
> @@ -2360,8 +2369,9 @@ struct rte_flow *
>  		  struct rte_flow_error *error __rte_unused)  {
>  	struct priv *priv = dev->data->dev_private;
> -
> +	rte_rwlock_write_lock(&priv->flows_rwlock);
>  	flow_list_destroy(dev, &priv->flows, flow);
> +	rte_rwlock_write_unlock(&priv->flows_rwlock);
>  	return 0;
>  }
> 
> @@ -2376,8 +2386,9 @@ struct rte_flow *
>  		struct rte_flow_error *error __rte_unused)  {
>  	struct priv *priv = dev->data->dev_private;
> -
> +	rte_rwlock_write_lock(&priv->flows_rwlock);
>  	mlx5_flow_list_flush(dev, &priv->flows);
> +	rte_rwlock_write_unlock(&priv->flows_rwlock);
>  	return 0;
>  }
> 
> @@ -2729,17 +2740,22 @@ struct rte_flow *
>  	ret = flow_fdir_filter_convert(dev, fdir_filter, fdir_flow);
>  	if (ret)
>  		goto error;
> +	rte_rwlock_write_lock(&priv->flows_rwlock);
>  	flow = flow_fdir_filter_lookup(dev, fdir_flow);
>  	if (flow) {
>  		rte_errno = EEXIST;
> +		rte_rwlock_write_unlock(&priv->flows_rwlock);
>  		goto error;
>  	}
>  	flow = flow_list_create(dev, &priv->flows, &fdir_flow->attr,
>  				fdir_flow->items, fdir_flow->actions, NULL);
> -	if (!flow)
> +	if (!flow) {
> +		rte_rwlock_write_unlock(&priv->flows_rwlock);
>  		goto error;
> +	}
>  	assert(!flow->fdir);
>  	flow->fdir = fdir_flow;
> +	rte_rwlock_write_unlock(&priv->flows_rwlock);
>  	DRV_LOG(DEBUG, "port %u created FDIR flow %p",
>  		dev->data->port_id, (void *)flow);
>  	return 0;
> @@ -2773,6 +2789,7 @@ struct rte_flow *
>  	ret = flow_fdir_filter_convert(dev, fdir_filter, &fdir_flow);
>  	if (ret)
>  		return -rte_errno;
> +	rte_rwlock_write_lock(&priv->flows_rwlock);
>  	flow = flow_fdir_filter_lookup(dev, &fdir_flow);
>  	if (!flow) {
>  		rte_errno = ENOENT;
> @@ -2781,6 +2798,7 @@ struct rte_flow *
>  	flow_list_destroy(dev, &priv->flows, flow);
>  	DRV_LOG(DEBUG, "port %u deleted FDIR flow %p",
>  		dev->data->port_id, (void *)flow);
> +	rte_rwlock_write_unlock(&priv->flows_rwlock);
>  	return 0;
>  }
> 
> @@ -2817,8 +2835,9 @@ struct rte_flow *
>  flow_fdir_filter_flush(struct rte_eth_dev *dev)  {
>  	struct priv *priv = dev->data->dev_private;
> -
> +	rte_rwlock_write_lock(&priv->flows_rwlock);
>  	mlx5_flow_list_flush(dev, &priv->flows);
> +	rte_rwlock_write_unlock(&priv->flows_rwlock);
>  }
> 
>  /**
> diff --git a/drivers/net/mlx5/mlx5_trigger.c
> b/drivers/net/mlx5/mlx5_trigger.c index e2a9bb7..b95c7cf 100644
> --- a/drivers/net/mlx5/mlx5_trigger.c
> +++ b/drivers/net/mlx5/mlx5_trigger.c
> @@ -188,12 +188,15 @@
>  			dev->data->port_id);
>  		goto error;
>  	}
> +	rte_rwlock_read_lock(&priv->flows_rwlock);
>  	ret = mlx5_flow_start(dev, &priv->flows);
>  	if (ret) {
>  		DRV_LOG(DEBUG, "port %u failed to set flows",
>  			dev->data->port_id);
> +		rte_rwlock_read_unlock(&priv->flows_rwlock);
>  		goto error;
>  	}
> +	rte_rwlock_read_unlock(&priv->flows_rwlock);
>  	dev->tx_pkt_burst = mlx5_select_tx_function(dev);
>  	dev->rx_pkt_burst = mlx5_select_rx_function(dev);
>  	mlx5_dev_interrupt_handler_install(dev);
> @@ -202,7 +205,9 @@
>  	ret = rte_errno; /* Save rte_errno before cleanup. */
>  	/* Rollback. */
>  	dev->data->dev_started = 0;
> +	rte_rwlock_write_lock(&priv->flows_rwlock);
>  	mlx5_flow_stop(dev, &priv->flows);
> +	rte_rwlock_write_unlock(&priv->flows_rwlock);
>  	mlx5_traffic_disable(dev);
>  	mlx5_txq_stop(dev);
>  	mlx5_rxq_stop(dev);
> @@ -230,7 +235,9 @@
>  	rte_wmb();
>  	usleep(1000 * priv->rxqs_n);
>  	DRV_LOG(DEBUG, "port %u stopping device", dev->data->port_id);
> +	rte_rwlock_write_lock(&priv->flows_rwlock);
>  	mlx5_flow_stop(dev, &priv->flows);
> +	rte_rwlock_write_unlock(&priv->flows_rwlock);
>  	mlx5_traffic_disable(dev);
>  	mlx5_rx_intr_vec_disable(dev);
>  	mlx5_dev_interrupt_handler_uninstall(dev);
> @@ -364,7 +371,9 @@
>  	return 0;
>  error:
>  	ret = rte_errno; /* Save rte_errno before cleanup. */
> +	rte_rwlock_write_lock(&priv->flows_rwlock);
>  	mlx5_flow_list_flush(dev, &priv->ctrl_flows);
> +	rte_rwlock_write_unlock(&priv->flows_rwlock);
>  	rte_errno = ret; /* Restore rte_errno. */
>  	return -rte_errno;
>  }
> @@ -380,8 +389,9 @@
>  mlx5_traffic_disable(struct rte_eth_dev *dev)  {
>  	struct priv *priv = dev->data->dev_private;
> -
> +	rte_rwlock_write_lock(&priv->flows_rwlock);
>  	mlx5_flow_list_flush(dev, &priv->ctrl_flows);
> +	rte_rwlock_write_unlock(&priv->flows_rwlock);
>  }
> 
> 
> -----邮件原件-----
> 发件人: wangyunjian
> 发送时间: 2019年2月22日 15:34
> 收件人: dev at dpdk.org; shahafs at mellanox.com; yskoh at mellanox.com
> 抄送: xudingke <xudingke at huawei.com>; Zhaohui (zhaohui, Polestar)
> <zhaohui8 at huawei.com>
> 主题: [dpdk-dev] Segfault when eal thread executing mlx5 nic‘s lsc event



More information about the dev mailing list