[dpdk-dev] Segfault when eal thread executing mlx5 nic‘s lsc event

Zhaohui (zhaohui, Polestar) zhaohui8 at huawei.com
Wed Mar 6 04:05:52 CET 2019


Hi:
    I think the flow list may be accessed in the same time by two different threads and may cause some errors. Do it need a lock to protect the flow list?

                                                                                                   Thanks
                                                                                                              Yunjian
(gdb) bt
#0 0x00007f54c9641237 in raise () from /usr/lib64/libc.so.6
#1 0x00007f54c9642928 in abort () from /usr/lib64/libc.so.6
#2 0x00000000006a8749 in PAT_abort ()
#3 0x00000000006a588d in patchIllInsHandler ()
#4 <signal handler called>
#5 0x00007f54c6acd2c8 in flow_list_destroy (dev=dev at entry=0xad8940 <rte_eth_devices+16512>, flow=0x1444b1b00, list=0x14455e618) at /usr/src/debug/dpdk-mlx4-pmd-18.11/drivers/net/mlx5/mlx5_flow.c:2150
#6 0x00007f54c6acfe1b in mlx5_flow_list_flush (dev=0xad8940 <rte_eth_devices+16512>, list=0x14455e618) at /usr/src/debug/dpdk-mlx4-pmd-18.11/drivers/net/mlx5/mlx5_flow.c:2170
#7 0x00007f54c6ac5cc4 in mlx5_traffic_disable (dev=<optimized out>) at /usr/src/debug/dpdk-mlx4-pmd-18.11/drivers/net/mlx5/mlx5_trigger.c:384
#8 0x00007f54c6ac637d in mlx5_traffic_restart (dev=0xad8940 <rte_eth_devices+16512>) at /usr/src/debug/dpdk-mlx4-pmd-18.11/drivers/net/mlx5/mlx5_trigger.c:400
#9 0x00007f54d1db3bba in rte_eth_dev_default_mac_addr_set (port_id=<optimized out>, addr=0x140200f40) at /usr/src/debug/dpdk-18.11/lib/librte_ethdev/rte_ethdev.c:3230
#10 0x00007f54cd8dee81 in mac_address_slaves_update (bonded_eth_dev=bonded_eth_dev at entry=0xad48c0 <rte_eth_devices>) at /usr/src/debug/dpdk-18.11/drivers/net/bonding/rte_eth_bond_pmd.c:1842
#11 0x00007f54cd8e0c31 in bond_ethdev_lsc_event_callback (port_id=<optimized out>, type=<optimized out>, param=<optimized out>, ret_param=<optimized out>) at /usr/src/debug/dpdk-18.11/drivers/net/bonding/rte_eth_bond_pmd.c:3070
#12 0x00007f54cd8e117b in bond_ethdev_slave_lsc_delay (cb_arg=0xad48c0 <rte_eth_devices>) at /usr/src/debug/dpdk-18.11/drivers/net/bonding/rte_eth_bond_pmd.c:2298
#13 0x00007f54d25ebe5f in eal_alarm_callback (arg=<optimized out>) at /usr/src/debug/dpdk-18.11/lib/librte_eal/linuxapp/eal/eal_alarm.c:90
#14 0x00007f54d25ea8aa in eal_intr_process_interrupts (nfds=<optimized out>, events=<optimized out>) at /usr/src/debug/dpdk-18.11/lib/librte_eal/linuxapp/eal/eal_interrupts.c:838
#15 eal_intr_handle_interrupts (totalfds=<optimized out>, pfd=21) at /usr/src/debug/dpdk-18.11/lib/librte_eal/linuxapp/eal/eal_interrupts.c:885
#16 eal_intr_thread_main (arg=<optimized out>) at /usr/src/debug/dpdk-18.11/lib/librte_eal/linuxapp/eal/eal_interrupts.c:965
#17 0x00007f54cade6dd5 in start_thread () from /usr/lib64/libpthread.so.0
#18 0x00007f54c970950d in clone () from /usr/lib64/libc.so.6


In order to solve this problem(core dump). Something modified like this:(Looking forward to your reply)

From: zhaohui8 <zhaohui8 at huawei.com>
---
 drivers/net/mlx5/mlx5.c         |  1 +
 drivers/net/mlx5/mlx5.h         |  1 +
 drivers/net/mlx5/mlx5_flow.c     |  33 ++++++++++++++++++++++++++-------
 drivers/net/mlx5/mlx5_trigger.c   |  12 +++++++++++-
 4 files changed, 39 insertions(+), 8 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 9e5cab1..e8ae816 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -1195,6 +1195,7 @@
 			priv->tcf_context = NULL;
 		}
 	}
+	rte_rwlock_init(&priv->flows_rwlock);
 	TAILQ_INIT(&priv->flows);
 	TAILQ_INIT(&priv->ctrl_flows);
 	/* Hint libmlx5 to use PMD allocator for data plane resources */
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index bc500b2..cb8657c 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -202,6 +202,7 @@ struct priv {
 	unsigned int (*reta_idx)[]; /* RETA index table. */
 	unsigned int reta_idx_n; /* RETA index size. */
 	struct mlx5_drop drop_queue; /* Flow drop queues. */
+	rte_rwlock_t flows_rwlock; /* flows Lock. */
 	struct mlx5_flows flows; /* RTE Flow rules. */
 	struct mlx5_flows ctrl_flows; /* Control flow rules. */
 	LIST_HEAD(counters, mlx5_flow_counter) flow_counters;
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index 97dc3e1..2c18602 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -2121,9 +2121,13 @@ struct rte_flow *
 		 const struct rte_flow_action actions[],
 		 struct rte_flow_error *error)
 {
-	return flow_list_create(dev,
+	struct rte_flow * flow;
+	rte_rwlock_write_lock(&((struct priv *)dev->data->dev_private)->flows_rwlock);
+	flow = flow_list_create(dev,
 				&((struct priv *)dev->data->dev_private)->flows,
 				attr, items, actions, error);
+	rte_rwlock_write_unlock(&((struct priv *)dev->data->dev_private)->flows_rwlock);
+	return flow;
 }
 
 /**
@@ -2235,12 +2239,13 @@ struct rte_flow *
 	struct priv *priv = dev->data->dev_private;
 	struct rte_flow *flow;
 	int ret = 0;
-
+	rte_rwlock_read_lock(&priv->flows_rwlock);
 	TAILQ_FOREACH(flow, &priv->flows, next) {
 		DRV_LOG(DEBUG, "port %u flow %p still referenced",
 			dev->data->port_id, (void *)flow);
 		++ret;
 	}
+	rte_rwlock_read_unlock(&priv->flows_rwlock);
 	return ret;
 }
 
@@ -2320,10 +2325,14 @@ struct rte_flow *
 	}
 	for (i = 0; i != priv->reta_idx_n; ++i)
 		queue[i] = (*priv->reta_idx)[i];
+	rte_rwlock_write_lock(&priv->flows_rwlock);
 	flow = flow_list_create(dev, &priv->ctrl_flows,
 				&attr, items, actions, &error);
-	if (!flow)
+	if (!flow) {
+		rte_rwlock_write_unlock(&priv->flows_rwlock);
 		return -rte_errno;
+	}
+	rte_rwlock_write_unlock(&priv->flows_rwlock);
 	return 0;
 }
 
@@ -2360,8 +2369,9 @@ struct rte_flow *
 		  struct rte_flow_error *error __rte_unused)
 {
 	struct priv *priv = dev->data->dev_private;
-
+	rte_rwlock_write_lock(&priv->flows_rwlock);
 	flow_list_destroy(dev, &priv->flows, flow);
+	rte_rwlock_write_unlock(&priv->flows_rwlock);
 	return 0;
 }
 
@@ -2376,8 +2386,9 @@ struct rte_flow *
 		struct rte_flow_error *error __rte_unused)
 {
 	struct priv *priv = dev->data->dev_private;
-
+	rte_rwlock_write_lock(&priv->flows_rwlock);
 	mlx5_flow_list_flush(dev, &priv->flows);
+	rte_rwlock_write_unlock(&priv->flows_rwlock);
 	return 0;
 }
 
@@ -2729,17 +2740,22 @@ struct rte_flow *
 	ret = flow_fdir_filter_convert(dev, fdir_filter, fdir_flow);
 	if (ret)
 		goto error;
+	rte_rwlock_write_lock(&priv->flows_rwlock);
 	flow = flow_fdir_filter_lookup(dev, fdir_flow);
 	if (flow) {
 		rte_errno = EEXIST;
+		rte_rwlock_write_unlock(&priv->flows_rwlock);
 		goto error;
 	}
 	flow = flow_list_create(dev, &priv->flows, &fdir_flow->attr,
 				fdir_flow->items, fdir_flow->actions, NULL);
-	if (!flow)
+	if (!flow) {
+		rte_rwlock_write_unlock(&priv->flows_rwlock);
 		goto error;
+	}
 	assert(!flow->fdir);
 	flow->fdir = fdir_flow;
+	rte_rwlock_write_unlock(&priv->flows_rwlock);
 	DRV_LOG(DEBUG, "port %u created FDIR flow %p",
 		dev->data->port_id, (void *)flow);
 	return 0;
@@ -2773,6 +2789,7 @@ struct rte_flow *
 	ret = flow_fdir_filter_convert(dev, fdir_filter, &fdir_flow);
 	if (ret)
 		return -rte_errno;
+	rte_rwlock_write_lock(&priv->flows_rwlock);
 	flow = flow_fdir_filter_lookup(dev, &fdir_flow);
 	if (!flow) {
 		rte_errno = ENOENT;
@@ -2781,6 +2798,7 @@ struct rte_flow *
 	flow_list_destroy(dev, &priv->flows, flow);
 	DRV_LOG(DEBUG, "port %u deleted FDIR flow %p",
 		dev->data->port_id, (void *)flow);
+	rte_rwlock_write_unlock(&priv->flows_rwlock);
 	return 0;
 }
 
@@ -2817,8 +2835,9 @@ struct rte_flow *
 flow_fdir_filter_flush(struct rte_eth_dev *dev)
 {
 	struct priv *priv = dev->data->dev_private;
-
+	rte_rwlock_write_lock(&priv->flows_rwlock);
 	mlx5_flow_list_flush(dev, &priv->flows);
+	rte_rwlock_write_unlock(&priv->flows_rwlock);
 }
 
 /**
diff --git a/drivers/net/mlx5/mlx5_trigger.c b/drivers/net/mlx5/mlx5_trigger.c
index e2a9bb7..b95c7cf 100644
--- a/drivers/net/mlx5/mlx5_trigger.c
+++ b/drivers/net/mlx5/mlx5_trigger.c
@@ -188,12 +188,15 @@
 			dev->data->port_id);
 		goto error;
 	}
+	rte_rwlock_read_lock(&priv->flows_rwlock);
 	ret = mlx5_flow_start(dev, &priv->flows);
 	if (ret) {
 		DRV_LOG(DEBUG, "port %u failed to set flows",
 			dev->data->port_id);
+		rte_rwlock_read_unlock(&priv->flows_rwlock);
 		goto error;
 	}
+	rte_rwlock_read_unlock(&priv->flows_rwlock);
 	dev->tx_pkt_burst = mlx5_select_tx_function(dev);
 	dev->rx_pkt_burst = mlx5_select_rx_function(dev);
 	mlx5_dev_interrupt_handler_install(dev);
@@ -202,7 +205,9 @@
 	ret = rte_errno; /* Save rte_errno before cleanup. */
 	/* Rollback. */
 	dev->data->dev_started = 0;
+	rte_rwlock_write_lock(&priv->flows_rwlock);
 	mlx5_flow_stop(dev, &priv->flows);
+	rte_rwlock_write_unlock(&priv->flows_rwlock);
 	mlx5_traffic_disable(dev);
 	mlx5_txq_stop(dev);
 	mlx5_rxq_stop(dev);
@@ -230,7 +235,9 @@
 	rte_wmb();
 	usleep(1000 * priv->rxqs_n);
 	DRV_LOG(DEBUG, "port %u stopping device", dev->data->port_id);
+	rte_rwlock_write_lock(&priv->flows_rwlock);
 	mlx5_flow_stop(dev, &priv->flows);
+	rte_rwlock_write_unlock(&priv->flows_rwlock);
 	mlx5_traffic_disable(dev);
 	mlx5_rx_intr_vec_disable(dev);
 	mlx5_dev_interrupt_handler_uninstall(dev);
@@ -364,7 +371,9 @@
 	return 0;
 error:
 	ret = rte_errno; /* Save rte_errno before cleanup. */
+	rte_rwlock_write_lock(&priv->flows_rwlock);
 	mlx5_flow_list_flush(dev, &priv->ctrl_flows);
+	rte_rwlock_write_unlock(&priv->flows_rwlock);
 	rte_errno = ret; /* Restore rte_errno. */
 	return -rte_errno;
 }
@@ -380,8 +389,9 @@
 mlx5_traffic_disable(struct rte_eth_dev *dev)
 {
 	struct priv *priv = dev->data->dev_private;
-
+	rte_rwlock_write_lock(&priv->flows_rwlock);
 	mlx5_flow_list_flush(dev, &priv->ctrl_flows);
+	rte_rwlock_write_unlock(&priv->flows_rwlock);
 }
 

-----邮件原件-----
发件人: wangyunjian 
发送时间: 2019年2月22日 15:34
收件人: dev at dpdk.org; shahafs at mellanox.com; yskoh at mellanox.com
抄送: xudingke <xudingke at huawei.com>; Zhaohui (zhaohui, Polestar) <zhaohui8 at huawei.com>
主题: [dpdk-dev] Segfault when eal thread executing mlx5 nic‘s lsc event



More information about the dev mailing list