[dpdk-dev] [RFC 2/6] ethdev: add simple power management API

Anatoly Burakov anatoly.burakov at intel.com
Wed May 27 19:02:02 CEST 2020


Add a simple on/off switch that will enable saving power when no
packets are arriving. It is based on counting the number of empty
polls and, when the number reaches a certain threshold, entering an
architecture-defined optimized power state that will either wait
until a TSC timestamp expires, or when packets arrive.

This API is limited to 1 core 1 queue use case as there is no
coordination between queues/cores in ethdev.

The TSC timestamp is automatically calculated using current link
speed and RX descriptor ring size, such that the sleep time is
not longer than it would take for a NIC to fill its entire RX
descriptor ring.

Signed-off-by: Liang J. Ma <liang.j.ma at intel.com>
Signed-off-by: Anatoly Burakov <anatoly.burakov at intel.com>
---
 lib/librte_ethdev/rte_ethdev.c           | 39 +++++++++++++
 lib/librte_ethdev/rte_ethdev.h           | 70 ++++++++++++++++++++++++
 lib/librte_ethdev/rte_ethdev_core.h      | 41 +++++++++++++-
 lib/librte_ethdev/rte_ethdev_version.map |  4 ++
 4 files changed, 152 insertions(+), 2 deletions(-)

diff --git a/lib/librte_ethdev/rte_ethdev.c b/lib/librte_ethdev/rte_ethdev.c
index 8e10a6fc36..0be5ecfc11 100644
--- a/lib/librte_ethdev/rte_ethdev.c
+++ b/lib/librte_ethdev/rte_ethdev.c
@@ -16,6 +16,7 @@
 #include <netinet/in.h>
 
 #include <rte_byteorder.h>
+#include <rte_cpuflags.h>
 #include <rte_log.h>
 #include <rte_debug.h>
 #include <rte_interrupts.h>
@@ -5053,6 +5054,44 @@ rte_eth_dev_pool_ops_supported(uint16_t port_id, const char *pool)
 	return (*dev->dev_ops->pool_ops_supported)(dev, pool);
 }
 
+int
+rte_eth_dev_power_mgmt_enable(uint16_t port_id)
+{
+	struct rte_eth_dev *dev;
+
+	RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
+	dev = &rte_eth_devices[port_id];
+
+	if (!rte_cpu_get_flag_enabled(RTE_CPUFLAG_WAITPKG))
+		return -ENOTSUP;
+
+	/* allocate memory for empty poll stats */
+	dev->empty_poll_stats = rte_malloc_socket(NULL,
+		sizeof(struct rte_eth_ep_stat) * RTE_MAX_QUEUES_PER_PORT,
+		0, dev->data->numa_node);
+
+	if (dev->empty_poll_stats == NULL)
+		return -ENOMEM;
+
+	dev->pwr_mgmt_state = RTE_ETH_DEV_POWER_MGMT_ENABLED;
+	return 0;
+}
+
+int
+rte_eth_dev_power_mgmt_disable(uint16_t port_id)
+{
+	struct rte_eth_dev *dev;
+
+	RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
+	dev = &rte_eth_devices[port_id];
+
+	/* rte_free ignores NULL so safe to call without checks */
+	rte_free(dev->empty_poll_stats);
+
+	dev->pwr_mgmt_state = RTE_ETH_DEV_POWER_MGMT_DISABLED;
+	return 0;
+}
+
 /**
  * A set of values to describe the possible states of a switch domain.
  */
diff --git a/lib/librte_ethdev/rte_ethdev.h b/lib/librte_ethdev/rte_ethdev.h
index a49242bcd2..b8318f7e91 100644
--- a/lib/librte_ethdev/rte_ethdev.h
+++ b/lib/librte_ethdev/rte_ethdev.h
@@ -157,6 +157,7 @@ extern "C" {
 #include <rte_common.h>
 #include <rte_config.h>
 #include <rte_ether.h>
+#include <rte_power_intrinsics.h>
 
 #include "rte_ethdev_trace_fp.h"
 #include "rte_dev_info.h"
@@ -666,6 +667,7 @@ rte_eth_rss_hf_refine(uint64_t rss_hf)
 /** Maximum nb. of vlan per mirror rule */
 #define ETH_MIRROR_MAX_VLANS       64
 
+#define ETH_EMPTYPOLL_MAX          512 /**< Empty poll number threshlold */
 #define ETH_MIRROR_VIRTUAL_POOL_UP     0x01  /**< Virtual Pool uplink Mirroring. */
 #define ETH_MIRROR_UPLINK_PORT         0x02  /**< Uplink Port Mirroring. */
 #define ETH_MIRROR_DOWNLINK_PORT       0x04  /**< Downlink Port Mirroring. */
@@ -1490,6 +1492,16 @@ enum rte_eth_dev_state {
 	RTE_ETH_DEV_REMOVED,
 };
 
+/**
+ * Possible power managment states of an ethdev port.
+ */
+enum rte_eth_dev_power_mgmt_state {
+	/** Device power management is disabled. */
+	RTE_ETH_DEV_POWER_MGMT_DISABLED = 0,
+	/** Device power management is enabled. */
+	RTE_ETH_DEV_POWER_MGMT_ENABLED
+};
+
 struct rte_eth_dev_sriov {
 	uint8_t active;               /**< SRIOV is active with 16, 32 or 64 pools */
 	uint8_t nb_q_per_pool;        /**< rx queue number per pool */
@@ -4302,6 +4314,38 @@ __rte_experimental
 int rte_eth_dev_hairpin_capability_get(uint16_t port_id,
 				       struct rte_eth_hairpin_cap *cap);
 
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
+ *
+ * Enable device power management.
+ *
+ * @param port_id
+ *   The port identifier of the Ethernet device.
+ *
+ * @return
+ *   0 on success
+ *   <0 on error
+ */
+__rte_experimental
+int rte_eth_dev_power_mgmt_enable(uint16_t port_id);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
+ *
+ * Disable device power management.
+ *
+ * @param port_id
+ *   The port identifier of the Ethernet device.
+ *
+ * @return
+ *   0 on success
+ *   <0 on error
+ */
+__rte_experimental
+int rte_eth_dev_power_mgmt_disable(uint16_t port_id);
+
 #include <rte_ethdev_core.h>
 
 /**
@@ -4417,6 +4461,32 @@ rte_eth_rx_burst(uint16_t port_id, uint16_t queue_id,
 		} while (cb != NULL);
 	}
 #endif
+	if (dev->pwr_mgmt_state == RTE_ETH_DEV_POWER_MGMT_ENABLED) {
+		if (unlikely(nb_rx == 0)) {
+			dev->empty_poll_stats[queue_id].num++;
+			if (unlikely(dev->empty_poll_stats[queue_id].num >
+					ETH_EMPTYPOLL_MAX)) {
+				volatile void *target_addr;
+				uint64_t expected, mask;
+				int ret;
+
+				/*
+				 * get address of next descriptor in the RX
+				 * ring for this queue, as well as expected
+				 * value and a mask.
+				 */
+				ret = (*dev->dev_ops->next_rx_desc)
+					(dev->data->rx_queues[queue_id],
+					 &target_addr, &expected, &mask);
+				if (ret == 0)
+					/* -1ULL is maximum value for TSC */
+					rte_power_monitor(target_addr,
+							  expected, mask,
+							  0, -1ULL);
+			}
+		} else
+			dev->empty_poll_stats[queue_id].num = 0;
+	}
 
 	rte_ethdev_trace_rx_burst(port_id, queue_id, (void **)rx_pkts, nb_rx);
 	return nb_rx;
diff --git a/lib/librte_ethdev/rte_ethdev_core.h b/lib/librte_ethdev/rte_ethdev_core.h
index 32407dd418..4e23d465f0 100644
--- a/lib/librte_ethdev/rte_ethdev_core.h
+++ b/lib/librte_ethdev/rte_ethdev_core.h
@@ -603,6 +603,27 @@ typedef int (*eth_tx_hairpin_queue_setup_t)
 	 uint16_t nb_tx_desc,
 	 const struct rte_eth_hairpin_conf *hairpin_conf);
 
+/**
+ * @internal
+ * Get the next RX ring descriptor address.
+ *
+ * @param rxq
+ *   ethdev queue pointer.
+ * @param tail_desc_addr
+ *   the pointer point to descriptor address var.
+ *
+ * @return
+ *   Negative errno value on error, 0 on success.
+ *
+ * @retval 0
+ *   Success.
+ * @retval -EINVAL
+ *   Failed to get descriptor address.
+ */
+typedef int (*eth_next_rx_desc_t)
+	(void *rxq, volatile void **tail_desc_addr,
+	 uint64_t *expected, uint64_t *mask);
+
 /**
  * @internal A structure containing the functions exported by an Ethernet driver.
  */
@@ -752,6 +773,8 @@ struct eth_dev_ops {
 	/**< Set up device RX hairpin queue. */
 	eth_tx_hairpin_queue_setup_t tx_hairpin_queue_setup;
 	/**< Set up device TX hairpin queue. */
+	eth_next_rx_desc_t next_rx_desc;
+	/**< Get next RX ring descriptor address. */
 };
 
 /**
@@ -768,6 +791,14 @@ struct rte_eth_rxtx_callback {
 	void *param;
 };
 
+/**
+ * @internal
+ * Structure used to hold counters for empty poll
+ */
+struct rte_eth_ep_stat {
+	uint64_t num;
+} __rte_cache_aligned;
+
 /**
  * @internal
  * The generic data structure associated with each ethernet device.
@@ -807,8 +838,14 @@ struct rte_eth_dev {
 	enum rte_eth_dev_state state; /**< Flag indicating the port state */
 	void *security_ctx; /**< Context for security ops */
 
-	uint64_t reserved_64s[4]; /**< Reserved for future fields */
-	void *reserved_ptrs[4];   /**< Reserved for future fields */
+	/**< Empty poll number */
+	enum rte_eth_dev_power_mgmt_state pwr_mgmt_state;
+	uint32_t reserved_32;
+	uint64_t reserved_64s[3]; /**< Reserved for future fields */
+
+	/**< Flag indicating the port power state */
+	struct rte_eth_ep_stat *empty_poll_stats;
+	void *reserved_ptrs[3];   /**< Reserved for future fields */
 } __rte_cache_aligned;
 
 struct rte_eth_dev_sriov;
diff --git a/lib/librte_ethdev/rte_ethdev_version.map b/lib/librte_ethdev/rte_ethdev_version.map
index 7155056045..141361823d 100644
--- a/lib/librte_ethdev/rte_ethdev_version.map
+++ b/lib/librte_ethdev/rte_ethdev_version.map
@@ -241,4 +241,8 @@ EXPERIMENTAL {
 	__rte_ethdev_trace_rx_burst;
 	__rte_ethdev_trace_tx_burst;
 	rte_flow_get_aged_flows;
+
+	# added in 20.08
+	rte_eth_dev_power_mgmt_disable;
+	rte_eth_dev_power_mgmt_enable;
 };
-- 
2.17.1


More information about the dev mailing list