[dpdk-dev] [RFC v2 2/5] ethdev: add simple power management API and callback

Liang Ma liang.j.ma at intel.com
Tue Aug 11 12:27:43 CEST 2020


Add a simple on/off switch that will enable saving power when no
packets are arriving. It is based on counting the number of empty
polls and, when the number reaches a certain threshold, entering an
architecture-defined optimized power state that will either wait
until a TSC timestamp expires, or when packets arrive.

This API is limited to 1 core 1 queue use case as there is no
coordination between queues/cores in ethdev.

This design leverage RX Callback mechnaism which allow three
different power management methodology co exist.

1. umwait/umonitor:

   The TSC timestamp is automatically calculated using current
   link speed and RX descriptor ring size, such that the sleep
   time is not longer than it would take for a NIC to fill its
   entire RX descriptor ring.

2. Pause instruction

   Instead of move the core into deeper C state, this lightweight
   method use Pause instruction to releaf the processor from
   busy polling.

3. Frequency Scaling
   Reuse exist rte power library to scale up/down core frequency
   depend on traffic volume.

Signed-off-by: Liang Ma <liang.j.ma at intel.com>
Signed-off-by: Anatoly Burakov <anatoly.burakov at intel.com>
---
 config/common_base                       |   4 +-
 lib/Makefile                             |   1 +
 lib/librte_ethdev/Makefile               |   2 +-
 lib/librte_ethdev/meson.build            |   2 +-
 lib/librte_ethdev/rte_ethdev.c           | 198 +++++++++++++++++++++++
 lib/librte_ethdev/rte_ethdev.h           |  59 +++++++
 lib/librte_ethdev/rte_ethdev_core.h      |  43 ++++-
 lib/librte_ethdev/rte_ethdev_version.map |   4 +
 lib/meson.build                          |   5 +-
 mk/rte.app.mk                            |   2 +-
 10 files changed, 311 insertions(+), 9 deletions(-)

diff --git a/config/common_base b/config/common_base
index f76585f16..e0948f0cb 100644
--- a/config/common_base
+++ b/config/common_base
@@ -155,7 +155,7 @@ CONFIG_RTE_MAX_ETHPORTS=32
 CONFIG_RTE_MAX_QUEUES_PER_PORT=1024
 CONFIG_RTE_LIBRTE_IEEE1588=n
 CONFIG_RTE_ETHDEV_QUEUE_STAT_CNTRS=16
-CONFIG_RTE_ETHDEV_RXTX_CALLBACKS=y
+CONFIG_RTE_ETHDEV_RXTX_CALLBACKS=n
 CONFIG_RTE_ETHDEV_PROFILE_WITH_VTUNE=n
 
 #
@@ -978,7 +978,7 @@ CONFIG_RTE_LIBRTE_ACL_DEBUG=n
 #
 # Compile librte_power
 #
-CONFIG_RTE_LIBRTE_POWER=n
+CONFIG_RTE_LIBRTE_POWER=y
 CONFIG_RTE_LIBRTE_POWER_DEBUG=n
 CONFIG_RTE_MAX_LCORE_FREQS=64
 
diff --git a/lib/Makefile b/lib/Makefile
index 8f5b68a2d..87646698a 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -28,6 +28,7 @@ DEPDIRS-librte_ethdev := librte_net librte_eal librte_mempool librte_ring
 DEPDIRS-librte_ethdev += librte_mbuf
 DEPDIRS-librte_ethdev += librte_kvargs
 DEPDIRS-librte_ethdev += librte_meter
+DEPDIRS-librte_ethdev += librte_power
 DIRS-$(CONFIG_RTE_LIBRTE_BBDEV) += librte_bbdev
 DEPDIRS-librte_bbdev := librte_eal librte_mempool librte_mbuf
 DIRS-$(CONFIG_RTE_LIBRTE_CRYPTODEV) += librte_cryptodev
diff --git a/lib/librte_ethdev/Makefile b/lib/librte_ethdev/Makefile
index 47747150b..6a4ce14cf 100644
--- a/lib/librte_ethdev/Makefile
+++ b/lib/librte_ethdev/Makefile
@@ -11,7 +11,7 @@ LIB = librte_ethdev.a
 CFLAGS += -O3
 CFLAGS += $(WERROR_FLAGS)
 LDLIBS += -lrte_net -lrte_eal -lrte_mempool -lrte_ring
-LDLIBS += -lrte_mbuf -lrte_kvargs -lrte_meter -lrte_telemetry
+LDLIBS += -lrte_mbuf -lrte_kvargs -lrte_meter -lrte_telemetry -lrte_power
 
 EXPORT_MAP := rte_ethdev_version.map
 
diff --git a/lib/librte_ethdev/meson.build b/lib/librte_ethdev/meson.build
index 8fc24e8c8..e09e2395e 100644
--- a/lib/librte_ethdev/meson.build
+++ b/lib/librte_ethdev/meson.build
@@ -27,4 +27,4 @@ headers = files('rte_ethdev.h',
 	'rte_tm.h',
 	'rte_tm_driver.h')
 
-deps += ['net', 'kvargs', 'meter', 'telemetry']
+deps += ['net', 'kvargs', 'meter', 'telemetry', 'power']
diff --git a/lib/librte_ethdev/rte_ethdev.c b/lib/librte_ethdev/rte_ethdev.c
index 7858ad5f1..b43de88ce 100644
--- a/lib/librte_ethdev/rte_ethdev.c
+++ b/lib/librte_ethdev/rte_ethdev.c
@@ -16,6 +16,7 @@
 #include <netinet/in.h>
 
 #include <rte_byteorder.h>
+#include <rte_cpuflags.h>
 #include <rte_log.h>
 #include <rte_debug.h>
 #include <rte_interrupts.h>
@@ -39,6 +40,7 @@
 #include <rte_class.h>
 #include <rte_ether.h>
 #include <rte_telemetry.h>
+#include <rte_power.h>
 
 #include "rte_ethdev_trace.h"
 #include "rte_ethdev.h"
@@ -185,6 +187,100 @@ enum {
 	STAT_QMAP_RX
 };
 
+
+static uint16_t
+rte_ethdev_pmgmt_umait(uint16_t port_id, uint16_t qidx,
+		struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
+		uint16_t max_pkts __rte_unused, void *_  __rte_unused)
+{
+
+	struct rte_eth_dev *dev = &rte_eth_devices[port_id];
+
+	if (dev->pwr_mgmt_state == RTE_ETH_DEV_POWER_MGMT_ENABLED) {
+		if (unlikely(nb_rx == 0)) {
+			dev->empty_poll_stats[qidx].num++;
+			if (unlikely(dev->empty_poll_stats[qidx].num >
+					ETH_EMPTYPOLL_MAX)) {
+				volatile void *target_addr;
+				uint64_t expected, mask;
+				uint16_t ret;
+
+				/*
+				 * get address of next descriptor in the RX
+				 * ring for this queue, as well as expected
+				 * value and a mask.
+				 */
+				ret = (*dev->dev_ops->next_rx_desc)
+					(dev->data->rx_queues[qidx],
+					 &target_addr, &expected, &mask);
+				if (ret == 0)
+					/* -1ULL is maximum value for TSC */
+					rte_power_monitor(target_addr,
+							  expected, mask,
+							  0, -1ULL);
+			}
+		} else
+			dev->empty_poll_stats[qidx].num = 0;
+	}
+
+	return 0;
+}
+
+static uint16_t
+rte_ethdev_pmgmt_pause(uint16_t port_id, uint16_t qidx,
+		struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
+		uint16_t max_pkts __rte_unused, void *_  __rte_unused)
+{
+	struct rte_eth_dev *dev = &rte_eth_devices[port_id];
+
+	int i;
+
+	if (dev->pwr_mgmt_state == RTE_ETH_DEV_POWER_MGMT_ENABLED) {
+		if (unlikely(nb_rx == 0)) {
+
+			dev->empty_poll_stats[qidx].num++;
+
+			if (unlikely(dev->empty_poll_stats[qidx].num >
+					ETH_EMPTYPOLL_MAX)) {
+
+				for (i = 0; i < RTE_ETH_PAUSE_NUM; i++)
+					rte_pause();
+
+			}
+		} else
+			dev->empty_poll_stats[qidx].num = 0;
+	}
+
+	return 0;
+}
+
+static uint16_t
+rte_ethdev_pmgmt_scalefreq(uint16_t port_id, uint16_t qidx,
+		struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
+		uint16_t max_pkts __rte_unused, void *_  __rte_unused)
+{
+	struct rte_eth_dev *dev = &rte_eth_devices[port_id];
+
+	if (dev->pwr_mgmt_state == RTE_ETH_DEV_POWER_MGMT_ENABLED) {
+		if (unlikely(nb_rx == 0)) {
+			dev->empty_poll_stats[qidx].num++;
+			if (unlikely(dev->empty_poll_stats[qidx].num >
+					ETH_EMPTYPOLL_MAX)) {
+
+				/*scale down freq */
+				rte_power_freq_min(rte_lcore_id());
+
+			}
+		} else {
+			dev->empty_poll_stats[qidx].num = 0;
+			/* scal up freq */
+			rte_power_freq_max(rte_lcore_id());
+		}
+	}
+
+	return 0;
+}
+
 int
 rte_eth_iterator_init(struct rte_dev_iterator *iter, const char *devargs_str)
 {
@@ -5113,6 +5209,108 @@ rte_eth_dev_pool_ops_supported(uint16_t port_id, const char *pool)
 	return (*dev->dev_ops->pool_ops_supported)(dev, pool);
 }
 
+int
+rte_eth_dev_power_mgmt_enable(unsigned int lcore_id,
+			      uint16_t port_id,
+			 enum rte_eth_dev_power_mgmt_cb_mode mode)
+{
+	struct rte_eth_dev *dev;
+
+	RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
+	dev = &rte_eth_devices[port_id];
+
+	/* allocate memory for empty poll stats */
+	dev->empty_poll_stats = rte_malloc_socket(NULL,
+						  sizeof(struct rte_eth_ep_stat)
+						  * RTE_MAX_QUEUES_PER_PORT,
+						  0, dev->data->numa_node);
+
+	if (dev->empty_poll_stats == NULL)
+		return -ENOMEM;
+
+	if (dev->pwr_mgmt_state == RTE_ETH_DEV_POWER_MGMT_ENABLED)
+		return -EINVAL;
+
+	dev->cb_mode = mode;
+
+	switch (mode) {
+
+	case RTE_ETH_DEV_POWER_MGMT_CB_UMWAIT:
+
+		if (!rte_cpu_get_flag_enabled(RTE_CPUFLAG_WAITPKG))
+			return -ENOTSUP;
+
+		dev->cur_pwr_cb = rte_eth_add_rx_callback(port_id, 0,
+						rte_ethdev_pmgmt_umait, NULL);
+		break;
+
+	case RTE_ETH_DEV_POWER_MGMT_CB_SCALE:
+
+		/* init scale freq */
+		if (rte_power_init(lcore_id))
+			return -EINVAL;
+
+		dev->cur_pwr_cb = rte_eth_add_rx_callback(port_id, 0,
+					rte_ethdev_pmgmt_scalefreq, NULL);
+		break;
+
+	case RTE_ETH_DEV_POWER_MGMT_CB_PAUSE:
+
+		dev->cur_pwr_cb = rte_eth_add_rx_callback(port_id, 0,
+						rte_ethdev_pmgmt_pause, NULL);
+		break;
+
+	}
+
+	dev->pwr_mgmt_state = RTE_ETH_DEV_POWER_MGMT_ENABLED;
+	return 0;
+}
+
+int
+rte_eth_dev_power_mgmt_disable(unsigned int lcore_id,
+			       uint16_t port_id)
+{
+	struct rte_eth_dev *dev;
+
+	RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
+	dev = &rte_eth_devices[port_id];
+
+	/*add flag check */
+
+	if (dev->pwr_mgmt_state == RTE_ETH_DEV_POWER_MGMT_ENABLED)  {
+		/* rte_free ignores NULL so safe to call without checks */
+		rte_free(dev->empty_poll_stats);
+
+		switch (dev->cb_mode) {
+
+		case RTE_ETH_DEV_POWER_MGMT_CB_UMWAIT:
+
+		case RTE_ETH_DEV_POWER_MGMT_CB_PAUSE:
+
+			rte_eth_remove_rx_callback(port_id, 0,
+						   dev->cur_pwr_cb);
+
+			break;
+
+		case RTE_ETH_DEV_POWER_MGMT_CB_SCALE:
+
+			rte_power_freq_max(lcore_id);
+
+			rte_eth_remove_rx_callback(port_id, 0,
+						   dev->cur_pwr_cb);
+
+			if (rte_power_exit(lcore_id))
+				return -EINVAL;
+
+			break;
+		}
+
+		dev->pwr_mgmt_state = RTE_ETH_DEV_POWER_MGMT_DISABLED;
+
+	}
+	return 0;
+}
+
 /**
  * A set of values to describe the possible states of a switch domain.
  */
diff --git a/lib/librte_ethdev/rte_ethdev.h b/lib/librte_ethdev/rte_ethdev.h
index 57e4a6ca5..6858c0338 100644
--- a/lib/librte_ethdev/rte_ethdev.h
+++ b/lib/librte_ethdev/rte_ethdev.h
@@ -157,6 +157,7 @@ extern "C" {
 #include <rte_common.h>
 #include <rte_config.h>
 #include <rte_ether.h>
+#include <rte_power_intrinsics.h>
 
 #include "rte_ethdev_trace_fp.h"
 #include "rte_dev_info.h"
@@ -775,6 +776,7 @@ rte_eth_rss_hf_refine(uint64_t rss_hf)
 /** Maximum nb. of vlan per mirror rule */
 #define ETH_MIRROR_MAX_VLANS       64
 
+#define ETH_EMPTYPOLL_MAX          512 /**< Empty poll number threshlold */
 #define ETH_MIRROR_VIRTUAL_POOL_UP     0x01  /**< Virtual Pool uplink Mirroring. */
 #define ETH_MIRROR_UPLINK_PORT         0x02  /**< Uplink Port Mirroring. */
 #define ETH_MIRROR_DOWNLINK_PORT       0x04  /**< Downlink Port Mirroring. */
@@ -1603,6 +1605,25 @@ enum rte_eth_dev_state {
 	RTE_ETH_DEV_REMOVED,
 };
 
+#define  RTE_ETH_PAUSE_NUM  64    /* How many times to pause */
+/**
+ * Possible power management states of an ethdev port.
+ */
+enum rte_eth_dev_power_mgmt_state {
+	/** Device power management is disabled. */
+	RTE_ETH_DEV_POWER_MGMT_DISABLED = 0,
+	/** Device power management is enabled. */
+	RTE_ETH_DEV_POWER_MGMT_ENABLED,
+};
+
+enum rte_eth_dev_power_mgmt_cb_mode {
+	/** Device power management is disabled. */
+	RTE_ETH_DEV_POWER_MGMT_CB_UMWAIT = 0,
+	/** Device power management is enabled. */
+	RTE_ETH_DEV_POWER_MGMT_CB_PAUSE,
+	RTE_ETH_DEV_POWER_MGMT_CB_SCALE,
+};
+
 struct rte_eth_dev_sriov {
 	uint8_t active;               /**< SRIOV is active with 16, 32 or 64 pools */
 	uint8_t nb_q_per_pool;        /**< rx queue number per pool */
@@ -4415,6 +4436,40 @@ __rte_experimental
 int rte_eth_dev_hairpin_capability_get(uint16_t port_id,
 				       struct rte_eth_hairpin_cap *cap);
 
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
+ *
+ * Enable device power management.
+ *
+ * @param port_id
+ *   The port identifier of the Ethernet device.
+ *
+ * @return
+ *   0 on success
+ *   <0 on error
+ */
+__rte_experimental
+int rte_eth_dev_power_mgmt_enable(unsigned int lcore_id,
+				  uint16_t port_id,
+				  enum rte_eth_dev_power_mgmt_cb_mode mode);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
+ *
+ * Disable device power management.
+ *
+ * @param port_id
+ *   The port identifier of the Ethernet device.
+ *
+ * @return
+ *   0 on success
+ *   <0 on error
+ */
+__rte_experimental
+int rte_eth_dev_power_mgmt_disable(unsigned int lcore_id, uint16_t port_id);
+
 #include <rte_ethdev_core.h>
 
 /**
@@ -4535,6 +4590,7 @@ rte_eth_rx_burst(uint16_t port_id, uint16_t queue_id,
 	return nb_rx;
 }
 
+
 /**
  * Get the number of used descriptors of a rx queue
  *
@@ -4993,6 +5049,9 @@ rte_eth_tx_buffer(uint16_t port_id, uint16_t queue_id,
 	return rte_eth_tx_buffer_flush(port_id, queue_id, buffer);
 }
 
+
+
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_ethdev/rte_ethdev_core.h b/lib/librte_ethdev/rte_ethdev_core.h
index 32407dd41..7d6d85ddc 100644
--- a/lib/librte_ethdev/rte_ethdev_core.h
+++ b/lib/librte_ethdev/rte_ethdev_core.h
@@ -603,6 +603,27 @@ typedef int (*eth_tx_hairpin_queue_setup_t)
 	 uint16_t nb_tx_desc,
 	 const struct rte_eth_hairpin_conf *hairpin_conf);
 
+/**
+ * @internal
+ * Get the next RX ring descriptor address.
+ *
+ * @param rxq
+ *   ethdev queue pointer.
+ * @param tail_desc_addr
+ *   the pointer point to descriptor address var.
+ *
+ * @return
+ *   Negative errno value on error, 0 on success.
+ *
+ * @retval 0
+ *   Success.
+ * @retval -EINVAL
+ *   Failed to get descriptor address.
+ */
+typedef int (*eth_next_rx_desc_t)
+	(void *rxq, volatile void **tail_desc_addr,
+	 uint64_t *expected, uint64_t *mask);
+
 /**
  * @internal A structure containing the functions exported by an Ethernet driver.
  */
@@ -752,6 +773,8 @@ struct eth_dev_ops {
 	/**< Set up device RX hairpin queue. */
 	eth_tx_hairpin_queue_setup_t tx_hairpin_queue_setup;
 	/**< Set up device TX hairpin queue. */
+	eth_next_rx_desc_t next_rx_desc;
+	/**< Get next RX ring descriptor address. */
 };
 
 /**
@@ -768,6 +791,14 @@ struct rte_eth_rxtx_callback {
 	void *param;
 };
 
+/**
+ * @internal
+ * Structure used to hold counters for empty poll
+ */
+struct rte_eth_ep_stat {
+	uint64_t num;
+} __rte_cache_aligned;
+
 /**
  * @internal
  * The generic data structure associated with each ethernet device.
@@ -807,8 +838,16 @@ struct rte_eth_dev {
 	enum rte_eth_dev_state state; /**< Flag indicating the port state */
 	void *security_ctx; /**< Context for security ops */
 
-	uint64_t reserved_64s[4]; /**< Reserved for future fields */
-	void *reserved_ptrs[4];   /**< Reserved for future fields */
+	/**< Empty poll number */
+	enum rte_eth_dev_power_mgmt_state pwr_mgmt_state;
+	enum rte_eth_dev_power_mgmt_cb_mode cb_mode;
+	uint32_t reserved_32;
+	uint64_t reserved_64s[3]; /**< Reserved for future fields */
+
+	/**< Flag indicating the port power state */
+	struct rte_eth_ep_stat *empty_poll_stats;
+	const struct rte_eth_rxtx_callback *cur_pwr_cb;
+	void *reserved_ptrs[3];   /**< Reserved for future fields */
 } __rte_cache_aligned;
 
 struct rte_eth_dev_sriov;
diff --git a/lib/librte_ethdev/rte_ethdev_version.map b/lib/librte_ethdev/rte_ethdev_version.map
index 1212a17d3..4d5b63a5b 100644
--- a/lib/librte_ethdev/rte_ethdev_version.map
+++ b/lib/librte_ethdev/rte_ethdev_version.map
@@ -241,6 +241,10 @@ EXPERIMENTAL {
 	__rte_ethdev_trace_rx_burst;
 	__rte_ethdev_trace_tx_burst;
 	rte_flow_get_aged_flows;
+
+	# added in 20.08
+	rte_eth_dev_power_mgmt_disable;
+	rte_eth_dev_power_mgmt_enable;
 };
 
 INTERNAL {
diff --git a/lib/meson.build b/lib/meson.build
index 3852c0156..54cc0db7d 100644
--- a/lib/meson.build
+++ b/lib/meson.build
@@ -14,17 +14,18 @@ libraries = [
 	'eal', # everything depends on eal
 	'ring',
 	'rcu', # rcu depends on ring
+	'timer',   # eventdev depends on this
+	'power',   # eventdev depends on this
 	'mempool', 'mbuf', 'net', 'meter', 'ethdev', 'pci', # core
 	'cmdline',
 	'metrics', # bitrate/latency stats depends on this
 	'hash',    # efd depends on this
-	'timer',   # eventdev depends on this
 	'acl', 'bbdev', 'bitratestats', 'cfgfile',
 	'compressdev', 'cryptodev',
 	'distributor', 'efd', 'eventdev',
 	'gro', 'gso', 'ip_frag', 'jobstats',
 	'kni', 'latencystats', 'lpm', 'member',
-	'power', 'pdump', 'rawdev', 'regexdev',
+	'pdump', 'rawdev', 'regexdev',
 	'rib', 'reorder', 'sched', 'security', 'stack', 'vhost',
 	# ipsec lib depends on net, crypto and security
 	'ipsec',
diff --git a/mk/rte.app.mk b/mk/rte.app.mk
index a54425997..b87abb26e 100644
--- a/mk/rte.app.mk
+++ b/mk/rte.app.mk
@@ -58,7 +58,6 @@ endif
 _LDLIBS-$(CONFIG_RTE_LIBRTE_METRICS)        += --no-whole-archive
 _LDLIBS-$(CONFIG_RTE_LIBRTE_BITRATE)        += -lrte_bitratestats
 _LDLIBS-$(CONFIG_RTE_LIBRTE_LATENCY_STATS)  += -lrte_latencystats
-_LDLIBS-$(CONFIG_RTE_LIBRTE_POWER)          += -lrte_power
 
 _LDLIBS-$(CONFIG_RTE_LIBRTE_EFD)            += -lrte_efd
 _LDLIBS-$(CONFIG_RTE_LIBRTE_BPF)            += -lrte_bpf
@@ -80,6 +79,7 @@ _LDLIBS-$(CONFIG_RTE_LIBRTE_KVARGS)         += -lrte_kvargs
 _LDLIBS-y                                   += -lrte_telemetry
 _LDLIBS-$(CONFIG_RTE_LIBRTE_MBUF)           += -lrte_mbuf
 _LDLIBS-$(CONFIG_RTE_LIBRTE_NET)            += -lrte_net
+_LDLIBS-$(CONFIG_RTE_LIBRTE_POWER)          += -lrte_power
 _LDLIBS-$(CONFIG_RTE_LIBRTE_ETHER)          += -lrte_ethdev
 _LDLIBS-$(CONFIG_RTE_LIBRTE_BBDEV)          += -lrte_bbdev
 _LDLIBS-$(CONFIG_RTE_LIBRTE_CRYPTODEV)      += -lrte_cryptodev
-- 
2.17.1



More information about the dev mailing list