[dpdk-dev] [RFC 2/6] ethdev: add simple power management API

Ananyev, Konstantin konstantin.ananyev at intel.com
Thu May 28 14:15:03 CEST 2020


> 
> Add a simple on/off switch that will enable saving power when no
> packets are arriving. It is based on counting the number of empty
> polls and, when the number reaches a certain threshold, entering an
> architecture-defined optimized power state that will either wait
> until a TSC timestamp expires, or when packets arrive.
> 
> This API is limited to 1 core 1 queue use case as there is no
> coordination between queues/cores in ethdev.
> 
> The TSC timestamp is automatically calculated using current link
> speed and RX descriptor ring size, such that the sleep time is
> not longer than it would take for a NIC to fill its entire RX
> descriptor ring.
> 
> Signed-off-by: Liang J. Ma <liang.j.ma at intel.com>
> Signed-off-by: Anatoly Burakov <anatoly.burakov at intel.com>
> ---
>  lib/librte_ethdev/rte_ethdev.c           | 39 +++++++++++++
>  lib/librte_ethdev/rte_ethdev.h           | 70 ++++++++++++++++++++++++
>  lib/librte_ethdev/rte_ethdev_core.h      | 41 +++++++++++++-
>  lib/librte_ethdev/rte_ethdev_version.map |  4 ++
>  4 files changed, 152 insertions(+), 2 deletions(-)
> 
> diff --git a/lib/librte_ethdev/rte_ethdev.c b/lib/librte_ethdev/rte_ethdev.c
> index 8e10a6fc36..0be5ecfc11 100644
> --- a/lib/librte_ethdev/rte_ethdev.c
> +++ b/lib/librte_ethdev/rte_ethdev.c
> @@ -16,6 +16,7 @@
>  #include <netinet/in.h>
> 
>  #include <rte_byteorder.h>
> +#include <rte_cpuflags.h>
>  #include <rte_log.h>
>  #include <rte_debug.h>
>  #include <rte_interrupts.h>
> @@ -5053,6 +5054,44 @@ rte_eth_dev_pool_ops_supported(uint16_t port_id, const char *pool)
>  	return (*dev->dev_ops->pool_ops_supported)(dev, pool);
>  }
> 
> +int
> +rte_eth_dev_power_mgmt_enable(uint16_t port_id)
> +{
> +	struct rte_eth_dev *dev;
> +
> +	RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
> +	dev = &rte_eth_devices[port_id];
> +
> +	if (!rte_cpu_get_flag_enabled(RTE_CPUFLAG_WAITPKG))
> +		return -ENOTSUP;
> +
> +	/* allocate memory for empty poll stats */
> +	dev->empty_poll_stats = rte_malloc_socket(NULL,
> +		sizeof(struct rte_eth_ep_stat) * RTE_MAX_QUEUES_PER_PORT,
> +		0, dev->data->numa_node);
> +
> +	if (dev->empty_poll_stats == NULL)
> +		return -ENOMEM;
> +
> +	dev->pwr_mgmt_state = RTE_ETH_DEV_POWER_MGMT_ENABLED;
> +	return 0;
> +}
> +
> +int
> +rte_eth_dev_power_mgmt_disable(uint16_t port_id)
> +{
> +	struct rte_eth_dev *dev;
> +
> +	RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
> +	dev = &rte_eth_devices[port_id];
> +
> +	/* rte_free ignores NULL so safe to call without checks */
> +	rte_free(dev->empty_poll_stats);
> +
> +	dev->pwr_mgmt_state = RTE_ETH_DEV_POWER_MGMT_DISABLED;
> +	return 0;
> +}
> +
>  /**
>   * A set of values to describe the possible states of a switch domain.
>   */
> diff --git a/lib/librte_ethdev/rte_ethdev.h b/lib/librte_ethdev/rte_ethdev.h
> index a49242bcd2..b8318f7e91 100644
> --- a/lib/librte_ethdev/rte_ethdev.h
> +++ b/lib/librte_ethdev/rte_ethdev.h
> @@ -157,6 +157,7 @@ extern "C" {
>  #include <rte_common.h>
>  #include <rte_config.h>
>  #include <rte_ether.h>
> +#include <rte_power_intrinsics.h>
> 
>  #include "rte_ethdev_trace_fp.h"
>  #include "rte_dev_info.h"
> @@ -666,6 +667,7 @@ rte_eth_rss_hf_refine(uint64_t rss_hf)
>  /** Maximum nb. of vlan per mirror rule */
>  #define ETH_MIRROR_MAX_VLANS       64
> 
> +#define ETH_EMPTYPOLL_MAX          512 /**< Empty poll number threshlold */
>  #define ETH_MIRROR_VIRTUAL_POOL_UP     0x01  /**< Virtual Pool uplink Mirroring. */
>  #define ETH_MIRROR_UPLINK_PORT         0x02  /**< Uplink Port Mirroring. */
>  #define ETH_MIRROR_DOWNLINK_PORT       0x04  /**< Downlink Port Mirroring. */
> @@ -1490,6 +1492,16 @@ enum rte_eth_dev_state {
>  	RTE_ETH_DEV_REMOVED,
>  };
> 
> +/**
> + * Possible power managment states of an ethdev port.
> + */
> +enum rte_eth_dev_power_mgmt_state {
> +	/** Device power management is disabled. */
> +	RTE_ETH_DEV_POWER_MGMT_DISABLED = 0,
> +	/** Device power management is enabled. */
> +	RTE_ETH_DEV_POWER_MGMT_ENABLED
> +};
> +
>  struct rte_eth_dev_sriov {
>  	uint8_t active;               /**< SRIOV is active with 16, 32 or 64 pools */
>  	uint8_t nb_q_per_pool;        /**< rx queue number per pool */
> @@ -4302,6 +4314,38 @@ __rte_experimental
>  int rte_eth_dev_hairpin_capability_get(uint16_t port_id,
>  				       struct rte_eth_hairpin_cap *cap);
> 
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
> + *
> + * Enable device power management.
> + *
> + * @param port_id
> + *   The port identifier of the Ethernet device.
> + *
> + * @return
> + *   0 on success
> + *   <0 on error
> + */
> +__rte_experimental
> +int rte_eth_dev_power_mgmt_enable(uint16_t port_id);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
> + *
> + * Disable device power management.
> + *
> + * @param port_id
> + *   The port identifier of the Ethernet device.
> + *
> + * @return
> + *   0 on success
> + *   <0 on error
> + */
> +__rte_experimental
> +int rte_eth_dev_power_mgmt_disable(uint16_t port_id);
> +
>  #include <rte_ethdev_core.h>
> 
>  /**
> @@ -4417,6 +4461,32 @@ rte_eth_rx_burst(uint16_t port_id, uint16_t queue_id,
>  		} while (cb != NULL);
>  	}
>  #endif
> +	if (dev->pwr_mgmt_state == RTE_ETH_DEV_POWER_MGMT_ENABLED) {
> +		if (unlikely(nb_rx == 0)) {
> +			dev->empty_poll_stats[queue_id].num++;
> +			if (unlikely(dev->empty_poll_stats[queue_id].num >
> +					ETH_EMPTYPOLL_MAX)) {
> +				volatile void *target_addr;
> +				uint64_t expected, mask;
> +				int ret;
> +
> +				/*
> +				 * get address of next descriptor in the RX
> +				 * ring for this queue, as well as expected
> +				 * value and a mask.
> +				 */
> +				ret = (*dev->dev_ops->next_rx_desc)
> +					(dev->data->rx_queues[queue_id],
> +					 &target_addr, &expected, &mask);

That makes every PMD that doesn't support next_rx_desc op to crash.
One simple way to avoid it - check in rte_eth_dev_power_mgmt_enable() that PMD
does implement ops->next_rx_desc.
Though I don't think introducing such new op is a best approach, as it implies
that PMD does have HW RX descriptor mapped into WB-type memory, and dictates 
to PMD on what it should sleep on.
Though depending on HW/SW capabilities and implementation PMD might choose to
sleep on different thing (HW doorbell, SW cond var, etc.).
Another thing - I doubt it is a good idea to pollute generic RX function with power
specific code (again, as I said above it probably wouldn't be that generic for all possible PMDs).
>From my perspective we have 2 alternatives to implement such functionality:
1. Keep rte_eth_dev_power_mgmt_enable/disable(port, queue) and move actual 
    *wait_on* code into the PMD RX implementations (we probably can still have some common.      
    logic about allowed number of empty polls, max timeout to sleep, etc.).
2. Drop rte_eth_dev_power_mgmt_enable/disable and introduce explicit:
    rte_eth_dev_wait_for_packet(port, queue, timeout)  API function.
    
In both cases PMD will have a full freedom to implement *wait_on_packet* functionality 
in a most convenient way.
For 2) user would have to do some extra work himself
(count number of consecutive empty polls, call *wait_on_packet* function explicitly).
Though I think it can be easily hidden inside some wrapper API on top
of rte_eth_rx_burst()/rte_eth-dev_wait_for_packet().
Something like rte_eth_rx_burst_wait() or so.
We can have logic about allowed number of empty polls,
might be some other conditions in that top level function.
In that case changes in the user app will still be minimal. 
>From other side 2) gives user explicit control on where and when to sleep,
so from my perspective it seems more straightforward and flexible.

> +				if (ret == 0)
> +					/* -1ULL is maximum value for TSC */
> +					rte_power_monitor(target_addr,
> +							  expected, mask,
> +							  0, -1ULL);
> +			}
> +		} else
> +			dev->empty_poll_stats[queue_id].num = 0;
> +	}
> 
>  	rte_ethdev_trace_rx_burst(port_id, queue_id, (void **)rx_pkts, nb_rx);
>  	return nb_rx;
> diff --git a/lib/librte_ethdev/rte_ethdev_core.h b/lib/librte_ethdev/rte_ethdev_core.h
> index 32407dd418..4e23d465f0 100644
> --- a/lib/librte_ethdev/rte_ethdev_core.h
> +++ b/lib/librte_ethdev/rte_ethdev_core.h
> @@ -603,6 +603,27 @@ typedef int (*eth_tx_hairpin_queue_setup_t)
>  	 uint16_t nb_tx_desc,
>  	 const struct rte_eth_hairpin_conf *hairpin_conf);
> 
> +/**
> + * @internal
> + * Get the next RX ring descriptor address.
> + *
> + * @param rxq
> + *   ethdev queue pointer.
> + * @param tail_desc_addr
> + *   the pointer point to descriptor address var.
> + *
> + * @return
> + *   Negative errno value on error, 0 on success.
> + *
> + * @retval 0
> + *   Success.
> + * @retval -EINVAL
> + *   Failed to get descriptor address.
> + */
> +typedef int (*eth_next_rx_desc_t)
> +	(void *rxq, volatile void **tail_desc_addr,
> +	 uint64_t *expected, uint64_t *mask);
> +
>  /**
>   * @internal A structure containing the functions exported by an Ethernet driver.
>   */
> @@ -752,6 +773,8 @@ struct eth_dev_ops {
>  	/**< Set up device RX hairpin queue. */
>  	eth_tx_hairpin_queue_setup_t tx_hairpin_queue_setup;
>  	/**< Set up device TX hairpin queue. */
> +	eth_next_rx_desc_t next_rx_desc;
> +	/**< Get next RX ring descriptor address. */
>  };
> 
>  /**
> @@ -768,6 +791,14 @@ struct rte_eth_rxtx_callback {
>  	void *param;
>  };
> 
> +/**
> + * @internal
> + * Structure used to hold counters for empty poll
> + */
> +struct rte_eth_ep_stat {
> +	uint64_t num;
> +} __rte_cache_aligned;
> +
>  /**
>   * @internal
>   * The generic data structure associated with each ethernet device.
> @@ -807,8 +838,14 @@ struct rte_eth_dev {
>  	enum rte_eth_dev_state state; /**< Flag indicating the port state */
>  	void *security_ctx; /**< Context for security ops */
> 
> -	uint64_t reserved_64s[4]; /**< Reserved for future fields */
> -	void *reserved_ptrs[4];   /**< Reserved for future fields */
> +	/**< Empty poll number */
> +	enum rte_eth_dev_power_mgmt_state pwr_mgmt_state;
> +	uint32_t reserved_32;
> +	uint64_t reserved_64s[3]; /**< Reserved for future fields */
> +
> +	/**< Flag indicating the port power state */
> +	struct rte_eth_ep_stat *empty_poll_stats;
> +	void *reserved_ptrs[3];   /**< Reserved for future fields */
>  } __rte_cache_aligned;
> 
>  struct rte_eth_dev_sriov;
> diff --git a/lib/librte_ethdev/rte_ethdev_version.map b/lib/librte_ethdev/rte_ethdev_version.map
> index 7155056045..141361823d 100644
> --- a/lib/librte_ethdev/rte_ethdev_version.map
> +++ b/lib/librte_ethdev/rte_ethdev_version.map
> @@ -241,4 +241,8 @@ EXPERIMENTAL {
>  	__rte_ethdev_trace_rx_burst;
>  	__rte_ethdev_trace_tx_burst;
>  	rte_flow_get_aged_flows;
> +
> +	# added in 20.08
> +	rte_eth_dev_power_mgmt_disable;
> +	rte_eth_dev_power_mgmt_enable;
>  };
> --
> 2.17.1


More information about the dev mailing list