[dpdk-dev] [PATCH v4 04/10] power: add simple power management API and callback

Burakov, Anatoly anatoly.burakov at intel.com
Fri Oct 9 18:47:36 CEST 2020


On 09-Oct-20 5:38 PM, Ananyev, Konstantin wrote:
>> Add a simple on/off switch that will enable saving power when no
>> packets are arriving. It is based on counting the number of empty
>> polls and, when the number reaches a certain threshold, entering an
>> architecture-defined optimized power state that will either wait
>> until a TSC timestamp expires, or when packets arrive.
>>
>> This API support 1 port to multiple core use case.
>>
>> This design leverage RX Callback mechnaism which allow three
>> different power management methodology co exist.
>>
>> 1. umwait/umonitor:
>>
>>     The TSC timestamp is automatically calculated using current
>>     link speed and RX descriptor ring size, such that the sleep
>>     time is not longer than it would take for a NIC to fill its
>>     entire RX descriptor ring.
>>
>> 2. Pause instruction
>>
>>     Instead of move the core into deeper C state, this lightweight
>>     method use Pause instruction to relief the processor from
>>     busy polling.
>>
>> 3. Frequency Scaling
>>     Reuse exist rte power library to scale up/down core frequency
>>     depend on traffic volume.
>>
>> Signed-off-by: Liang Ma <liang.j.ma at intel.com>
>> Signed-off-by: Anatoly Burakov <anatoly.burakov at intel.com>
>> ---
>>   lib/librte_power/meson.build           |   5 +-
>>   lib/librte_power/pmd_mgmt.h            |  49 ++++++
>>   lib/librte_power/rte_power_pmd_mgmt.c  | 208 +++++++++++++++++++++++++
>>   lib/librte_power/rte_power_pmd_mgmt.h  |  88 +++++++++++
>>   lib/librte_power/rte_power_version.map |   4 +
>>   5 files changed, 352 insertions(+), 2 deletions(-)
>>   create mode 100644 lib/librte_power/pmd_mgmt.h
>>   create mode 100644 lib/librte_power/rte_power_pmd_mgmt.c
>>   create mode 100644 lib/librte_power/rte_power_pmd_mgmt.h
>>
>> diff --git a/lib/librte_power/meson.build b/lib/librte_power/meson.build
>> index 78c031c943..cc3c7a8646 100644
>> --- a/lib/librte_power/meson.build
>> +++ b/lib/librte_power/meson.build
>> @@ -9,6 +9,7 @@ sources = files('rte_power.c', 'power_acpi_cpufreq.c',
>>   'power_kvm_vm.c', 'guest_channel.c',
>>   'rte_power_empty_poll.c',
>>   'power_pstate_cpufreq.c',
>> +'rte_power_pmd_mgmt.c',
>>   'power_common.c')
>> -headers = files('rte_power.h','rte_power_empty_poll.h')
>> -deps += ['timer']
>> +headers = files('rte_power.h','rte_power_empty_poll.h','rte_power_pmd_mgmt.h')
>> +deps += ['timer' ,'ethdev']
>> diff --git a/lib/librte_power/pmd_mgmt.h b/lib/librte_power/pmd_mgmt.h
>> new file mode 100644
>> index 0000000000..756fbe20f7
>> --- /dev/null
>> +++ b/lib/librte_power/pmd_mgmt.h
>> @@ -0,0 +1,49 @@
>> +/* SPDX-License-Identifier: BSD-3-Clause
>> + * Copyright(c) 2010-2020 Intel Corporation
>> + */
>> +
>> +#ifndef _PMD_MGMT_H
>> +#define _PMD_MGMT_H
>> +
>> +/**
>> + * @file
>> + * Power Management
>> + */
>> +
>> +#ifdef __cplusplus
>> +extern "C" {
>> +#endif
>> +
>> +/**
>> + * Possible power management states of an ethdev port.
>> + */
>> +enum pmd_mgmt_state {
>> +/** Device power management is disabled. */
>> +PMD_MGMT_DISABLED = 0,
>> +/** Device power management is enabled. */
>> +PMD_MGMT_ENABLED,
>> +};
>> +
>> +struct pmd_queue_cfg {
>> +enum pmd_mgmt_state pwr_mgmt_state;
>> +/**< Power mgmt Callback mode */
>> +enum rte_power_pmd_mgmt_type cb_mode;
>> +/**< Empty poll number */
>> +uint16_t empty_poll_stats;
>> +/**< Callback instance  */
>> +const struct rte_eth_rxtx_callback *cur_cb;
>> +} __rte_cache_aligned;
>> +
>> +struct pmd_port_cfg {
>> +int  ref_cnt;
>> +struct pmd_queue_cfg *queue_cfg;
>> +} __rte_cache_aligned;
>> +
>> +
>> +
>> +
>> +#ifdef __cplusplus
>> +}
>> +#endif
>> +
>> +#endif
>> diff --git a/lib/librte_power/rte_power_pmd_mgmt.c b/lib/librte_power/rte_power_pmd_mgmt.c
>> new file mode 100644
>> index 0000000000..35d2af46a4
>> --- /dev/null
>> +++ b/lib/librte_power/rte_power_pmd_mgmt.c
>> @@ -0,0 +1,208 @@
>> +/* SPDX-License-Identifier: BSD-3-Clause
>> + * Copyright(c) 2010-2020 Intel Corporation
>> + */
>> +
>> +#include <rte_lcore.h>
>> +#include <rte_cycles.h>
>> +#include <rte_malloc.h>
>> +#include <rte_ethdev.h>
>> +#include <rte_power_intrinsics.h>
>> +
>> +#include "rte_power_pmd_mgmt.h"
>> +#include "pmd_mgmt.h"
>> +
>> +
>> +#define EMPTYPOLL_MAX  512
>> +#define PAUSE_NUM  64
>> +
>> +static struct pmd_port_cfg port_cfg[RTE_MAX_ETHPORTS];
>> +
>> +static uint16_t
>> +rte_power_mgmt_umwait(uint16_t port_id, uint16_t qidx,
>> +struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
>> +uint16_t max_pkts __rte_unused, void *_  __rte_unused)
>> +{
>> +
>> +struct pmd_queue_cfg *q_conf;
>> +q_conf = &port_cfg[port_id].queue_cfg[qidx];
>> +
>> +if (unlikely(nb_rx == 0)) {
>> +q_conf->empty_poll_stats++;
>> +if (unlikely(q_conf->empty_poll_stats > EMPTYPOLL_MAX)) {
> 
> 
> Here and in other places - wouldn't it be better to empty_poll_max as configurable
> parameter, instead of constant value?

It would be more flexible, but i don't think it's "better" in the sense 
that providing additional options will only make using this (already 
under-utilized!) API harder than it needs to be.

> 
>> +volatile void *target_addr;
>> +uint64_t expected, mask;
>> +uint16_t ret;
>> +
>> +/*
>> + * get address of next descriptor in the RX
>> + * ring for this queue, as well as expected
>> + * value and a mask.
>> + */
>> +ret = rte_eth_get_wake_addr(port_id, qidx,
>> +    &target_addr, &expected,
>> +    &mask);
>> +if (ret == 0)
>> +/* -1ULL is maximum value for TSC */
>> +rte_power_monitor(target_addr,
>> +  expected, mask,
>> +  0, -1ULL);
> 
> 
> Why not make timeout a user specified parameter?

This is meant to be an "easy to use" API, we were trying to keep the 
amount of configuration to an absolute minimum. If the user wants to use 
custom timeouts, they can do so with using rte_power_monitor API explicitly.

> 
>> +}
>> +} else
>> +q_conf->empty_poll_stats = 0;
>> +
>> +return nb_rx;
>> +}
>> +
>> +static uint16_t
>> +rte_power_mgmt_pause(uint16_t port_id, uint16_t qidx,
>> +struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
>> +uint16_t max_pkts __rte_unused, void *_  __rte_unused)
>> +{
>> +struct pmd_queue_cfg *q_conf;
>> +int i;
>> +q_conf = &port_cfg[port_id].queue_cfg[qidx];
>> +
>> +if (unlikely(nb_rx == 0)) {
>> +q_conf->empty_poll_stats++;
>> +if (unlikely(q_conf->empty_poll_stats > EMPTYPOLL_MAX)) {
>> +for (i = 0; i < PAUSE_NUM; i++)
>> +rte_pause();
> 
> Just rte_delay_us(timeout) instead of this loop?

Yes, seems better, thanks.

> 
>> +}
>> +} else
>> +q_conf->empty_poll_stats = 0;
>> +
>> +return nb_rx;
>> +}
>> +
>> +static uint16_t
>> +rte_power_mgmt_scalefreq(uint16_t port_id, uint16_t qidx,
>> +struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
>> +uint16_t max_pkts __rte_unused, void *_  __rte_unused)
>> +{
>> +struct pmd_queue_cfg *q_conf;
>> +q_conf = &port_cfg[port_id].queue_cfg[qidx];
>> +
>> +if (unlikely(nb_rx == 0)) {
>> +q_conf->empty_poll_stats++;
>> +if (unlikely(q_conf->empty_poll_stats > EMPTYPOLL_MAX)) {
>> +/*scale down freq */
>> +rte_power_freq_min(rte_lcore_id());
>> +
>> +}
>> +} else {
>> +q_conf->empty_poll_stats = 0;
>> +/* scal up freq */
>> +rte_power_freq_max(rte_lcore_id());
>> +}
>> +
>> +return nb_rx;
>> +}
>> +
> 
> Probably worth to mention in comments that these functions enable/disable
> are not MT safe.

Will do in v6.

> 
>> +int
>> +rte_power_pmd_mgmt_queue_enable(unsigned int lcore_id,
>> +uint16_t port_id,
>> +uint16_t queue_id,
>> +enum rte_power_pmd_mgmt_type mode)
>> +{
>> +struct rte_eth_dev *dev;
>> +struct pmd_queue_cfg *queue_cfg;
>> +int ret = 0;
>> +
>> +RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
>> +dev = &rte_eth_devices[port_id];
>> +
>> +if (port_cfg[port_id].queue_cfg == NULL) {
>> +port_cfg[port_id].ref_cnt = 0;
>> +/* allocate memory for empty poll stats */
>> +port_cfg[port_id].queue_cfg  = rte_malloc_socket(NULL,
>> +sizeof(struct pmd_queue_cfg)
>> +* RTE_MAX_QUEUES_PER_PORT,
>> +0, dev->data->numa_node);
>> +if (port_cfg[port_id].queue_cfg == NULL)
>> +return -ENOMEM;
>> +}
>> +
>> +queue_cfg = &port_cfg[port_id].queue_cfg[queue_id];
>> +
>> +if (queue_cfg->pwr_mgmt_state == PMD_MGMT_ENABLED) {
>> +ret = -EINVAL;
>> +goto failure_handler;
>> +}
>> +
>> +switch (mode) {
>> +case RTE_POWER_MGMT_TYPE_WAIT:
>> +if (!rte_cpu_get_flag_enabled(RTE_CPUFLAG_WAITPKG)) {
>> +ret = -ENOTSUP;
>> +goto failure_handler;
>> +}
>> +queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id, queue_id,
>> +rte_power_mgmt_umwait, NULL);
>> +break;
>> +case RTE_POWER_MGMT_TYPE_SCALE:
>> +/* init scale freq */
>> +if (rte_power_init(lcore_id)) {
>> +ret = -EINVAL;
>> +goto failure_handler;
>> +}
>> +queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id, queue_id,
>> +rte_power_mgmt_scalefreq, NULL);
>> +break;
>> +case RTE_POWER_MGMT_TYPE_PAUSE:
>> +queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id, queue_id,
>> +rte_power_mgmt_pause, NULL);
>> +break;
> 
> default:
> ....

Will add in v6.

> 
>> +}
>> +queue_cfg->cb_mode = mode;
>> +port_cfg[port_id].ref_cnt++;
>> +queue_cfg->pwr_mgmt_state = PMD_MGMT_ENABLED;
>> +return ret;
>> +
>> +failure_handler:
>> +if (port_cfg[port_id].ref_cnt == 0) {
>> +rte_free(port_cfg[port_id].queue_cfg);
>> +port_cfg[port_id].queue_cfg = NULL;
>> +}
>> +return ret;
>> +}
>> +
>> +int
>> +rte_power_pmd_mgmt_queue_disable(unsigned int lcore_id,
>> +uint16_t port_id,
>> +uint16_t queue_id)
>> +{
>> +struct pmd_queue_cfg *queue_cfg;
>> +
>> +if (port_cfg[port_id].ref_cnt <= 0)
>> +return -EINVAL;
>> +
>> +queue_cfg = &port_cfg[port_id].queue_cfg[queue_id];
>> +
>> +if (queue_cfg->pwr_mgmt_state == PMD_MGMT_DISABLED)
>> +return -EINVAL;
>> +
>> +switch (queue_cfg->cb_mode) {
>> +case RTE_POWER_MGMT_TYPE_WAIT:
> 
> Think we need wakeup(lcore_id) here.

Not sure what you mean? Could you please elaborate?

> 
>> +case RTE_POWER_MGMT_TYPE_PAUSE:
>> +rte_eth_remove_rx_callback(port_id, queue_id,
>> +   queue_cfg->cur_cb);
>> +break;
>> +case RTE_POWER_MGMT_TYPE_SCALE:
>> +rte_power_freq_max(lcore_id);
>> +rte_eth_remove_rx_callback(port_id, queue_id,
>> +   queue_cfg->cur_cb);
>> +rte_power_exit(lcore_id);
>> +break;
>> +}
>> +/* it's not recommend to free callback instance here.
>> + * it cause memory leak which is a known issue.
>> + */
>> +queue_cfg->cur_cb = NULL;
>> +queue_cfg->pwr_mgmt_state = PMD_MGMT_DISABLED;
>> +port_cfg[port_id].ref_cnt--;
>> +
>> +if (port_cfg[port_id].ref_cnt == 0) {
>> +rte_free(port_cfg[port_id].queue_cfg);
> 
> It is not safe to do so, unless device is already stopped.
> Otherwise you need some sync mechanism here (hand-made as bpf lib, or rcu online/offline, or...)

Not sure what you mean. We're not freeing the callback structure, we're 
freeing the local data structure holding the per-port status.

> 
>> +port_cfg[port_id].queue_cfg = NULL;
>> +}
>> +return 0;
>> +}


-- 
Thanks,
Anatoly


More information about the dev mailing list