[dpdk-dev] [PATCH v4 04/10] power: add simple power management API and callback

Ananyev, Konstantin konstantin.ananyev at intel.com
Fri Oct 9 18:51:53 CEST 2020


> On 09-Oct-20 5:38 PM, Ananyev, Konstantin wrote:
> >> Add a simple on/off switch that will enable saving power when no
> >> packets are arriving. It is based on counting the number of empty
> >> polls and, when the number reaches a certain threshold, entering an
> >> architecture-defined optimized power state that will either wait
> >> until a TSC timestamp expires, or when packets arrive.
> >>
> >> This API support 1 port to multiple core use case.
> >>
> >> This design leverage RX Callback mechnaism which allow three
> >> different power management methodology co exist.
> >>
> >> 1. umwait/umonitor:
> >>
> >>     The TSC timestamp is automatically calculated using current
> >>     link speed and RX descriptor ring size, such that the sleep
> >>     time is not longer than it would take for a NIC to fill its
> >>     entire RX descriptor ring.
> >>
> >> 2. Pause instruction
> >>
> >>     Instead of move the core into deeper C state, this lightweight
> >>     method use Pause instruction to relief the processor from
> >>     busy polling.
> >>
> >> 3. Frequency Scaling
> >>     Reuse exist rte power library to scale up/down core frequency
> >>     depend on traffic volume.
> >>
> >> Signed-off-by: Liang Ma <liang.j.ma at intel.com>
> >> Signed-off-by: Anatoly Burakov <anatoly.burakov at intel.com>
> >> ---
> >>   lib/librte_power/meson.build           |   5 +-
> >>   lib/librte_power/pmd_mgmt.h            |  49 ++++++
> >>   lib/librte_power/rte_power_pmd_mgmt.c  | 208 +++++++++++++++++++++++++
> >>   lib/librte_power/rte_power_pmd_mgmt.h  |  88 +++++++++++
> >>   lib/librte_power/rte_power_version.map |   4 +
> >>   5 files changed, 352 insertions(+), 2 deletions(-)
> >>   create mode 100644 lib/librte_power/pmd_mgmt.h
> >>   create mode 100644 lib/librte_power/rte_power_pmd_mgmt.c
> >>   create mode 100644 lib/librte_power/rte_power_pmd_mgmt.h
> >>
> >> diff --git a/lib/librte_power/meson.build b/lib/librte_power/meson.build
> >> index 78c031c943..cc3c7a8646 100644
> >> --- a/lib/librte_power/meson.build
> >> +++ b/lib/librte_power/meson.build
> >> @@ -9,6 +9,7 @@ sources = files('rte_power.c', 'power_acpi_cpufreq.c',
> >>   'power_kvm_vm.c', 'guest_channel.c',
> >>   'rte_power_empty_poll.c',
> >>   'power_pstate_cpufreq.c',
> >> +'rte_power_pmd_mgmt.c',
> >>   'power_common.c')
> >> -headers = files('rte_power.h','rte_power_empty_poll.h')
> >> -deps += ['timer']
> >> +headers = files('rte_power.h','rte_power_empty_poll.h','rte_power_pmd_mgmt.h')
> >> +deps += ['timer' ,'ethdev']
> >> diff --git a/lib/librte_power/pmd_mgmt.h b/lib/librte_power/pmd_mgmt.h
> >> new file mode 100644
> >> index 0000000000..756fbe20f7
> >> --- /dev/null
> >> +++ b/lib/librte_power/pmd_mgmt.h
> >> @@ -0,0 +1,49 @@
> >> +/* SPDX-License-Identifier: BSD-3-Clause
> >> + * Copyright(c) 2010-2020 Intel Corporation
> >> + */
> >> +
> >> +#ifndef _PMD_MGMT_H
> >> +#define _PMD_MGMT_H
> >> +
> >> +/**
> >> + * @file
> >> + * Power Management
> >> + */
> >> +
> >> +#ifdef __cplusplus
> >> +extern "C" {
> >> +#endif
> >> +
> >> +/**
> >> + * Possible power management states of an ethdev port.
> >> + */
> >> +enum pmd_mgmt_state {
> >> +/** Device power management is disabled. */
> >> +PMD_MGMT_DISABLED = 0,
> >> +/** Device power management is enabled. */
> >> +PMD_MGMT_ENABLED,
> >> +};
> >> +
> >> +struct pmd_queue_cfg {
> >> +enum pmd_mgmt_state pwr_mgmt_state;
> >> +/**< Power mgmt Callback mode */
> >> +enum rte_power_pmd_mgmt_type cb_mode;
> >> +/**< Empty poll number */
> >> +uint16_t empty_poll_stats;
> >> +/**< Callback instance  */
> >> +const struct rte_eth_rxtx_callback *cur_cb;
> >> +} __rte_cache_aligned;
> >> +
> >> +struct pmd_port_cfg {
> >> +int  ref_cnt;
> >> +struct pmd_queue_cfg *queue_cfg;
> >> +} __rte_cache_aligned;
> >> +
> >> +
> >> +
> >> +
> >> +#ifdef __cplusplus
> >> +}
> >> +#endif
> >> +
> >> +#endif
> >> diff --git a/lib/librte_power/rte_power_pmd_mgmt.c b/lib/librte_power/rte_power_pmd_mgmt.c
> >> new file mode 100644
> >> index 0000000000..35d2af46a4
> >> --- /dev/null
> >> +++ b/lib/librte_power/rte_power_pmd_mgmt.c
> >> @@ -0,0 +1,208 @@
> >> +/* SPDX-License-Identifier: BSD-3-Clause
> >> + * Copyright(c) 2010-2020 Intel Corporation
> >> + */
> >> +
> >> +#include <rte_lcore.h>
> >> +#include <rte_cycles.h>
> >> +#include <rte_malloc.h>
> >> +#include <rte_ethdev.h>
> >> +#include <rte_power_intrinsics.h>
> >> +
> >> +#include "rte_power_pmd_mgmt.h"
> >> +#include "pmd_mgmt.h"
> >> +
> >> +
> >> +#define EMPTYPOLL_MAX  512
> >> +#define PAUSE_NUM  64
> >> +
> >> +static struct pmd_port_cfg port_cfg[RTE_MAX_ETHPORTS];
> >> +
> >> +static uint16_t
> >> +rte_power_mgmt_umwait(uint16_t port_id, uint16_t qidx,
> >> +struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
> >> +uint16_t max_pkts __rte_unused, void *_  __rte_unused)
> >> +{
> >> +
> >> +struct pmd_queue_cfg *q_conf;
> >> +q_conf = &port_cfg[port_id].queue_cfg[qidx];
> >> +
> >> +if (unlikely(nb_rx == 0)) {
> >> +q_conf->empty_poll_stats++;
> >> +if (unlikely(q_conf->empty_poll_stats > EMPTYPOLL_MAX)) {
> >
> >
> > Here and in other places - wouldn't it be better to empty_poll_max as configurable
> > parameter, instead of constant value?
> 
> It would be more flexible, but i don't think it's "better" in the sense
> that providing additional options will only make using this (already
> under-utilized!) API harder than it needs to be.
> 
> >
> >> +volatile void *target_addr;
> >> +uint64_t expected, mask;
> >> +uint16_t ret;
> >> +
> >> +/*
> >> + * get address of next descriptor in the RX
> >> + * ring for this queue, as well as expected
> >> + * value and a mask.
> >> + */
> >> +ret = rte_eth_get_wake_addr(port_id, qidx,
> >> +    &target_addr, &expected,
> >> +    &mask);
> >> +if (ret == 0)
> >> +/* -1ULL is maximum value for TSC */
> >> +rte_power_monitor(target_addr,
> >> +  expected, mask,
> >> +  0, -1ULL);
> >
> >
> > Why not make timeout a user specified parameter?
> 
> This is meant to be an "easy to use" API, we were trying to keep the
> amount of configuration to an absolute minimum. If the user wants to use
> custom timeouts, they can do so with using rte_power_monitor API explicitly.
> 
> >
> >> +}
> >> +} else
> >> +q_conf->empty_poll_stats = 0;
> >> +
> >> +return nb_rx;
> >> +}
> >> +
> >> +static uint16_t
> >> +rte_power_mgmt_pause(uint16_t port_id, uint16_t qidx,
> >> +struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
> >> +uint16_t max_pkts __rte_unused, void *_  __rte_unused)
> >> +{
> >> +struct pmd_queue_cfg *q_conf;
> >> +int i;
> >> +q_conf = &port_cfg[port_id].queue_cfg[qidx];
> >> +
> >> +if (unlikely(nb_rx == 0)) {
> >> +q_conf->empty_poll_stats++;
> >> +if (unlikely(q_conf->empty_poll_stats > EMPTYPOLL_MAX)) {
> >> +for (i = 0; i < PAUSE_NUM; i++)
> >> +rte_pause();
> >
> > Just rte_delay_us(timeout) instead of this loop?
> 
> Yes, seems better, thanks.
> 
> >
> >> +}
> >> +} else
> >> +q_conf->empty_poll_stats = 0;
> >> +
> >> +return nb_rx;
> >> +}
> >> +
> >> +static uint16_t
> >> +rte_power_mgmt_scalefreq(uint16_t port_id, uint16_t qidx,
> >> +struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
> >> +uint16_t max_pkts __rte_unused, void *_  __rte_unused)
> >> +{
> >> +struct pmd_queue_cfg *q_conf;
> >> +q_conf = &port_cfg[port_id].queue_cfg[qidx];
> >> +
> >> +if (unlikely(nb_rx == 0)) {
> >> +q_conf->empty_poll_stats++;
> >> +if (unlikely(q_conf->empty_poll_stats > EMPTYPOLL_MAX)) {
> >> +/*scale down freq */
> >> +rte_power_freq_min(rte_lcore_id());
> >> +
> >> +}
> >> +} else {
> >> +q_conf->empty_poll_stats = 0;
> >> +/* scal up freq */
> >> +rte_power_freq_max(rte_lcore_id());
> >> +}
> >> +
> >> +return nb_rx;
> >> +}
> >> +
> >
> > Probably worth to mention in comments that these functions enable/disable
> > are not MT safe.
> 
> Will do in v6.
> 
> >
> >> +int
> >> +rte_power_pmd_mgmt_queue_enable(unsigned int lcore_id,
> >> +uint16_t port_id,
> >> +uint16_t queue_id,
> >> +enum rte_power_pmd_mgmt_type mode)
> >> +{
> >> +struct rte_eth_dev *dev;
> >> +struct pmd_queue_cfg *queue_cfg;
> >> +int ret = 0;
> >> +
> >> +RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
> >> +dev = &rte_eth_devices[port_id];
> >> +
> >> +if (port_cfg[port_id].queue_cfg == NULL) {
> >> +port_cfg[port_id].ref_cnt = 0;
> >> +/* allocate memory for empty poll stats */
> >> +port_cfg[port_id].queue_cfg  = rte_malloc_socket(NULL,
> >> +sizeof(struct pmd_queue_cfg)
> >> +* RTE_MAX_QUEUES_PER_PORT,
> >> +0, dev->data->numa_node);
> >> +if (port_cfg[port_id].queue_cfg == NULL)
> >> +return -ENOMEM;
> >> +}
> >> +
> >> +queue_cfg = &port_cfg[port_id].queue_cfg[queue_id];
> >> +
> >> +if (queue_cfg->pwr_mgmt_state == PMD_MGMT_ENABLED) {
> >> +ret = -EINVAL;
> >> +goto failure_handler;
> >> +}
> >> +
> >> +switch (mode) {
> >> +case RTE_POWER_MGMT_TYPE_WAIT:
> >> +if (!rte_cpu_get_flag_enabled(RTE_CPUFLAG_WAITPKG)) {
> >> +ret = -ENOTSUP;
> >> +goto failure_handler;
> >> +}
> >> +queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id, queue_id,
> >> +rte_power_mgmt_umwait, NULL);
> >> +break;
> >> +case RTE_POWER_MGMT_TYPE_SCALE:
> >> +/* init scale freq */
> >> +if (rte_power_init(lcore_id)) {
> >> +ret = -EINVAL;
> >> +goto failure_handler;
> >> +}
> >> +queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id, queue_id,
> >> +rte_power_mgmt_scalefreq, NULL);
> >> +break;
> >> +case RTE_POWER_MGMT_TYPE_PAUSE:
> >> +queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id, queue_id,
> >> +rte_power_mgmt_pause, NULL);
> >> +break;
> >
> > default:
> > ....
> 
> Will add in v6.
> 
> >
> >> +}
> >> +queue_cfg->cb_mode = mode;
> >> +port_cfg[port_id].ref_cnt++;
> >> +queue_cfg->pwr_mgmt_state = PMD_MGMT_ENABLED;
> >> +return ret;
> >> +
> >> +failure_handler:
> >> +if (port_cfg[port_id].ref_cnt == 0) {
> >> +rte_free(port_cfg[port_id].queue_cfg);
> >> +port_cfg[port_id].queue_cfg = NULL;
> >> +}
> >> +return ret;
> >> +}
> >> +
> >> +int
> >> +rte_power_pmd_mgmt_queue_disable(unsigned int lcore_id,
> >> +uint16_t port_id,
> >> +uint16_t queue_id)
> >> +{
> >> +struct pmd_queue_cfg *queue_cfg;
> >> +
> >> +if (port_cfg[port_id].ref_cnt <= 0)
> >> +return -EINVAL;
> >> +
> >> +queue_cfg = &port_cfg[port_id].queue_cfg[queue_id];
> >> +
> >> +if (queue_cfg->pwr_mgmt_state == PMD_MGMT_DISABLED)
> >> +return -EINVAL;
> >> +
> >> +switch (queue_cfg->cb_mode) {
> >> +case RTE_POWER_MGMT_TYPE_WAIT:
> >
> > Think we need wakeup(lcore_id) here.
> 
> Not sure what you mean? Could you please elaborate?
> 
> >
> >> +case RTE_POWER_MGMT_TYPE_PAUSE:
> >> +rte_eth_remove_rx_callback(port_id, queue_id,
> >> +   queue_cfg->cur_cb);
> >> +break;
> >> +case RTE_POWER_MGMT_TYPE_SCALE:
> >> +rte_power_freq_max(lcore_id);
> >> +rte_eth_remove_rx_callback(port_id, queue_id,
> >> +   queue_cfg->cur_cb);
> >> +rte_power_exit(lcore_id);
> >> +break;
> >> +}
> >> +/* it's not recommend to free callback instance here.
> >> + * it cause memory leak which is a known issue.
> >> + */
> >> +queue_cfg->cur_cb = NULL;
> >> +queue_cfg->pwr_mgmt_state = PMD_MGMT_DISABLED;
> >> +port_cfg[port_id].ref_cnt--;
> >> +
> >> +if (port_cfg[port_id].ref_cnt == 0) {
> >> +rte_free(port_cfg[port_id].queue_cfg);
> >
> > It is not safe to do so, unless device is already stopped.
> > Otherwise you need some sync mechanism here (hand-made as bpf lib, or rcu online/offline, or...)
> 
> Not sure what you mean. We're not freeing the callback structure, we're
> freeing the local data structure holding the per-port status.

What is the difference?
You still trying to free memory that might be used by your DP thread
that still executes the callback.

> 
> >
> >> +port_cfg[port_id].queue_cfg = NULL;
> >> +}
> >> +return 0;
> >> +}
> 
> 
> --
> Thanks,
> Anatoly


More information about the dev mailing list