[dpdk-dev] [PATCH] libs/power: add p-state driver compatibility

Liang, Ma liang.j.ma at intel.com
Thu Dec 13 11:58:48 CET 2018


On 10 Dec 16:08, Burakov, Anatoly wrote:
<snip> 
> Hi Liang,
> 
> General comment: please do not break up log strings onto multiple lines. It
> makes it harder to grep the codebase. Checkpatch will not warn about log
> strings going over 80 characters.
agree, I will update in next version
> 
> > ---
> >   lib/librte_power/Makefile               |   2 +
> >   lib/librte_power/meson.build            |   4 +-
> >   lib/librte_power/power_pstate_cpufreq.c | 778 ++++++++++++++++++++++++++++++++
> >   lib/librte_power/power_pstate_cpufreq.h | 218 +++++++++
> >   lib/librte_power/rte_power.c            |  43 +-
> >   lib/librte_power/rte_power.h            |   3 +-
> >   6 files changed, 1039 insertions(+), 9 deletions(-)
> >   create mode 100644 lib/librte_power/power_pstate_cpufreq.c
> >   create mode 100644 lib/librte_power/power_pstate_cpufreq.h
> 
> <snip>
> 
> >   sources = files('rte_power.c', 'power_acpi_cpufreq.c',
> >   		'power_kvm_vm.c', 'guest_channel.c',
> > -		'rte_power_empty_poll.c')
> > +		'rte_power_empty_poll.c',
> > +		'power_pstate_cpufreq.c')
> >   headers = files('rte_power.h','rte_power_empty_poll.h')
> > -deps += ['timer']
> > diff --git a/lib/librte_power/power_pstate_cpufreq.c b/lib/librte_power/power_pstate_cpufreq.c
> > new file mode 100644
> > index 0000000..f1dada8
> > --- /dev/null
> > +++ b/lib/librte_power/power_pstate_cpufreq.c
> > @@ -0,0 +1,778 @@
> > +/* SPDX-License-Identifier: BSD-3-Clause
> > + * Copyright(c) 2010-2018 Intel Corporation
> 
> I believe it should be 2018.
I didn't see anything wrong here.
> 
> > + */
> > +
> > +#include <stdio.h>
> > +#include <sys/types.h>
> > +#include <sys/stat.h>
> > +#include <fcntl.h>
> > +#include <stdlib.h>
> > +#include <string.h>
> > +#include <unistd.h>
> > +#include <signal.h>
> > +#include <limits.h>
> > +#include <errno.h>
> > +
> > +#include <rte_memcpy.h>
> > +#include <rte_atomic.h>
> > +
> > +#include "power_pstate_cpufreq.h"
> > +#include "power_common.h"
> > +
> <snip>
> 
> > +
> > +static struct pstate_power_info lcore_power_info[RTE_MAX_LCORE];
> > +
> > +/**
> > + * It is to read the specific MSR.
> > + */
> > +
> > +static int32_t
> > +power_rdmsr(int msr, uint64_t *val, unsigned int lcore_id)
> > +{
> > +	int fd;
> > +	int ret;
> 
> int fd, ret?a
should be fine.
> 
> > +	char fullpath[PATH_MAX];
> > +
> > +	snprintf(fullpath, sizeof(fullpath), POWER_MSR_PATH, lcore_id);
> > +
> > +	fd = open(fullpath, O_RDONLY);
> > +
> > +	if (fd < 0) {
> > +
> > +		if (errno == EACCES)
> > +			RTE_LOG(ERR, POWER, "No access to %s\n", fullpath);
> > +
> > +		if (errno == ENXIO)
> > +			RTE_LOG(ERR, POWER, "%s Not Exist!\n", fullpath);
> 
> How about:
> 
> RTE_LOG(ERR, POWER, "Error opening '%s': %s\n", fullpath, strerror(errno));
agree, I will update in next version 
> 
> ?
> 
> > +
> > +		return fd;
> > +	}
> > +
> > +	ret = pread(fd, val, sizeof(uint64_t), msr);
> 
> Can pread fail?
agree, I will add error checking here.
> 
> > +
> > +	close(fd);
> > +
> > +	POWER_DEBUG_TRACE("MSR Path %s, offset 0x%X for lcore %u\n",
> > +			fullpath, msr, lcore_id);
> > +
> > +	POWER_DEBUG_TRACE("Ret value %d, content is 0x%lx\n", ret, *val);
> > +
> > +	return ret;
> > +}
> > +
> 
> <snip>
> 
> > +	if (fseek(pi->f_cur_max, 0, SEEK_SET) < 0) {
> > +		RTE_LOG(ERR, POWER, "Fail to set file position indicator to 0 "
> > +				"for setting frequency for lcore %u\n",
> > +				pi->lcore_id);
> > +		return -1;
> > +	}
> > +
> > +	/* Turbo is available and enabled, first freq bucket is sys max freq */
> > +	if (pi->turbo_available && pi->turbo_enable && (idx == 0))
> > +
> > +		target_freq = pi->sys_max_freq;
> > +
> > +	else
> > +
> > +		target_freq = pi->freqs[idx];
> 
> Unneeded whitespace?
agree, I will remove it. 
> > +
> > +
> > +	/* Decrease freq, the min freq should be updated first */
> > +	if (idx  >  pi->curr_idx) {
> > +
> > +		if (fprintf(pi->f_cur_min, "%u", target_freq) < 0) {
> > +			RTE_LOG(ERR, POWER, "Fail to write new frequency for "
> > +					"lcore %u\n", pi->lcore_id);
> > +			return -1;
> > +		}
> > +
> > +		if (fprintf(pi->f_cur_max, "%u", target_freq) < 0) {
> > +			RTE_LOG(ERR, POWER, "Fail to write new frequency for "
> > +					"lcore %u\n", pi->lcore_id);
> > +			return -1;
> > +		}
> > +
> > +		POWER_DEBUG_TRACE("Freqency[%u] to be set for lcore %u\n",
> > +				  target_freq, pi->lcore_id);
> 
> Frequency :)
> 
> Also, i believe "Frequency[%u]" is misleading in this case, because the
> value is not an index into an array, but is an actual target frequency. I
> think "Frequency '%u'" would be more descriptive.
agree I will update in next version
> 
> > +
> > +		fflush(pi->f_cur_min);
> > +		fflush(pi->f_cur_max);
> > +
> > +	}
> > +
> > +	/* Increase freq, the max freq should be updated first */
> > +	if (idx  <  pi->curr_idx) {
> 
> Else if?
no need here.
> 
> > +
> > +		if (fprintf(pi->f_cur_max, "%u", target_freq) < 0) {
> > +			RTE_LOG(ERR, POWER, "Fail to write new frequency for "
> > +					"lcore %u\n", pi->lcore_id);
> > +			return -1;
> > +		}
> > +
> > +		if (fprintf(pi->f_cur_min, "%u", target_freq) < 0) {
> > +			RTE_LOG(ERR, POWER, "Fail to write new frequency for "
> > +					"lcore %u\n", pi->lcore_id);
> > +			return -1;
> > +		}
> 
> Forgot POWER_DEBUG_TRACE?
> 
agree, I will add here.
> > +
> > +		fflush(pi->f_cur_max);
> > +		fflush(pi->f_cur_min);
> > +	}
> > +
> > +	pi->curr_idx = idx;
> > +
> > +	return 1;
> > +}
> > +
> 
> <snip>
> 
> > +	snprintf(fullpath, sizeof(fullpath), POWER_SYSFILE_GOVERNOR,
> > +			pi->lcore_id);
> > +	f = fopen(fullpath, "rw+");
> > +	FOPEN_OR_ERR_RET(f, ret);
> > +
> > +	s = fgets(buf, sizeof(buf), f);
> > +	FOPS_OR_NULL_GOTO(s, out);
> > +
> > +	/* Check if current governor is performance */
> > +	if (strncmp(buf, POWER_GOVERNOR_PERF,
> > +			sizeof(POWER_GOVERNOR_PERF)) == 0) {
> 
> Nitpick, but probably should be strlen, not sizeof?
sizeof should be fine
> 
> > +		ret = 0;
> > +		POWER_DEBUG_TRACE("Power management governor of lcore %u is "
> > +				"already performance\n", pi->lcore_id);
> > +		goto out;
> > +	}
> > +	/* Save the original governor */
> > +	snprintf(pi->governor_ori, sizeof(pi->governor_ori), "%s", buf);
> > +
> > +	/* Write 'performance' to the governor */
> > +	val = fseek(f, 0, SEEK_SET);
> > +	FOPS_OR_ERR_GOTO(val, out);
> > +
> > +	val = fputs(POWER_GOVERNOR_PERF, f);
> > +	FOPS_OR_ERR_GOTO(val, out);
> > +
> > +	ret = 0;
> > +	RTE_LOG(INFO, POWER, "Power management governor of lcore %u has been "
> > +			"set to performance successfully\n", pi->lcore_id);
> 
> Do we want this as INFO? (it's OK if we do, just asking!)
just keep it as same as acpi driver
> 
> > +out:
> > +	fclose(f);
> > +
> > +	return ret;
> > +}
> > +
> > +/**
> > + * It is to check the governor and then set the original governor back if
> > + * needed by writing the sys file.
> > + */
> > +static int
> > +power_set_governor_original(struct pstate_power_info *pi)
> > +{
> 
> <snip>
> 
> > +	/* Write back the original governor */
> > +	val = fseek(f, 0, SEEK_SET);
> > +	FOPS_OR_ERR_GOTO(val, out);
> > +
> > +	val = fputs(pi->governor_ori, f);
> > +	FOPS_OR_ERR_GOTO(val, out);
> > +
> > +	ret = 0;
> > +	RTE_LOG(INFO, POWER, "Power management governor of lcore %u "
> > +			"has been set back to %s successfully\n",
> > +			pi->lcore_id, pi->governor_ori);
> 
> Same - do we want this as INFO?
keep it as same as acpi driver
> 
> > +out:
> > +	fclose(f);
> > +
> > +	return ret;
> > +}
> > +
> > +/**
> > + * It is to get the available frequencies of the specific lcore by reading the
> > + * sys file.
> > + */
> > +static int
> > +power_get_available_freqs(struct pstate_power_info *pi)
> > +{
> > +	FILE *f_min, *f_max;
> > +	int ret = -1;
> > +	char *p_min, *p_max;
> > +	char buf_min[BUFSIZ];
> > +	char buf_max[BUFSIZ];
> > +	char fullpath_min[PATH_MAX];
> > +	char fullpath_max[PATH_MAX];
> > +	char *s_min, *s_max;
> > +	uint32_t sys_min_freq = 0, sys_max_freq = 0, base_max_freq = 0;
> > +	uint32_t i, num_freqs = 0;
> > +
> > +	snprintf(fullpath_max, sizeof(fullpath_max),
> > +			POWER_SYSFILE_BASE_MAX_FREQ,
> > +			pi->lcore_id);
> > +	snprintf(fullpath_min, sizeof(fullpath_min),
> > +			POWER_SYSFILE_BASE_MIN_FREQ,
> > +			pi->lcore_id);
> > +
> > +	f_min = fopen(fullpath_min, "r");
> > +	FOPEN_OR_ERR_RET(f_min, ret);
> > +
> > +	s_min = fgets(buf_min, sizeof(buf_min), f_min);
> > +	FOPS_OR_NULL_GOTO(s_min, out);
> > +
> > +	f_max = fopen(fullpath_max, "r");
> > +	FOPEN_OR_ERR_RET(f_max, ret);
> > +
> > +	s_max = fgets(buf_max, sizeof(buf_max), f_max);
> > +	FOPS_OR_NULL_GOTO(s_max, out);
> > +
> > +
> > +	/* Strip the line break if there is */
> > +	p_min = strchr(buf_min, '\n');
> > +	if (p_min != NULL)
> > +		*p_min = 0;
> > +
> > +	p_max = strchr(buf_max, '\n');
> > +	if (p_max != NULL)
> > +		*p_max = 0;
> 
> Probably '\0' would be more correct.
should be fine
> 
> > +
> > +	sys_min_freq = strtoul(buf_min, &p_min, POWER_CONVERT_TO_DECIMAL);
> > +	sys_max_freq = strtoul(buf_max, &p_max, POWER_CONVERT_TO_DECIMAL);
> > +
> > +	if (sys_max_freq < sys_min_freq)
> > +		goto out;
> 
> This leaks fds. See below.
agree, I will adjust the postion of label "out"
> 
> > +
> > +
> > +	pi->sys_max_freq = sys_max_freq;
> > +
> > +	base_max_freq = pi->non_turbo_max_ratio*BUS_FREQ;
> 
> spacing around multiplication :)
agree. I will remove that
> 
> > +
> > +	POWER_DEBUG_TRACE("sys min %u, sys max %u, base_max %u\n",
> > +			sys_min_freq,
> > +			sys_max_freq,
> > +			base_max_freq);
> > +
> > +	if (base_max_freq < sys_max_freq)
> > +
> > +		pi->turbo_available = 1;
> 
> Extra whitespace.
agree
> 
> > +	else
> > +		pi->turbo_available = 0;
> > +
> > +
> > +	/* If turbo is available then there is one extra freq bucket
> > +	 * to store the sys max freq which value is base_max +1
> > +	 */
> > +	num_freqs = (base_max_freq - sys_min_freq)/BUS_FREQ + 1
> > +		+ pi->turbo_available;
> 
> The '+' should be on the preceding line, also spacing.
agree
> 
> > +
> > +	/* Generate the freq bucket array.
> > +	 * If turbo is available the freq bucket[0] value is base_max +1
> > +	 * the bucket[1] is base_max, bucket[2] is base_max - BUS_FREQ
> > +	 * and so on.
> > +	 * If turbo is not available bucket[0] is base_max and so on
> > +	 */
> > +	for (i = 0, pi->nb_freqs = 0; i < num_freqs; i++) {
> > +
> 
> Extra whitespace.
agree
> 
> > +		if ((i == 0) && pi->turbo_available)
> > +			pi->freqs[pi->nb_freqs++] = base_max_freq + 1;
> > +		else
> > +			pi->freqs[pi->nb_freqs++] =
> > +			base_max_freq - (i - pi->turbo_available)*BUS_FREQ;
> 
> Misleading indentation, also spacing around multiplication operator.
agree 
> > +	}
> > +
> > +	ret = 0;
> > +
> > +	POWER_DEBUG_TRACE("%d frequency(s) of lcore %u are available\n",
> > +			num_freqs, pi->lcore_id);
> > +
> > +	fclose(f_min);
> > +	fclose(f_max);
> > +
> > +
> > +out:
> 
> fclose should be after out, otherwise error path above will leak fd.
agree 
> > +	return ret;
> > +}
> > +
> > +int
> > +power_pstate_cpufreq_init(unsigned int lcore_id)
> > +{
> > +	struct pstate_power_info *pi;
> > +
> > +	if (lcore_id >= RTE_MAX_LCORE) {
> > +		RTE_LOG(ERR, POWER, "Lcore id %u can not exceeds %u\n",
> 
> Cannot exceed.
>
agree
> > +				lcore_id, RTE_MAX_LCORE - 1U);
> > +		return -1;
> > +	}
> > +
> > +	pi = &lcore_power_info[lcore_id];
> > +	if (rte_atomic32_cmpset(&(pi->state), POWER_IDLE, POWER_ONGOING)
> > +			== 0) {
> > +		RTE_LOG(INFO, POWER, "Power management of lcore %u is "
> > +				"in use\n", lcore_id);
> > +		return -1;
> > +	}
> > +
> > +	pi->lcore_id = lcore_id;
> 
> Can we not set this until we're sure we didn't fail? Or do any of the below
> calls rely on pi->lcore_id to be set?
> 
power_set_governor_performance will sue lcore_id
> > +	/* Check and set the governor */
> > +	if (power_set_governor_performance(pi) < 0) {
> > +		RTE_LOG(ERR, POWER, "Cannot set governor of lcore %u to "
> > +				"performance\n", lcore_id);
> > +		goto fail;
> > +	}
> > +	/* Init for setting lcore frequency */
> > +	if (power_init_for_setting_freq(pi) < 0) {
> > +		RTE_LOG(ERR, POWER, "Cannot init for setting frequency for "
> > +				"lcore %u\n", lcore_id);
> > +		goto fail;
> > +	}
> > +
> > +	/* Get the available frequencies */
> > +	if (power_get_available_freqs(pi) < 0) {
> > +		RTE_LOG(ERR, POWER, "Cannot get available frequencies of "
> > +				"lcore %u\n", lcore_id);
> > +		goto fail;
> > +	}
> > +
> > +
> > +	/* Set freq to max by default */
> > +	if (power_pstate_cpufreq_freq_max(lcore_id) < 0) {
> > +		RTE_LOG(ERR, POWER, "Cannot set frequency of lcore %u "
> > +				"to max\n", lcore_id);
> > +		goto fail;
> > +	}
> > +
> > +	RTE_LOG(INFO, POWER, "Initialized successfully for lcore %u "
> > +			"power management\n", lcore_id);
> 
> Do we want this as INFO?
> 
> > +	rte_atomic32_cmpset(&(pi->state), POWER_ONGOING, POWER_USED);
> > +
> > +	return 0;
> > +
> > +fail:
> > +	rte_atomic32_cmpset(&(pi->state), POWER_ONGOING, POWER_UNKNOWN);
> > +
> > +	return -1;
> > +}
> > +
> > +int
> > +power_pstate_cpufreq_exit(unsigned int lcore_id)
> > +{
> > +	struct pstate_power_info *pi;
> > +
> 
> <snip>
> 
> > +	/* Set the governor back to the original */
> > +	if (power_set_governor_original(pi) < 0) {
> > +		RTE_LOG(ERR, POWER, "Cannot set the governor of %u back "
> > +				"to the original\n", lcore_id);
> > +		goto fail;
> > +	}
> > +
> > +	RTE_LOG(INFO, POWER, "Power management of lcore %u has exited from "
> > +			"'performance' mode and been set back to the "
> > +			"original\n", lcore_id);
> 
> Perhaps print out the original governor name? Also, i think it's better to
> use the performance string define, rather than having it as part of
> formatted message.
original governor name is alread print out by power_set_governor_original
> > +	rte_atomic32_cmpset(&(pi->state), POWER_ONGOING, POWER_IDLE);
> > +
> > +	return 0;
> > +
> > +fail:
> > +	rte_atomic32_cmpset(&(pi->state), POWER_ONGOING, POWER_UNKNOWN);
> > +
> > +	return -1;
> > +}
> > +
> > +
> > +uint32_t
> > +power_pstate_cpufreq_freqs(unsigned int lcore_id, uint32_t *freqs, uint32_t num)
> > +{
> > +	struct pstate_power_info *pi;
> > +
> > +	if (lcore_id >= RTE_MAX_LCORE) {
> > +		RTE_LOG(ERR, POWER, "Invalid lcore ID\n");
> > +		return -1;
> > +	}
> > +
> > +	pi = &lcore_power_info[lcore_id];
> > +	if (num < pi->nb_freqs) {
> > +		RTE_LOG(ERR, POWER, "Buffer size is not enough\n");
> > +		return 0;
> > +	}
> > +	rte_memcpy(freqs, pi->freqs, pi->nb_freqs * sizeof(uint32_t));
> 
> Why rte_memcpy?
the table can be large
> > +
> > +	return pi->nb_freqs;
> > +}
> > +
> > +uint32_t
> > +power_pstate_cpufreq_get_freq(unsigned int lcore_id)
> > +{
> > +	if (lcore_id >= RTE_MAX_LCORE) {
> > +		RTE_LOG(ERR, POWER, "Invalid lcore ID\n");
> > +		return RTE_POWER_INVALID_FREQ_INDEX;
> > +	}
> > +
> > +	return lcore_power_info[lcore_id].curr_idx;
> > +}
> > +
> 
> <snip>
> 
> > +	caps->turbo = !!(pi->turbo_available);
> > +
> > +	return 0;
> > +}
> > diff --git a/lib/librte_power/power_pstate_cpufreq.h b/lib/librte_power/power_pstate_cpufreq.h
> > new file mode 100644
> > index 0000000..0fc917a
> > --- /dev/null
> > +++ b/lib/librte_power/power_pstate_cpufreq.h
> > @@ -0,0 +1,218 @@
> > +/* SPDX-License-Identifier: BSD-3-Clause
> > + * Copyright(c) 2010-2018 Intel Corporation
> 
> I believe it should be just 2018.
> 
> > + */
> > +
> > +#ifndef _POWER_PSTATE_CPUFREQ_H
> > +#define _POWER_PSTATE_CPUFREQ_H
> > +
> > +/**
> > + * @file
> > + * RTE Power Management via Intel Pstate driver
> > + */
> > +
> > +#include <rte_common.h>
> > +#include <rte_byteorder.h>
> > +#include <rte_log.h>
> > +#include <rte_string_fns.h>
> > +#include "rte_power.h"
> > +
> > +#ifdef __cplusplus
> > +extern "C" {
> > +#endif
> 
> I don't think this is necessary. These extern declarations are only for
> public API headers, so that C++ compilers could parse them. Since this is
> not a public header, there's no need for extern C declaration.
just keep as same as other headers 
> 
> > +
> > +/**
> > + * Initialize power management for a specific lcore. It will check and set the
> > + * governor to performance for the lcore, get the available frequencies, and
> > + * prepare to set new lcore frequency.
> > + *
> > + * @param lcore_id
> > + *  lcore id.
> > + *
> > + * @return
> > + *  - 0 on success.
> > + *  - Negative on error.
> > + */
> 
> <snip>
> 
> > + *
> > + * @return
> > + *  The number of available frequencies.
> > + */
> > +uint32_t power_pstate_cpufreq_freqs(unsigned int lcore_id, uint32_t *freqs,
> > +		uint32_t num);
> > +
> > +/**
> > + * Return the current index of available frequencies of a specific lcore. It
> > + * will return 'RTE_POWER_INVALID_FREQ_INDEX = (~0)' if error.
> 
> This isn't a public API, so it's nitpicking, but any notes about return
> value should be in the @return section.
agree
> 
> > + * It should be protected outside of this function for threadsafe.
> > + *
> > + * @param lcore_id
> > + *  lcore id.
> > + *
> > + * @return
> > + *  The current index of available frequencies.
> > + */
> > +uint32_t power_pstate_cpufreq_get_freq(unsigned int lcore_id);
> > +
> > +/**
> 
> <snip>
> 
> >   #include "rte_power.h"
> >   #include "power_acpi_cpufreq.h"
> >   #include "power_kvm_vm.h"
> > +#include "power_pstate_cpufreq.h"
> >   #include "power_common.h"
> >   enum power_management_env global_default_env = PM_ENV_NOT_SET;
> > @@ -29,6 +30,8 @@ rte_power_get_capabilities_t rte_power_get_capabilities;
> >   int
> >   rte_power_set_env(enum power_management_env env)
> >   {
> > +
> > +
> 
> This is probably unintended whitespace change.
> 
agree
> >   	if (rte_atomic32_cmpset(&global_env_cfg_status, 0, 1) == 0) {
> >   		return 0;
> >   	}
> > @@ -56,6 +59,19 @@ rte_power_set_env(enum power_management_env env)
> >   		rte_power_freq_enable_turbo = power_kvm_vm_enable_turbo;
> >   		rte_power_freq_disable_turbo = power_kvm_vm_disable_turbo;
> >   		rte_power_get_capabilities = power_kvm_vm_get_capabilities;
> > +	} else if (env == PM_ENV_PSTATE_CPUFREQ) {
> > +		rte_power_freqs = power_pstate_cpufreq_freqs;
> > +		rte_power_get_freq = power_pstate_cpufreq_get_freq;
> 
> <snip>
> 
> > +	if (global_default_env == PM_ENV_KVM_VM)
> >   		return power_kvm_vm_init(lcore_id);
> > -	}
> > +
> > +	if (global_default_env == PM_ENV_PSTATE_CPUFREQ)
> > +		return power_pstate_cpufreq_init(lcore_id);
> > +
> 
> switch?
should be fine for only 3 enum type
> 
> >   	/* Auto detect Environment */
> >   	RTE_LOG(INFO, POWER, "Attempting to initialise ACPI cpufreq power "
> >   			"management...\n");
> > @@ -99,13 +118,23 @@ rte_power_init(unsigned int lcore_id)
> >   		goto out;
> >   	}
> > -	RTE_LOG(INFO, POWER, "Attempting to initialise VM power management...\n");
> > +	RTE_LOG(INFO, POWER, "Attempting to initialise PSTAT power "
> > +			"management...\n");
> > +	ret = power_pstate_cpufreq_init(lcore_id);
> > +	if (ret == 0) {
> > +		rte_power_set_env(PM_ENV_PSTATE_CPUFREQ);
> > +		goto out;
> > +	}
> > +
> > +	RTE_LOG(INFO, POWER, "Attempting to initialise VM power "
> > +			"management...\n");
> >   	ret = power_kvm_vm_init(lcore_id);
> >   	if (ret == 0) {
> >   		rte_power_set_env(PM_ENV_KVM_VM);
> >   		goto out;
> >   	}
> > -	RTE_LOG(ERR, POWER, "Unable to set Power Management Environment for lcore "
> > +	RTE_LOG(ERR, POWER, "Unable to set Power Management "
> > +			"Environment for lcore "
> >   			"%u\n", lcore_id);
> 
> Perhaps this could be moved to a separate function, so that the code could
> look like:
> 
> switch (global_default_env) {
> case PM_ENV_ACPI:
> 	return acpi_init();
> case PM_ENV_KVM:
> 	return kvm_init();
> case PM_ENV_PSTATE:
> 	return pstate_init();
> default:
> 	if (autodetect() < 0)
> 		RTE_LOG("error");
> 	return 0;
> }
> 
> Or something like that.

I can move to switch
> 
> >   out:
> >   	return ret;
> > @@ -118,6 +147,8 @@ rte_power_exit(unsigned int lcore_id)
> >   		return power_acpi_cpufreq_exit(lcore_id);
> >   	if (global_default_env == PM_ENV_KVM_VM)
> >   		return power_kvm_vm_exit(lcore_id);
> > +	if (global_default_env == PM_ENV_PSTATE_CPUFREQ)
> > +		return power_pstate_cpufreq_exit(lcore_id);
> 
> switch?
> 
> >   	RTE_LOG(ERR, POWER, "Environment has not been set, unable to exit "
> >   				"gracefully\n");
> > diff --git a/lib/librte_power/rte_power.h b/lib/librte_power/rte_power.h
> > index d70bc0b..c5e8f6b 100644
> > --- a/lib/librte_power/rte_power.h
> > +++ b/lib/librte_power/rte_power.h
> > @@ -20,7 +20,8 @@ extern "C" {
> >   #endif
> >   /* Power Management Environment State */
> > -enum power_management_env {PM_ENV_NOT_SET, PM_ENV_ACPI_CPUFREQ, PM_ENV_KVM_VM};
> > +enum power_management_env {PM_ENV_NOT_SET, PM_ENV_ACPI_CPUFREQ, PM_ENV_KVM_VM,
> > +		PM_ENV_PSTATE_CPUFREQ};
> 
> I don't like this approach. While it can be argued that application needs to
> be explicitly aware of whether it's using native (ACPI or pstate) vs. KVM
> power management, there's no real reason for an application to differentiate
> between ACPI and pstate modes.
> 
> Changing this would of course require a deprecation notice, so for now, can
> we hide all of this behind ACPI mode, and auto-detect whether we want ACPI
> or pstate mode? IMO it would be better for the user application to use a
> somewhat misnamed ACPI option for both ACPI and pstate modes, than to
> needlessly care about whether one or the other is in use.
> 
> What do you think?
acpi has significant difference with hwp. sysfs/xxxx/setspeed  sysfs/xxxx/scaling_driver is different
I would like to make  application aware the driver type. 
> 
> >   /**
> >    * Set the default power management implementation. If this is not called prior
> > 
> 
> -- 
> Thanks,
> Anatoly


More information about the dev mailing list