[PATCH v9 1/2] power: introduce PM QoS API on CPU wide
lihuisong (C)
lihuisong at huawei.com
Tue Sep 10 11:32:11 CEST 2024
Hi Chengwen,
Thanks for your review.
在 2024/9/10 10:00, fengchengwen 写道:
> Hi Huisong
>
> Please see comments inline.
>
> Thanks
>
> On 2024/8/9 17:50, Huisong Li wrote:
>> The deeper the idle state, the lower the power consumption, but the longer
>> the resume time. Some service are delay sensitive and very except the low
>> resume time, like interrupt packet receiving mode.
>>
>> And the "/sys/devices/system/cpu/cpuX/power/pm_qos_resume_latency_us" sysfs
>> interface is used to set and get the resume latency limit on the cpuX for
>> userspace. Each cpuidle governor in Linux select which idle state to enter
>> based on this CPU resume latency in their idle task.
>>
>> The per-CPU PM QoS API can be used to control this CPU's idle state
>> selection and limit just enter the shallowest idle state to low the delay
>> after sleep by setting strict resume latency (zero value).
>>
>> Signed-off-by: Huisong Li <lihuisong at huawei.com>
>> Acked-by: Morten Brørup <mb at smartsharesystems.com>
>> ---
> ...
>
>> diff --git a/lib/power/rte_power_qos.c b/lib/power/rte_power_qos.c
>> new file mode 100644
>> index 0000000000..375746f832
>> --- /dev/null
>> +++ b/lib/power/rte_power_qos.c
>> @@ -0,0 +1,114 @@
>> +/* SPDX-License-Identifier: BSD-3-Clause
>> + * Copyright(c) 2024 HiSilicon Limited
>> + */
>> +
>> +#include <errno.h>
>> +#include <stdlib.h>
>> +#include <string.h>
>> +
>> +#include <rte_lcore.h>
>> +#include <rte_log.h>
>> +
>> +#include "power_common.h"
>> +#include "rte_power_qos.h"
>> +
>> +#define PM_QOS_SYSFILE_RESUME_LATENCY_US \
>> + "/sys/devices/system/cpu/cpu%u/power/pm_qos_resume_latency_us"
>> +
>> +int
>> +rte_power_qos_set_cpu_resume_latency(uint16_t lcore_id, int latency)
>> +{
>> + char buf[LINE_MAX];
> no need LINE_MAX, [32] would enough.
Ack
>
>> + FILE *f;
>> + int ret;
>> +
>> + if (!rte_lcore_is_enabled(lcore_id)) {
>> + POWER_LOG(ERR, "lcore id %u is not enabled", lcore_id);
>> + return -EINVAL;
>> + }
>> +
>> + if (latency < 0) {
>> + POWER_LOG(ERR, "latency should be greater than and equal to 0");
>> + return -EINVAL;
>> + }
>> +
>> + ret = open_core_sysfs_file(&f, "w", PM_QOS_SYSFILE_RESUME_LATENCY_US, lcore_id);
>> + if (ret != 0) {
>> + POWER_LOG(ERR, "Failed to open "PM_QOS_SYSFILE_RESUME_LATENCY_US, lcore_id);
>> + return ret;
>> + }
>> +
>> + /*
>> + * Based on the sysfs interface pm_qos_resume_latency_us under
>> + * @PM_QOS_SYSFILE_RESUME_LATENCY_US directory in kernel, their meanning
> meanning -> meaning
Ack
>
>> + * is as follows for different input string.
>> + * 1> the resume latency is 0 if the input is "n/a".
>> + * 2> the resume latency is no constraint if the input is "0".
>> + * 3> the resume latency is the actual value to be set.
>> + */
>> + if (latency == 0)
>> + snprintf(buf, sizeof(buf), "%s", "n/a");
>> + else if (latency == RTE_POWER_QOS_RESUME_LATENCY_NO_CONSTRAINT)
>> + snprintf(buf, sizeof(buf), "%u", 0);
>> + else
>> + snprintf(buf, sizeof(buf), "%u", latency);
>> +
>> + ret = write_core_sysfs_s(f, buf);
>> + if (ret != 0) {
>> + POWER_LOG(ERR, "Failed to write "PM_QOS_SYSFILE_RESUME_LATENCY_US, lcore_id);
>> + goto out;
> no need of goto
Ack
>
>> + }
>> +
>> +out:
>> + if (f != NULL)
>> + fclose(f);
> just fclose(f) because f is valid here.
Ack
>> +
>> + return ret;
>> +}
>> +
>> +int
>> +rte_power_qos_get_cpu_resume_latency(uint16_t lcore_id)
>> +{
>> + char buf[LINE_MAX];
>> + int latency = -1;
>> + FILE *f;
>> + int ret;
>> +
>> + if (!rte_lcore_is_enabled(lcore_id)) {
>> + POWER_LOG(ERR, "lcore id %u is not enabled", lcore_id);
>> + return -EINVAL;
>> + }
>> +
>> + ret = open_core_sysfs_file(&f, "r", PM_QOS_SYSFILE_RESUME_LATENCY_US, lcore_id);
>> + if (ret != 0) {
>> + POWER_LOG(ERR, "Failed to open "PM_QOS_SYSFILE_RESUME_LATENCY_US, lcore_id);
>> + return ret;
>> + }
>> +
>> + ret = read_core_sysfs_s(f, buf, sizeof(buf));
>> + if (ret != 0) {
>> + POWER_LOG(ERR, "Failed to read "PM_QOS_SYSFILE_RESUME_LATENCY_US, lcore_id);
>> + goto out;
>> + }
>> +
>> + /*
>> + * Based on the sysfs interface pm_qos_resume_latency_us under
>> + * @PM_QOS_SYSFILE_RESUME_LATENCY_US directory in kernel, their meanning
> meanning -> meaning
Ack
>
>> + * is as follows for different output string.
>> + * 1> the resume latency is 0 if the output is "n/a".
>> + * 2> the resume latency is no constraint if the output is "0".
>> + * 3> the resume latency is the actual value in used for other string.
>> + */
>> + if (strcmp(buf, "n/a") == 0)
>> + latency = 0;
>> + else {
>> + latency = strtoul(buf, NULL, 10);
>> + latency = latency == 0 ? RTE_POWER_QOS_RESUME_LATENCY_NO_CONSTRAINT : latency;
>> + }
>> +
>> +out:
>> + if (f != NULL)
>> + fclose(f);
> just fclose(f) because f is valid here.
Ack
>
>> +
>> + return latency != -1 ? latency : ret;
>> +}
>> diff --git a/lib/power/rte_power_qos.h b/lib/power/rte_power_qos.h
>> new file mode 100644
>> index 0000000000..990c488373
>> --- /dev/null
>> +++ b/lib/power/rte_power_qos.h
>> @@ -0,0 +1,73 @@
>> +/* SPDX-License-Identifier: BSD-3-Clause
>> + * Copyright(c) 2024 HiSilicon Limited
>> + */
>> +
>> +#ifndef RTE_POWER_QOS_H
>> +#define RTE_POWER_QOS_H
>> +
>> +#include <stdint.h>
>> +
>> +#include <rte_compat.h>
>> +
>> +#ifdef __cplusplus
>> +extern "C" {
>> +#endif
>> +
>> +/**
>> + * @file rte_power_qos.h
>> + *
>> + * PM QoS API.
>> + *
>> + * The CPU-wide resume latency limit has a positive impact on this CPU's idle
>> + * state selection in each cpuidle governor.
>> + * Please see the PM QoS on CPU wide in the following link:
>> + * https://www.kernel.org/doc/html/latest/admin-guide/abi-testing.html?highlight=pm_qos_resume_latency_us#abi-sys-devices-power-pm-qos-resume-latency-us
>> + *
>> + * The deeper the idle state, the lower the power consumption, but the
>> + * longer the resume time. Some service are delay sensitive and very except the
>> + * low resume time, like interrupt packet receiving mode.
>> + *
>> + * In these case, per-CPU PM QoS API can be used to control this CPU's idle
>> + * state selection and limit just enter the shallowest idle state to low the
>> + * delay after sleep by setting strict resume latency (zero value).
>> + */
>> +
>> +#define RTE_POWER_QOS_STRICT_LATENCY_VALUE 0
>> +#define RTE_POWER_QOS_RESUME_LATENCY_NO_CONSTRAINT ((int)(UINT32_MAX >> 1))
>> +
>> +/**
>> + * @warning
>> + * @b EXPERIMENTAL: this API may change without prior notice.
>> + *
>> + * @param lcore_id
>> + * target logical core id
>> + *
>> + * @param latency
>> + * The latency should be greater than and equal to zero in microseconds unit.
>> + *
>> + * @return
>> + * 0 on success. Otherwise negative value is returned.
>> + */
>> +__rte_experimental
>> +int rte_power_qos_set_cpu_resume_latency(uint16_t lcore_id, int latency);
>> +
>> +/**
>> + * @warning
>> + * @b EXPERIMENTAL: this API may change without prior notice.
>> + *
>> + * Get the current resume latency of this logical core.
>> + * The default value in kernel is @see RTE_POWER_QOS_RESUME_LATENCY_NO_CONSTRAINT
>> + * if don't set it.
>> + *
>> + * @return
>> + * Negative value on failure.
>> + * >= 0 means the actual resume latency limit on this core.
>> + */
>> +__rte_experimental
>> +int rte_power_qos_get_cpu_resume_latency(uint16_t lcore_id);
>> +
>> +#ifdef __cplusplus
>> +}
>> +#endif
>> +
>> +#endif /* RTE_POWER_QOS_H */
>> diff --git a/lib/power/version.map b/lib/power/version.map
>> index c9a226614e..4e4955a4cf 100644
>> --- a/lib/power/version.map
>> +++ b/lib/power/version.map
>> @@ -51,4 +51,8 @@ EXPERIMENTAL {
>> rte_power_set_uncore_env;
>> rte_power_uncore_freqs;
>> rte_power_unset_uncore_env;
>> +
>> + # added in 24.11
>> + rte_power_qos_set_cpu_resume_latency;
>> + rte_power_qos_get_cpu_resume_latency;
> order by alphabetic.
Ack
>
> another question, I think rename cpu with core maybe more accurate, despite sysfs export with cpu, but in DPDK it means core.
> and there are some rte_power_core_xxx name in rte_power library, I think better to keep the same.
Firstly, the rte_power_qos_set/get_cpu_resume_latency is just consistent
with linux sysfs interface. Having the same name is more releative for user.
In addition, Sivaprasad Tummala is reworking power library and the name
of rte_power_core_xxx also might be changed.
>
>> };
>>
> .
More information about the dev
mailing list