[dpdk-dev] [RFC v2 1/5] eal: add power management intrinsics

Liang Ma liang.j.ma at intel.com
Tue Aug 11 12:27:42 CEST 2020


Add two new power management intrinsics, and provide an implementation
in eal/x86 based on UMONITOR/UMWAIT instructions. The instructions
are implemented as raw byte opcodes because there is not yet widespread
compiler support for these instructions.

The power management instructions provide an architecture-specific
function to either wait until a specified TSC timestamp is reached, or
optionally wait until either a TSC timestamp is reached or a memory
location is written to. The monitor function also provides an optional
comparison, to avoid sleeping when the expected write has already
happened, and no more writes are expected.

Signed-off-by: Liang Ma <liang.j.ma at intel.com>
Signed-off-by: Anatoly Burakov <anatoly.burakov at intel.com>
---
 .../include/generic/rte_power_intrinsics.h    |  64 ++++++++
 lib/librte_eal/include/meson.build            |   1 +
 lib/librte_eal/x86/include/meson.build        |   1 +
 lib/librte_eal/x86/include/rte_cpuflags.h     |   1 +
 .../x86/include/rte_power_intrinsics.h        | 138 ++++++++++++++++++
 lib/librte_eal/x86/rte_cpuflags.c             |   2 +
 6 files changed, 207 insertions(+)
 create mode 100644 lib/librte_eal/include/generic/rte_power_intrinsics.h
 create mode 100644 lib/librte_eal/x86/include/rte_power_intrinsics.h

diff --git a/lib/librte_eal/include/generic/rte_power_intrinsics.h b/lib/librte_eal/include/generic/rte_power_intrinsics.h
new file mode 100644
index 000000000..8646c4ac1
--- /dev/null
+++ b/lib/librte_eal/include/generic/rte_power_intrinsics.h
@@ -0,0 +1,64 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2020 Intel Corporation
+ */
+
+#ifndef _RTE_POWER_INTRINSIC_H_
+#define _RTE_POWER_INTRINSIC_H_
+
+#include <inttypes.h>
+
+/**
+ * @file
+ * Advanced power management operations.
+ *
+ * This file define APIs for advanced power management,
+ * which are architecture-dependent.
+ */
+
+/**
+ * Monitor specific address for changes. This will cause the CPU to enter an
+ * architecture-defined optimized power state until either the specified
+ * memory address is written to, or a certain TSC timestamp is reached.
+ *
+ * Additionally, an `expected` 64-bit value and 64-bit mask are provided. If
+ * mask is non-zero, the current value pointed to by the `p` pointer will be
+ * checked against the expected value, and if they match, the entering of
+ * optimized power state may be aborted.
+ *
+ * @param p
+ *   Address to monitor for changes. Must be aligned on an 8-byte boundary.
+ * @param expected_value
+ *   Before attempting the monitoring, the `p` address may be read and compared
+ *   against this value. If `value_mask` is zero, this step will be skipped.
+ * @param value_mask
+ *   The 64-bit mask to use to extract current value from `p`.
+ * @param state
+ *   Architecture-dependent optimized power state number
+ * @param tsc_timestamp
+ *   Maximum TSC timestamp to wait for. Note that the wait behavior is
+ *   architecture-dependent.
+ *
+ * @return
+ *   Architecture-dependent return value.
+ */
+static inline int rte_power_monitor(const volatile void *p,
+		const uint64_t expected_value, const uint64_t value_mask,
+		const uint32_t state, const uint64_t tsc_timestamp);
+
+/**
+ * Enter an architecture-defined optimized power state until a certain TSC
+ * timestamp is reached.
+ *
+ * @param state
+ *   Architecture-dependent optimized power state number
+ * @param tsc_timestamp
+ *   Maximum TSC timestamp to wait for. Note that the wait behavior is
+ *   architecture-dependent.
+ *
+ * @return
+ *   Architecture-dependent return value.
+ */
+static inline int rte_power_pause(const uint32_t state,
+		const uint64_t tsc_timestamp);
+
+#endif /* _RTE_POWER_INTRINSIC_H_ */
diff --git a/lib/librte_eal/include/meson.build b/lib/librte_eal/include/meson.build
index cd0902795..3a12e87e1 100644
--- a/lib/librte_eal/include/meson.build
+++ b/lib/librte_eal/include/meson.build
@@ -60,6 +60,7 @@ generic_headers = files(
 	'generic/rte_memcpy.h',
 	'generic/rte_pause.h',
 	'generic/rte_prefetch.h',
+	'generic/rte_power_intrinsics.h',
 	'generic/rte_rwlock.h',
 	'generic/rte_spinlock.h',
 	'generic/rte_ticketlock.h',
diff --git a/lib/librte_eal/x86/include/meson.build b/lib/librte_eal/x86/include/meson.build
index f0e998c2f..494a8142a 100644
--- a/lib/librte_eal/x86/include/meson.build
+++ b/lib/librte_eal/x86/include/meson.build
@@ -13,6 +13,7 @@ arch_headers = files(
 	'rte_io.h',
 	'rte_memcpy.h',
 	'rte_prefetch.h',
+	'rte_power_intrinsics.h',
 	'rte_pause.h',
 	'rte_rtm.h',
 	'rte_rwlock.h',
diff --git a/lib/librte_eal/x86/include/rte_cpuflags.h b/lib/librte_eal/x86/include/rte_cpuflags.h
index c1d20364d..94d6a4376 100644
--- a/lib/librte_eal/x86/include/rte_cpuflags.h
+++ b/lib/librte_eal/x86/include/rte_cpuflags.h
@@ -110,6 +110,7 @@ enum rte_cpu_flag_t {
 	RTE_CPUFLAG_RDTSCP,                 /**< RDTSCP */
 	RTE_CPUFLAG_EM64T,                  /**< EM64T */
 
+	RTE_CPUFLAG_WAITPKG,                 /**< UMINITOR/UMWAIT/TPAUSE */
 	/* (EAX 80000007h) EDX features */
 	RTE_CPUFLAG_INVTSC,                 /**< INVTSC */
 
diff --git a/lib/librte_eal/x86/include/rte_power_intrinsics.h b/lib/librte_eal/x86/include/rte_power_intrinsics.h
new file mode 100644
index 000000000..af8aa9459
--- /dev/null
+++ b/lib/librte_eal/x86/include/rte_power_intrinsics.h
@@ -0,0 +1,138 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2020 Intel Corporation
+ */
+
+#ifndef _RTE_POWER_INTRINSIC_X86_64_H_
+#define _RTE_POWER_INTRINSIC_X86_64_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <rte_atomic.h>
+#include <rte_common.h>
+
+#include "generic/rte_power_intrinsics.h"
+
+/**
+ * Monitor specific address for changes. This will cause the CPU to enter an
+ * architecture-defined optimized power state until either the specified
+ * memory address is written to, or a certain TSC timestamp is reached.
+ *
+ * Additionally, an `expected` 64-bit value and 64-bit mask are provided. If
+ * mask is non-zero, the current value pointed to by the `p` pointer will be
+ * checked against the expected value, and if they match, the entering of
+ * optimized power state may be aborted.
+ *
+ * This function uses UMONITOR/UMWAIT instructions. For more information about
+ * their usage, please refer to Intel(R) 64 and IA-32 Architectures Software
+ * Developer's Manual.
+ *
+ * @param p
+ *   Address to monitor for changes. Must be aligned on an 8-byte boundary.
+ * @param expected_value
+ *   Before attempting the monitoring, the `p` address may be read and compared
+ *   against this value. If `value_mask` is zero, this step will be skipped.
+ * @param value_mask
+ *   The 64-bit mask to use to extract current value from `p`.
+ * @param state
+ *   Architecture-dependent optimized power state number. Can be 0 (C0.2) or
+ *   1 (C0.1).
+ * @param tsc_timestamp
+ *   Maximum TSC timestamp to wait for.
+ *
+ * @return
+ *   - 1 if wakeup was due to TSC timeout expiration.
+ *   - 0 if wakeup was due to memory write or other reasons.
+ */
+static inline int rte_power_monitor(const volatile void *p,
+		const uint64_t expected_value, const uint64_t value_mask,
+		const uint32_t state, const uint64_t tsc_timestamp)
+{
+	const uint32_t tsc_l = (uint32_t)tsc_timestamp;
+	const uint32_t tsc_h = (uint32_t)(tsc_timestamp >> 32);
+
+#ifdef RTE_ARCH_I686
+	uint32_t rflags;
+#else
+	uint64_t rflags;
+#endif
+	/*
+	 * we're using raw byte codes for now as only the newest compiler
+	 * versions support this instruction natively.
+	 */
+
+	/* set address for UMONITOR */
+	asm volatile(".byte 0xf3, 0x0f, 0xae, 0xf7;"
+			:
+			: "D"(p));
+	rte_mb();
+	if (value_mask) {
+		const uint64_t cur_value = *(const volatile uint64_t *)p;
+		const uint64_t masked = cur_value & value_mask;
+		/* if the masked value is already matching, abort */
+		if (masked == expected_value)
+			return 0;
+	}
+	/* execute UMWAIT */
+	asm volatile(".byte 0xf2, 0x0f, 0xae, 0xf7;\n"
+		/*
+		 * UMWAIT sets CF flag in RFLAGS, so PUSHF to push them
+		 * onto the stack, then pop them back into `rflags` so that
+		 * we can read it.
+		 */
+		"pushf;\n"
+		"pop %0;\n"
+		: "=r"(rflags)
+		: "D"(state), "a"(tsc_l), "d"(tsc_h));
+
+	/* we're interested in the first bit (the carry flag) */
+	return rflags & 0x1;
+}
+
+/**
+ * Enter an architecture-defined optimized power state until a certain TSC
+ * timestamp is reached.
+ *
+ * This function uses TPAUSE instruction. For more information about its usage,
+ * please refer to Intel(R) 64 and IA-32 Architectures Software Developer's
+ * Manual.
+ *
+ * @param state
+ *   Architecture-dependent optimized power state number. Can be 0 (C0.2) or
+ *   1 (C0.1).
+ * @param tsc_timestamp
+ *   Maximum TSC timestamp to wait for.
+ *
+ * @return
+ *   - 1 if wakeup was due to TSC timeout expiration.
+ *   - 0 if wakeup was due to other reasons.
+ */
+static inline int rte_power_pause(const uint32_t state,
+		const uint64_t tsc_timestamp)
+{
+	const uint32_t tsc_l = (uint32_t)tsc_timestamp;
+	const uint32_t tsc_h = (uint32_t)(tsc_timestamp >> 32);
+	uint64_t rflags;
+
+	/* execute TPAUSE */
+	asm volatile(".byte 0x66, 0x0f, 0xae, 0xf7;\n"
+		     /*
+		      * TPAUSE sets CF flag in RFLAGS, so PUSHF to push them
+		      * onto the stack, then pop them back into `rflags` so that
+		      * we can read it.
+		      */
+		     "pushf;\n"
+		     "pop %0;\n"
+		     : "=r"(rflags)
+		     : "D"(state), "a"(tsc_l), "d"(tsc_h));
+
+	/* we're interested in the first bit (the carry flag) */
+	return rflags & 0x1;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_POWER_INTRINSIC_X86_64_H_ */
diff --git a/lib/librte_eal/x86/rte_cpuflags.c b/lib/librte_eal/x86/rte_cpuflags.c
index 30439e795..0325c4b93 100644
--- a/lib/librte_eal/x86/rte_cpuflags.c
+++ b/lib/librte_eal/x86/rte_cpuflags.c
@@ -110,6 +110,8 @@ const struct feature_entry rte_cpu_feature_table[] = {
 	FEAT_DEF(AVX512F, 0x00000007, 0, RTE_REG_EBX, 16)
 	FEAT_DEF(RDSEED, 0x00000007, 0, RTE_REG_EBX, 18)
 
+	FEAT_DEF(WAITPKG, 0x00000007, 0, RTE_REG_ECX, 5)
+
 	FEAT_DEF(LAHF_SAHF, 0x80000001, 0, RTE_REG_ECX,  0)
 	FEAT_DEF(LZCNT, 0x80000001, 0, RTE_REG_ECX,  4)
 
-- 
2.17.1



More information about the dev mailing list