[PATCH v14 3/6] ptr_compress: add pointer compression library
Paul Szczepanek
paul.szczepanek at arm.com
Fri Jun 7 17:09:57 CEST 2024
Add a new utility header for compressing pointers. The provided
functions can store pointers as 32-bit or 16-bit offsets.
The compression takes advantage of the fact that pointers are
usually located in a limited memory region (like a mempool).
We can compress them by converting them to offsets from a base
memory address. Offsets can be stored in fewer bytes (dictated
by the memory region size and alignment of the pointer).
For example: an 8 byte aligned pointer which is part of a 32GB
memory pool can be stored in 4 bytes.
Suggested-by: Honnappa Nagarahalli <honnappa.nagarahalli at arm.com>
Signed-off-by: Paul Szczepanek <paul.szczepanek at arm.com>
Signed-off-by: Kamalakshitha Aligeri <kamalakshitha.aligeri at arm.com>
Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli at arm.com>
Reviewed-by: Nathan Brown <nathan.brown at arm.com>
Reviewed-by: Jack Bond-Preston <jack.bond-preston at foss.arm.com>
Acked-by: Morten Brørup <mb at smartsharesystems.com>
---
MAINTAINERS | 4 +
doc/api/doxy-api-index.md | 1 +
doc/api/doxy-api.conf.in | 1 +
doc/guides/rel_notes/release_24_07.rst | 5 +
lib/meson.build | 1 +
lib/ptr_compress/meson.build | 4 +
lib/ptr_compress/rte_ptr_compress.h | 324 +++++++++++++++++++++++++
7 files changed, 340 insertions(+)
create mode 100644 lib/ptr_compress/meson.build
create mode 100644 lib/ptr_compress/rte_ptr_compress.h
diff --git a/MAINTAINERS b/MAINTAINERS
index c9adff9846..27b2f03e6c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1694,6 +1694,10 @@ M: Chenbo Xia <chenbox at nvidia.com>
M: Gaetan Rivet <grive at u256.net>
F: lib/pci/
+Pointer Compression
+M: Paul Szczepanek <paul.szczepanek at arm.com>
+F: lib/ptr_compress/
+
Power management
M: Anatoly Burakov <anatoly.burakov at intel.com>
M: David Hunt <david.hunt at intel.com>
diff --git a/doc/api/doxy-api-index.md b/doc/api/doxy-api-index.md
index 8c1eb8fafa..f9283154f8 100644
--- a/doc/api/doxy-api-index.md
+++ b/doc/api/doxy-api-index.md
@@ -222,6 +222,7 @@ The public API headers are grouped by topics:
[config file](@ref rte_cfgfile.h),
[key/value args](@ref rte_kvargs.h),
[argument parsing](@ref rte_argparse.h),
+ [ptr_compress](@ref rte_ptr_compress.h),
[string](@ref rte_string_fns.h),
[thread](@ref rte_thread.h)
diff --git a/doc/api/doxy-api.conf.in b/doc/api/doxy-api.conf.in
index 27afec8b3b..a8823c046f 100644
--- a/doc/api/doxy-api.conf.in
+++ b/doc/api/doxy-api.conf.in
@@ -71,6 +71,7 @@ INPUT = @TOPDIR@/doc/api/doxy-api-index.md \
@TOPDIR@/lib/pipeline \
@TOPDIR@/lib/port \
@TOPDIR@/lib/power \
+ @TOPDIR@/lib/ptr_compress \
@TOPDIR@/lib/rawdev \
@TOPDIR@/lib/rcu \
@TOPDIR@/lib/regexdev \
diff --git a/doc/guides/rel_notes/release_24_07.rst b/doc/guides/rel_notes/release_24_07.rst
index a69f24cf99..4711792e61 100644
--- a/doc/guides/rel_notes/release_24_07.rst
+++ b/doc/guides/rel_notes/release_24_07.rst
@@ -55,6 +55,11 @@ New Features
Also, make sure to start the actual text at the margin.
=======================================================
+* **Introduced pointer compression library.**
+
+ Library provides functions to compress and decompress arrays of pointers
+ which can improve application performance under certain conditions.
+ Performance test was added to help users evaluate performance on their setup.
Removed Items
-------------
diff --git a/lib/meson.build b/lib/meson.build
index 7c90602bf5..63becee142 100644
--- a/lib/meson.build
+++ b/lib/meson.build
@@ -14,6 +14,7 @@ libraries = [
'argparse',
'telemetry', # basic info querying
'eal', # everything depends on eal
+ 'ptr_compress',
'ring',
'rcu', # rcu depends on ring
'mempool',
diff --git a/lib/ptr_compress/meson.build b/lib/ptr_compress/meson.build
new file mode 100644
index 0000000000..e92706a45f
--- /dev/null
+++ b/lib/ptr_compress/meson.build
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2024 Arm Limited
+
+headers = files('rte_ptr_compress.h')
diff --git a/lib/ptr_compress/rte_ptr_compress.h b/lib/ptr_compress/rte_ptr_compress.h
new file mode 100644
index 0000000000..bf9cfb0661
--- /dev/null
+++ b/lib/ptr_compress/rte_ptr_compress.h
@@ -0,0 +1,324 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2024 Arm Limited
+ */
+
+#ifndef RTE_PTR_COMPRESS_H
+#define RTE_PTR_COMPRESS_H
+
+/**
+ * @file
+ * Pointer compression and decompression functions.
+ *
+ * When passing arrays full of pointers between threads, memory containing
+ * the pointers is copied multiple times which is especially costly between
+ * cores. These functions allow us to compress the pointers.
+ *
+ * Compression takes advantage of the fact that pointers are usually located in
+ * a limited memory region. We compress them by converting them to offsets from
+ * a base memory address. Offsets can be stored in fewer bytes.
+ *
+ * The compression functions come in two varieties: 32-bit and 16-bit.
+ *
+ * To determine how many bits are needed to compress the pointer calculate
+ * the biggest offset possible (highest value pointer - base pointer)
+ * and shift the value right according to alignment (shift by exponent of the
+ * power of 2 of alignment: aligned by 4 - shift by 2, aligned by 8 - shift by
+ * 3, etc.). The resulting value must fit in either 32 or 16 bits.
+ *
+ * For usage example and further explanation please see "Pointer Compression" in
+ * doc/guides/prog_guide/ptr_compress_lib.rst
+ */
+
+#include <stdint.h>
+#include <inttypes.h>
+
+#include <rte_branch_prediction.h>
+#include <rte_common.h>
+#include <rte_debug.h>
+#include <rte_vect.h>
+#include <rte_mempool.h>
+#include <rte_bitops.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Calculate how many bits are required to store a value within a given range.
+ * This is to help decide which pointer compression functions can be used to
+ * store pointers contained within a memory range.
+ *
+ * @param range
+ * The size of the range the value belongs to.
+ * @return
+ * Number of bits required to store a value.
+ **/
+#define RTE_PTR_COMPRESS_BITS_REQUIRED_TO_STORE_VALUE_IN_RANGE(range) \
+ (((uint64_t)range) < 2 ? 1 : \
+ (sizeof(uint64_t) * CHAR_BIT - rte_clz64((uint64_t)range - 1)))
+
+/**
+ * Calculate how many bits in the address can be dropped without losing any
+ * information thanks to the alignment of the address.
+ *
+ * @param alignment
+ * Memory alignment.
+ * @return
+ * Size of shift allowed without dropping any information from the pointer.
+ **/
+#define RTE_PTR_COMPRESS_BIT_SHIFT_FROM_ALIGNMENT(alignment) \
+ ((alignment) == 0 ? 0 : rte_ctz64((uint64_t)alignment))
+
+/**
+ * Determine if rte_ptr_compress_16_shift can be used to compress pointers
+ * that contain addresses of memory objects whose memory is aligned by
+ * a given amount and contained in a given memory range.
+ *
+ * @param mem_range
+ * The size of the memory region that contains the objects pointed to.
+ * @param obj_alignment
+ * The alignment of objects pointed to.
+ * @return
+ * 1 if function can be used, 0 otherwise.
+ **/
+#define RTE_PTR_COMPRESS_CAN_COMPRESS_16_SHIFT(mem_range, obj_alignment) \
+ ((RTE_PTR_COMPRESS_BITS_REQUIRED_TO_STORE_VALUE_IN_RANGE(mem_range) - \
+ RTE_PTR_COMPRESS_BIT_SHIFT_FROM_ALIGNMENT(obj_alignment)) <= 16 ? 1 : 0)
+
+/**
+ * Determine if rte_ptr_compress_32_shift can be used to compress pointers
+ * that contain addresses of memory objects whose memory is aligned by
+ * a given amount and contained in a given memory range.
+ *
+ * @param mem_range
+ * The size of the memory region that contains the objects pointed to.
+ * @param obj_alignment
+ * The alignment of objects pointed to.
+ * @return
+ * 1 if function can be used, 0 otherwise.
+ **/
+#define RTE_PTR_COMPRESS_CAN_COMPRESS_32_SHIFT(mem_range, obj_alignment) \
+ ((RTE_PTR_COMPRESS_BITS_REQUIRED_TO_STORE_VALUE_IN_RANGE(mem_range) - \
+ RTE_PTR_COMPRESS_BIT_SHIFT_FROM_ALIGNMENT(obj_alignment)) <= 32 ? 1 : 0)
+
+/**
+ * Compress pointers into 32-bit offsets from base pointer.
+ *
+ * @note It is programmer's responsibility to ensure the resulting offsets fit
+ * into 32 bits. Alignment of the structures pointed to by the pointers allows
+ * us to drop bits from the offsets. This is controlled by the bit_shift
+ * parameter. This means that if structures are aligned by 8 bytes they must be
+ * within 32GB of the base pointer. If there is no such alignment guarantee they
+ * must be within 4GB.
+ *
+ * @param ptr_base
+ * A pointer used to calculate offsets of pointers in src_table.
+ * @param src_table
+ * A pointer to an array of pointers.
+ * @param dest_table
+ * A pointer to an array of compressed pointers returned by this function.
+ * @param n
+ * The number of objects to compress, must be strictly positive.
+ * @param bit_shift
+ * Byte alignment of memory pointed to by the pointers allows for
+ * bits to be dropped from the offset and hence widen the memory region that
+ * can be covered. This controls how many bits are right shifted.
+ **/
+static __rte_always_inline void
+rte_ptr_compress_32_shift(void *ptr_base, void * const *src_table,
+ uint32_t *dest_table, size_t n, uint8_t bit_shift)
+{
+ size_t i = 0;
+#if defined RTE_HAS_SVE_ACLE && !defined RTE_ARCH_ARMv8_AARCH32
+ svuint64_t v_ptr_table;
+ do {
+ svbool_t pg = svwhilelt_b64(i, n);
+ v_ptr_table = svld1_u64(pg, (uint64_t *)src_table + i);
+ v_ptr_table = svsub_x(pg, v_ptr_table, (uint64_t)ptr_base);
+ v_ptr_table = svlsr_x(pg, v_ptr_table, bit_shift);
+ svst1w(pg, &dest_table[i], v_ptr_table);
+ i += svcntd();
+ } while (i < n);
+#elif defined __ARM_NEON && !defined RTE_ARCH_ARMv8_AARCH32
+ uint64_t ptr_diff;
+ uint64x2_t v_ptr_table;
+ /* right shift is done by left shifting by negative int */
+ int64x2_t v_shift = vdupq_n_s64(-bit_shift);
+ uint64x2_t v_ptr_base = vdupq_n_u64((uint64_t)ptr_base);
+ const size_t n_even = n & ~0x1;
+ for (; i < n_even; i += 2) {
+ v_ptr_table = vld1q_u64((const uint64_t *)src_table + i);
+ v_ptr_table = vsubq_u64(v_ptr_table, v_ptr_base);
+ v_ptr_table = vshlq_u64(v_ptr_table, v_shift);
+ vst1_u32(dest_table + i, vqmovn_u64(v_ptr_table));
+ }
+ /* process leftover single item in case of odd number of n */
+ if (unlikely(n & 0x1)) {
+ ptr_diff = RTE_PTR_DIFF(src_table[i], ptr_base);
+ dest_table[i] = (uint32_t) (ptr_diff >> bit_shift);
+ }
+#else
+ uintptr_t ptr_diff;
+ for (; i < n; i++) {
+ ptr_diff = RTE_PTR_DIFF(src_table[i], ptr_base);
+ ptr_diff = ptr_diff >> bit_shift;
+ RTE_ASSERT(ptr_diff <= UINT32_MAX);
+ dest_table[i] = (uint32_t) ptr_diff;
+ }
+#endif
+}
+
+/**
+ * Decompress pointers from 32-bit offsets from base pointer.
+ *
+ * @param ptr_base
+ * A pointer which was used to calculate offsets in src_table.
+ * @param src_table
+ * A pointer to an array to compressed pointers.
+ * @param dest_table
+ * A pointer to an array of decompressed pointers returned by this function.
+ * @param n
+ * The number of objects to decompress, must be strictly positive.
+ * @param bit_shift
+ * Byte alignment of memory pointed to by the pointers allows for
+ * bits to be dropped from the offset and hence widen the memory region that
+ * can be covered. This controls how many bits are left shifted when pointers
+ * are recovered from the offsets.
+ **/
+static __rte_always_inline void
+rte_ptr_decompress_32_shift(void *ptr_base, uint32_t const *src_table,
+ void **dest_table, size_t n, uint8_t bit_shift)
+{
+ size_t i = 0;
+#if defined RTE_HAS_SVE_ACLE && !defined RTE_ARCH_ARMv8_AARCH32
+ svuint64_t v_ptr_table;
+ do {
+ svbool_t pg = svwhilelt_b64(i, n);
+ v_ptr_table = svld1uw_u64(pg, &src_table[i]);
+ v_ptr_table = svlsl_x(pg, v_ptr_table, bit_shift);
+ v_ptr_table = svadd_x(pg, v_ptr_table, (uint64_t)ptr_base);
+ svst1(pg, (uint64_t *)dest_table + i, v_ptr_table);
+ i += svcntd();
+ } while (i < n);
+#elif defined __ARM_NEON && !defined RTE_ARCH_ARMv8_AARCH32
+ uint64_t ptr_diff;
+ uint64x2_t v_ptr_table;
+ int64x2_t v_shift = vdupq_n_s64(bit_shift);
+ uint64x2_t v_ptr_base = vdupq_n_u64((uint64_t)ptr_base);
+ const size_t n_even = n & ~0x1;
+ for (; i < n_even; i += 2) {
+ v_ptr_table = vmovl_u32(vld1_u32(src_table + i));
+ v_ptr_table = vshlq_u64(v_ptr_table, v_shift);
+ v_ptr_table = vaddq_u64(v_ptr_table, v_ptr_base);
+ vst1q_u64((uint64_t *)dest_table + i, v_ptr_table);
+ }
+ /* process leftover single item in case of odd number of n */
+ if (unlikely(n & 0x1)) {
+ ptr_diff = ((uint64_t) src_table[i]) << bit_shift;
+ dest_table[i] = RTE_PTR_ADD(ptr_base, ptr_diff);
+ }
+#else
+ uintptr_t ptr_diff;
+ for (; i < n; i++) {
+ ptr_diff = ((uintptr_t) src_table[i]) << bit_shift;
+ dest_table[i] = RTE_PTR_ADD(ptr_base, ptr_diff);
+ }
+#endif
+}
+
+/**
+ * Compress pointers into 16-bit offsets from base pointer.
+ *
+ * @note It is programmer's responsibility to ensure the resulting offsets fit
+ * into 16 bits. Alignment of the structures pointed to by the pointers allows
+ * us to drop bits from the offsets. This is controlled by the bit_shift
+ * parameter. This means that if structures are aligned by 8 bytes they must be
+ * within 256KB of the base pointer. If there is no such alignment guarantee
+ * they must be within 64KB.
+ *
+ * @param ptr_base
+ * A pointer used to calculate offsets of pointers in src_table.
+ * @param src_table
+ * A pointer to an array of pointers.
+ * @param dest_table
+ * A pointer to an array of compressed pointers returned by this function.
+ * @param n
+ * The number of objects to compress, must be strictly positive.
+ * @param bit_shift
+ * Byte alignment of memory pointed to by the pointers allows for
+ * bits to be dropped from the offset and hence widen the memory region that
+ * can be covered. This controls how many bits are right shifted.
+ **/
+static __rte_always_inline void
+rte_ptr_compress_16_shift(void *ptr_base, void * const *src_table,
+ uint16_t *dest_table, size_t n, uint8_t bit_shift)
+{
+
+ size_t i = 0;
+#if defined RTE_HAS_SVE_ACLE && !defined RTE_ARCH_ARMv8_AARCH32
+ svuint64_t v_ptr_table;
+ do {
+ svbool_t pg = svwhilelt_b64(i, n);
+ v_ptr_table = svld1_u64(pg, (uint64_t *)src_table + i);
+ v_ptr_table = svsub_x(pg, v_ptr_table, (uint64_t)ptr_base);
+ v_ptr_table = svlsr_x(pg, v_ptr_table, bit_shift);
+ svst1h(pg, &dest_table[i], v_ptr_table);
+ i += svcntd();
+ } while (i < n);
+#else
+ uintptr_t ptr_diff;
+ for (; i < n; i++) {
+ ptr_diff = RTE_PTR_DIFF(src_table[i], ptr_base);
+ ptr_diff = ptr_diff >> bit_shift;
+ RTE_ASSERT(ptr_diff <= UINT16_MAX);
+ dest_table[i] = (uint16_t) ptr_diff;
+ }
+#endif
+}
+
+/**
+ * Decompress pointers from 16-bit offsets from base pointer.
+ *
+ * @param ptr_base
+ * A pointer which was used to calculate offsets in src_table.
+ * @param src_table
+ * A pointer to an array to compressed pointers.
+ * @param dest_table
+ * A pointer to an array of decompressed pointers returned by this function.
+ * @param n
+ * The number of objects to decompress, must be strictly positive.
+ * @param bit_shift
+ * Byte alignment of memory pointed to by the pointers allows for
+ * bits to be dropped from the offset and hence widen the memory region that
+ * can be covered. This controls how many bits are left shifted when pointers
+ * are recovered from the offsets.
+ **/
+static __rte_always_inline void
+rte_ptr_decompress_16_shift(void *ptr_base, uint16_t const *src_table,
+ void **dest_table, size_t n, uint8_t bit_shift)
+{
+ size_t i = 0;
+#if defined RTE_HAS_SVE_ACLE && !defined RTE_ARCH_ARMv8_AARCH32
+ svuint64_t v_ptr_table;
+ do {
+ svbool_t pg = svwhilelt_b64(i, n);
+ v_ptr_table = svld1uh_u64(pg, &src_table[i]);
+ v_ptr_table = svlsl_x(pg, v_ptr_table, bit_shift);
+ v_ptr_table = svadd_x(pg, v_ptr_table, (uint64_t)ptr_base);
+ svst1(pg, (uint64_t *)dest_table + i, v_ptr_table);
+ i += svcntd();
+ } while (i < n);
+#else
+ uintptr_t ptr_diff;
+ for (; i < n; i++) {
+ ptr_diff = ((uintptr_t) src_table[i]) << bit_shift;
+ dest_table[i] = RTE_PTR_ADD(ptr_base, ptr_diff);
+ }
+#endif
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* RTE_PTR_COMPRESS_H */
--
2.25.1
More information about the dev
mailing list