[RFC v4 3/3] app/test: add fastmem test suite
Mattias Rönnblom
hofors at lysator.liu.se
Sat May 30 11:26:34 CEST 2026
Add functional, performance, and profiling test suites for the
fastmem library.
--
RFC v4:
* Add tests for handle alloc/free from uncached lcores and
non-EAL threads.
* Add tests that statistics survive cache flush.
* Add test for shared-cache statistics.
* Refactor tests to use per-test setup/teardown.
RFC v3:
* Add realloc test cases (same class, grow, shrink, NULL ptr,
zero size, too big, invalid align).
* Merge lifecycle and functional test suites into one.
* Suppress -Wuse-after-free in test_alloc_reuse (intentional
pointer comparison after free).
RFC v2:
* Add test_alloc_cross_socket_deinit exercising cross-socket
teardown path.
* Remove trailing double blank lines in test_fastmem.c.
Signed-off-by: Mattias Rönnblom <hofors at lysator.liu.se>
---
app/test/meson.build | 3 +
app/test/test_fastmem.c | 2111 +++++++++++++++++++++++++++++++
app/test/test_fastmem_perf.c | 1040 +++++++++++++++
app/test/test_fastmem_profile.c | 157 +++
4 files changed, 3311 insertions(+)
create mode 100644 app/test/test_fastmem.c
create mode 100644 app/test/test_fastmem_perf.c
create mode 100644 app/test/test_fastmem_profile.c
diff --git a/app/test/meson.build b/app/test/meson.build
index 3f9340f2f5..fe375e97f3 100644
--- a/app/test/meson.build
+++ b/app/test/meson.build
@@ -82,6 +82,9 @@ source_file_deps = {
'test_event_vector_adapter.c': ['eventdev', 'bus_vdev'],
'test_eventdev.c': ['eventdev', 'bus_vdev'],
'test_external_mem.c': [],
+ 'test_fastmem.c': ['fastmem'],
+ 'test_fastmem_perf.c': ['fastmem', 'mempool'],
+ 'test_fastmem_profile.c': ['fastmem'],
'test_fbarray.c': [],
'test_fib.c': ['net', 'fib'],
'test_fib6.c': ['rib', 'fib'],
diff --git a/app/test/test_fastmem.c b/app/test/test_fastmem.c
new file mode 100644
index 0000000000..24ba1e671a
--- /dev/null
+++ b/app/test/test_fastmem.c
@@ -0,0 +1,2111 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2026 Ericsson AB
+ */
+
+#include <errno.h>
+#include <inttypes.h>
+#include <stdalign.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <rte_common.h>
+#include <rte_errno.h>
+#include <rte_lcore.h>
+#include <rte_memory.h>
+#include <rte_memzone.h>
+#include <rte_thread.h>
+
+#include <rte_fastmem.h>
+
+#include "test.h"
+
+#define FASTMEM_MEMZONE_SIZE (128U << 20)
+
+/*
+ * Count memzones whose names begin with the fastmem prefix.
+ * Used to verify that rte_fastmem_reserve() really did reserve
+ * backing memzones.
+ */
+static int fastmem_memzone_count;
+
+static void
+count_fastmem_memzones_walk(const struct rte_memzone *mz, void *arg)
+{
+ RTE_SET_USED(arg);
+
+ if (strncmp(mz->name, "fastmem_", strlen("fastmem_")) == 0)
+ fastmem_memzone_count++;
+}
+
+static unsigned int
+count_fastmem_memzones(void)
+{
+ fastmem_memzone_count = 0;
+ rte_memzone_walk(count_fastmem_memzones_walk, NULL);
+ return fastmem_memzone_count;
+}
+
+static int
+test_init_deinit(void)
+{
+ int rc;
+
+ rc = rte_fastmem_init();
+ TEST_ASSERT_EQUAL(rc, 0, "rte_fastmem_init() failed: %d", rc);
+
+ rte_fastmem_deinit();
+
+ /* A subsequent init/deinit cycle must succeed. */
+ rc = rte_fastmem_init();
+ TEST_ASSERT_EQUAL(rc, 0, "second rte_fastmem_init() failed: %d", rc);
+
+ rte_fastmem_deinit();
+
+ return TEST_SUCCESS;
+}
+
+static int
+test_init_is_not_idempotent(void)
+{
+ int rc;
+
+ rc = rte_fastmem_init();
+ TEST_ASSERT_EQUAL(rc, 0, "rte_fastmem_init() failed: %d", rc);
+
+ rc = rte_fastmem_init();
+ TEST_ASSERT_EQUAL(rc, -EBUSY,
+ "expected -EBUSY on re-init, got %d", rc);
+
+ rte_fastmem_deinit();
+
+ return TEST_SUCCESS;
+}
+
+static int
+test_deinit_without_init(void)
+{
+ /* Must be a no-op, not a crash. */
+ rte_fastmem_deinit();
+
+ return TEST_SUCCESS;
+}
+
+static int
+test_max_size(void)
+{
+ size_t max;
+
+ max = rte_fastmem_max_size();
+ TEST_ASSERT(max >= (1U << 20),
+ "max_size=%zu below required 1 MiB minimum", max);
+
+ return TEST_SUCCESS;
+}
+
+static int
+test_reserve_small(void)
+{
+ int socket_id;
+ unsigned int before, after;
+ int rc;
+
+ socket_id = rte_socket_id_by_idx(0);
+ TEST_ASSERT(socket_id >= 0, "no available sockets");
+
+ before = count_fastmem_memzones();
+
+ /*
+ * A small reserve request (1 byte) must result in exactly
+ * one memzone reservation: the internal rounding is to
+ * memzone granularity.
+ */
+ rc = rte_fastmem_reserve(1, socket_id);
+ TEST_ASSERT_EQUAL(rc, 0, "rte_fastmem_reserve() failed: %d", rc);
+
+ after = count_fastmem_memzones();
+ TEST_ASSERT_EQUAL(after - before, 1,
+ "expected 1 new memzone, got %u", after - before);
+
+ rte_fastmem_deinit();
+
+ /* After deinit the memzones must be released. */
+ TEST_ASSERT_EQUAL(count_fastmem_memzones(), 0,
+ "%u fastmem memzones leaked after deinit",
+ count_fastmem_memzones());
+
+ return TEST_SUCCESS;
+}
+
+static int
+test_reserve_multiple_memzones(void)
+{
+ int socket_id;
+ unsigned int before, after;
+ size_t reserve_size;
+ int rc;
+
+ socket_id = rte_socket_id_by_idx(0);
+ TEST_ASSERT(socket_id >= 0, "no available sockets");
+
+ before = count_fastmem_memzones();
+
+ /*
+ * Request just over one memzone's worth; this must force
+ * a second memzone to be reserved.
+ */
+ reserve_size = FASTMEM_MEMZONE_SIZE + 1;
+ rc = rte_fastmem_reserve(reserve_size, socket_id);
+ TEST_ASSERT_EQUAL(rc, 0, "rte_fastmem_reserve(%zu) failed: %d",
+ reserve_size, rc);
+
+ after = count_fastmem_memzones();
+ TEST_ASSERT_EQUAL(after - before, 2,
+ "expected 2 new memzones for %zu-byte reserve, got %u",
+ reserve_size, after - before);
+
+ return TEST_SUCCESS;
+}
+
+static int
+test_reserve_cumulative(void)
+{
+ int socket_id;
+ unsigned int after_first, after_second;
+ int rc;
+
+ socket_id = rte_socket_id_by_idx(0);
+ TEST_ASSERT(socket_id >= 0, "no available sockets");
+
+ rc = rte_fastmem_reserve(FASTMEM_MEMZONE_SIZE, socket_id);
+ TEST_ASSERT_EQUAL(rc, 0, "first reserve failed: %d", rc);
+
+ after_first = count_fastmem_memzones();
+
+ /*
+ * A second call requesting the same amount that's already
+ * reserved must not trigger any new memzone reservation.
+ */
+ rc = rte_fastmem_reserve(FASTMEM_MEMZONE_SIZE, socket_id);
+ TEST_ASSERT_EQUAL(rc, 0, "second reserve failed: %d", rc);
+
+ after_second = count_fastmem_memzones();
+ TEST_ASSERT_EQUAL(after_first, after_second,
+ "reserve of already-reserved amount added memzones (%u -> %u)",
+ after_first, after_second);
+
+ return TEST_SUCCESS;
+}
+
+static int
+test_reserve_invalid_socket(void)
+{
+ int rc;
+
+ rc = rte_fastmem_reserve(1, RTE_MAX_NUMA_NODES);
+ TEST_ASSERT_EQUAL(rc, -EINVAL,
+ "expected -EINVAL for out-of-range socket, got %d", rc);
+
+ rc = rte_fastmem_reserve(1, -2);
+ TEST_ASSERT_EQUAL(rc, -EINVAL,
+ "expected -EINVAL for negative socket, got %d", rc);
+
+ return TEST_SUCCESS;
+}
+
+static int
+test_reserve_without_init(void)
+{
+ int rc;
+
+ rc = rte_fastmem_reserve(1, SOCKET_ID_ANY);
+ TEST_ASSERT(rc < 0,
+ "expected failure without init, got %d", rc);
+
+ return TEST_SUCCESS;
+}
+
+static int
+test_reserve_any_socket(void)
+{
+ unsigned int before, after;
+ int rc;
+
+ before = count_fastmem_memzones();
+
+ /*
+ * SOCKET_ID_ANY should succeed on any system with at least
+ * one configured socket. The allocator picks the caller's
+ * socket first and falls back to other sockets if needed.
+ */
+ rc = rte_fastmem_reserve(1, SOCKET_ID_ANY);
+ TEST_ASSERT_EQUAL(rc, 0,
+ "rte_fastmem_reserve(SOCKET_ID_ANY) failed: %d", rc);
+
+ after = count_fastmem_memzones();
+ TEST_ASSERT_EQUAL(after - before, 1,
+ "expected 1 new memzone, got %u", after - before);
+
+ return TEST_SUCCESS;
+}
+
+/*
+ * Stage 2 tests: allocation and free.
+ */
+
+static int
+test_alloc_too_big(void)
+{
+ void *p;
+ rte_errno = 0;
+ p = rte_fastmem_alloc(rte_fastmem_max_size() + 1, 0, 0);
+ TEST_ASSERT_NULL(p, "alloc above max_size returned non-NULL");
+ TEST_ASSERT_EQUAL(rte_errno, E2BIG,
+ "expected rte_errno=E2BIG, got %d", rte_errno);
+
+ return TEST_SUCCESS;
+}
+
+static int
+test_alloc_invalid_align(void)
+{
+ void *p;
+ rte_errno = 0;
+ p = rte_fastmem_alloc(16, 3, 0); /* 3 is not a power of 2 */
+ TEST_ASSERT_NULL(p, "alloc with align=3 returned non-NULL");
+ TEST_ASSERT_EQUAL(rte_errno, EINVAL,
+ "expected rte_errno=EINVAL, got %d", rte_errno);
+
+ return TEST_SUCCESS;
+}
+
+static int
+test_alloc_free_small(void)
+{
+ void *p;
+ p = rte_fastmem_alloc(8, 0, 0);
+ TEST_ASSERT_NOT_NULL(p, "alloc(8) failed: rte_errno=%d", rte_errno);
+
+ /* Writing into the object must not crash. */
+ memset(p, 0xa5, 8);
+
+ rte_fastmem_free(p);
+
+ return TEST_SUCCESS;
+}
+
+static int
+test_alloc_free_various_sizes(void)
+{
+ static const size_t sizes[] = {
+ 1, 8, 16, 17, 63, 64, 128, 1024, 4096,
+ 64 * 1024, 256 * 1024, 1024 * 1024,
+ };
+ void *ptrs[RTE_DIM(sizes)];
+ unsigned int i;
+ for (i = 0; i < RTE_DIM(sizes); i++) {
+ ptrs[i] = rte_fastmem_alloc(sizes[i], 0, 0);
+ TEST_ASSERT_NOT_NULL(ptrs[i],
+ "alloc(%zu) failed: rte_errno=%d",
+ sizes[i], rte_errno);
+ memset(ptrs[i], 0x5a, sizes[i]);
+ }
+
+ for (i = 0; i < RTE_DIM(sizes); i++)
+ rte_fastmem_free(ptrs[i]);
+
+ return TEST_SUCCESS;
+}
+
+static int
+test_alloc_alignment(void)
+{
+ static const size_t aligns[] = {
+ 8, 16, 64, 256, 4096, 65536,
+ };
+ unsigned int i;
+ for (i = 0; i < RTE_DIM(aligns); i++) {
+ void *p = rte_fastmem_alloc(1, aligns[i], 0);
+
+ TEST_ASSERT_NOT_NULL(p,
+ "alloc(1, align=%zu) failed: rte_errno=%d",
+ aligns[i], rte_errno);
+ TEST_ASSERT((uintptr_t)p % aligns[i] == 0,
+ "pointer %p not aligned on %zu",
+ p, aligns[i]);
+ rte_fastmem_free(p);
+ }
+
+ /* Default (align=0) gives at least RTE_CACHE_LINE_SIZE. */
+ {
+ void *p = rte_fastmem_alloc(1, 0, 0);
+
+ TEST_ASSERT_NOT_NULL(p,
+ "alloc(1, align=0) failed: rte_errno=%d", rte_errno);
+ TEST_ASSERT((uintptr_t)p % RTE_CACHE_LINE_SIZE == 0,
+ "default-align pointer %p not cache-line aligned",
+ p);
+ rte_fastmem_free(p);
+ }
+
+ return TEST_SUCCESS;
+}
+
+static int
+test_alloc_zero_flag(void)
+{
+ uint8_t *p;
+ unsigned int i;
+ bool all_zero = true;
+
+ /*
+ * Dirty a slab first by allocating without F_ZERO, writing
+ * a non-zero pattern, and freeing. A subsequent F_ZERO
+ * allocation on the same slab must return zeroed memory.
+ */
+ p = rte_fastmem_alloc(128, 0, 0);
+ TEST_ASSERT_NOT_NULL(p, "priming alloc failed");
+ memset(p, 0xff, 128);
+ rte_fastmem_free(p);
+
+ p = rte_fastmem_alloc(128, 0, RTE_FASTMEM_F_ZERO);
+ TEST_ASSERT_NOT_NULL(p, "F_ZERO alloc failed");
+ for (i = 0; i < 128; i++) {
+ if (p[i] != 0) {
+ all_zero = false;
+ break;
+ }
+ }
+ TEST_ASSERT(all_zero, "F_ZERO returned non-zero byte at offset %u", i);
+
+ rte_fastmem_free(p);
+
+ return TEST_SUCCESS;
+}
+
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuse-after-free"
+#endif
+static int
+test_alloc_reuse(void)
+{
+ void *first, *second;
+
+ first = rte_fastmem_alloc(64, 0, 0);
+ TEST_ASSERT_NOT_NULL(first, "first alloc failed");
+ rte_fastmem_free(first);
+
+ second = rte_fastmem_alloc(64, 0, 0);
+ TEST_ASSERT_NOT_NULL(second, "second alloc failed");
+
+ /*
+ * The slab's free list is LIFO, so the most recently freed
+ * object is at the head of the list. A subsequent alloc in
+ * the same class returns it.
+ */
+ TEST_ASSERT_EQUAL(first, second,
+ "free + alloc did not reuse: first=%p second=%p",
+ first, second);
+
+ rte_fastmem_free(second);
+
+ return TEST_SUCCESS;
+}
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+
+static int
+test_alloc_many_in_class(void)
+{
+ /*
+ * Allocate more objects in one class than fit in a single
+ * slab, forcing the bin to pull a second block. This
+ * exercises the partial->full transition and the cross-slab
+ * allocation path.
+ */
+ enum { CLASS_SIZE = 8, COUNT = 300000 };
+ void **ptrs;
+ unsigned int i;
+
+ ptrs = calloc(COUNT, sizeof(*ptrs));
+ TEST_ASSERT_NOT_NULL(ptrs, "calloc for test ptrs failed");
+
+ for (i = 0; i < COUNT; i++) {
+ ptrs[i] = rte_fastmem_alloc(CLASS_SIZE, 0, 0);
+ TEST_ASSERT_NOT_NULL(ptrs[i],
+ "alloc[%u] failed: rte_errno=%d",
+ i, rte_errno);
+ }
+
+ for (i = 0; i < COUNT; i++)
+ rte_fastmem_free(ptrs[i]);
+
+ free(ptrs);
+
+ return TEST_SUCCESS;
+}
+
+static int
+test_alloc_socket(void)
+{
+ void *p;
+ int socket_id;
+ socket_id = rte_socket_id_by_idx(0);
+ TEST_ASSERT(socket_id >= 0, "no available sockets");
+
+ p = rte_fastmem_alloc_socket(64, 0, 0, socket_id);
+ TEST_ASSERT_NOT_NULL(p,
+ "alloc_socket(%d) failed: rte_errno=%d",
+ socket_id, rte_errno);
+
+ rte_fastmem_free(p);
+
+ return TEST_SUCCESS;
+}
+
+static int
+test_alloc_block_repurposing(void)
+{
+ void *small, *large;
+
+ /*
+ * Allocate and free a small object, forcing a block to be
+ * assigned to the small class and then returned to the
+ * free-block pool. A subsequent allocation in a different
+ * class must be able to reuse that block.
+ */
+ small = rte_fastmem_alloc(8, 0, 0);
+ TEST_ASSERT_NOT_NULL(small, "small alloc failed");
+ rte_fastmem_free(small);
+
+ large = rte_fastmem_alloc(256 * 1024, 0, 0);
+ TEST_ASSERT_NOT_NULL(large, "large alloc failed");
+ rte_fastmem_free(large);
+
+ return TEST_SUCCESS;
+}
+
+static int
+test_alloc_block_repurposing_no_growth(void)
+{
+ struct rte_fastmem_stats stats;
+ void *small, *large;
+ uint64_t after_small;
+ int rc;
+
+ /*
+ * Stronger version of test_alloc_block_repurposing: assert
+ * that the cross-class allocation does not grow the
+ * backing memory (bytes_backing stays flat). Because the
+ * free-block pool is shared across size classes — not
+ * partitioned per class — the block freed from the small
+ * class must serve the large allocation without triggering
+ * a new memzone reservation.
+ */
+ rc = rte_fastmem_stats(&stats);
+ TEST_ASSERT_EQUAL(rc, 0, "rte_fastmem_stats() failed: %d", rc);
+ TEST_ASSERT_EQUAL(stats.bytes_backing, (uint64_t)0,
+ "unexpected pre-alloc bytes_backing: %" PRIu64,
+ stats.bytes_backing);
+
+ small = rte_fastmem_alloc(8, 0, 0);
+ TEST_ASSERT_NOT_NULL(small, "small alloc failed");
+
+ rc = rte_fastmem_stats(&stats);
+ TEST_ASSERT_EQUAL(rc, 0, "rte_fastmem_stats() failed: %d", rc);
+ TEST_ASSERT(stats.bytes_backing > 0,
+ "bytes_backing did not grow on first alloc");
+ after_small = stats.bytes_backing;
+
+ rte_fastmem_free(small);
+ rte_fastmem_cache_flush();
+
+ large = rte_fastmem_alloc(256 * 1024, 0, 0);
+ TEST_ASSERT_NOT_NULL(large,
+ "large alloc failed: rte_errno=%d", rte_errno);
+
+ rc = rte_fastmem_stats(&stats);
+ TEST_ASSERT_EQUAL(rc, 0, "rte_fastmem_stats() failed: %d", rc);
+ TEST_ASSERT_EQUAL(stats.bytes_backing, after_small,
+ "cross-class alloc grew backing memory from %" PRIu64
+ " to %" PRIu64,
+ after_small, stats.bytes_backing);
+
+ rte_fastmem_free(large);
+
+ return TEST_SUCCESS;
+}
+
+static int
+test_free_null(void)
+{
+ /* Must be a no-op, not a crash. */
+ rte_fastmem_free(NULL);
+
+ return TEST_SUCCESS;
+}
+
+static int
+test_alloc_content_integrity(void)
+{
+ /*
+ * Allocate a batch of objects, fill each with a distinct
+ * byte pattern, then verify none of the patterns overlap.
+ * This catches header overwrites (slab header corrupted by
+ * object access) and slot-overlap bugs (two pointers pointing
+ * at overlapping slots).
+ */
+ enum { N = 256, SIZE = 128 };
+ uint8_t *ptrs[N];
+ unsigned int i, j;
+ for (i = 0; i < N; i++) {
+ ptrs[i] = rte_fastmem_alloc(SIZE, 0, 0);
+ TEST_ASSERT_NOT_NULL(ptrs[i], "alloc[%u] failed", i);
+ memset(ptrs[i], (int)i, SIZE);
+ }
+
+ for (i = 0; i < N; i++)
+ for (j = 0; j < SIZE; j++)
+ TEST_ASSERT_EQUAL(ptrs[i][j], (uint8_t)i,
+ "corruption at ptrs[%u][%u]: got 0x%x, want 0x%x",
+ i, j, ptrs[i][j], (uint8_t)i);
+
+ for (i = 0; i < N; i++)
+ rte_fastmem_free(ptrs[i]);
+
+ return TEST_SUCCESS;
+}
+
+static int
+test_alloc_align_too_big(void)
+{
+ void *p;
+ /*
+ * A small size with an alignment larger than the maximum
+ * size class cannot be served. The class selected must be
+ * large enough for the alignment, but no such class exists.
+ */
+ rte_errno = 0;
+ p = rte_fastmem_alloc(1, rte_fastmem_max_size() * 2, 0);
+ TEST_ASSERT_NULL(p,
+ "alloc with align>max_size returned non-NULL");
+ TEST_ASSERT_EQUAL(rte_errno, E2BIG,
+ "expected rte_errno=E2BIG, got %d", rte_errno);
+
+ return TEST_SUCCESS;
+}
+
+static int
+test_alloc_align_one(void)
+{
+ void *p;
+ /* align=1 is a valid power of 2 and must be accepted. */
+ p = rte_fastmem_alloc(8, 1, 0);
+ TEST_ASSERT_NOT_NULL(p, "alloc(8, 1) failed: rte_errno=%d",
+ rte_errno);
+ rte_fastmem_free(p);
+
+ return TEST_SUCCESS;
+}
+
+static int
+test_alloc_socket_numa_placement(void)
+{
+ void *p;
+ int socket_id;
+ struct rte_memseg *ms;
+ socket_id = rte_socket_id_by_idx(0);
+ TEST_ASSERT(socket_id >= 0, "no available sockets");
+
+ p = rte_fastmem_alloc_socket(64, 0, 0, socket_id);
+ TEST_ASSERT_NOT_NULL(p,
+ "alloc_socket(%d) failed: rte_errno=%d",
+ socket_id, rte_errno);
+
+ /*
+ * Walk the memory to find the memseg for this pointer and
+ * verify its socket. Skip the check if lookup fails (e.g.,
+ * --no-huge mode may not populate memsegs for fastmem's
+ * allocations in a way that rte_mem_virt2memseg can find).
+ */
+ ms = rte_mem_virt2memseg(p, NULL);
+ if (ms != NULL) {
+ TEST_ASSERT_EQUAL(ms->socket_id, socket_id,
+ "alloc on socket %d landed on socket %d",
+ socket_id, ms->socket_id);
+ }
+
+ rte_fastmem_free(p);
+
+ return TEST_SUCCESS;
+}
+
+/*
+ * Allocate from a socket different from the calling lcore's socket,
+ * triggering a cross-socket cache allocation. Then deinit to exercise
+ * the teardown path where a cache's backing memory lives on a
+ * different socket than the one it serves.
+ */
+static int
+test_alloc_cross_socket_deinit(void)
+{
+ int local_sid, remote_sid;
+ unsigned int i, n_sockets;
+ void *p;
+
+ local_sid = (int)rte_socket_id();
+ if (local_sid < 0 || (unsigned int)local_sid >= RTE_MAX_NUMA_NODES)
+ local_sid = rte_socket_id_by_idx(0);
+
+ n_sockets = rte_socket_count();
+ if (n_sockets < 2)
+ return TEST_SKIPPED;
+
+ /* Find a socket different from the local one. */
+ remote_sid = -1;
+ for (i = 0; i < n_sockets; i++) {
+ int sid = rte_socket_id_by_idx(i);
+ if (sid >= 0 && sid != local_sid) {
+ remote_sid = sid;
+ break;
+ }
+ }
+ if (remote_sid < 0)
+ return TEST_SKIPPED;
+
+ p = rte_fastmem_alloc_socket(64, 0, 0, remote_sid);
+ TEST_ASSERT_NOT_NULL(p,
+ "cross-socket alloc(socket %d) failed: rte_errno=%d",
+ remote_sid, rte_errno);
+
+ rte_fastmem_free(p);
+
+ /* Teardown and re-init to exercise the deinit path with
+ * cross-socket caches.
+ */
+ rte_fastmem_deinit();
+
+ TEST_ASSERT_EQUAL(rte_fastmem_init(), 0,
+ "re-init after cross-socket deinit failed");
+
+ return TEST_SUCCESS;
+}
+
+/*
+ * Stage 3 tests: per-lcore caches.
+ */
+
+static int
+test_cache_flush(void)
+{
+ void *p;
+ /*
+ * Alloc and free one object, leaving it in the cache. Then
+ * flush and verify that a subsequent alloc may or may not
+ * return the same pointer (not asserting same/different —
+ * just checking that flush does not crash and a follow-up
+ * alloc still works).
+ */
+ p = rte_fastmem_alloc(64, 0, 0);
+ TEST_ASSERT_NOT_NULL(p, "first alloc failed");
+ rte_fastmem_free(p);
+
+ rte_fastmem_cache_flush();
+
+ /* Flush again — must be idempotent. */
+ rte_fastmem_cache_flush();
+
+ p = rte_fastmem_alloc(64, 0, 0);
+ TEST_ASSERT_NOT_NULL(p, "post-flush alloc failed");
+ rte_fastmem_free(p);
+
+ return TEST_SUCCESS;
+}
+
+static int
+test_cache_flush_without_init(void)
+{
+ /* Must be a no-op, not a crash. */
+ rte_fastmem_cache_flush();
+
+ return TEST_SUCCESS;
+}
+
+static int
+test_cache_exceeds_capacity(void)
+{
+ /*
+ * Free more objects at a single size class than the cache
+ * capacity (64 for classes <= 4 KiB). This forces the
+ * cache-drain slow path and verifies no corruption.
+ */
+ enum { COUNT = 200, SIZE = 64 };
+ void *ptrs[COUNT];
+ unsigned int i;
+
+ for (i = 0; i < COUNT; i++) {
+ ptrs[i] = rte_fastmem_alloc(SIZE, 0, 0);
+ TEST_ASSERT_NOT_NULL(ptrs[i],
+ "alloc[%u] failed: rte_errno=%d", i, rte_errno);
+ }
+
+ for (i = 0; i < COUNT; i++)
+ rte_fastmem_free(ptrs[i]);
+
+ /* Re-alloc the same count should still work. */
+ for (i = 0; i < COUNT; i++) {
+ ptrs[i] = rte_fastmem_alloc(SIZE, 0, 0);
+ TEST_ASSERT_NOT_NULL(ptrs[i],
+ "re-alloc[%u] failed: rte_errno=%d", i, rte_errno);
+ }
+
+ for (i = 0; i < COUNT; i++)
+ rte_fastmem_free(ptrs[i]);
+
+ return TEST_SUCCESS;
+}
+
+struct non_eal_args {
+ int ok;
+ char pad[64];
+};
+
+static uint32_t
+non_eal_thread_main(void *arg)
+{
+ struct non_eal_args *args = arg;
+ uint8_t *p;
+
+ p = rte_fastmem_alloc(128, 0, 0);
+ if (p == NULL)
+ return 1;
+
+ memset(p, 0x7e, 128);
+
+ rte_fastmem_free(p);
+
+ args->ok = 1;
+ return 0;
+}
+
+static int
+test_non_eal_thread(void)
+{
+ rte_thread_t thread_id;
+ struct non_eal_args args = { 0 };
+ int rc;
+
+ rc = rte_thread_create(&thread_id, NULL, non_eal_thread_main, &args);
+ TEST_ASSERT_EQUAL(rc, 0, "rte_thread_create() failed: %d", rc);
+
+ rc = rte_thread_join(thread_id, NULL);
+ TEST_ASSERT_EQUAL(rc, 0, "rte_thread_join() failed: %d", rc);
+
+ TEST_ASSERT_EQUAL(args.ok, 1,
+ "non-EAL thread did not complete alloc/free successfully");
+
+ return TEST_SUCCESS;
+}
+
+static int
+test_cache_flush_returns_memory(void)
+{
+ /*
+ * When an entire slab's worth of objects is freed, the
+ * slab's block is returned to the free-block pool and can
+ * be reassigned to another size class. Verify the cache
+ * does not permanently hold objects that prevent this.
+ *
+ * Allocate enough objects in one class to force multiple
+ * slabs, free them all, then flush the cache. After the
+ * flush, all cached objects are drained to their bins and
+ * empty slabs are returned to the block pool.
+ */
+ enum { N = 200, SIZE = 64 };
+ void *ptrs[N];
+ unsigned int i;
+
+ for (i = 0; i < N; i++) {
+ ptrs[i] = rte_fastmem_alloc(SIZE, 0, 0);
+ TEST_ASSERT_NOT_NULL(ptrs[i], "alloc[%u] failed", i);
+ }
+ for (i = 0; i < N; i++)
+ rte_fastmem_free(ptrs[i]);
+
+ rte_fastmem_cache_flush();
+
+ /*
+ * An allocation in a completely different class should
+ * succeed now, having access to any blocks freed by the
+ * flush.
+ */
+ {
+ void *other = rte_fastmem_alloc(65536, 0, 0);
+
+ TEST_ASSERT_NOT_NULL(other,
+ "post-flush cross-class alloc failed");
+ rte_fastmem_free(other);
+ }
+
+ return TEST_SUCCESS;
+}
+
+static int
+test_alloc_bulk_basic(void)
+{
+ enum { N = 32 };
+ void *ptrs[N];
+ int rc;
+
+ rc = rte_fastmem_alloc_bulk(ptrs, N, 64, 0, 0);
+ TEST_ASSERT_EQUAL(rc, 0, "alloc_bulk failed: %d", rc);
+
+ /* Verify all pointers are non-NULL and distinct. */
+ for (unsigned int i = 0; i < N; i++) {
+ TEST_ASSERT_NOT_NULL(ptrs[i], "ptrs[%u] is NULL", i);
+ for (unsigned int j = 0; j < i; j++)
+ TEST_ASSERT(ptrs[i] != ptrs[j],
+ "ptrs[%u] == ptrs[%u]", i, j);
+ }
+
+ rte_fastmem_free_bulk(ptrs, N);
+
+ return TEST_SUCCESS;
+}
+
+static int
+test_alloc_bulk_zero_flag(void)
+{
+ enum { N = 8, SIZE = 128 };
+ void *ptrs[N];
+ int rc;
+
+ rc = rte_fastmem_alloc_bulk(ptrs, N, SIZE, 0, RTE_FASTMEM_F_ZERO);
+ TEST_ASSERT_EQUAL(rc, 0, "alloc_bulk failed: %d", rc);
+
+ for (unsigned int i = 0; i < N; i++) {
+ uint8_t *p = ptrs[i];
+
+ for (unsigned int b = 0; b < SIZE; b++)
+ TEST_ASSERT_EQUAL(p[b], 0,
+ "ptrs[%u][%u] != 0", i, b);
+ }
+
+ rte_fastmem_free_bulk(ptrs, N);
+
+ return TEST_SUCCESS;
+}
+
+static int
+test_alloc_bulk_exceeds_cache(void)
+{
+ /* Allocate more than cache capacity (64) in one bulk call. */
+ enum { N = 128 };
+ void *ptrs[N];
+ int rc;
+
+ rc = rte_fastmem_alloc_bulk(ptrs, N, 64, 0, 0);
+ TEST_ASSERT_EQUAL(rc, 0, "alloc_bulk(%u) failed: %d", N, rc);
+
+ rte_fastmem_free_bulk(ptrs, N);
+
+ return TEST_SUCCESS;
+}
+
+static int
+test_alloc_bulk_socket(void)
+{
+ enum { N = 16 };
+ void *ptrs[N];
+ int socket_id;
+ int rc;
+
+ socket_id = rte_socket_id_by_idx(0);
+ TEST_ASSERT(socket_id >= 0, "no sockets");
+
+ rc = rte_fastmem_alloc_bulk_socket(ptrs, N, 64, 0, 0, socket_id);
+ TEST_ASSERT_EQUAL(rc, 0, "alloc_bulk_socket failed: %d", rc);
+
+ rte_fastmem_free_bulk(ptrs, N);
+
+ /* SOCKET_ID_ANY */
+ rc = rte_fastmem_alloc_bulk_socket(ptrs, N, 64, 0, 0, SOCKET_ID_ANY);
+ TEST_ASSERT_EQUAL(rc, 0, "alloc_bulk_socket(ANY) failed: %d", rc);
+
+ rte_fastmem_free_bulk(ptrs, N);
+
+ return TEST_SUCCESS;
+}
+
+static int
+test_free_bulk(void)
+{
+ enum { N = 64 };
+ void *ptrs[N];
+ /* Allocate individually, free in bulk. */
+ for (unsigned int i = 0; i < N; i++) {
+ ptrs[i] = rte_fastmem_alloc(64, 0, 0);
+ TEST_ASSERT_NOT_NULL(ptrs[i], "alloc[%u] failed", i);
+ }
+
+ rte_fastmem_free_bulk(ptrs, N);
+
+ /* Verify memory is reusable. */
+ for (unsigned int i = 0; i < N; i++) {
+ ptrs[i] = rte_fastmem_alloc(64, 0, 0);
+ TEST_ASSERT_NOT_NULL(ptrs[i], "re-alloc[%u] failed", i);
+ }
+
+ rte_fastmem_free_bulk(ptrs, N);
+
+ return TEST_SUCCESS;
+}
+
+static int
+test_classes(void)
+{
+ size_t sizes[32];
+ unsigned int n;
+
+ n = rte_fastmem_classes(NULL);
+ TEST_ASSERT_EQUAL(n, 18u, "expected 18 classes, got %u", n);
+
+ n = rte_fastmem_classes(sizes);
+ TEST_ASSERT_EQUAL(n, 18u, "expected 18 classes, got %u", n);
+ TEST_ASSERT_EQUAL(sizes[0], (size_t)8, "class 0 != 8");
+ TEST_ASSERT_EQUAL(sizes[n - 1], (size_t)(1 << 20),
+ "last class != 1 MiB");
+
+ for (unsigned int i = 0; i < n; i++) {
+ TEST_ASSERT(sizes[i] != 0 && (sizes[i] & (sizes[i] - 1)) == 0,
+ "class %u size %zu not power of 2", i, sizes[i]);
+ if (i > 0)
+ TEST_ASSERT(sizes[i] > sizes[i - 1],
+ "classes not ascending at %u", i);
+ }
+
+ return TEST_SUCCESS;
+}
+
+static int
+test_stats_class(void)
+{
+ enum { N = 10 };
+ struct rte_fastmem_class_stats cs;
+ void *ptrs[N];
+ int rc;
+
+ for (unsigned int i = 0; i < N; i++) {
+ ptrs[i] = rte_fastmem_alloc(64, 0, 0);
+ TEST_ASSERT_NOT_NULL(ptrs[i], "alloc[%u] failed", i);
+ }
+
+ rc = rte_fastmem_stats_class(64, &cs);
+ TEST_ASSERT_EQUAL(rc, 0, "stats_class failed: %d", rc);
+ TEST_ASSERT_EQUAL(cs.class_size, (size_t)64, "wrong class_size");
+ TEST_ASSERT(cs.alloc_cache_hits + cs.alloc_cache_misses == N,
+ "alloc count != N: hits=%" PRIu64 " misses=%" PRIu64,
+ cs.alloc_cache_hits, cs.alloc_cache_misses);
+ TEST_ASSERT_EQUAL(cs.in_use, (uint64_t)N, "in_use != N");
+
+ for (unsigned int i = 0; i < N; i++)
+ rte_fastmem_free(ptrs[i]);
+
+ rc = rte_fastmem_stats_class(64, &cs);
+ TEST_ASSERT_EQUAL(rc, 0, "stats_class after free failed: %d", rc);
+ TEST_ASSERT_EQUAL(cs.in_use, (uint64_t)0, "in_use != 0 after free");
+
+ /* Invalid class size. */
+ rc = rte_fastmem_stats_class(13, &cs);
+ TEST_ASSERT_EQUAL(rc, -EINVAL, "expected -EINVAL for bad size");
+
+ return TEST_SUCCESS;
+}
+
+static int
+test_stats_lcore(void)
+{
+ struct rte_fastmem_lcore_stats ls;
+ void *ptr;
+ int rc;
+
+ ptr = rte_fastmem_alloc(128, 0, 0);
+ TEST_ASSERT_NOT_NULL(ptr, "alloc failed");
+
+ rc = rte_fastmem_stats_lcore(rte_lcore_id(), &ls);
+ TEST_ASSERT_EQUAL(rc, 0, "stats_lcore failed: %d", rc);
+ TEST_ASSERT(ls.alloc_cache_hits + ls.alloc_cache_misses > 0,
+ "no alloc activity on this lcore");
+
+ rte_fastmem_free(ptr);
+
+ rc = rte_fastmem_stats_lcore(rte_lcore_id(), &ls);
+ TEST_ASSERT_EQUAL(rc, 0, "stats_lcore after free failed: %d", rc);
+ TEST_ASSERT(ls.free_cache_hits + ls.free_cache_misses > 0,
+ "no free activity on this lcore");
+
+ /* Invalid lcore. */
+ rc = rte_fastmem_stats_lcore(RTE_MAX_LCORE, &ls);
+ TEST_ASSERT_EQUAL(rc, -EINVAL, "expected -EINVAL for bad lcore");
+
+ return TEST_SUCCESS;
+}
+
+static int
+test_stats_lcore_class(void)
+{
+ struct rte_fastmem_lcore_class_stats lcs;
+ void *ptr;
+ int rc;
+
+ ptr = rte_fastmem_alloc(256, 0, 0);
+ TEST_ASSERT_NOT_NULL(ptr, "alloc failed");
+
+ rc = rte_fastmem_stats_lcore_class(rte_lcore_id(), 256, &lcs);
+ TEST_ASSERT_EQUAL(rc, 0, "stats_lcore_class failed: %d", rc);
+ TEST_ASSERT_EQUAL(lcs.class_size, (size_t)256, "wrong class_size");
+ TEST_ASSERT(lcs.alloc_cache_hits + lcs.alloc_cache_misses > 0,
+ "no alloc activity");
+
+ rte_fastmem_free(ptr);
+ return TEST_SUCCESS;
+}
+
+static int
+test_stats_reset(void)
+{
+ struct rte_fastmem_stats gs;
+ void *ptr;
+ int rc;
+
+ ptr = rte_fastmem_alloc(64, 0, 0);
+ TEST_ASSERT_NOT_NULL(ptr, "alloc failed");
+ rte_fastmem_free(ptr);
+
+ rte_fastmem_stats_reset();
+
+ rc = rte_fastmem_stats(&gs);
+ TEST_ASSERT_EQUAL(rc, 0, "stats failed: %d", rc);
+ TEST_ASSERT_EQUAL(gs.alloc_total, (uint64_t)0,
+ "alloc_total not zero after reset");
+ TEST_ASSERT_EQUAL(gs.free_total, (uint64_t)0,
+ "free_total not zero after reset");
+
+ return TEST_SUCCESS;
+}
+
+/*
+ * Counters are stored separately from the per-lcore caches, so a
+ * cache flush (which frees the cache structs) must not discard
+ * accumulated statistics.
+ */
+static int
+test_stats_survive_cache_flush(void)
+{
+ enum { N = 10 };
+ struct rte_fastmem_class_stats before, after;
+ struct rte_fastmem_lcore_stats lbefore, lafter;
+ void *ptrs[N];
+ unsigned int i;
+ int rc;
+
+ for (i = 0; i < N; i++) {
+ ptrs[i] = rte_fastmem_alloc(64, 0, 0);
+ TEST_ASSERT_NOT_NULL(ptrs[i], "alloc[%u] failed", i);
+ }
+ for (i = 0; i < N; i++)
+ rte_fastmem_free(ptrs[i]);
+
+ rc = rte_fastmem_stats_class(64, &before);
+ TEST_ASSERT_EQUAL(rc, 0, "stats_class failed: %d", rc);
+ rc = rte_fastmem_stats_lcore(rte_lcore_id(), &lbefore);
+ TEST_ASSERT_EQUAL(rc, 0, "stats_lcore failed: %d", rc);
+
+ TEST_ASSERT(before.alloc_cache_hits + before.alloc_cache_misses == N,
+ "expected %d allocs before flush", N);
+
+ rte_fastmem_cache_flush();
+
+ rc = rte_fastmem_stats_class(64, &after);
+ TEST_ASSERT_EQUAL(rc, 0, "stats_class after flush failed: %d", rc);
+ rc = rte_fastmem_stats_lcore(rte_lcore_id(), &lafter);
+ TEST_ASSERT_EQUAL(rc, 0, "stats_lcore after flush failed: %d", rc);
+
+ TEST_ASSERT_EQUAL(after.alloc_cache_hits, before.alloc_cache_hits,
+ "alloc_cache_hits lost across flush: %" PRIu64 " -> %" PRIu64,
+ before.alloc_cache_hits, after.alloc_cache_hits);
+ TEST_ASSERT_EQUAL(after.alloc_cache_misses, before.alloc_cache_misses,
+ "alloc_cache_misses lost across flush: %" PRIu64 " -> %" PRIu64,
+ before.alloc_cache_misses, after.alloc_cache_misses);
+ TEST_ASSERT_EQUAL(after.free_cache_hits, before.free_cache_hits,
+ "free_cache_hits lost across flush: %" PRIu64 " -> %" PRIu64,
+ before.free_cache_hits, after.free_cache_hits);
+ TEST_ASSERT_EQUAL(lafter.alloc_cache_hits + lafter.alloc_cache_misses,
+ lbefore.alloc_cache_hits + lbefore.alloc_cache_misses,
+ "per-lcore alloc counters lost across flush");
+
+ return TEST_SUCCESS;
+}
+
+/*
+ * Allocations made by a non-EAL thread cannot be attributed to an
+ * lcore, but must still be reflected in the global and per-class
+ * statistics.
+ */
+static uint32_t
+stats_non_eal_main(void *arg)
+{
+ struct non_eal_args *args = arg;
+ void *ptrs[8];
+ unsigned int i;
+
+ for (i = 0; i < RTE_DIM(ptrs); i++) {
+ ptrs[i] = rte_fastmem_alloc(64, 0, 0);
+ if (ptrs[i] == NULL)
+ return 1;
+ }
+ for (i = 0; i < RTE_DIM(ptrs); i++)
+ rte_fastmem_free(ptrs[i]);
+
+ args->ok = 1;
+ return 0;
+}
+
+static int
+test_stats_count_non_eal(void)
+{
+ enum { N = 8 };
+ struct rte_fastmem_stats before, after;
+ struct non_eal_args args = { 0 };
+ rte_thread_t thread_id;
+ int rc;
+
+ rte_fastmem_stats_reset();
+
+ rc = rte_fastmem_stats(&before);
+ TEST_ASSERT_EQUAL(rc, 0, "stats failed: %d", rc);
+
+ rc = rte_thread_create(&thread_id, NULL, stats_non_eal_main, &args);
+ TEST_ASSERT_EQUAL(rc, 0, "rte_thread_create() failed: %d", rc);
+ rc = rte_thread_join(thread_id, NULL);
+ TEST_ASSERT_EQUAL(rc, 0, "rte_thread_join() failed: %d", rc);
+ TEST_ASSERT_EQUAL(args.ok, 1, "non-EAL thread alloc/free failed");
+
+ rc = rte_fastmem_stats(&after);
+ TEST_ASSERT_EQUAL(rc, 0, "stats failed: %d", rc);
+
+ TEST_ASSERT_EQUAL(after.alloc_total - before.alloc_total, (uint64_t)N,
+ "non-EAL allocs not counted globally: delta=%" PRIu64,
+ after.alloc_total - before.alloc_total);
+ TEST_ASSERT_EQUAL(after.free_total - before.free_total, (uint64_t)N,
+ "non-EAL frees not counted globally: delta=%" PRIu64,
+ after.free_total - before.free_total);
+
+ return TEST_SUCCESS;
+}
+
+/*
+ * A non-EAL thread has no lcore id, so its traffic must land in the
+ * shared cache and be reported by rte_fastmem_stats_shared().
+ */
+static int
+test_stats_shared_non_eal(void)
+{
+ enum { N = 8 };
+ struct rte_fastmem_lcore_stats sh;
+ struct rte_fastmem_lcore_class_stats shc;
+ struct non_eal_args args = { 0 };
+ rte_thread_t thread_id;
+ int rc;
+
+ rte_fastmem_stats_reset();
+
+ rc = rte_thread_create(&thread_id, NULL, stats_non_eal_main, &args);
+ TEST_ASSERT_EQUAL(rc, 0, "rte_thread_create() failed: %d", rc);
+ rc = rte_thread_join(thread_id, NULL);
+ TEST_ASSERT_EQUAL(rc, 0, "rte_thread_join() failed: %d", rc);
+ TEST_ASSERT_EQUAL(args.ok, 1, "non-EAL thread alloc/free failed");
+
+ rc = rte_fastmem_stats_shared(&sh);
+ TEST_ASSERT_EQUAL(rc, 0, "stats_shared failed: %d", rc);
+ TEST_ASSERT_EQUAL(sh.alloc_cache_hits + sh.alloc_cache_misses,
+ (uint64_t)N, "shared allocs not counted: %" PRIu64,
+ sh.alloc_cache_hits + sh.alloc_cache_misses);
+ TEST_ASSERT_EQUAL(sh.free_cache_hits + sh.free_cache_misses,
+ (uint64_t)N, "shared frees not counted: %" PRIu64,
+ sh.free_cache_hits + sh.free_cache_misses);
+
+ /* stats_non_eal_main allocates 64-byte objects. */
+ rc = rte_fastmem_stats_shared_class(64, &shc);
+ TEST_ASSERT_EQUAL(rc, 0, "stats_shared_class failed: %d", rc);
+ TEST_ASSERT_EQUAL(shc.class_size, (size_t)64, "wrong class_size");
+ TEST_ASSERT_EQUAL(shc.alloc_cache_hits + shc.alloc_cache_misses,
+ (uint64_t)N, "shared class allocs not counted: %" PRIu64,
+ shc.alloc_cache_hits + shc.alloc_cache_misses);
+
+ /* The shared traffic must not be attributed to any lcore. */
+ struct rte_fastmem_lcore_stats ls;
+ rc = rte_fastmem_stats_lcore(rte_lcore_id(), &ls);
+ TEST_ASSERT_EQUAL(rc, 0, "stats_lcore failed: %d", rc);
+ TEST_ASSERT_EQUAL(ls.alloc_cache_hits + ls.alloc_cache_misses,
+ (uint64_t)0, "shared traffic leaked into lcore stats");
+
+ /* Error paths. */
+ rc = rte_fastmem_stats_shared(NULL);
+ TEST_ASSERT_EQUAL(rc, -EINVAL, "expected -EINVAL for NULL stats");
+ rc = rte_fastmem_stats_shared_class(13, &shc);
+ TEST_ASSERT_EQUAL(rc, -EINVAL, "expected -EINVAL for bad size");
+
+ return TEST_SUCCESS;
+}
+
+
+#define MIXED_LONG_LIVED_COUNT 25
+#define MIXED_SHORT_LIVED_ITERS 1000
+#define MIXED_MIN_LCORES 3
+
+static const size_t mixed_long_sizes[] = { 64, 256, 4096 };
+static const size_t mixed_short_sizes[] = { 8, 16, 32, 64, 128, 256, 512, 1024 };
+
+struct mixed_worker_args {
+ uint32_t seed;
+ int result;
+};
+
+static uint32_t
+xorshift32(uint32_t *state)
+{
+ uint32_t x = *state;
+
+ x ^= x << 13;
+ x ^= x >> 17;
+ x ^= x << 5;
+ *state = x;
+ return x;
+}
+
+static int
+mixed_worker(void *arg)
+{
+ struct mixed_worker_args *args = arg;
+ uint32_t seed = args->seed;
+ void *long_lived[MIXED_LONG_LIVED_COUNT];
+ size_t long_sizes[MIXED_LONG_LIVED_COUNT];
+ unsigned int i;
+
+ /* Allocate long-lived objects of mixed sizes. */
+ for (i = 0; i < MIXED_LONG_LIVED_COUNT; i++) {
+ long_sizes[i] = mixed_long_sizes[i % RTE_DIM(mixed_long_sizes)];
+ long_lived[i] = rte_fastmem_alloc(long_sizes[i], 0, 0);
+ if (long_lived[i] == NULL) {
+ args->result = TEST_FAILED;
+ return -1;
+ }
+ memset(long_lived[i], (int)(i + 1), long_sizes[i]);
+ }
+
+ /* Rapidly cycle short-lived objects. */
+ for (i = 0; i < MIXED_SHORT_LIVED_ITERS; i++) {
+ size_t sz = mixed_short_sizes[xorshift32(&seed) %
+ RTE_DIM(mixed_short_sizes)];
+ uint8_t pattern = (uint8_t)(i & 0xff);
+ uint8_t *p;
+
+ p = rte_fastmem_alloc(sz, 0, 0);
+ if (p == NULL) {
+ args->result = TEST_FAILED;
+ return -1;
+ }
+ memset(p, pattern, sz);
+
+ /* Verify before freeing. */
+ for (size_t j = 0; j < sz; j++) {
+ if (p[j] != pattern) {
+ args->result = TEST_FAILED;
+ return -1;
+ }
+ }
+ rte_fastmem_free(p);
+ }
+
+ /* Verify long-lived objects are still intact. */
+ for (i = 0; i < MIXED_LONG_LIVED_COUNT; i++) {
+ uint8_t *bytes = long_lived[i];
+ uint8_t expected = (uint8_t)(i + 1);
+
+ for (size_t j = 0; j < long_sizes[i]; j++) {
+ if (bytes[j] != expected) {
+ args->result = TEST_FAILED;
+ return -1;
+ }
+ }
+ rte_fastmem_free(long_lived[i]);
+ }
+
+ args->result = TEST_SUCCESS;
+ return 0;
+}
+
+static int
+test_mixed_lifetimes_multi_lcore(void)
+{
+ struct mixed_worker_args args[RTE_MAX_LCORE];
+ unsigned int lcore_id;
+ unsigned int count = 0;
+ struct rte_fastmem_stats stats;
+ int rc;
+
+ RTE_LCORE_FOREACH_WORKER(lcore_id)
+ count++;
+
+ if (count < MIXED_MIN_LCORES) {
+ printf("Not enough worker lcores (%u < %u), skipping\n",
+ count, MIXED_MIN_LCORES);
+ return TEST_SKIPPED;
+ }
+
+ /* Launch workers with distinct seeds. */
+ uint32_t seed = 0xdeadbeef;
+
+ RTE_LCORE_FOREACH_WORKER(lcore_id) {
+ args[lcore_id].seed = seed;
+ args[lcore_id].result = TEST_FAILED;
+ seed += 0x12345678;
+ rte_eal_remote_launch(mixed_worker, &args[lcore_id], lcore_id);
+ }
+
+ rte_eal_mp_wait_lcore();
+
+ /* Check all workers succeeded. */
+ RTE_LCORE_FOREACH_WORKER(lcore_id) {
+ TEST_ASSERT_EQUAL(args[lcore_id].result, TEST_SUCCESS,
+ "worker on lcore %u failed", lcore_id);
+ }
+
+ /* Verify no memory leak. */
+ rc = rte_fastmem_stats(&stats);
+ TEST_ASSERT_EQUAL(rc, 0, "stats failed: %d", rc);
+ TEST_ASSERT_EQUAL(stats.bytes_in_use, (uint64_t)0,
+ "bytes_in_use not zero after test: %" PRIu64,
+ stats.bytes_in_use);
+
+ return TEST_SUCCESS;
+}
+
+
+/*
+ * Memory limit tests.
+ *
+ * FASTMEM_MEMZONE_SIZE is 128 MiB. We use a limit of 128 MiB
+ * (one memzone) for most tests, and large objects (256 KiB) to
+ * exhaust slabs quickly.
+ */
+
+#define LIMIT_ONE_MZ ((size_t)128 << 20)
+#define LIMIT_OBJ_SIZE ((size_t)256 * 1024)
+
+static int
+test_memory_limit_basic(void)
+{
+ int rc;
+
+ rc = rte_fastmem_set_limit(SOCKET_ID_ANY, LIMIT_ONE_MZ);
+ TEST_ASSERT_EQUAL(rc, 0, "set_memory_limit failed: %d", rc);
+
+ const size_t got = rte_fastmem_get_limit(0);
+ TEST_ASSERT_EQUAL(got, LIMIT_ONE_MZ,
+ "get_memory_limit mismatch: %zu", got);
+
+ rc = rte_fastmem_reserve(LIMIT_ONE_MZ, SOCKET_ID_ANY);
+ TEST_ASSERT_EQUAL(rc, 0, "first reserve failed: %d", rc);
+
+ rc = rte_fastmem_reserve(LIMIT_ONE_MZ + 1, SOCKET_ID_ANY);
+ TEST_ASSERT(rc < 0, "second reserve should have failed");
+
+ return TEST_SUCCESS;
+}
+
+static int
+test_memory_limit_alloc_exhaustion(void)
+{
+ const unsigned int max_ptrs = 1024;
+ void *ptrs[max_ptrs];
+ unsigned int count = 0;
+ rte_fastmem_set_limit(SOCKET_ID_ANY, LIMIT_ONE_MZ);
+
+ for (count = 0; count < max_ptrs; count++) {
+ ptrs[count] = rte_fastmem_alloc(LIMIT_OBJ_SIZE, 0, 0);
+ if (ptrs[count] == NULL)
+ break;
+ }
+
+ TEST_ASSERT(count > 0, "should have allocated at least one");
+ TEST_ASSERT(count < max_ptrs, "should have hit the limit");
+ TEST_ASSERT_EQUAL(rte_errno, ENOMEM, "expected ENOMEM, got %d", rte_errno);
+
+ rte_fastmem_free(ptrs[count - 1]);
+ void *p = rte_fastmem_alloc(LIMIT_OBJ_SIZE, 0, 0);
+ TEST_ASSERT_NOT_NULL(p, "alloc after free should succeed");
+ rte_fastmem_free(p);
+
+ for (unsigned int i = 0; i < count - 1; i++)
+ rte_fastmem_free(ptrs[i]);
+
+ return TEST_SUCCESS;
+}
+
+static int
+test_memory_limit_zero_blocks_growth(void)
+{
+ int rc;
+
+ rte_fastmem_set_limit(SOCKET_ID_ANY, 0);
+
+ rc = rte_fastmem_reserve(1, SOCKET_ID_ANY);
+ TEST_ASSERT(rc < 0, "reserve with limit=0 should fail");
+
+ void *p = rte_fastmem_alloc(64, 0, 0);
+ TEST_ASSERT_NULL(p, "alloc with limit=0 should fail");
+
+ return TEST_SUCCESS;
+}
+
+static int
+test_memory_limit_below_current(void)
+{
+ int rc;
+
+ rc = rte_fastmem_reserve(LIMIT_ONE_MZ, SOCKET_ID_ANY);
+ TEST_ASSERT_EQUAL(rc, 0, "reserve failed: %d", rc);
+
+ rte_fastmem_set_limit(SOCKET_ID_ANY, 1);
+
+ void *p = rte_fastmem_alloc(64, 0, 0);
+ TEST_ASSERT_NOT_NULL(p, "alloc from existing backing should work");
+ rte_fastmem_free(p);
+
+ rc = rte_fastmem_reserve(LIMIT_ONE_MZ * 2, SOCKET_ID_ANY);
+ TEST_ASSERT(rc < 0, "growth beyond limit should fail");
+
+ return TEST_SUCCESS;
+}
+
+static int
+test_memory_limit_socket_id_any(void)
+{
+ rte_fastmem_set_limit(SOCKET_ID_ANY, 42);
+
+ for (unsigned int i = 0; i < rte_socket_count(); i++) {
+ const int sid = rte_socket_id_by_idx(i);
+ const size_t lim = rte_fastmem_get_limit(sid);
+
+ TEST_ASSERT_EQUAL(lim, (size_t)42,
+ "socket %d limit mismatch: %zu", sid, lim);
+ }
+
+ return TEST_SUCCESS;
+}
+
+static int
+test_memory_limit_unlimited(void)
+{
+ int rc;
+
+ rte_fastmem_set_limit(SOCKET_ID_ANY, 0);
+ rte_fastmem_set_limit(SOCKET_ID_ANY, SIZE_MAX);
+
+ rc = rte_fastmem_reserve(LIMIT_ONE_MZ, SOCKET_ID_ANY);
+ TEST_ASSERT_EQUAL(rc, 0, "reserve after reset failed: %d", rc);
+
+ return TEST_SUCCESS;
+}
+
+static int
+test_memory_limit_alloc_integrity_under_oom(void)
+{
+ const unsigned int n = 128;
+ const size_t obj_size = 1024;
+ uint8_t *ptrs[n];
+ const unsigned int extra_max = 1024;
+ void *extra[extra_max];
+ unsigned int n_extra = 0;
+ unsigned int i;
+ rte_fastmem_set_limit(SOCKET_ID_ANY, LIMIT_ONE_MZ);
+
+ for (i = 0; i < n; i++) {
+ ptrs[i] = rte_fastmem_alloc(obj_size, 0, 0);
+ TEST_ASSERT_NOT_NULL(ptrs[i], "alloc[%u] failed", i);
+ memset(ptrs[i], (int)(i & 0xff), obj_size);
+ }
+
+ /* Exhaust remaining backing with large objects. */
+ for (n_extra = 0; n_extra < extra_max; n_extra++) {
+ extra[n_extra] = rte_fastmem_alloc(LIMIT_OBJ_SIZE, 0, 0);
+ if (extra[n_extra] == NULL)
+ break;
+ }
+
+ /* Verify original objects are intact. */
+ for (i = 0; i < n; i++) {
+ const uint8_t expected = (uint8_t)(i & 0xff);
+ for (unsigned int j = 0; j < obj_size; j++)
+ TEST_ASSERT_EQUAL(ptrs[i][j], expected,
+ "corruption at [%u][%u]", i, j);
+ }
+
+ for (i = 0; i < n; i++)
+ rte_fastmem_free(ptrs[i]);
+ for (i = 0; i < n_extra; i++)
+ rte_fastmem_free(extra[i]);
+
+ return TEST_SUCCESS;
+}
+
+static int
+test_memory_limit_bulk_alloc_oom(void)
+{
+ const unsigned int bulk_n = 64;
+ const unsigned int drain_max = 512;
+ void *ptrs[bulk_n];
+ void *drain[drain_max];
+ unsigned int drained = 0;
+ int rc;
+
+ rte_fastmem_set_limit(SOCKET_ID_ANY, LIMIT_ONE_MZ);
+
+ for (drained = 0; drained < drain_max; drained++) {
+ drain[drained] = rte_fastmem_alloc(LIMIT_OBJ_SIZE, 0, 0);
+ if (drain[drained] == NULL)
+ break;
+ }
+
+ /* Free a few — enough for some but not bulk_n objects. */
+ const unsigned int freed = RTE_MIN(drained, 4u);
+ for (unsigned int i = 0; i < freed; i++)
+ rte_fastmem_free(drain[--drained]);
+
+ rc = rte_fastmem_alloc_bulk(ptrs, bulk_n, LIMIT_OBJ_SIZE, 0, 0);
+ TEST_ASSERT(rc < 0, "bulk alloc should fail");
+
+ for (unsigned int i = 0; i < drained; i++)
+ rte_fastmem_free(drain[i]);
+
+ return TEST_SUCCESS;
+}
+
+static int
+test_memory_limit_recovery_after_free(void)
+{
+ const unsigned int max_ptrs = 512;
+ void *ptrs[max_ptrs];
+ unsigned int count = 0;
+ rte_fastmem_set_limit(SOCKET_ID_ANY, LIMIT_ONE_MZ);
+
+ for (count = 0; count < max_ptrs; count++) {
+ ptrs[count] = rte_fastmem_alloc(LIMIT_OBJ_SIZE, 0, 0);
+ if (ptrs[count] == NULL)
+ break;
+ }
+ TEST_ASSERT(count > 0 && count < max_ptrs,
+ "expected partial fill, got %u", count);
+
+ const unsigned int half = count / 2;
+ for (unsigned int i = 0; i < half; i++)
+ rte_fastmem_free(ptrs[i]);
+
+ for (unsigned int i = 0; i < half; i++) {
+ ptrs[i] = rte_fastmem_alloc(LIMIT_OBJ_SIZE, 0, 0);
+ TEST_ASSERT_NOT_NULL(ptrs[i], "recovery alloc[%u] failed", i);
+ }
+
+ for (unsigned int i = 0; i < count; i++)
+ rte_fastmem_free(ptrs[i]);
+
+ return TEST_SUCCESS;
+}
+
+struct limit_worker_args {
+ unsigned int alloc_count;
+ int result;
+};
+
+static int
+limit_worker(void *arg)
+{
+ struct limit_worker_args *args = arg;
+ const unsigned int max_ptrs = 128;
+ void *ptrs[max_ptrs];
+ unsigned int i;
+
+ args->alloc_count = 0;
+
+ for (i = 0; i < max_ptrs; i++) {
+ ptrs[i] = rte_fastmem_alloc(LIMIT_OBJ_SIZE, 0, 0);
+ if (ptrs[i] == NULL)
+ break;
+ memset(ptrs[i], 0xab, LIMIT_OBJ_SIZE);
+ args->alloc_count++;
+ }
+
+ for (unsigned int j = 0; j < args->alloc_count; j++) {
+ uint8_t *bytes = ptrs[j];
+ for (size_t k = 0; k < LIMIT_OBJ_SIZE; k++) {
+ if (bytes[k] != 0xab) {
+ args->result = TEST_FAILED;
+ return -1;
+ }
+ }
+ rte_fastmem_free(ptrs[j]);
+ }
+
+ args->result = TEST_SUCCESS;
+ return 0;
+}
+
+static int
+test_memory_limit_multi_lcore_oom(void)
+{
+ struct limit_worker_args args[RTE_MAX_LCORE];
+ unsigned int lcore_id;
+ unsigned int worker_count = 0;
+ RTE_LCORE_FOREACH_WORKER(lcore_id)
+ worker_count++;
+
+ if (worker_count < 2) {
+ printf("Not enough workers (%u < 2), skipping\n", worker_count);
+ return TEST_SKIPPED;
+ }
+
+ rte_fastmem_set_limit(SOCKET_ID_ANY, LIMIT_ONE_MZ);
+
+ RTE_LCORE_FOREACH_WORKER(lcore_id) {
+ args[lcore_id].result = TEST_FAILED;
+ rte_eal_remote_launch(limit_worker, &args[lcore_id], lcore_id);
+ }
+
+ rte_eal_mp_wait_lcore();
+
+ RTE_LCORE_FOREACH_WORKER(lcore_id) {
+ TEST_ASSERT_EQUAL(args[lcore_id].result, TEST_SUCCESS,
+ "worker on lcore %u failed", lcore_id);
+ }
+
+ struct rte_fastmem_stats stats;
+ rte_fastmem_stats(&stats);
+ TEST_ASSERT_EQUAL(stats.bytes_in_use, (uint64_t)0,
+ "bytes_in_use not zero: %" PRIu64, stats.bytes_in_use);
+
+ return TEST_SUCCESS;
+}
+
+static int
+test_realloc_same_class(void)
+{
+ void *ptr = rte_fastmem_alloc(32, 0, 0);
+ TEST_ASSERT_NOT_NULL(ptr, "alloc failed");
+
+ /* Realloc to a smaller size within the same class (64 B class). */
+ void *ptr2 = rte_fastmem_realloc(ptr, 33, 0);
+ TEST_ASSERT_NOT_NULL(ptr2, "realloc failed");
+ TEST_ASSERT_EQUAL(ptr, ptr2,
+ "realloc returned different pointer for same class");
+
+ /* Realloc to exact class boundary — still same class. */
+ void *ptr3 = rte_fastmem_realloc(ptr2, 64, 0);
+ TEST_ASSERT_NOT_NULL(ptr3, "realloc failed");
+ TEST_ASSERT_EQUAL(ptr2, ptr3,
+ "realloc returned different pointer for same class");
+
+ rte_fastmem_free(ptr3);
+ return TEST_SUCCESS;
+}
+
+static int
+test_realloc_grow(void)
+{
+ const uint8_t pattern = 0xab;
+ void *ptr = rte_fastmem_alloc(16, 0, 0);
+ TEST_ASSERT_NOT_NULL(ptr, "alloc failed");
+
+ memset(ptr, pattern, 16);
+
+ /* Grow beyond current class. */
+ void *ptr2 = rte_fastmem_realloc(ptr, 128, 0);
+ TEST_ASSERT_NOT_NULL(ptr2, "realloc grow failed");
+
+ /* Verify contents preserved. */
+ uint8_t *bytes = ptr2;
+ for (unsigned int i = 0; i < 16; i++)
+ TEST_ASSERT_EQUAL(bytes[i], pattern,
+ "content corrupted at byte %u", i);
+
+ rte_fastmem_free(ptr2);
+ return TEST_SUCCESS;
+}
+
+static int
+test_realloc_shrink(void)
+{
+ const uint8_t pattern = 0xcd;
+ void *ptr = rte_fastmem_alloc(256, 0, 0);
+ TEST_ASSERT_NOT_NULL(ptr, "alloc failed");
+
+ memset(ptr, pattern, 256);
+
+ /* Shrink to a smaller class. */
+ void *ptr2 = rte_fastmem_realloc(ptr, 16, 0);
+ TEST_ASSERT_NOT_NULL(ptr2, "realloc shrink failed");
+
+ /* Verify contents preserved up to new size. */
+ uint8_t *bytes = ptr2;
+ for (unsigned int i = 0; i < 16; i++)
+ TEST_ASSERT_EQUAL(bytes[i], pattern,
+ "content corrupted at byte %u", i);
+
+ rte_fastmem_free(ptr2);
+ return TEST_SUCCESS;
+}
+
+static int
+test_realloc_null_ptr(void)
+{
+ /* NULL ptr should behave like alloc. */
+ void *ptr = rte_fastmem_realloc(NULL, 64, 0);
+ TEST_ASSERT_NOT_NULL(ptr, "realloc(NULL) failed");
+
+ rte_fastmem_free(ptr);
+ return TEST_SUCCESS;
+}
+
+static int
+test_realloc_zero_size(void)
+{
+ void *ptr = rte_fastmem_alloc(64, 0, 0);
+ TEST_ASSERT_NOT_NULL(ptr, "alloc failed");
+
+ /* size 0 should free and return NULL. */
+ void *ptr2 = rte_fastmem_realloc(ptr, 0, 0);
+ TEST_ASSERT_NULL(ptr2, "realloc(size=0) should return NULL");
+
+ return TEST_SUCCESS;
+}
+
+static int
+test_realloc_too_big(void)
+{
+ void *ptr = rte_fastmem_alloc(64, 0, 0);
+ TEST_ASSERT_NOT_NULL(ptr, "alloc failed");
+
+ void *ptr2 = rte_fastmem_realloc(ptr, rte_fastmem_max_size() + 1, 0);
+ TEST_ASSERT_NULL(ptr2, "realloc should fail for oversized request");
+ TEST_ASSERT_EQUAL(rte_errno, E2BIG, "expected E2BIG");
+
+ /* Original pointer should still be valid. */
+ rte_fastmem_free(ptr);
+ return TEST_SUCCESS;
+}
+
+static int
+test_realloc_invalid_align(void)
+{
+ void *ptr = rte_fastmem_alloc(64, 0, 0);
+ TEST_ASSERT_NOT_NULL(ptr, "alloc failed");
+
+ void *ptr2 = rte_fastmem_realloc(ptr, 64, 3);
+ TEST_ASSERT_NULL(ptr2, "realloc should fail for non-power-of-2 align");
+ TEST_ASSERT_EQUAL(rte_errno, EINVAL, "expected EINVAL");
+
+ rte_fastmem_free(ptr);
+ return TEST_SUCCESS;
+}
+
+/*
+ * Handle-based allocation API.
+ */
+
+static int
+test_halloc_basic(void)
+{
+ rte_fastmem_handle_t handle;
+ void *ptrs[16];
+ void *p;
+ int rc;
+ unsigned int i;
+
+ rc = rte_fastmem_hlookup(64, 0, rte_socket_id_by_idx(0), &handle);
+ TEST_ASSERT_EQUAL(rc, 0, "hlookup failed: %d", rc);
+
+ p = rte_fastmem_halloc(handle, RTE_FASTMEM_F_ZERO);
+ TEST_ASSERT_NOT_NULL(p, "halloc failed: rte_errno=%d", rte_errno);
+ memset(p, 0x5a, 64);
+ rte_fastmem_hfree(handle, p);
+
+ /* NULL pointer free is a no-op. */
+ rte_fastmem_hfree(handle, NULL);
+
+ rc = rte_fastmem_halloc_bulk(handle, ptrs, RTE_DIM(ptrs), 0);
+ TEST_ASSERT_EQUAL(rc, 0, "halloc_bulk failed: %d", rc);
+ for (i = 0; i < RTE_DIM(ptrs); i++)
+ TEST_ASSERT_NOT_NULL(ptrs[i], "halloc_bulk[%u] NULL", i);
+ rte_fastmem_hfree_bulk(handle, ptrs, RTE_DIM(ptrs));
+
+ return TEST_SUCCESS;
+}
+
+struct halloc_worker_args {
+ rte_fastmem_handle_t handle;
+ int result;
+};
+
+/*
+ * Allocate and free using a handle that was looked up on a
+ * different lcore. The worker lcore has no pre-existing cache for
+ * the handle's size class, so this exercises the path where
+ * halloc/hfree must lazily create (or bypass) the per-lcore cache.
+ */
+static int
+halloc_worker(void *arg)
+{
+ struct halloc_worker_args *args = arg;
+ void *ptrs[8];
+ uint8_t *p;
+ unsigned int i;
+
+ args->result = TEST_FAILED;
+
+ p = rte_fastmem_halloc(args->handle, 0);
+ if (p == NULL)
+ return -1;
+ memset(p, 0x3c, 64);
+ rte_fastmem_hfree(args->handle, p);
+
+ if (rte_fastmem_halloc_bulk(args->handle, ptrs, RTE_DIM(ptrs), 0) < 0)
+ return -1;
+ for (i = 0; i < RTE_DIM(ptrs); i++) {
+ if (ptrs[i] == NULL)
+ return -1;
+ }
+ rte_fastmem_hfree_bulk(args->handle, ptrs, RTE_DIM(ptrs));
+
+ args->result = TEST_SUCCESS;
+ return 0;
+}
+
+static int
+test_halloc_other_lcore(void)
+{
+ struct halloc_worker_args args;
+ rte_fastmem_handle_t handle;
+ unsigned int lcore_id;
+ int rc;
+
+ lcore_id = rte_get_next_lcore(-1, 1, 0);
+ if (lcore_id == RTE_MAX_LCORE)
+ return TEST_SKIPPED;
+
+ /* Look up the handle on the main lcore only. */
+ rc = rte_fastmem_hlookup(64, 0, rte_socket_id_by_idx(0), &handle);
+ TEST_ASSERT_EQUAL(rc, 0, "hlookup failed: %d", rc);
+
+ args.handle = handle;
+ args.result = TEST_FAILED;
+
+ rte_eal_remote_launch(halloc_worker, &args, lcore_id);
+ rc = rte_eal_wait_lcore(lcore_id);
+ TEST_ASSERT_EQUAL(rc, 0, "worker returned %d", rc);
+ TEST_ASSERT_EQUAL(args.result, TEST_SUCCESS,
+ "halloc/hfree failed on a lcore that did not call hlookup");
+
+ return TEST_SUCCESS;
+}
+
+static uint32_t
+halloc_non_eal_main(void *arg)
+{
+ struct halloc_worker_args *args = arg;
+
+ return halloc_worker(args) == 0 ? 0 : 1;
+}
+
+static int
+test_halloc_non_eal_thread(void)
+{
+ struct halloc_worker_args args;
+ rte_fastmem_handle_t handle;
+ rte_thread_t thread_id;
+ int rc;
+
+ rc = rte_fastmem_hlookup(64, 0, rte_socket_id_by_idx(0), &handle);
+ TEST_ASSERT_EQUAL(rc, 0, "hlookup failed: %d", rc);
+
+ args.handle = handle;
+ args.result = TEST_FAILED;
+
+ rc = rte_thread_create(&thread_id, NULL, halloc_non_eal_main, &args);
+ TEST_ASSERT_EQUAL(rc, 0, "rte_thread_create() failed: %d", rc);
+ rc = rte_thread_join(thread_id, NULL);
+ TEST_ASSERT_EQUAL(rc, 0, "rte_thread_join() failed: %d", rc);
+
+ TEST_ASSERT_EQUAL(args.result, TEST_SUCCESS,
+ "halloc/hfree failed on a non-EAL thread");
+
+ return TEST_SUCCESS;
+}
+
+static int
+fastmem_setup(void)
+{
+ return rte_fastmem_init();
+}
+
+static void
+fastmem_teardown(void)
+{
+ rte_fastmem_deinit();
+}
+
+static struct unit_test_suite fastmem_testsuite = {
+ .suite_name = "fastmem tests",
+ .setup = NULL,
+ .teardown = NULL,
+ .unit_test_cases = {
+ TEST_CASE(test_init_deinit),
+ TEST_CASE(test_init_is_not_idempotent),
+ TEST_CASE(test_deinit_without_init),
+ TEST_CASE(test_max_size),
+ TEST_CASE(test_reserve_without_init),
+ TEST_CASE(test_cache_flush_without_init),
+ TEST_CASE(test_classes),
+ TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+ test_reserve_small),
+ TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+ test_reserve_multiple_memzones),
+ TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+ test_reserve_cumulative),
+ TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+ test_reserve_invalid_socket),
+ TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+ test_reserve_any_socket),
+ TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+ test_alloc_too_big),
+ TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+ test_alloc_invalid_align),
+ TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+ test_alloc_free_small),
+ TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+ test_alloc_free_various_sizes),
+ TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+ test_alloc_alignment),
+ TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+ test_alloc_zero_flag),
+ TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+ test_alloc_reuse),
+ TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+ test_alloc_many_in_class),
+ TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+ test_alloc_socket),
+ TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+ test_alloc_block_repurposing),
+ TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+ test_alloc_block_repurposing_no_growth),
+ TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+ test_free_null),
+ TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+ test_alloc_content_integrity),
+ TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+ test_alloc_align_too_big),
+ TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+ test_alloc_align_one),
+ TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+ test_alloc_socket_numa_placement),
+ TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+ test_alloc_cross_socket_deinit),
+ TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+ test_cache_flush),
+ TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+ test_cache_exceeds_capacity),
+ TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+ test_non_eal_thread),
+ TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+ test_cache_flush_returns_memory),
+ TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+ test_alloc_bulk_basic),
+ TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+ test_alloc_bulk_zero_flag),
+ TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+ test_alloc_bulk_exceeds_cache),
+ TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+ test_alloc_bulk_socket),
+ TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+ test_free_bulk),
+ TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+ test_stats_class),
+ TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+ test_stats_lcore),
+ TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+ test_stats_lcore_class),
+ TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+ test_stats_reset),
+ TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+ test_stats_survive_cache_flush),
+ TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+ test_stats_count_non_eal),
+ TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+ test_stats_shared_non_eal),
+ TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+ test_mixed_lifetimes_multi_lcore),
+ TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+ test_memory_limit_basic),
+ TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+ test_memory_limit_alloc_exhaustion),
+ TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+ test_memory_limit_zero_blocks_growth),
+ TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+ test_memory_limit_below_current),
+ TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+ test_memory_limit_socket_id_any),
+ TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+ test_memory_limit_unlimited),
+ TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+ test_memory_limit_alloc_integrity_under_oom),
+ TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+ test_memory_limit_bulk_alloc_oom),
+ TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+ test_memory_limit_recovery_after_free),
+ TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+ test_memory_limit_multi_lcore_oom),
+ TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+ test_realloc_same_class),
+ TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+ test_realloc_grow),
+ TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+ test_realloc_shrink),
+ TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+ test_realloc_null_ptr),
+ TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+ test_realloc_zero_size),
+ TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+ test_realloc_too_big),
+ TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+ test_realloc_invalid_align),
+ TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+ test_halloc_basic),
+ TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+ test_halloc_other_lcore),
+ TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+ test_halloc_non_eal_thread),
+ TEST_CASES_END()
+ }
+};
+
+static int
+test_fastmem(void)
+{
+ return unit_test_suite_runner(&fastmem_testsuite);
+}
+
+REGISTER_FAST_TEST(fastmem_autotest, NOHUGE_SKIP, ASAN_OK, test_fastmem);
diff --git a/app/test/test_fastmem_perf.c b/app/test/test_fastmem_perf.c
new file mode 100644
index 0000000000..73c0a4c6ce
--- /dev/null
+++ b/app/test/test_fastmem_perf.c
@@ -0,0 +1,1040 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2026 Ericsson AB
+ */
+
+#include <inttypes.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <rte_common.h>
+#include <rte_cycles.h>
+#include <rte_launch.h>
+#include <rte_lcore.h>
+#include <rte_malloc.h>
+#include <rte_mempool.h>
+#include <rte_stdatomic.h>
+
+#include <rte_fastmem.h>
+
+#include "test.h"
+
+#define TEST_LOG(...) printf(__VA_ARGS__)
+
+static const size_t SIZES[] = { 8, 64, 256, 1024, 4096 };
+#define N_SIZES RTE_DIM(SIZES)
+
+/* Number of ops for warmup and measurement. */
+#define WARMUP_OPS 20000u
+#define MEASURE_OPS 2000000u
+
+/* Buffer for scenarios that allocate N then free N. */
+#define BATCH_N 256
+
+/*
+ * Allocator vtable: a thin adapter exposing alloc / free /
+ * per-allocator setup/teardown. Each scenario calls these
+ * indirectly so the same timing loop serves all allocators.
+ */
+struct allocator {
+ const char *name;
+ int (*setup)(size_t size, unsigned int n_max);
+ void (*teardown)(void);
+ void *(*alloc)(void);
+ void (*free_obj)(void *ptr);
+ int (*alloc_bulk)(void **ptrs, unsigned int n);
+ void (*free_bulk)(void **ptrs, unsigned int n);
+};
+
+/* Fastmem adapter -------------------------------------------------- */
+
+static size_t fastmem_size;
+
+static int
+fastmem_setup(size_t size, unsigned int n_max __rte_unused)
+{
+ fastmem_size = size;
+ return 0;
+}
+
+static void
+fastmem_teardown(void)
+{
+ rte_fastmem_cache_flush();
+}
+
+static void * __rte_noinline
+fastmem_alloc(void)
+{
+ return rte_fastmem_alloc(fastmem_size, 0, 0);
+}
+
+static void __rte_noinline
+fastmem_free(void *ptr)
+{
+ rte_fastmem_free(ptr);
+}
+
+/* Mempool adapter -------------------------------------------------- */
+
+static struct rte_mempool *mempool_pool;
+
+static int
+mempool_setup(size_t size, unsigned int n_max)
+{
+ char name[RTE_MEMPOOL_NAMESIZE];
+ unsigned int cache_size;
+
+ /*
+ * Pool size must accommodate the full batch burst plus
+ * per-lcore cache capacity. Use mempool's default cache
+ * size so we're measuring its standard hot path.
+ */
+ cache_size = RTE_MEMPOOL_CACHE_MAX_SIZE;
+
+ snprintf(name, sizeof(name), "fmperf_mp_%zu", size);
+ mempool_pool = rte_mempool_create(name, n_max + cache_size * 2,
+ size, cache_size, 0, NULL, NULL, NULL, NULL,
+ SOCKET_ID_ANY, 0);
+ if (mempool_pool == NULL) {
+ TEST_LOG("mempool_create(%zu) failed\n", size);
+ return -1;
+ }
+
+ return 0;
+}
+
+static void
+mempool_teardown(void)
+{
+ rte_mempool_free(mempool_pool);
+ mempool_pool = NULL;
+}
+
+static void * __rte_noinline
+mempool_alloc_one(void)
+{
+ void *obj = NULL;
+
+ if (rte_mempool_get(mempool_pool, &obj) < 0)
+ return NULL;
+ return obj;
+}
+
+static void __rte_noinline
+mempool_free_one(void *ptr)
+{
+ rte_mempool_put(mempool_pool, ptr);
+}
+
+/* rte_malloc adapter ----------------------------------------------- */
+
+static size_t malloc_size;
+
+static int
+malloc_setup(size_t size, unsigned int n_max __rte_unused)
+{
+ malloc_size = size;
+ return 0;
+}
+
+static void
+malloc_teardown(void)
+{
+}
+
+static void * __rte_noinline
+malloc_alloc(void)
+{
+ return rte_malloc(NULL, malloc_size, 0);
+}
+
+static void __rte_noinline
+malloc_free(void *ptr)
+{
+ rte_free(ptr);
+}
+
+/* libc (glibc) malloc adapter -------------------------------------- */
+
+static size_t libc_size;
+
+static int
+libc_setup(size_t size, unsigned int n_max __rte_unused)
+{
+ /*
+ * Round up to cache-line alignment to match the other
+ * allocators' default alignment guarantees and keep the
+ * comparison honest. aligned_alloc() requires size to be
+ * a multiple of the alignment.
+ */
+ libc_size = RTE_ALIGN_CEIL(size, RTE_CACHE_LINE_SIZE);
+ return 0;
+}
+
+static void
+libc_teardown(void)
+{
+}
+
+static void * __rte_noinline
+libc_alloc(void)
+{
+ return aligned_alloc(RTE_CACHE_LINE_SIZE, libc_size);
+}
+
+static void __rte_noinline
+libc_free(void *ptr)
+{
+ free(ptr);
+}
+
+/* Bulk adapters ---------------------------------------------------- */
+
+static int __rte_noinline
+fastmem_alloc_bulk(void **ptrs, unsigned int n)
+{
+ return rte_fastmem_alloc_bulk(ptrs, n, fastmem_size, 0, 0);
+}
+
+static void __rte_noinline
+fastmem_free_bulk(void **ptrs, unsigned int n)
+{
+ rte_fastmem_free_bulk(ptrs, n);
+}
+
+/* Fastmem handle adapter ------------------------------------------- */
+
+static rte_fastmem_handle_t fastmem_handle;
+
+static int
+fastmem_h_setup(size_t size, unsigned int n_max __rte_unused)
+{
+ return rte_fastmem_hlookup(size, 0, rte_socket_id(), &fastmem_handle);
+}
+
+static void
+fastmem_h_teardown(void)
+{
+ rte_fastmem_cache_flush();
+}
+
+static void * __rte_noinline
+fastmem_h_alloc(void)
+{
+ return rte_fastmem_halloc(fastmem_handle, 0);
+}
+
+static void __rte_noinline
+fastmem_h_free(void *ptr)
+{
+ rte_fastmem_hfree(fastmem_handle, ptr);
+}
+
+static int __rte_noinline
+fastmem_h_alloc_bulk(void **ptrs, unsigned int n)
+{
+ return rte_fastmem_halloc_bulk(fastmem_handle, ptrs, n, 0);
+}
+
+static void __rte_noinline
+fastmem_h_free_bulk(void **ptrs, unsigned int n)
+{
+ rte_fastmem_hfree_bulk(fastmem_handle, ptrs, n);
+}
+
+/* Mempool adapter -------------------------------------------------- */
+
+static int __rte_noinline
+mempool_alloc_bulk(void **ptrs, unsigned int n)
+{
+ return rte_mempool_get_bulk(mempool_pool, ptrs, n);
+}
+
+static void __rte_noinline
+mempool_free_bulk(void **ptrs, unsigned int n)
+{
+ rte_mempool_put_bulk(mempool_pool, ptrs, n);
+}
+
+static int __rte_noinline
+generic_alloc_bulk(void **ptrs, unsigned int n, void *(*alloc_fn)(void))
+{
+ unsigned int i;
+
+ for (i = 0; i < n; i++) {
+ ptrs[i] = alloc_fn();
+ if (ptrs[i] == NULL)
+ return -1;
+ }
+ return 0;
+}
+
+static int __rte_noinline
+malloc_alloc_bulk(void **ptrs, unsigned int n)
+{
+ return generic_alloc_bulk(ptrs, n, malloc_alloc);
+}
+
+static void __rte_noinline
+malloc_free_bulk(void **ptrs, unsigned int n)
+{
+ unsigned int i;
+
+ for (i = 0; i < n; i++)
+ malloc_free(ptrs[i]);
+}
+
+static int __rte_noinline
+libc_alloc_bulk(void **ptrs, unsigned int n)
+{
+ return generic_alloc_bulk(ptrs, n, libc_alloc);
+}
+
+static void __rte_noinline
+libc_free_bulk(void **ptrs, unsigned int n)
+{
+ unsigned int i;
+
+ for (i = 0; i < n; i++)
+ libc_free(ptrs[i]);
+}
+
+/* Adapter table ---------------------------------------------------- */
+
+static const struct allocator allocators[] = {
+ { "fastmem", fastmem_setup, fastmem_teardown, fastmem_alloc, fastmem_free, fastmem_alloc_bulk, fastmem_free_bulk },
+ { "fastmem_h", fastmem_h_setup, fastmem_h_teardown, fastmem_h_alloc, fastmem_h_free, fastmem_h_alloc_bulk, fastmem_h_free_bulk },
+ { "mempool", mempool_setup, mempool_teardown, mempool_alloc_one, mempool_free_one, mempool_alloc_bulk, mempool_free_bulk },
+ { "rte_malloc", malloc_setup, malloc_teardown, malloc_alloc, malloc_free, malloc_alloc_bulk, malloc_free_bulk },
+ { "libc", libc_setup, libc_teardown, libc_alloc, libc_free, libc_alloc_bulk, libc_free_bulk },
+};
+#define N_ALLOCATORS RTE_DIM(allocators)
+
+/*
+ * Scenario 1: tight alloc+free loop. A single object is cycled
+ * repeatedly. The LIFO path keeps the same pointer hot, giving
+ * a best-case measurement.
+ */
+static double
+run_tight(const struct allocator *alloc, size_t size)
+{
+ void *p;
+ uint64_t tsc;
+ unsigned int i;
+
+ if (alloc->setup(size, 1) < 0)
+ return -1.0;
+
+ /* Warmup. */
+ for (i = 0; i < WARMUP_OPS; i++) {
+ p = alloc->alloc();
+ if (p == NULL)
+ goto err;
+ alloc->free_obj(p);
+ }
+
+ tsc = rte_rdtsc_precise();
+ for (i = 0; i < MEASURE_OPS; i++) {
+ p = alloc->alloc();
+ if (p == NULL)
+ goto err;
+ alloc->free_obj(p);
+ }
+ tsc = rte_rdtsc_precise() - tsc;
+
+ alloc->teardown();
+
+ return (double)tsc / MEASURE_OPS;
+err:
+ alloc->teardown();
+ return -1.0;
+}
+
+/*
+ * Scenario 2: allocate N, free N (FIFO free order). Exercises
+ * cache refill and drain paths when N exceeds cache capacity.
+ */
+static void
+run_batch(const struct allocator *alloc, size_t size,
+ double *cycles_alloc, double *cycles_free)
+{
+ void *ptrs[BATCH_N];
+ uint64_t tsc_alloc = 0, tsc_free = 0;
+ unsigned int iter, i;
+ unsigned int iters;
+
+ *cycles_alloc = -1.0;
+ *cycles_free = -1.0;
+
+ if (alloc->setup(size, BATCH_N) < 0)
+ return;
+
+ /* Pick iteration count so total ops ~= MEASURE_OPS. */
+ iters = MEASURE_OPS / BATCH_N;
+
+ /* Warmup. */
+ for (iter = 0; iter < WARMUP_OPS / BATCH_N; iter++) {
+ for (i = 0; i < BATCH_N; i++) {
+ ptrs[i] = alloc->alloc();
+ if (ptrs[i] == NULL)
+ goto err;
+ }
+ for (i = 0; i < BATCH_N; i++)
+ alloc->free_obj(ptrs[i]);
+ }
+
+ for (iter = 0; iter < iters; iter++) {
+ uint64_t t0;
+
+ t0 = rte_rdtsc_precise();
+ for (i = 0; i < BATCH_N; i++) {
+ ptrs[i] = alloc->alloc();
+ if (ptrs[i] == NULL)
+ goto err;
+ }
+ tsc_alloc += rte_rdtsc_precise() - t0;
+
+ t0 = rte_rdtsc_precise();
+ for (i = 0; i < BATCH_N; i++)
+ alloc->free_obj(ptrs[i]);
+ tsc_free += rte_rdtsc_precise() - t0;
+ }
+
+ alloc->teardown();
+
+ *cycles_alloc = (double)tsc_alloc / (iters * BATCH_N);
+ *cycles_free = (double)tsc_free / (iters * BATCH_N);
+ return;
+err:
+ alloc->teardown();
+}
+
+/*
+ * Scenario 3: allocate N, free N in reverse order.
+ */
+static void
+run_batch_reverse(const struct allocator *alloc, size_t size,
+ double *cycles_alloc, double *cycles_free)
+{
+ void *ptrs[BATCH_N];
+ uint64_t tsc_alloc = 0, tsc_free = 0;
+ unsigned int iter, i;
+ unsigned int iters;
+
+ *cycles_alloc = -1.0;
+ *cycles_free = -1.0;
+
+ if (alloc->setup(size, BATCH_N) < 0)
+ return;
+
+ iters = MEASURE_OPS / BATCH_N;
+
+ for (iter = 0; iter < WARMUP_OPS / BATCH_N; iter++) {
+ for (i = 0; i < BATCH_N; i++) {
+ ptrs[i] = alloc->alloc();
+ if (ptrs[i] == NULL)
+ goto err;
+ }
+ for (i = BATCH_N; i > 0; i--)
+ alloc->free_obj(ptrs[i - 1]);
+ }
+
+ for (iter = 0; iter < iters; iter++) {
+ uint64_t t0;
+
+ t0 = rte_rdtsc_precise();
+ for (i = 0; i < BATCH_N; i++) {
+ ptrs[i] = alloc->alloc();
+ if (ptrs[i] == NULL)
+ goto err;
+ }
+ tsc_alloc += rte_rdtsc_precise() - t0;
+
+ t0 = rte_rdtsc_precise();
+ for (i = BATCH_N; i > 0; i--)
+ alloc->free_obj(ptrs[i - 1]);
+ tsc_free += rte_rdtsc_precise() - t0;
+ }
+
+ alloc->teardown();
+
+ *cycles_alloc = (double)tsc_alloc / (iters * BATCH_N);
+ *cycles_free = (double)tsc_free / (iters * BATCH_N);
+ return;
+err:
+ alloc->teardown();
+}
+
+/*
+ * Scenario 4: multi-lcore alloc/work/free with a dummy-work
+ * baseline. Each worker runs a tight alloc → touch → free loop
+ * on its own lcore. A second run with the same dummy work but
+ * no allocator traffic establishes a baseline; the per-op
+ * allocator cost is reported as (alloc_run - baseline_run).
+ *
+ * Fixed size class and a fixed amount of dummy work per op —
+ * this scenario sweeps lcore count rather than size.
+ */
+#define MULTI_SIZE 256u
+#define MULTI_WORK_BYTES 64u
+#define MULTI_WORK_PASSES 8u /* RMW passes over the work region. */
+#define MULTI_OPS 200000u
+#define MULTI_WARMUP 2000u
+#define MAX_MULTI_LCORES 32u
+
+/*
+ * Per-worker volatile sink. Each worker writes to its own
+ * slot, preventing dead-code elimination of touch_buffer() and
+ * avoiding cross-lcore cache-line sharing on the hot path.
+ * Padded to cache-line stride to prevent false sharing between
+ * neighboring workers' slots.
+ */
+struct worker_sink {
+ volatile uint64_t value;
+} __rte_cache_aligned;
+
+static struct worker_sink worker_sinks[RTE_MAX_LCORE];
+
+/*
+ * Out-of-line dummy workload: run MULTI_WORK_PASSES
+ * read-modify-write passes over the first 'bytes' of the
+ * buffer. Each pass reads what the previous pass wrote, so the
+ * compiler cannot unroll or parallelize across passes — the
+ * work scales linearly with MULTI_WORK_PASSES. Returns an
+ * accumulator so the caller can feed it into a volatile sink;
+ * without that, the compiler could elide the whole function.
+ *
+ * __rte_noinline so it looks identical to the compiler in both
+ * the baseline (pre-allocated scratch buffer) and alloc-path
+ * runs, making the cycle-delta subtraction valid.
+ *
+ * The purpose of this being tunably expensive is to keep
+ * worker-per-iteration cost high relative to the allocator's
+ * critical section, so that even serialized allocators like
+ * rte_malloc spend most of their time outside the lock and the
+ * measured per-op allocator cost reflects its own work rather
+ * than its contention queue.
+ */
+static uint64_t __rte_noinline
+touch_buffer(void *buf, size_t bytes)
+{
+ uint64_t *p = buf;
+ size_t n = bytes / sizeof(uint64_t);
+ uint64_t acc = 0;
+ unsigned int pass;
+ size_t i;
+
+ /* Prime the buffer with a known pattern. */
+ for (i = 0; i < n; i++)
+ p[i] = i * 0x9E3779B97F4A7C15ULL;
+
+ /*
+ * Dependent RMW passes: each pass reads p[i] written by
+ * the previous pass, mixes the pass index in, and writes
+ * back. The XOR into acc keeps the chain live.
+ */
+ for (pass = 0; pass < MULTI_WORK_PASSES; pass++) {
+ for (i = 0; i < n; i++) {
+ uint64_t v = p[i];
+
+ v = v * 0xC2B2AE3D27D4EB4FULL + pass;
+ v ^= v >> 33;
+ p[i] = v;
+ acc ^= v;
+ }
+ }
+
+ return acc;
+}
+
+struct worker_args {
+ const struct allocator *alloc;
+ void *scratch; /* baseline only; NULL => alloc path */
+ unsigned int iters;
+ unsigned int warmup;
+ unsigned int bulk_n; /* 0 = single-object, >0 = bulk */
+ RTE_ATOMIC(bool) start_flag; /* barrier at worker entry */
+ uint64_t cycles; /* out */
+ unsigned int ops; /* out */
+ int err; /* out */
+};
+
+static int
+worker_run(void *arg)
+{
+ struct worker_args *wa = arg;
+ unsigned int lcore = rte_lcore_id();
+ uint64_t acc = 0;
+ uint64_t t0;
+ unsigned int i;
+
+ wa->err = 0;
+ wa->ops = 0;
+ wa->cycles = 0;
+
+ /* Wait for start flag (spin-barrier set by main). */
+ while (!rte_atomic_load_explicit(&wa->start_flag,
+ rte_memory_order_acquire))
+ rte_pause();
+
+ /* Warmup. */
+ for (i = 0; i < wa->warmup; i++) {
+ void *p;
+
+ if (wa->scratch != NULL)
+ p = wa->scratch;
+ else {
+ p = wa->alloc->alloc();
+ if (p == NULL) {
+ wa->err = -1;
+ return -1;
+ }
+ }
+ acc ^= touch_buffer(p, MULTI_WORK_BYTES);
+ if (wa->scratch == NULL)
+ wa->alloc->free_obj(p);
+ }
+
+ /* Measured loop. */
+ t0 = rte_rdtsc_precise();
+ for (i = 0; i < wa->iters; i++) {
+ void *p;
+
+ if (wa->scratch != NULL)
+ p = wa->scratch;
+ else {
+ p = wa->alloc->alloc();
+ if (p == NULL) {
+ wa->err = -1;
+ break;
+ }
+ }
+ acc ^= touch_buffer(p, MULTI_WORK_BYTES);
+ if (wa->scratch == NULL)
+ wa->alloc->free_obj(p);
+ }
+ wa->cycles = rte_rdtsc_precise() - t0;
+ wa->ops = i;
+
+ /* Publish accumulator to defeat dead-code elimination. */
+ worker_sinks[lcore].value ^= acc;
+
+ return 0;
+}
+
+static int
+worker_run_bulk(void *arg)
+{
+ struct worker_args *wa = arg;
+ unsigned int lcore = rte_lcore_id();
+ void *ptrs[BATCH_N];
+ uint64_t acc = 0;
+ uint64_t t0;
+ unsigned int i, j;
+ unsigned int bulk_n = wa->bulk_n;
+
+ wa->err = 0;
+ wa->ops = 0;
+ wa->cycles = 0;
+
+ while (!rte_atomic_load_explicit(&wa->start_flag,
+ rte_memory_order_acquire))
+ rte_pause();
+
+ /* Warmup. */
+ for (i = 0; i < wa->warmup; i++) {
+ if (wa->alloc->alloc_bulk(ptrs, bulk_n) < 0) {
+ wa->err = -1;
+ return -1;
+ }
+ for (j = 0; j < bulk_n; j++)
+ acc ^= touch_buffer(ptrs[j], MULTI_WORK_BYTES);
+ wa->alloc->free_bulk(ptrs, bulk_n);
+ }
+
+ t0 = rte_rdtsc_precise();
+ for (i = 0; i < wa->iters; i++) {
+ if (wa->alloc->alloc_bulk(ptrs, bulk_n) < 0) {
+ wa->err = -1;
+ break;
+ }
+ for (j = 0; j < bulk_n; j++)
+ acc ^= touch_buffer(ptrs[j], MULTI_WORK_BYTES);
+ wa->alloc->free_bulk(ptrs, bulk_n);
+ }
+ wa->cycles = rte_rdtsc_precise() - t0;
+ wa->ops = i * bulk_n;
+
+ worker_sinks[lcore].value ^= acc;
+
+ return 0;
+}
+
+/*
+ * Launch workers on the first 'n_workers' worker lcores, run
+ * either the baseline (scratch != NULL) or the alloc path
+ * (scratch == NULL), and return the mean per-op cycle cost
+ * averaged across participating workers.
+ *
+ * On any worker error, returns -1.0.
+ */
+static double
+run_multi_workers(const struct allocator *alloc, unsigned int n_workers,
+ void *const *scratches, unsigned int bulk_n)
+{
+ struct worker_args wargs[RTE_MAX_LCORE];
+ unsigned int worker_lcores[MAX_MULTI_LCORES];
+ unsigned int n = 0;
+ unsigned int lcore_id;
+ unsigned int i;
+ lcore_function_t *fn = bulk_n > 0 ? worker_run_bulk : worker_run;
+
+ /* Collect the first n_workers worker lcores. */
+ RTE_LCORE_FOREACH_WORKER(lcore_id) {
+ if (n >= n_workers)
+ break;
+ worker_lcores[n++] = lcore_id;
+ }
+ if (n < n_workers)
+ return -1.0;
+
+ /* Prepare per-worker args. */
+ for (i = 0; i < n_workers; i++) {
+ struct worker_args *wa = &wargs[worker_lcores[i]];
+
+ wa->alloc = alloc;
+ wa->scratch = scratches != NULL ? scratches[i] : NULL;
+ wa->iters = MULTI_OPS;
+ wa->warmup = MULTI_WARMUP;
+ wa->bulk_n = bulk_n;
+ rte_atomic_store_explicit(&wa->start_flag, false,
+ rte_memory_order_relaxed);
+ }
+
+ /* Launch workers. They spin on start_flag until released. */
+ for (i = 0; i < n_workers; i++)
+ rte_eal_remote_launch(fn, &wargs[worker_lcores[i]],
+ worker_lcores[i]);
+
+ /* Release all workers roughly simultaneously. */
+ for (i = 0; i < n_workers; i++)
+ rte_atomic_store_explicit(
+ &wargs[worker_lcores[i]].start_flag, true,
+ rte_memory_order_release);
+
+ /* Wait for completion. */
+ for (i = 0; i < n_workers; i++)
+ rte_eal_wait_lcore(worker_lcores[i]);
+
+ /* Aggregate: mean cycles per op across workers. */
+ {
+ double sum_cycles_per_op = 0.0;
+ unsigned int n_ok = 0;
+
+ for (i = 0; i < n_workers; i++) {
+ struct worker_args *wa = &wargs[worker_lcores[i]];
+
+ if (wa->err != 0 || wa->ops == 0)
+ return -1.0;
+ sum_cycles_per_op +=
+ (double)wa->cycles / (double)wa->ops;
+ n_ok++;
+ }
+ return sum_cycles_per_op / n_ok;
+ }
+}
+
+/*
+ * One sub-run of Scenario 4: given an allocator and a worker
+ * count, return (baseline, alloc_path) mean cycles per op.
+ */
+static void
+run_multi_lcore(const struct allocator *alloc, unsigned int n_workers,
+ unsigned int bulk_n, double *baseline, double *alloc_path)
+{
+ void *scratches[MAX_MULTI_LCORES] = {0};
+ unsigned int n_alloced = 0;
+ unsigned int i;
+
+ *baseline = -1.0;
+ *alloc_path = -1.0;
+
+ if (alloc->setup(MULTI_SIZE, n_workers * 64) < 0)
+ return;
+
+ /* Baseline: pre-allocate one scratch per worker. */
+ for (i = 0; i < n_workers; i++) {
+ scratches[i] = alloc->alloc();
+ if (scratches[i] == NULL)
+ goto err;
+ n_alloced++;
+ }
+
+ *baseline = run_multi_workers(alloc, n_workers, scratches, 0);
+
+ for (i = 0; i < n_alloced; i++)
+ alloc->free_obj(scratches[i]);
+ n_alloced = 0;
+
+ /* Alloc path: workers alloc+free each iter. */
+ *alloc_path = run_multi_workers(alloc, n_workers, NULL, bulk_n);
+
+ alloc->teardown();
+ return;
+err:
+ for (i = 0; i < n_alloced; i++)
+ alloc->free_obj(scratches[i]);
+ alloc->teardown();
+}
+
+/* Reporting -------------------------------------------------------- */
+
+static void
+print_header(const char *title)
+{
+ size_t i;
+
+ TEST_LOG("\n=== %s ===\n", title);
+ TEST_LOG("%-12s", "allocator");
+ for (i = 0; i < N_SIZES; i++)
+ TEST_LOG(" %10zu B", SIZES[i]);
+ TEST_LOG("\n");
+}
+
+static void
+print_row(const char *name, const double *values)
+{
+ size_t i;
+
+ TEST_LOG("%-12s", name);
+ for (i = 0; i < N_SIZES; i++) {
+ if (values[i] < 0)
+ TEST_LOG(" %12s", "--");
+ else
+ TEST_LOG(" %12.1f", values[i]);
+ }
+ TEST_LOG("\n");
+}
+
+static void
+print_multi_header(const char *title, const unsigned int *lcore_counts,
+ unsigned int n_counts)
+{
+ unsigned int i;
+
+ TEST_LOG("\n=== %s ===\n", title);
+ TEST_LOG("%-12s", "allocator");
+ for (i = 0; i < n_counts; i++)
+ TEST_LOG(" %8u lcore%c", lcore_counts[i],
+ lcore_counts[i] == 1 ? ' ' : 's');
+ TEST_LOG("\n");
+}
+
+static void
+print_multi_row(const char *name, const double *values, unsigned int n_counts)
+{
+ unsigned int i;
+
+ TEST_LOG("%-12s", name);
+ for (i = 0; i < n_counts; i++) {
+ if (values[i] < 0)
+ TEST_LOG(" %14s", "--");
+ else
+ TEST_LOG(" %14.1f", values[i]);
+ }
+ TEST_LOG("\n");
+}
+
+/* Driver ----------------------------------------------------------- */
+
+static int
+test_fastmem_perf(void)
+{
+ size_t i;
+ size_t a;
+ int rc;
+
+ rc = rte_fastmem_init();
+ if (rc < 0) {
+ TEST_LOG("rte_fastmem_init() failed: %d\n", rc);
+ return -1;
+ }
+
+ rc = rte_fastmem_reserve(128 * 1024 * 1024, SOCKET_ID_ANY);
+ if (rc < 0) {
+ TEST_LOG("rte_fastmem_reserve() failed: %d\n", rc);
+ rte_fastmem_deinit();
+ return -1;
+ }
+
+ TEST_LOG("\nfastmem performance — single-lcore, fixed-size\n");
+ TEST_LOG("All numbers are TSC cycles.\n");
+
+ /* Scenario 1: tight alloc+free. */
+ print_header("Scenario 1: Single-object hot path — cycles per (alloc + free)");
+ for (a = 0; a < N_ALLOCATORS; a++) {
+ double vals[N_SIZES];
+
+ for (i = 0; i < N_SIZES; i++)
+ vals[i] = run_tight(&allocators[a], SIZES[i]);
+ print_row(allocators[a].name, vals);
+ }
+
+ /* Scenario 2: batched, FIFO free. */
+ print_header("Scenario 2: Batch alloc, FIFO free — cycles per alloc");
+ for (a = 0; a < N_ALLOCATORS; a++) {
+ double vals_alloc[N_SIZES], vals_free[N_SIZES];
+
+ for (i = 0; i < N_SIZES; i++)
+ run_batch(&allocators[a], SIZES[i],
+ &vals_alloc[i], &vals_free[i]);
+ print_row(allocators[a].name, vals_alloc);
+ }
+ print_header("Scenario 2: Batch alloc, FIFO free — cycles per free");
+ for (a = 0; a < N_ALLOCATORS; a++) {
+ double vals_alloc[N_SIZES], vals_free[N_SIZES];
+
+ for (i = 0; i < N_SIZES; i++)
+ run_batch(&allocators[a], SIZES[i],
+ &vals_alloc[i], &vals_free[i]);
+ print_row(allocators[a].name, vals_free);
+ }
+
+ /* Scenario 3: batched, reverse free. */
+ print_header("Scenario 3: Batch alloc, LIFO free — cycles per alloc");
+ for (a = 0; a < N_ALLOCATORS; a++) {
+ double vals_alloc[N_SIZES], vals_free[N_SIZES];
+
+ for (i = 0; i < N_SIZES; i++)
+ run_batch_reverse(&allocators[a], SIZES[i],
+ &vals_alloc[i], &vals_free[i]);
+ print_row(allocators[a].name, vals_alloc);
+ }
+ print_header("Scenario 3: Batch alloc, LIFO free — cycles per free");
+ for (a = 0; a < N_ALLOCATORS; a++) {
+ double vals_alloc[N_SIZES], vals_free[N_SIZES];
+
+ for (i = 0; i < N_SIZES; i++)
+ run_batch_reverse(&allocators[a], SIZES[i],
+ &vals_alloc[i], &vals_free[i]);
+ print_row(allocators[a].name, vals_free);
+ }
+
+ /* Scenario 4: multi-lcore alloc/work/free with baseline. */
+ {
+ unsigned int max_workers = rte_lcore_count() - 1;
+ unsigned int lcore_counts[8];
+ unsigned int n_counts = 0;
+ unsigned int w;
+ double base_vals[N_ALLOCATORS][8];
+ double alloc_vals[N_ALLOCATORS][8];
+ double delta_vals[N_ALLOCATORS][8];
+
+ if (max_workers > MAX_MULTI_LCORES)
+ max_workers = MAX_MULTI_LCORES;
+
+ /* Sweep lcore counts: 1, 2, 4, 8, ... up to max_workers. */
+ for (w = 1; w <= max_workers && n_counts < RTE_DIM(lcore_counts); w *= 2)
+ lcore_counts[n_counts++] = w;
+ /* Ensure max_workers is the final column if not power of two. */
+ if (n_counts > 0 && lcore_counts[n_counts - 1] != max_workers &&
+ n_counts < RTE_DIM(lcore_counts) && max_workers >= 1)
+ lcore_counts[n_counts++] = max_workers;
+
+ if (n_counts == 0) {
+ TEST_LOG("\nScenario 4 (Multi-lcore contention) skipped: no worker lcores available.\n");
+ } else {
+ TEST_LOG("\nScenario 4 parameters: size=%u B\n",
+ MULTI_SIZE);
+
+ for (a = 0; a < N_ALLOCATORS; a++) {
+ unsigned int c;
+
+ for (c = 0; c < n_counts; c++)
+ run_multi_lcore(&allocators[a], lcore_counts[c],
+ 0, &base_vals[a][c],
+ &alloc_vals[a][c]);
+ for (c = 0; c < n_counts; c++) {
+ if (base_vals[a][c] < 0 || alloc_vals[a][c] < 0)
+ delta_vals[a][c] = -1.0;
+ else
+ delta_vals[a][c] = alloc_vals[a][c] -
+ base_vals[a][c];
+ }
+ }
+
+ TEST_LOG("Baseline (domain logic only): %.1f cycles/op\n",
+ base_vals[0][0]);
+
+ print_multi_header("Scenario 4: Multi-lcore contention — allocator overhead (cycles/op)",
+ lcore_counts, n_counts);
+ for (a = 0; a < N_ALLOCATORS; a++)
+ print_multi_row(allocators[a].name,
+ delta_vals[a], n_counts);
+ }
+ }
+
+ /* Scenario 5: multi-lcore bulk alloc/work/free. */
+ {
+ unsigned int max_workers = rte_lcore_count() - 1;
+ unsigned int lcore_counts[8];
+ unsigned int n_counts = 0;
+ unsigned int w;
+ double base_vals[N_ALLOCATORS][8];
+ double alloc_vals[N_ALLOCATORS][8];
+ double delta_vals[N_ALLOCATORS][8];
+ unsigned int bulk_n = 8;
+
+ if (max_workers > MAX_MULTI_LCORES)
+ max_workers = MAX_MULTI_LCORES;
+
+ for (w = 1; w <= max_workers && n_counts < RTE_DIM(lcore_counts); w *= 2)
+ lcore_counts[n_counts++] = w;
+ if (n_counts > 0 && lcore_counts[n_counts - 1] != max_workers &&
+ n_counts < RTE_DIM(lcore_counts) && max_workers >= 1)
+ lcore_counts[n_counts++] = max_workers;
+
+ if (n_counts == 0) {
+ TEST_LOG("\nScenario 5 (Multi-lcore bulk contention) skipped: no worker lcores available.\n");
+ } else {
+ TEST_LOG("\nScenario 5 parameters: size=%u B, "
+ "bulk=%u\n",
+ MULTI_SIZE, bulk_n);
+
+ for (size_t a = 0; a < N_ALLOCATORS; a++) {
+ unsigned int c;
+
+ for (c = 0; c < n_counts; c++)
+ run_multi_lcore(&allocators[a],
+ lcore_counts[c], bulk_n,
+ &base_vals[a][c],
+ &alloc_vals[a][c]);
+ for (c = 0; c < n_counts; c++) {
+ if (base_vals[a][c] < 0 || alloc_vals[a][c] < 0)
+ delta_vals[a][c] = -1.0;
+ else
+ delta_vals[a][c] = alloc_vals[a][c] -
+ base_vals[a][c];
+ }
+ }
+
+ TEST_LOG("Baseline (domain logic only): %.1f cycles/op\n",
+ base_vals[0][0]);
+
+ print_multi_header("Scenario 5: Multi-lcore bulk contention — allocator overhead (cycles/op)",
+ lcore_counts, n_counts);
+ for (size_t a = 0; a < N_ALLOCATORS; a++)
+ print_multi_row(allocators[a].name,
+ delta_vals[a], n_counts);
+ }
+ }
+
+ TEST_LOG("\n");
+ rte_fastmem_deinit();
+ return 0;
+}
+
+REGISTER_PERF_TEST(fastmem_perf_autotest, test_fastmem_perf);
diff --git a/app/test/test_fastmem_profile.c b/app/test/test_fastmem_profile.c
new file mode 100644
index 0000000000..9a5dc94018
--- /dev/null
+++ b/app/test/test_fastmem_profile.c
@@ -0,0 +1,157 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2026 Ericsson AB
+ */
+
+/*
+ * A minimal fastmem workload intended for use with perf record /
+ * perf report. Runs a tight alloc/free loop for a fixed duration
+ * so that sampling profilers can attribute cycles to individual
+ * functions and instructions within the fastmem hot path.
+ *
+ * Usage:
+ * perf record -g -- dpdk-test --no-huge --no-pci -m 8192 \
+ * -l 0 <<< fastmem_profile_autotest
+ * perf report
+ */
+
+#include <inttypes.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include <rte_common.h>
+#include <rte_cycles.h>
+#include <rte_lcore.h>
+#include <rte_memory.h>
+
+#include <rte_fastmem.h>
+
+#include "test.h"
+
+/* Duration of each sub-test in TSC cycles (~3 seconds at 3 GHz). */
+#define PROFILE_DURATION_CYCLES (3ULL * rte_get_tsc_hz())
+
+/* Allocation size for the profiling workload. */
+#define PROFILE_SIZE 256u
+
+/*
+ * Sub-test 1: tight alloc+free, exercises only the per-lcore
+ * cache (no bin interaction after warmup).
+ */
+static int
+profile_cache_hit(void)
+{
+ uint64_t deadline;
+ uint64_t ops = 0;
+
+ deadline = rte_rdtsc() + PROFILE_DURATION_CYCLES;
+
+ while (rte_rdtsc() < deadline) {
+ void *p = rte_fastmem_alloc(PROFILE_SIZE, 0, 0);
+
+ if (p == NULL)
+ return -1;
+ rte_fastmem_free(p);
+ ops++;
+ }
+
+ printf(" cache_hit: %" PRIu64 " ops\n", ops);
+ return 0;
+}
+
+/*
+ * Sub-test 2: alloc N then free N, where N exceeds the cache
+ * capacity. This forces repeated cache refills and drains,
+ * exercising the bin lock and slab free-list traversal.
+ */
+#define PROFILE_BATCH 256u
+
+static int
+profile_cache_miss(void)
+{
+ void *ptrs[PROFILE_BATCH];
+ uint64_t deadline;
+ uint64_t ops = 0;
+ unsigned int i;
+
+ deadline = rte_rdtsc() + PROFILE_DURATION_CYCLES;
+
+ while (rte_rdtsc() < deadline) {
+ for (i = 0; i < PROFILE_BATCH; i++) {
+ ptrs[i] = rte_fastmem_alloc(PROFILE_SIZE, 0, 0);
+ if (ptrs[i] == NULL)
+ return -1;
+ }
+ for (i = 0; i < PROFILE_BATCH; i++)
+ rte_fastmem_free(ptrs[i]);
+ ops += PROFILE_BATCH;
+ }
+
+ printf(" cache_miss: %" PRIu64 " ops\n", ops);
+ return 0;
+}
+
+static int
+test_fastmem_profile_cache_hit(void)
+{
+ int rc;
+
+ rc = rte_fastmem_init();
+ if (rc < 0) {
+ printf("rte_fastmem_init() failed: %d\n", rc);
+ return -1;
+ }
+
+ rc = rte_fastmem_reserve(128 * 1024 * 1024, SOCKET_ID_ANY);
+ if (rc < 0) {
+ printf("rte_fastmem_reserve() failed: %d\n", rc);
+ rte_fastmem_deinit();
+ return -1;
+ }
+
+ printf("fastmem profile: cache-hit workload (size=%u, ~%u s)\n",
+ PROFILE_SIZE, 3);
+
+ if (profile_cache_hit() < 0) {
+ rte_fastmem_deinit();
+ return -1;
+ }
+
+ rte_fastmem_deinit();
+ return 0;
+}
+
+static int
+test_fastmem_profile_cache_miss(void)
+{
+ int rc;
+
+ rc = rte_fastmem_init();
+ if (rc < 0) {
+ printf("rte_fastmem_init() failed: %d\n", rc);
+ return -1;
+ }
+
+ rc = rte_fastmem_reserve(128 * 1024 * 1024, SOCKET_ID_ANY);
+ if (rc < 0) {
+ printf("rte_fastmem_reserve() failed: %d\n", rc);
+ rte_fastmem_deinit();
+ return -1;
+ }
+
+ printf("fastmem profile: cache-miss workload (size=%u, ~%u s)\n",
+ PROFILE_SIZE, 3);
+
+ if (profile_cache_miss() < 0) {
+ rte_fastmem_deinit();
+ return -1;
+ }
+
+ rte_fastmem_deinit();
+ return 0;
+}
+
+REGISTER_PERF_TEST(fastmem_profile_cache_hit_autotest,
+ test_fastmem_profile_cache_hit);
+REGISTER_PERF_TEST(fastmem_profile_cache_miss_autotest,
+ test_fastmem_profile_cache_miss);
--
2.43.0
More information about the dev
mailing list