[RFC v4 3/3] app/test: add fastmem test suite

Mattias Rönnblom hofors at lysator.liu.se
Sat May 30 11:26:34 CEST 2026
Previous message (by thread): [RFC v4 2/3] lib: add fastmem library
Next message (by thread): [RFC v3 2/3] lib: add fastmem library
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]
Add functional, performance, and profiling test suites for the
fastmem library.

--

RFC v4:
 * Add tests for handle alloc/free from uncached lcores and
   non-EAL threads.
 * Add tests that statistics survive cache flush.
 * Add test for shared-cache statistics.
 * Refactor tests to use per-test setup/teardown.

RFC v3:
 * Add realloc test cases (same class, grow, shrink, NULL ptr,
   zero size, too big, invalid align).
 * Merge lifecycle and functional test suites into one.
 * Suppress -Wuse-after-free in test_alloc_reuse (intentional
   pointer comparison after free).

RFC v2:
 * Add test_alloc_cross_socket_deinit exercising cross-socket
   teardown path.
 * Remove trailing double blank lines in test_fastmem.c.

Signed-off-by: Mattias Rönnblom <hofors at lysator.liu.se>
---
 app/test/meson.build            |    3 +
 app/test/test_fastmem.c         | 2111 +++++++++++++++++++++++++++++++
 app/test/test_fastmem_perf.c    | 1040 +++++++++++++++
 app/test/test_fastmem_profile.c |  157 +++
 4 files changed, 3311 insertions(+)
 create mode 100644 app/test/test_fastmem.c
 create mode 100644 app/test/test_fastmem_perf.c
 create mode 100644 app/test/test_fastmem_profile.c

diff --git a/app/test/meson.build b/app/test/meson.build
index 3f9340f2f5..fe375e97f3 100644
--- a/app/test/meson.build
+++ b/app/test/meson.build
@@ -82,6 +82,9 @@ source_file_deps = {
     'test_event_vector_adapter.c': ['eventdev', 'bus_vdev'],
     'test_eventdev.c': ['eventdev', 'bus_vdev'],
     'test_external_mem.c': [],
+    'test_fastmem.c': ['fastmem'],
+    'test_fastmem_perf.c': ['fastmem', 'mempool'],
+    'test_fastmem_profile.c': ['fastmem'],
     'test_fbarray.c': [],
     'test_fib.c': ['net', 'fib'],
     'test_fib6.c': ['rib', 'fib'],
diff --git a/app/test/test_fastmem.c b/app/test/test_fastmem.c
new file mode 100644
index 0000000000..24ba1e671a
--- /dev/null
+++ b/app/test/test_fastmem.c
@@ -0,0 +1,2111 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2026 Ericsson AB
+ */
+
+#include <errno.h>
+#include <inttypes.h>
+#include <stdalign.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <rte_common.h>
+#include <rte_errno.h>
+#include <rte_lcore.h>
+#include <rte_memory.h>
+#include <rte_memzone.h>
+#include <rte_thread.h>
+
+#include <rte_fastmem.h>
+
+#include "test.h"
+
+#define FASTMEM_MEMZONE_SIZE (128U << 20)
+
+/*
+ * Count memzones whose names begin with the fastmem prefix.
+ * Used to verify that rte_fastmem_reserve() really did reserve
+ * backing memzones.
+ */
+static int fastmem_memzone_count;
+
+static void
+count_fastmem_memzones_walk(const struct rte_memzone *mz, void *arg)
+{
+	RTE_SET_USED(arg);
+
+	if (strncmp(mz->name, "fastmem_", strlen("fastmem_")) == 0)
+		fastmem_memzone_count++;
+}
+
+static unsigned int
+count_fastmem_memzones(void)
+{
+	fastmem_memzone_count = 0;
+	rte_memzone_walk(count_fastmem_memzones_walk, NULL);
+	return fastmem_memzone_count;
+}
+
+static int
+test_init_deinit(void)
+{
+	int rc;
+
+	rc = rte_fastmem_init();
+	TEST_ASSERT_EQUAL(rc, 0, "rte_fastmem_init() failed: %d", rc);
+
+	rte_fastmem_deinit();
+
+	/* A subsequent init/deinit cycle must succeed. */
+	rc = rte_fastmem_init();
+	TEST_ASSERT_EQUAL(rc, 0, "second rte_fastmem_init() failed: %d", rc);
+
+	rte_fastmem_deinit();
+
+	return TEST_SUCCESS;
+}
+
+static int
+test_init_is_not_idempotent(void)
+{
+	int rc;
+
+	rc = rte_fastmem_init();
+	TEST_ASSERT_EQUAL(rc, 0, "rte_fastmem_init() failed: %d", rc);
+
+	rc = rte_fastmem_init();
+	TEST_ASSERT_EQUAL(rc, -EBUSY,
+		"expected -EBUSY on re-init, got %d", rc);
+
+	rte_fastmem_deinit();
+
+	return TEST_SUCCESS;
+}
+
+static int
+test_deinit_without_init(void)
+{
+	/* Must be a no-op, not a crash. */
+	rte_fastmem_deinit();
+
+	return TEST_SUCCESS;
+}
+
+static int
+test_max_size(void)
+{
+	size_t max;
+
+	max = rte_fastmem_max_size();
+	TEST_ASSERT(max >= (1U << 20),
+		"max_size=%zu below required 1 MiB minimum", max);
+
+	return TEST_SUCCESS;
+}
+
+static int
+test_reserve_small(void)
+{
+	int socket_id;
+	unsigned int before, after;
+	int rc;
+
+	socket_id = rte_socket_id_by_idx(0);
+	TEST_ASSERT(socket_id >= 0, "no available sockets");
+
+	before = count_fastmem_memzones();
+
+	/*
+	 * A small reserve request (1 byte) must result in exactly
+	 * one memzone reservation: the internal rounding is to
+	 * memzone granularity.
+	 */
+	rc = rte_fastmem_reserve(1, socket_id);
+	TEST_ASSERT_EQUAL(rc, 0, "rte_fastmem_reserve() failed: %d", rc);
+
+	after = count_fastmem_memzones();
+	TEST_ASSERT_EQUAL(after - before, 1,
+		"expected 1 new memzone, got %u", after - before);
+
+	rte_fastmem_deinit();
+
+	/* After deinit the memzones must be released. */
+	TEST_ASSERT_EQUAL(count_fastmem_memzones(), 0,
+		"%u fastmem memzones leaked after deinit",
+		count_fastmem_memzones());
+
+	return TEST_SUCCESS;
+}
+
+static int
+test_reserve_multiple_memzones(void)
+{
+	int socket_id;
+	unsigned int before, after;
+	size_t reserve_size;
+	int rc;
+
+	socket_id = rte_socket_id_by_idx(0);
+	TEST_ASSERT(socket_id >= 0, "no available sockets");
+
+	before = count_fastmem_memzones();
+
+	/*
+	 * Request just over one memzone's worth; this must force
+	 * a second memzone to be reserved.
+	 */
+	reserve_size = FASTMEM_MEMZONE_SIZE + 1;
+	rc = rte_fastmem_reserve(reserve_size, socket_id);
+	TEST_ASSERT_EQUAL(rc, 0, "rte_fastmem_reserve(%zu) failed: %d",
+		reserve_size, rc);
+
+	after = count_fastmem_memzones();
+	TEST_ASSERT_EQUAL(after - before, 2,
+		"expected 2 new memzones for %zu-byte reserve, got %u",
+		reserve_size, after - before);
+
+	return TEST_SUCCESS;
+}
+
+static int
+test_reserve_cumulative(void)
+{
+	int socket_id;
+	unsigned int after_first, after_second;
+	int rc;
+
+	socket_id = rte_socket_id_by_idx(0);
+	TEST_ASSERT(socket_id >= 0, "no available sockets");
+
+	rc = rte_fastmem_reserve(FASTMEM_MEMZONE_SIZE, socket_id);
+	TEST_ASSERT_EQUAL(rc, 0, "first reserve failed: %d", rc);
+
+	after_first = count_fastmem_memzones();
+
+	/*
+	 * A second call requesting the same amount that's already
+	 * reserved must not trigger any new memzone reservation.
+	 */
+	rc = rte_fastmem_reserve(FASTMEM_MEMZONE_SIZE, socket_id);
+	TEST_ASSERT_EQUAL(rc, 0, "second reserve failed: %d", rc);
+
+	after_second = count_fastmem_memzones();
+	TEST_ASSERT_EQUAL(after_first, after_second,
+		"reserve of already-reserved amount added memzones (%u -> %u)",
+		after_first, after_second);
+
+	return TEST_SUCCESS;
+}
+
+static int
+test_reserve_invalid_socket(void)
+{
+	int rc;
+
+	rc = rte_fastmem_reserve(1, RTE_MAX_NUMA_NODES);
+	TEST_ASSERT_EQUAL(rc, -EINVAL,
+		"expected -EINVAL for out-of-range socket, got %d", rc);
+
+	rc = rte_fastmem_reserve(1, -2);
+	TEST_ASSERT_EQUAL(rc, -EINVAL,
+		"expected -EINVAL for negative socket, got %d", rc);
+
+	return TEST_SUCCESS;
+}
+
+static int
+test_reserve_without_init(void)
+{
+	int rc;
+
+	rc = rte_fastmem_reserve(1, SOCKET_ID_ANY);
+	TEST_ASSERT(rc < 0,
+		"expected failure without init, got %d", rc);
+
+	return TEST_SUCCESS;
+}
+
+static int
+test_reserve_any_socket(void)
+{
+	unsigned int before, after;
+	int rc;
+
+	before = count_fastmem_memzones();
+
+	/*
+	 * SOCKET_ID_ANY should succeed on any system with at least
+	 * one configured socket. The allocator picks the caller's
+	 * socket first and falls back to other sockets if needed.
+	 */
+	rc = rte_fastmem_reserve(1, SOCKET_ID_ANY);
+	TEST_ASSERT_EQUAL(rc, 0,
+		"rte_fastmem_reserve(SOCKET_ID_ANY) failed: %d", rc);
+
+	after = count_fastmem_memzones();
+	TEST_ASSERT_EQUAL(after - before, 1,
+		"expected 1 new memzone, got %u", after - before);
+
+	return TEST_SUCCESS;
+}
+
+/*
+ * Stage 2 tests: allocation and free.
+ */
+
+static int
+test_alloc_too_big(void)
+{
+	void *p;
+	rte_errno = 0;
+	p = rte_fastmem_alloc(rte_fastmem_max_size() + 1, 0, 0);
+	TEST_ASSERT_NULL(p, "alloc above max_size returned non-NULL");
+	TEST_ASSERT_EQUAL(rte_errno, E2BIG,
+		"expected rte_errno=E2BIG, got %d", rte_errno);
+
+	return TEST_SUCCESS;
+}
+
+static int
+test_alloc_invalid_align(void)
+{
+	void *p;
+	rte_errno = 0;
+	p = rte_fastmem_alloc(16, 3, 0); /* 3 is not a power of 2 */
+	TEST_ASSERT_NULL(p, "alloc with align=3 returned non-NULL");
+	TEST_ASSERT_EQUAL(rte_errno, EINVAL,
+		"expected rte_errno=EINVAL, got %d", rte_errno);
+
+	return TEST_SUCCESS;
+}
+
+static int
+test_alloc_free_small(void)
+{
+	void *p;
+	p = rte_fastmem_alloc(8, 0, 0);
+	TEST_ASSERT_NOT_NULL(p, "alloc(8) failed: rte_errno=%d", rte_errno);
+
+	/* Writing into the object must not crash. */
+	memset(p, 0xa5, 8);
+
+	rte_fastmem_free(p);
+
+	return TEST_SUCCESS;
+}
+
+static int
+test_alloc_free_various_sizes(void)
+{
+	static const size_t sizes[] = {
+		1, 8, 16, 17, 63, 64, 128, 1024, 4096,
+		64 * 1024, 256 * 1024, 1024 * 1024,
+	};
+	void *ptrs[RTE_DIM(sizes)];
+	unsigned int i;
+	for (i = 0; i < RTE_DIM(sizes); i++) {
+		ptrs[i] = rte_fastmem_alloc(sizes[i], 0, 0);
+		TEST_ASSERT_NOT_NULL(ptrs[i],
+			"alloc(%zu) failed: rte_errno=%d",
+			sizes[i], rte_errno);
+		memset(ptrs[i], 0x5a, sizes[i]);
+	}
+
+	for (i = 0; i < RTE_DIM(sizes); i++)
+		rte_fastmem_free(ptrs[i]);
+
+	return TEST_SUCCESS;
+}
+
+static int
+test_alloc_alignment(void)
+{
+	static const size_t aligns[] = {
+		8, 16, 64, 256, 4096, 65536,
+	};
+	unsigned int i;
+	for (i = 0; i < RTE_DIM(aligns); i++) {
+		void *p = rte_fastmem_alloc(1, aligns[i], 0);
+
+		TEST_ASSERT_NOT_NULL(p,
+			"alloc(1, align=%zu) failed: rte_errno=%d",
+			aligns[i], rte_errno);
+		TEST_ASSERT((uintptr_t)p % aligns[i] == 0,
+			"pointer %p not aligned on %zu",
+			p, aligns[i]);
+		rte_fastmem_free(p);
+	}
+
+	/* Default (align=0) gives at least RTE_CACHE_LINE_SIZE. */
+	{
+		void *p = rte_fastmem_alloc(1, 0, 0);
+
+		TEST_ASSERT_NOT_NULL(p,
+			"alloc(1, align=0) failed: rte_errno=%d", rte_errno);
+		TEST_ASSERT((uintptr_t)p % RTE_CACHE_LINE_SIZE == 0,
+			"default-align pointer %p not cache-line aligned",
+			p);
+		rte_fastmem_free(p);
+	}
+
+	return TEST_SUCCESS;
+}
+
+static int
+test_alloc_zero_flag(void)
+{
+	uint8_t *p;
+	unsigned int i;
+	bool all_zero = true;
+
+	/*
+	 * Dirty a slab first by allocating without F_ZERO, writing
+	 * a non-zero pattern, and freeing. A subsequent F_ZERO
+	 * allocation on the same slab must return zeroed memory.
+	 */
+	p = rte_fastmem_alloc(128, 0, 0);
+	TEST_ASSERT_NOT_NULL(p, "priming alloc failed");
+	memset(p, 0xff, 128);
+	rte_fastmem_free(p);
+
+	p = rte_fastmem_alloc(128, 0, RTE_FASTMEM_F_ZERO);
+	TEST_ASSERT_NOT_NULL(p, "F_ZERO alloc failed");
+	for (i = 0; i < 128; i++) {
+		if (p[i] != 0) {
+			all_zero = false;
+			break;
+		}
+	}
+	TEST_ASSERT(all_zero, "F_ZERO returned non-zero byte at offset %u", i);
+
+	rte_fastmem_free(p);
+
+	return TEST_SUCCESS;
+}
+
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuse-after-free"
+#endif
+static int
+test_alloc_reuse(void)
+{
+	void *first, *second;
+
+	first = rte_fastmem_alloc(64, 0, 0);
+	TEST_ASSERT_NOT_NULL(first, "first alloc failed");
+	rte_fastmem_free(first);
+
+	second = rte_fastmem_alloc(64, 0, 0);
+	TEST_ASSERT_NOT_NULL(second, "second alloc failed");
+
+	/*
+	 * The slab's free list is LIFO, so the most recently freed
+	 * object is at the head of the list. A subsequent alloc in
+	 * the same class returns it.
+	 */
+	TEST_ASSERT_EQUAL(first, second,
+		"free + alloc did not reuse: first=%p second=%p",
+		first, second);
+
+	rte_fastmem_free(second);
+
+	return TEST_SUCCESS;
+}
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+
+static int
+test_alloc_many_in_class(void)
+{
+	/*
+	 * Allocate more objects in one class than fit in a single
+	 * slab, forcing the bin to pull a second block. This
+	 * exercises the partial->full transition and the cross-slab
+	 * allocation path.
+	 */
+	enum { CLASS_SIZE = 8, COUNT = 300000 };
+	void **ptrs;
+	unsigned int i;
+
+	ptrs = calloc(COUNT, sizeof(*ptrs));
+	TEST_ASSERT_NOT_NULL(ptrs, "calloc for test ptrs failed");
+
+	for (i = 0; i < COUNT; i++) {
+		ptrs[i] = rte_fastmem_alloc(CLASS_SIZE, 0, 0);
+		TEST_ASSERT_NOT_NULL(ptrs[i],
+			"alloc[%u] failed: rte_errno=%d",
+			i, rte_errno);
+	}
+
+	for (i = 0; i < COUNT; i++)
+		rte_fastmem_free(ptrs[i]);
+
+	free(ptrs);
+
+	return TEST_SUCCESS;
+}
+
+static int
+test_alloc_socket(void)
+{
+	void *p;
+	int socket_id;
+	socket_id = rte_socket_id_by_idx(0);
+	TEST_ASSERT(socket_id >= 0, "no available sockets");
+
+	p = rte_fastmem_alloc_socket(64, 0, 0, socket_id);
+	TEST_ASSERT_NOT_NULL(p,
+		"alloc_socket(%d) failed: rte_errno=%d",
+		socket_id, rte_errno);
+
+	rte_fastmem_free(p);
+
+	return TEST_SUCCESS;
+}
+
+static int
+test_alloc_block_repurposing(void)
+{
+	void *small, *large;
+
+	/*
+	 * Allocate and free a small object, forcing a block to be
+	 * assigned to the small class and then returned to the
+	 * free-block pool. A subsequent allocation in a different
+	 * class must be able to reuse that block.
+	 */
+	small = rte_fastmem_alloc(8, 0, 0);
+	TEST_ASSERT_NOT_NULL(small, "small alloc failed");
+	rte_fastmem_free(small);
+
+	large = rte_fastmem_alloc(256 * 1024, 0, 0);
+	TEST_ASSERT_NOT_NULL(large, "large alloc failed");
+	rte_fastmem_free(large);
+
+	return TEST_SUCCESS;
+}
+
+static int
+test_alloc_block_repurposing_no_growth(void)
+{
+	struct rte_fastmem_stats stats;
+	void *small, *large;
+	uint64_t after_small;
+	int rc;
+
+	/*
+	 * Stronger version of test_alloc_block_repurposing: assert
+	 * that the cross-class allocation does not grow the
+	 * backing memory (bytes_backing stays flat). Because the
+	 * free-block pool is shared across size classes — not
+	 * partitioned per class — the block freed from the small
+	 * class must serve the large allocation without triggering
+	 * a new memzone reservation.
+	 */
+	rc = rte_fastmem_stats(&stats);
+	TEST_ASSERT_EQUAL(rc, 0, "rte_fastmem_stats() failed: %d", rc);
+	TEST_ASSERT_EQUAL(stats.bytes_backing, (uint64_t)0,
+		"unexpected pre-alloc bytes_backing: %" PRIu64,
+		stats.bytes_backing);
+
+	small = rte_fastmem_alloc(8, 0, 0);
+	TEST_ASSERT_NOT_NULL(small, "small alloc failed");
+
+	rc = rte_fastmem_stats(&stats);
+	TEST_ASSERT_EQUAL(rc, 0, "rte_fastmem_stats() failed: %d", rc);
+	TEST_ASSERT(stats.bytes_backing > 0,
+		"bytes_backing did not grow on first alloc");
+	after_small = stats.bytes_backing;
+
+	rte_fastmem_free(small);
+	rte_fastmem_cache_flush();
+
+	large = rte_fastmem_alloc(256 * 1024, 0, 0);
+	TEST_ASSERT_NOT_NULL(large,
+		"large alloc failed: rte_errno=%d", rte_errno);
+
+	rc = rte_fastmem_stats(&stats);
+	TEST_ASSERT_EQUAL(rc, 0, "rte_fastmem_stats() failed: %d", rc);
+	TEST_ASSERT_EQUAL(stats.bytes_backing, after_small,
+		"cross-class alloc grew backing memory from %" PRIu64
+		" to %" PRIu64,
+		after_small, stats.bytes_backing);
+
+	rte_fastmem_free(large);
+
+	return TEST_SUCCESS;
+}
+
+static int
+test_free_null(void)
+{
+	/* Must be a no-op, not a crash. */
+	rte_fastmem_free(NULL);
+
+	return TEST_SUCCESS;
+}
+
+static int
+test_alloc_content_integrity(void)
+{
+	/*
+	 * Allocate a batch of objects, fill each with a distinct
+	 * byte pattern, then verify none of the patterns overlap.
+	 * This catches header overwrites (slab header corrupted by
+	 * object access) and slot-overlap bugs (two pointers pointing
+	 * at overlapping slots).
+	 */
+	enum { N = 256, SIZE = 128 };
+	uint8_t *ptrs[N];
+	unsigned int i, j;
+	for (i = 0; i < N; i++) {
+		ptrs[i] = rte_fastmem_alloc(SIZE, 0, 0);
+		TEST_ASSERT_NOT_NULL(ptrs[i], "alloc[%u] failed", i);
+		memset(ptrs[i], (int)i, SIZE);
+	}
+
+	for (i = 0; i < N; i++)
+		for (j = 0; j < SIZE; j++)
+			TEST_ASSERT_EQUAL(ptrs[i][j], (uint8_t)i,
+				"corruption at ptrs[%u][%u]: got 0x%x, want 0x%x",
+				i, j, ptrs[i][j], (uint8_t)i);
+
+	for (i = 0; i < N; i++)
+		rte_fastmem_free(ptrs[i]);
+
+	return TEST_SUCCESS;
+}
+
+static int
+test_alloc_align_too_big(void)
+{
+	void *p;
+	/*
+	 * A small size with an alignment larger than the maximum
+	 * size class cannot be served. The class selected must be
+	 * large enough for the alignment, but no such class exists.
+	 */
+	rte_errno = 0;
+	p = rte_fastmem_alloc(1, rte_fastmem_max_size() * 2, 0);
+	TEST_ASSERT_NULL(p,
+		"alloc with align>max_size returned non-NULL");
+	TEST_ASSERT_EQUAL(rte_errno, E2BIG,
+		"expected rte_errno=E2BIG, got %d", rte_errno);
+
+	return TEST_SUCCESS;
+}
+
+static int
+test_alloc_align_one(void)
+{
+	void *p;
+	/* align=1 is a valid power of 2 and must be accepted. */
+	p = rte_fastmem_alloc(8, 1, 0);
+	TEST_ASSERT_NOT_NULL(p, "alloc(8, 1) failed: rte_errno=%d",
+		rte_errno);
+	rte_fastmem_free(p);
+
+	return TEST_SUCCESS;
+}
+
+static int
+test_alloc_socket_numa_placement(void)
+{
+	void *p;
+	int socket_id;
+	struct rte_memseg *ms;
+	socket_id = rte_socket_id_by_idx(0);
+	TEST_ASSERT(socket_id >= 0, "no available sockets");
+
+	p = rte_fastmem_alloc_socket(64, 0, 0, socket_id);
+	TEST_ASSERT_NOT_NULL(p,
+		"alloc_socket(%d) failed: rte_errno=%d",
+		socket_id, rte_errno);
+
+	/*
+	 * Walk the memory to find the memseg for this pointer and
+	 * verify its socket. Skip the check if lookup fails (e.g.,
+	 * --no-huge mode may not populate memsegs for fastmem's
+	 * allocations in a way that rte_mem_virt2memseg can find).
+	 */
+	ms = rte_mem_virt2memseg(p, NULL);
+	if (ms != NULL) {
+		TEST_ASSERT_EQUAL(ms->socket_id, socket_id,
+			"alloc on socket %d landed on socket %d",
+			socket_id, ms->socket_id);
+	}
+
+	rte_fastmem_free(p);
+
+	return TEST_SUCCESS;
+}
+
+/*
+ * Allocate from a socket different from the calling lcore's socket,
+ * triggering a cross-socket cache allocation. Then deinit to exercise
+ * the teardown path where a cache's backing memory lives on a
+ * different socket than the one it serves.
+ */
+static int
+test_alloc_cross_socket_deinit(void)
+{
+	int local_sid, remote_sid;
+	unsigned int i, n_sockets;
+	void *p;
+
+	local_sid = (int)rte_socket_id();
+	if (local_sid < 0 || (unsigned int)local_sid >= RTE_MAX_NUMA_NODES)
+		local_sid = rte_socket_id_by_idx(0);
+
+	n_sockets = rte_socket_count();
+	if (n_sockets < 2)
+		return TEST_SKIPPED;
+
+	/* Find a socket different from the local one. */
+	remote_sid = -1;
+	for (i = 0; i < n_sockets; i++) {
+		int sid = rte_socket_id_by_idx(i);
+		if (sid >= 0 && sid != local_sid) {
+			remote_sid = sid;
+			break;
+		}
+	}
+	if (remote_sid < 0)
+		return TEST_SKIPPED;
+
+	p = rte_fastmem_alloc_socket(64, 0, 0, remote_sid);
+	TEST_ASSERT_NOT_NULL(p,
+		"cross-socket alloc(socket %d) failed: rte_errno=%d",
+		remote_sid, rte_errno);
+
+	rte_fastmem_free(p);
+
+	/* Teardown and re-init to exercise the deinit path with
+	 * cross-socket caches.
+	 */
+	rte_fastmem_deinit();
+
+	TEST_ASSERT_EQUAL(rte_fastmem_init(), 0,
+		"re-init after cross-socket deinit failed");
+
+	return TEST_SUCCESS;
+}
+
+/*
+ * Stage 3 tests: per-lcore caches.
+ */
+
+static int
+test_cache_flush(void)
+{
+	void *p;
+	/*
+	 * Alloc and free one object, leaving it in the cache. Then
+	 * flush and verify that a subsequent alloc may or may not
+	 * return the same pointer (not asserting same/different —
+	 * just checking that flush does not crash and a follow-up
+	 * alloc still works).
+	 */
+	p = rte_fastmem_alloc(64, 0, 0);
+	TEST_ASSERT_NOT_NULL(p, "first alloc failed");
+	rte_fastmem_free(p);
+
+	rte_fastmem_cache_flush();
+
+	/* Flush again — must be idempotent. */
+	rte_fastmem_cache_flush();
+
+	p = rte_fastmem_alloc(64, 0, 0);
+	TEST_ASSERT_NOT_NULL(p, "post-flush alloc failed");
+	rte_fastmem_free(p);
+
+	return TEST_SUCCESS;
+}
+
+static int
+test_cache_flush_without_init(void)
+{
+	/* Must be a no-op, not a crash. */
+	rte_fastmem_cache_flush();
+
+	return TEST_SUCCESS;
+}
+
+static int
+test_cache_exceeds_capacity(void)
+{
+	/*
+	 * Free more objects at a single size class than the cache
+	 * capacity (64 for classes <= 4 KiB). This forces the
+	 * cache-drain slow path and verifies no corruption.
+	 */
+	enum { COUNT = 200, SIZE = 64 };
+	void *ptrs[COUNT];
+	unsigned int i;
+
+	for (i = 0; i < COUNT; i++) {
+		ptrs[i] = rte_fastmem_alloc(SIZE, 0, 0);
+		TEST_ASSERT_NOT_NULL(ptrs[i],
+			"alloc[%u] failed: rte_errno=%d", i, rte_errno);
+	}
+
+	for (i = 0; i < COUNT; i++)
+		rte_fastmem_free(ptrs[i]);
+
+	/* Re-alloc the same count should still work. */
+	for (i = 0; i < COUNT; i++) {
+		ptrs[i] = rte_fastmem_alloc(SIZE, 0, 0);
+		TEST_ASSERT_NOT_NULL(ptrs[i],
+			"re-alloc[%u] failed: rte_errno=%d", i, rte_errno);
+	}
+
+	for (i = 0; i < COUNT; i++)
+		rte_fastmem_free(ptrs[i]);
+
+	return TEST_SUCCESS;
+}
+
+struct non_eal_args {
+	int ok;
+	char pad[64];
+};
+
+static uint32_t
+non_eal_thread_main(void *arg)
+{
+	struct non_eal_args *args = arg;
+	uint8_t *p;
+
+	p = rte_fastmem_alloc(128, 0, 0);
+	if (p == NULL)
+		return 1;
+
+	memset(p, 0x7e, 128);
+
+	rte_fastmem_free(p);
+
+	args->ok = 1;
+	return 0;
+}
+
+static int
+test_non_eal_thread(void)
+{
+	rte_thread_t thread_id;
+	struct non_eal_args args = { 0 };
+	int rc;
+
+	rc = rte_thread_create(&thread_id, NULL, non_eal_thread_main, &args);
+	TEST_ASSERT_EQUAL(rc, 0, "rte_thread_create() failed: %d", rc);
+
+	rc = rte_thread_join(thread_id, NULL);
+	TEST_ASSERT_EQUAL(rc, 0, "rte_thread_join() failed: %d", rc);
+
+	TEST_ASSERT_EQUAL(args.ok, 1,
+		"non-EAL thread did not complete alloc/free successfully");
+
+	return TEST_SUCCESS;
+}
+
+static int
+test_cache_flush_returns_memory(void)
+{
+	/*
+	 * When an entire slab's worth of objects is freed, the
+	 * slab's block is returned to the free-block pool and can
+	 * be reassigned to another size class. Verify the cache
+	 * does not permanently hold objects that prevent this.
+	 *
+	 * Allocate enough objects in one class to force multiple
+	 * slabs, free them all, then flush the cache. After the
+	 * flush, all cached objects are drained to their bins and
+	 * empty slabs are returned to the block pool.
+	 */
+	enum { N = 200, SIZE = 64 };
+	void *ptrs[N];
+	unsigned int i;
+
+	for (i = 0; i < N; i++) {
+		ptrs[i] = rte_fastmem_alloc(SIZE, 0, 0);
+		TEST_ASSERT_NOT_NULL(ptrs[i], "alloc[%u] failed", i);
+	}
+	for (i = 0; i < N; i++)
+		rte_fastmem_free(ptrs[i]);
+
+	rte_fastmem_cache_flush();
+
+	/*
+	 * An allocation in a completely different class should
+	 * succeed now, having access to any blocks freed by the
+	 * flush.
+	 */
+	{
+		void *other = rte_fastmem_alloc(65536, 0, 0);
+
+		TEST_ASSERT_NOT_NULL(other,
+			"post-flush cross-class alloc failed");
+		rte_fastmem_free(other);
+	}
+
+	return TEST_SUCCESS;
+}
+
+static int
+test_alloc_bulk_basic(void)
+{
+	enum { N = 32 };
+	void *ptrs[N];
+	int rc;
+
+	rc = rte_fastmem_alloc_bulk(ptrs, N, 64, 0, 0);
+	TEST_ASSERT_EQUAL(rc, 0, "alloc_bulk failed: %d", rc);
+
+	/* Verify all pointers are non-NULL and distinct. */
+	for (unsigned int i = 0; i < N; i++) {
+		TEST_ASSERT_NOT_NULL(ptrs[i], "ptrs[%u] is NULL", i);
+		for (unsigned int j = 0; j < i; j++)
+			TEST_ASSERT(ptrs[i] != ptrs[j],
+				"ptrs[%u] == ptrs[%u]", i, j);
+	}
+
+	rte_fastmem_free_bulk(ptrs, N);
+
+	return TEST_SUCCESS;
+}
+
+static int
+test_alloc_bulk_zero_flag(void)
+{
+	enum { N = 8, SIZE = 128 };
+	void *ptrs[N];
+	int rc;
+
+	rc = rte_fastmem_alloc_bulk(ptrs, N, SIZE, 0, RTE_FASTMEM_F_ZERO);
+	TEST_ASSERT_EQUAL(rc, 0, "alloc_bulk failed: %d", rc);
+
+	for (unsigned int i = 0; i < N; i++) {
+		uint8_t *p = ptrs[i];
+
+		for (unsigned int b = 0; b < SIZE; b++)
+			TEST_ASSERT_EQUAL(p[b], 0,
+				"ptrs[%u][%u] != 0", i, b);
+	}
+
+	rte_fastmem_free_bulk(ptrs, N);
+
+	return TEST_SUCCESS;
+}
+
+static int
+test_alloc_bulk_exceeds_cache(void)
+{
+	/* Allocate more than cache capacity (64) in one bulk call. */
+	enum { N = 128 };
+	void *ptrs[N];
+	int rc;
+
+	rc = rte_fastmem_alloc_bulk(ptrs, N, 64, 0, 0);
+	TEST_ASSERT_EQUAL(rc, 0, "alloc_bulk(%u) failed: %d", N, rc);
+
+	rte_fastmem_free_bulk(ptrs, N);
+
+	return TEST_SUCCESS;
+}
+
+static int
+test_alloc_bulk_socket(void)
+{
+	enum { N = 16 };
+	void *ptrs[N];
+	int socket_id;
+	int rc;
+
+	socket_id = rte_socket_id_by_idx(0);
+	TEST_ASSERT(socket_id >= 0, "no sockets");
+
+	rc = rte_fastmem_alloc_bulk_socket(ptrs, N, 64, 0, 0, socket_id);
+	TEST_ASSERT_EQUAL(rc, 0, "alloc_bulk_socket failed: %d", rc);
+
+	rte_fastmem_free_bulk(ptrs, N);
+
+	/* SOCKET_ID_ANY */
+	rc = rte_fastmem_alloc_bulk_socket(ptrs, N, 64, 0, 0, SOCKET_ID_ANY);
+	TEST_ASSERT_EQUAL(rc, 0, "alloc_bulk_socket(ANY) failed: %d", rc);
+
+	rte_fastmem_free_bulk(ptrs, N);
+
+	return TEST_SUCCESS;
+}
+
+static int
+test_free_bulk(void)
+{
+	enum { N = 64 };
+	void *ptrs[N];
+	/* Allocate individually, free in bulk. */
+	for (unsigned int i = 0; i < N; i++) {
+		ptrs[i] = rte_fastmem_alloc(64, 0, 0);
+		TEST_ASSERT_NOT_NULL(ptrs[i], "alloc[%u] failed", i);
+	}
+
+	rte_fastmem_free_bulk(ptrs, N);
+
+	/* Verify memory is reusable. */
+	for (unsigned int i = 0; i < N; i++) {
+		ptrs[i] = rte_fastmem_alloc(64, 0, 0);
+		TEST_ASSERT_NOT_NULL(ptrs[i], "re-alloc[%u] failed", i);
+	}
+
+	rte_fastmem_free_bulk(ptrs, N);
+
+	return TEST_SUCCESS;
+}
+
+static int
+test_classes(void)
+{
+	size_t sizes[32];
+	unsigned int n;
+
+	n = rte_fastmem_classes(NULL);
+	TEST_ASSERT_EQUAL(n, 18u, "expected 18 classes, got %u", n);
+
+	n = rte_fastmem_classes(sizes);
+	TEST_ASSERT_EQUAL(n, 18u, "expected 18 classes, got %u", n);
+	TEST_ASSERT_EQUAL(sizes[0], (size_t)8, "class 0 != 8");
+	TEST_ASSERT_EQUAL(sizes[n - 1], (size_t)(1 << 20),
+		"last class != 1 MiB");
+
+	for (unsigned int i = 0; i < n; i++) {
+		TEST_ASSERT(sizes[i] != 0 && (sizes[i] & (sizes[i] - 1)) == 0,
+			"class %u size %zu not power of 2", i, sizes[i]);
+		if (i > 0)
+			TEST_ASSERT(sizes[i] > sizes[i - 1],
+				"classes not ascending at %u", i);
+	}
+
+	return TEST_SUCCESS;
+}
+
+static int
+test_stats_class(void)
+{
+	enum { N = 10 };
+	struct rte_fastmem_class_stats cs;
+	void *ptrs[N];
+	int rc;
+
+	for (unsigned int i = 0; i < N; i++) {
+		ptrs[i] = rte_fastmem_alloc(64, 0, 0);
+		TEST_ASSERT_NOT_NULL(ptrs[i], "alloc[%u] failed", i);
+	}
+
+	rc = rte_fastmem_stats_class(64, &cs);
+	TEST_ASSERT_EQUAL(rc, 0, "stats_class failed: %d", rc);
+	TEST_ASSERT_EQUAL(cs.class_size, (size_t)64, "wrong class_size");
+	TEST_ASSERT(cs.alloc_cache_hits + cs.alloc_cache_misses == N,
+		"alloc count != N: hits=%" PRIu64 " misses=%" PRIu64,
+		cs.alloc_cache_hits, cs.alloc_cache_misses);
+	TEST_ASSERT_EQUAL(cs.in_use, (uint64_t)N, "in_use != N");
+
+	for (unsigned int i = 0; i < N; i++)
+		rte_fastmem_free(ptrs[i]);
+
+	rc = rte_fastmem_stats_class(64, &cs);
+	TEST_ASSERT_EQUAL(rc, 0, "stats_class after free failed: %d", rc);
+	TEST_ASSERT_EQUAL(cs.in_use, (uint64_t)0, "in_use != 0 after free");
+
+	/* Invalid class size. */
+	rc = rte_fastmem_stats_class(13, &cs);
+	TEST_ASSERT_EQUAL(rc, -EINVAL, "expected -EINVAL for bad size");
+
+	return TEST_SUCCESS;
+}
+
+static int
+test_stats_lcore(void)
+{
+	struct rte_fastmem_lcore_stats ls;
+	void *ptr;
+	int rc;
+
+	ptr = rte_fastmem_alloc(128, 0, 0);
+	TEST_ASSERT_NOT_NULL(ptr, "alloc failed");
+
+	rc = rte_fastmem_stats_lcore(rte_lcore_id(), &ls);
+	TEST_ASSERT_EQUAL(rc, 0, "stats_lcore failed: %d", rc);
+	TEST_ASSERT(ls.alloc_cache_hits + ls.alloc_cache_misses > 0,
+		"no alloc activity on this lcore");
+
+	rte_fastmem_free(ptr);
+
+	rc = rte_fastmem_stats_lcore(rte_lcore_id(), &ls);
+	TEST_ASSERT_EQUAL(rc, 0, "stats_lcore after free failed: %d", rc);
+	TEST_ASSERT(ls.free_cache_hits + ls.free_cache_misses > 0,
+		"no free activity on this lcore");
+
+	/* Invalid lcore. */
+	rc = rte_fastmem_stats_lcore(RTE_MAX_LCORE, &ls);
+	TEST_ASSERT_EQUAL(rc, -EINVAL, "expected -EINVAL for bad lcore");
+
+	return TEST_SUCCESS;
+}
+
+static int
+test_stats_lcore_class(void)
+{
+	struct rte_fastmem_lcore_class_stats lcs;
+	void *ptr;
+	int rc;
+
+	ptr = rte_fastmem_alloc(256, 0, 0);
+	TEST_ASSERT_NOT_NULL(ptr, "alloc failed");
+
+	rc = rte_fastmem_stats_lcore_class(rte_lcore_id(), 256, &lcs);
+	TEST_ASSERT_EQUAL(rc, 0, "stats_lcore_class failed: %d", rc);
+	TEST_ASSERT_EQUAL(lcs.class_size, (size_t)256, "wrong class_size");
+	TEST_ASSERT(lcs.alloc_cache_hits + lcs.alloc_cache_misses > 0,
+		"no alloc activity");
+
+	rte_fastmem_free(ptr);
+	return TEST_SUCCESS;
+}
+
+static int
+test_stats_reset(void)
+{
+	struct rte_fastmem_stats gs;
+	void *ptr;
+	int rc;
+
+	ptr = rte_fastmem_alloc(64, 0, 0);
+	TEST_ASSERT_NOT_NULL(ptr, "alloc failed");
+	rte_fastmem_free(ptr);
+
+	rte_fastmem_stats_reset();
+
+	rc = rte_fastmem_stats(&gs);
+	TEST_ASSERT_EQUAL(rc, 0, "stats failed: %d", rc);
+	TEST_ASSERT_EQUAL(gs.alloc_total, (uint64_t)0,
+		"alloc_total not zero after reset");
+	TEST_ASSERT_EQUAL(gs.free_total, (uint64_t)0,
+		"free_total not zero after reset");
+
+	return TEST_SUCCESS;
+}
+
+/*
+ * Counters are stored separately from the per-lcore caches, so a
+ * cache flush (which frees the cache structs) must not discard
+ * accumulated statistics.
+ */
+static int
+test_stats_survive_cache_flush(void)
+{
+	enum { N = 10 };
+	struct rte_fastmem_class_stats before, after;
+	struct rte_fastmem_lcore_stats lbefore, lafter;
+	void *ptrs[N];
+	unsigned int i;
+	int rc;
+
+	for (i = 0; i < N; i++) {
+		ptrs[i] = rte_fastmem_alloc(64, 0, 0);
+		TEST_ASSERT_NOT_NULL(ptrs[i], "alloc[%u] failed", i);
+	}
+	for (i = 0; i < N; i++)
+		rte_fastmem_free(ptrs[i]);
+
+	rc = rte_fastmem_stats_class(64, &before);
+	TEST_ASSERT_EQUAL(rc, 0, "stats_class failed: %d", rc);
+	rc = rte_fastmem_stats_lcore(rte_lcore_id(), &lbefore);
+	TEST_ASSERT_EQUAL(rc, 0, "stats_lcore failed: %d", rc);
+
+	TEST_ASSERT(before.alloc_cache_hits + before.alloc_cache_misses == N,
+		"expected %d allocs before flush", N);
+
+	rte_fastmem_cache_flush();
+
+	rc = rte_fastmem_stats_class(64, &after);
+	TEST_ASSERT_EQUAL(rc, 0, "stats_class after flush failed: %d", rc);
+	rc = rte_fastmem_stats_lcore(rte_lcore_id(), &lafter);
+	TEST_ASSERT_EQUAL(rc, 0, "stats_lcore after flush failed: %d", rc);
+
+	TEST_ASSERT_EQUAL(after.alloc_cache_hits, before.alloc_cache_hits,
+		"alloc_cache_hits lost across flush: %" PRIu64 " -> %" PRIu64,
+		before.alloc_cache_hits, after.alloc_cache_hits);
+	TEST_ASSERT_EQUAL(after.alloc_cache_misses, before.alloc_cache_misses,
+		"alloc_cache_misses lost across flush: %" PRIu64 " -> %" PRIu64,
+		before.alloc_cache_misses, after.alloc_cache_misses);
+	TEST_ASSERT_EQUAL(after.free_cache_hits, before.free_cache_hits,
+		"free_cache_hits lost across flush: %" PRIu64 " -> %" PRIu64,
+		before.free_cache_hits, after.free_cache_hits);
+	TEST_ASSERT_EQUAL(lafter.alloc_cache_hits + lafter.alloc_cache_misses,
+		lbefore.alloc_cache_hits + lbefore.alloc_cache_misses,
+		"per-lcore alloc counters lost across flush");
+
+	return TEST_SUCCESS;
+}
+
+/*
+ * Allocations made by a non-EAL thread cannot be attributed to an
+ * lcore, but must still be reflected in the global and per-class
+ * statistics.
+ */
+static uint32_t
+stats_non_eal_main(void *arg)
+{
+	struct non_eal_args *args = arg;
+	void *ptrs[8];
+	unsigned int i;
+
+	for (i = 0; i < RTE_DIM(ptrs); i++) {
+		ptrs[i] = rte_fastmem_alloc(64, 0, 0);
+		if (ptrs[i] == NULL)
+			return 1;
+	}
+	for (i = 0; i < RTE_DIM(ptrs); i++)
+		rte_fastmem_free(ptrs[i]);
+
+	args->ok = 1;
+	return 0;
+}
+
+static int
+test_stats_count_non_eal(void)
+{
+	enum { N = 8 };
+	struct rte_fastmem_stats before, after;
+	struct non_eal_args args = { 0 };
+	rte_thread_t thread_id;
+	int rc;
+
+	rte_fastmem_stats_reset();
+
+	rc = rte_fastmem_stats(&before);
+	TEST_ASSERT_EQUAL(rc, 0, "stats failed: %d", rc);
+
+	rc = rte_thread_create(&thread_id, NULL, stats_non_eal_main, &args);
+	TEST_ASSERT_EQUAL(rc, 0, "rte_thread_create() failed: %d", rc);
+	rc = rte_thread_join(thread_id, NULL);
+	TEST_ASSERT_EQUAL(rc, 0, "rte_thread_join() failed: %d", rc);
+	TEST_ASSERT_EQUAL(args.ok, 1, "non-EAL thread alloc/free failed");
+
+	rc = rte_fastmem_stats(&after);
+	TEST_ASSERT_EQUAL(rc, 0, "stats failed: %d", rc);
+
+	TEST_ASSERT_EQUAL(after.alloc_total - before.alloc_total, (uint64_t)N,
+		"non-EAL allocs not counted globally: delta=%" PRIu64,
+		after.alloc_total - before.alloc_total);
+	TEST_ASSERT_EQUAL(after.free_total - before.free_total, (uint64_t)N,
+		"non-EAL frees not counted globally: delta=%" PRIu64,
+		after.free_total - before.free_total);
+
+	return TEST_SUCCESS;
+}
+
+/*
+ * A non-EAL thread has no lcore id, so its traffic must land in the
+ * shared cache and be reported by rte_fastmem_stats_shared().
+ */
+static int
+test_stats_shared_non_eal(void)
+{
+	enum { N = 8 };
+	struct rte_fastmem_lcore_stats sh;
+	struct rte_fastmem_lcore_class_stats shc;
+	struct non_eal_args args = { 0 };
+	rte_thread_t thread_id;
+	int rc;
+
+	rte_fastmem_stats_reset();
+
+	rc = rte_thread_create(&thread_id, NULL, stats_non_eal_main, &args);
+	TEST_ASSERT_EQUAL(rc, 0, "rte_thread_create() failed: %d", rc);
+	rc = rte_thread_join(thread_id, NULL);
+	TEST_ASSERT_EQUAL(rc, 0, "rte_thread_join() failed: %d", rc);
+	TEST_ASSERT_EQUAL(args.ok, 1, "non-EAL thread alloc/free failed");
+
+	rc = rte_fastmem_stats_shared(&sh);
+	TEST_ASSERT_EQUAL(rc, 0, "stats_shared failed: %d", rc);
+	TEST_ASSERT_EQUAL(sh.alloc_cache_hits + sh.alloc_cache_misses,
+		(uint64_t)N, "shared allocs not counted: %" PRIu64,
+		sh.alloc_cache_hits + sh.alloc_cache_misses);
+	TEST_ASSERT_EQUAL(sh.free_cache_hits + sh.free_cache_misses,
+		(uint64_t)N, "shared frees not counted: %" PRIu64,
+		sh.free_cache_hits + sh.free_cache_misses);
+
+	/* stats_non_eal_main allocates 64-byte objects. */
+	rc = rte_fastmem_stats_shared_class(64, &shc);
+	TEST_ASSERT_EQUAL(rc, 0, "stats_shared_class failed: %d", rc);
+	TEST_ASSERT_EQUAL(shc.class_size, (size_t)64, "wrong class_size");
+	TEST_ASSERT_EQUAL(shc.alloc_cache_hits + shc.alloc_cache_misses,
+		(uint64_t)N, "shared class allocs not counted: %" PRIu64,
+		shc.alloc_cache_hits + shc.alloc_cache_misses);
+
+	/* The shared traffic must not be attributed to any lcore. */
+	struct rte_fastmem_lcore_stats ls;
+	rc = rte_fastmem_stats_lcore(rte_lcore_id(), &ls);
+	TEST_ASSERT_EQUAL(rc, 0, "stats_lcore failed: %d", rc);
+	TEST_ASSERT_EQUAL(ls.alloc_cache_hits + ls.alloc_cache_misses,
+		(uint64_t)0, "shared traffic leaked into lcore stats");
+
+	/* Error paths. */
+	rc = rte_fastmem_stats_shared(NULL);
+	TEST_ASSERT_EQUAL(rc, -EINVAL, "expected -EINVAL for NULL stats");
+	rc = rte_fastmem_stats_shared_class(13, &shc);
+	TEST_ASSERT_EQUAL(rc, -EINVAL, "expected -EINVAL for bad size");
+
+	return TEST_SUCCESS;
+}
+
+
+#define MIXED_LONG_LIVED_COUNT 25
+#define MIXED_SHORT_LIVED_ITERS 1000
+#define MIXED_MIN_LCORES 3
+
+static const size_t mixed_long_sizes[] = { 64, 256, 4096 };
+static const size_t mixed_short_sizes[] = { 8, 16, 32, 64, 128, 256, 512, 1024 };
+
+struct mixed_worker_args {
+	uint32_t seed;
+	int result;
+};
+
+static uint32_t
+xorshift32(uint32_t *state)
+{
+	uint32_t x = *state;
+
+	x ^= x << 13;
+	x ^= x >> 17;
+	x ^= x << 5;
+	*state = x;
+	return x;
+}
+
+static int
+mixed_worker(void *arg)
+{
+	struct mixed_worker_args *args = arg;
+	uint32_t seed = args->seed;
+	void *long_lived[MIXED_LONG_LIVED_COUNT];
+	size_t long_sizes[MIXED_LONG_LIVED_COUNT];
+	unsigned int i;
+
+	/* Allocate long-lived objects of mixed sizes. */
+	for (i = 0; i < MIXED_LONG_LIVED_COUNT; i++) {
+		long_sizes[i] = mixed_long_sizes[i % RTE_DIM(mixed_long_sizes)];
+		long_lived[i] = rte_fastmem_alloc(long_sizes[i], 0, 0);
+		if (long_lived[i] == NULL) {
+			args->result = TEST_FAILED;
+			return -1;
+		}
+		memset(long_lived[i], (int)(i + 1), long_sizes[i]);
+	}
+
+	/* Rapidly cycle short-lived objects. */
+	for (i = 0; i < MIXED_SHORT_LIVED_ITERS; i++) {
+		size_t sz = mixed_short_sizes[xorshift32(&seed) %
+					      RTE_DIM(mixed_short_sizes)];
+		uint8_t pattern = (uint8_t)(i & 0xff);
+		uint8_t *p;
+
+		p = rte_fastmem_alloc(sz, 0, 0);
+		if (p == NULL) {
+			args->result = TEST_FAILED;
+			return -1;
+		}
+		memset(p, pattern, sz);
+
+		/* Verify before freeing. */
+		for (size_t j = 0; j < sz; j++) {
+			if (p[j] != pattern) {
+				args->result = TEST_FAILED;
+				return -1;
+			}
+		}
+		rte_fastmem_free(p);
+	}
+
+	/* Verify long-lived objects are still intact. */
+	for (i = 0; i < MIXED_LONG_LIVED_COUNT; i++) {
+		uint8_t *bytes = long_lived[i];
+		uint8_t expected = (uint8_t)(i + 1);
+
+		for (size_t j = 0; j < long_sizes[i]; j++) {
+			if (bytes[j] != expected) {
+				args->result = TEST_FAILED;
+				return -1;
+			}
+		}
+		rte_fastmem_free(long_lived[i]);
+	}
+
+	args->result = TEST_SUCCESS;
+	return 0;
+}
+
+static int
+test_mixed_lifetimes_multi_lcore(void)
+{
+	struct mixed_worker_args args[RTE_MAX_LCORE];
+	unsigned int lcore_id;
+	unsigned int count = 0;
+	struct rte_fastmem_stats stats;
+	int rc;
+
+	RTE_LCORE_FOREACH_WORKER(lcore_id)
+		count++;
+
+	if (count < MIXED_MIN_LCORES) {
+		printf("Not enough worker lcores (%u < %u), skipping\n",
+		       count, MIXED_MIN_LCORES);
+		return TEST_SKIPPED;
+	}
+
+	/* Launch workers with distinct seeds. */
+	uint32_t seed = 0xdeadbeef;
+
+	RTE_LCORE_FOREACH_WORKER(lcore_id) {
+		args[lcore_id].seed = seed;
+		args[lcore_id].result = TEST_FAILED;
+		seed += 0x12345678;
+		rte_eal_remote_launch(mixed_worker, &args[lcore_id], lcore_id);
+	}
+
+	rte_eal_mp_wait_lcore();
+
+	/* Check all workers succeeded. */
+	RTE_LCORE_FOREACH_WORKER(lcore_id) {
+		TEST_ASSERT_EQUAL(args[lcore_id].result, TEST_SUCCESS,
+			"worker on lcore %u failed", lcore_id);
+	}
+
+	/* Verify no memory leak. */
+	rc = rte_fastmem_stats(&stats);
+	TEST_ASSERT_EQUAL(rc, 0, "stats failed: %d", rc);
+	TEST_ASSERT_EQUAL(stats.bytes_in_use, (uint64_t)0,
+		"bytes_in_use not zero after test: %" PRIu64,
+		stats.bytes_in_use);
+
+	return TEST_SUCCESS;
+}
+
+
+/*
+ * Memory limit tests.
+ *
+ * FASTMEM_MEMZONE_SIZE is 128 MiB. We use a limit of 128 MiB
+ * (one memzone) for most tests, and large objects (256 KiB) to
+ * exhaust slabs quickly.
+ */
+
+#define LIMIT_ONE_MZ ((size_t)128 << 20)
+#define LIMIT_OBJ_SIZE ((size_t)256 * 1024)
+
+static int
+test_memory_limit_basic(void)
+{
+	int rc;
+
+	rc = rte_fastmem_set_limit(SOCKET_ID_ANY, LIMIT_ONE_MZ);
+	TEST_ASSERT_EQUAL(rc, 0, "set_memory_limit failed: %d", rc);
+
+	const size_t got = rte_fastmem_get_limit(0);
+	TEST_ASSERT_EQUAL(got, LIMIT_ONE_MZ,
+		"get_memory_limit mismatch: %zu", got);
+
+	rc = rte_fastmem_reserve(LIMIT_ONE_MZ, SOCKET_ID_ANY);
+	TEST_ASSERT_EQUAL(rc, 0, "first reserve failed: %d", rc);
+
+	rc = rte_fastmem_reserve(LIMIT_ONE_MZ + 1, SOCKET_ID_ANY);
+	TEST_ASSERT(rc < 0, "second reserve should have failed");
+
+	return TEST_SUCCESS;
+}
+
+static int
+test_memory_limit_alloc_exhaustion(void)
+{
+	const unsigned int max_ptrs = 1024;
+	void *ptrs[max_ptrs];
+	unsigned int count = 0;
+	rte_fastmem_set_limit(SOCKET_ID_ANY, LIMIT_ONE_MZ);
+
+	for (count = 0; count < max_ptrs; count++) {
+		ptrs[count] = rte_fastmem_alloc(LIMIT_OBJ_SIZE, 0, 0);
+		if (ptrs[count] == NULL)
+			break;
+	}
+
+	TEST_ASSERT(count > 0, "should have allocated at least one");
+	TEST_ASSERT(count < max_ptrs, "should have hit the limit");
+	TEST_ASSERT_EQUAL(rte_errno, ENOMEM, "expected ENOMEM, got %d", rte_errno);
+
+	rte_fastmem_free(ptrs[count - 1]);
+	void *p = rte_fastmem_alloc(LIMIT_OBJ_SIZE, 0, 0);
+	TEST_ASSERT_NOT_NULL(p, "alloc after free should succeed");
+	rte_fastmem_free(p);
+
+	for (unsigned int i = 0; i < count - 1; i++)
+		rte_fastmem_free(ptrs[i]);
+
+	return TEST_SUCCESS;
+}
+
+static int
+test_memory_limit_zero_blocks_growth(void)
+{
+	int rc;
+
+	rte_fastmem_set_limit(SOCKET_ID_ANY, 0);
+
+	rc = rte_fastmem_reserve(1, SOCKET_ID_ANY);
+	TEST_ASSERT(rc < 0, "reserve with limit=0 should fail");
+
+	void *p = rte_fastmem_alloc(64, 0, 0);
+	TEST_ASSERT_NULL(p, "alloc with limit=0 should fail");
+
+	return TEST_SUCCESS;
+}
+
+static int
+test_memory_limit_below_current(void)
+{
+	int rc;
+
+	rc = rte_fastmem_reserve(LIMIT_ONE_MZ, SOCKET_ID_ANY);
+	TEST_ASSERT_EQUAL(rc, 0, "reserve failed: %d", rc);
+
+	rte_fastmem_set_limit(SOCKET_ID_ANY, 1);
+
+	void *p = rte_fastmem_alloc(64, 0, 0);
+	TEST_ASSERT_NOT_NULL(p, "alloc from existing backing should work");
+	rte_fastmem_free(p);
+
+	rc = rte_fastmem_reserve(LIMIT_ONE_MZ * 2, SOCKET_ID_ANY);
+	TEST_ASSERT(rc < 0, "growth beyond limit should fail");
+
+	return TEST_SUCCESS;
+}
+
+static int
+test_memory_limit_socket_id_any(void)
+{
+	rte_fastmem_set_limit(SOCKET_ID_ANY, 42);
+
+	for (unsigned int i = 0; i < rte_socket_count(); i++) {
+		const int sid = rte_socket_id_by_idx(i);
+		const size_t lim = rte_fastmem_get_limit(sid);
+
+		TEST_ASSERT_EQUAL(lim, (size_t)42,
+			"socket %d limit mismatch: %zu", sid, lim);
+	}
+
+	return TEST_SUCCESS;
+}
+
+static int
+test_memory_limit_unlimited(void)
+{
+	int rc;
+
+	rte_fastmem_set_limit(SOCKET_ID_ANY, 0);
+	rte_fastmem_set_limit(SOCKET_ID_ANY, SIZE_MAX);
+
+	rc = rte_fastmem_reserve(LIMIT_ONE_MZ, SOCKET_ID_ANY);
+	TEST_ASSERT_EQUAL(rc, 0, "reserve after reset failed: %d", rc);
+
+	return TEST_SUCCESS;
+}
+
+static int
+test_memory_limit_alloc_integrity_under_oom(void)
+{
+	const unsigned int n = 128;
+	const size_t obj_size = 1024;
+	uint8_t *ptrs[n];
+	const unsigned int extra_max = 1024;
+	void *extra[extra_max];
+	unsigned int n_extra = 0;
+	unsigned int i;
+	rte_fastmem_set_limit(SOCKET_ID_ANY, LIMIT_ONE_MZ);
+
+	for (i = 0; i < n; i++) {
+		ptrs[i] = rte_fastmem_alloc(obj_size, 0, 0);
+		TEST_ASSERT_NOT_NULL(ptrs[i], "alloc[%u] failed", i);
+		memset(ptrs[i], (int)(i & 0xff), obj_size);
+	}
+
+	/* Exhaust remaining backing with large objects. */
+	for (n_extra = 0; n_extra < extra_max; n_extra++) {
+		extra[n_extra] = rte_fastmem_alloc(LIMIT_OBJ_SIZE, 0, 0);
+		if (extra[n_extra] == NULL)
+			break;
+	}
+
+	/* Verify original objects are intact. */
+	for (i = 0; i < n; i++) {
+		const uint8_t expected = (uint8_t)(i & 0xff);
+		for (unsigned int j = 0; j < obj_size; j++)
+			TEST_ASSERT_EQUAL(ptrs[i][j], expected,
+				"corruption at [%u][%u]", i, j);
+	}
+
+	for (i = 0; i < n; i++)
+		rte_fastmem_free(ptrs[i]);
+	for (i = 0; i < n_extra; i++)
+		rte_fastmem_free(extra[i]);
+
+	return TEST_SUCCESS;
+}
+
+static int
+test_memory_limit_bulk_alloc_oom(void)
+{
+	const unsigned int bulk_n = 64;
+	const unsigned int drain_max = 512;
+	void *ptrs[bulk_n];
+	void *drain[drain_max];
+	unsigned int drained = 0;
+	int rc;
+
+	rte_fastmem_set_limit(SOCKET_ID_ANY, LIMIT_ONE_MZ);
+
+	for (drained = 0; drained < drain_max; drained++) {
+		drain[drained] = rte_fastmem_alloc(LIMIT_OBJ_SIZE, 0, 0);
+		if (drain[drained] == NULL)
+			break;
+	}
+
+	/* Free a few — enough for some but not bulk_n objects. */
+	const unsigned int freed = RTE_MIN(drained, 4u);
+	for (unsigned int i = 0; i < freed; i++)
+		rte_fastmem_free(drain[--drained]);
+
+	rc = rte_fastmem_alloc_bulk(ptrs, bulk_n, LIMIT_OBJ_SIZE, 0, 0);
+	TEST_ASSERT(rc < 0, "bulk alloc should fail");
+
+	for (unsigned int i = 0; i < drained; i++)
+		rte_fastmem_free(drain[i]);
+
+	return TEST_SUCCESS;
+}
+
+static int
+test_memory_limit_recovery_after_free(void)
+{
+	const unsigned int max_ptrs = 512;
+	void *ptrs[max_ptrs];
+	unsigned int count = 0;
+	rte_fastmem_set_limit(SOCKET_ID_ANY, LIMIT_ONE_MZ);
+
+	for (count = 0; count < max_ptrs; count++) {
+		ptrs[count] = rte_fastmem_alloc(LIMIT_OBJ_SIZE, 0, 0);
+		if (ptrs[count] == NULL)
+			break;
+	}
+	TEST_ASSERT(count > 0 && count < max_ptrs,
+		"expected partial fill, got %u", count);
+
+	const unsigned int half = count / 2;
+	for (unsigned int i = 0; i < half; i++)
+		rte_fastmem_free(ptrs[i]);
+
+	for (unsigned int i = 0; i < half; i++) {
+		ptrs[i] = rte_fastmem_alloc(LIMIT_OBJ_SIZE, 0, 0);
+		TEST_ASSERT_NOT_NULL(ptrs[i], "recovery alloc[%u] failed", i);
+	}
+
+	for (unsigned int i = 0; i < count; i++)
+		rte_fastmem_free(ptrs[i]);
+
+	return TEST_SUCCESS;
+}
+
+struct limit_worker_args {
+	unsigned int alloc_count;
+	int result;
+};
+
+static int
+limit_worker(void *arg)
+{
+	struct limit_worker_args *args = arg;
+	const unsigned int max_ptrs = 128;
+	void *ptrs[max_ptrs];
+	unsigned int i;
+
+	args->alloc_count = 0;
+
+	for (i = 0; i < max_ptrs; i++) {
+		ptrs[i] = rte_fastmem_alloc(LIMIT_OBJ_SIZE, 0, 0);
+		if (ptrs[i] == NULL)
+			break;
+		memset(ptrs[i], 0xab, LIMIT_OBJ_SIZE);
+		args->alloc_count++;
+	}
+
+	for (unsigned int j = 0; j < args->alloc_count; j++) {
+		uint8_t *bytes = ptrs[j];
+		for (size_t k = 0; k < LIMIT_OBJ_SIZE; k++) {
+			if (bytes[k] != 0xab) {
+				args->result = TEST_FAILED;
+				return -1;
+			}
+		}
+		rte_fastmem_free(ptrs[j]);
+	}
+
+	args->result = TEST_SUCCESS;
+	return 0;
+}
+
+static int
+test_memory_limit_multi_lcore_oom(void)
+{
+	struct limit_worker_args args[RTE_MAX_LCORE];
+	unsigned int lcore_id;
+	unsigned int worker_count = 0;
+	RTE_LCORE_FOREACH_WORKER(lcore_id)
+		worker_count++;
+
+	if (worker_count < 2) {
+		printf("Not enough workers (%u < 2), skipping\n", worker_count);
+		return TEST_SKIPPED;
+	}
+
+	rte_fastmem_set_limit(SOCKET_ID_ANY, LIMIT_ONE_MZ);
+
+	RTE_LCORE_FOREACH_WORKER(lcore_id) {
+		args[lcore_id].result = TEST_FAILED;
+		rte_eal_remote_launch(limit_worker, &args[lcore_id], lcore_id);
+	}
+
+	rte_eal_mp_wait_lcore();
+
+	RTE_LCORE_FOREACH_WORKER(lcore_id) {
+		TEST_ASSERT_EQUAL(args[lcore_id].result, TEST_SUCCESS,
+			"worker on lcore %u failed", lcore_id);
+	}
+
+	struct rte_fastmem_stats stats;
+	rte_fastmem_stats(&stats);
+	TEST_ASSERT_EQUAL(stats.bytes_in_use, (uint64_t)0,
+		"bytes_in_use not zero: %" PRIu64, stats.bytes_in_use);
+
+	return TEST_SUCCESS;
+}
+
+static int
+test_realloc_same_class(void)
+{
+	void *ptr = rte_fastmem_alloc(32, 0, 0);
+	TEST_ASSERT_NOT_NULL(ptr, "alloc failed");
+
+	/* Realloc to a smaller size within the same class (64 B class). */
+	void *ptr2 = rte_fastmem_realloc(ptr, 33, 0);
+	TEST_ASSERT_NOT_NULL(ptr2, "realloc failed");
+	TEST_ASSERT_EQUAL(ptr, ptr2,
+		"realloc returned different pointer for same class");
+
+	/* Realloc to exact class boundary — still same class. */
+	void *ptr3 = rte_fastmem_realloc(ptr2, 64, 0);
+	TEST_ASSERT_NOT_NULL(ptr3, "realloc failed");
+	TEST_ASSERT_EQUAL(ptr2, ptr3,
+		"realloc returned different pointer for same class");
+
+	rte_fastmem_free(ptr3);
+	return TEST_SUCCESS;
+}
+
+static int
+test_realloc_grow(void)
+{
+	const uint8_t pattern = 0xab;
+	void *ptr = rte_fastmem_alloc(16, 0, 0);
+	TEST_ASSERT_NOT_NULL(ptr, "alloc failed");
+
+	memset(ptr, pattern, 16);
+
+	/* Grow beyond current class. */
+	void *ptr2 = rte_fastmem_realloc(ptr, 128, 0);
+	TEST_ASSERT_NOT_NULL(ptr2, "realloc grow failed");
+
+	/* Verify contents preserved. */
+	uint8_t *bytes = ptr2;
+	for (unsigned int i = 0; i < 16; i++)
+		TEST_ASSERT_EQUAL(bytes[i], pattern,
+			"content corrupted at byte %u", i);
+
+	rte_fastmem_free(ptr2);
+	return TEST_SUCCESS;
+}
+
+static int
+test_realloc_shrink(void)
+{
+	const uint8_t pattern = 0xcd;
+	void *ptr = rte_fastmem_alloc(256, 0, 0);
+	TEST_ASSERT_NOT_NULL(ptr, "alloc failed");
+
+	memset(ptr, pattern, 256);
+
+	/* Shrink to a smaller class. */
+	void *ptr2 = rte_fastmem_realloc(ptr, 16, 0);
+	TEST_ASSERT_NOT_NULL(ptr2, "realloc shrink failed");
+
+	/* Verify contents preserved up to new size. */
+	uint8_t *bytes = ptr2;
+	for (unsigned int i = 0; i < 16; i++)
+		TEST_ASSERT_EQUAL(bytes[i], pattern,
+			"content corrupted at byte %u", i);
+
+	rte_fastmem_free(ptr2);
+	return TEST_SUCCESS;
+}
+
+static int
+test_realloc_null_ptr(void)
+{
+	/* NULL ptr should behave like alloc. */
+	void *ptr = rte_fastmem_realloc(NULL, 64, 0);
+	TEST_ASSERT_NOT_NULL(ptr, "realloc(NULL) failed");
+
+	rte_fastmem_free(ptr);
+	return TEST_SUCCESS;
+}
+
+static int
+test_realloc_zero_size(void)
+{
+	void *ptr = rte_fastmem_alloc(64, 0, 0);
+	TEST_ASSERT_NOT_NULL(ptr, "alloc failed");
+
+	/* size 0 should free and return NULL. */
+	void *ptr2 = rte_fastmem_realloc(ptr, 0, 0);
+	TEST_ASSERT_NULL(ptr2, "realloc(size=0) should return NULL");
+
+	return TEST_SUCCESS;
+}
+
+static int
+test_realloc_too_big(void)
+{
+	void *ptr = rte_fastmem_alloc(64, 0, 0);
+	TEST_ASSERT_NOT_NULL(ptr, "alloc failed");
+
+	void *ptr2 = rte_fastmem_realloc(ptr, rte_fastmem_max_size() + 1, 0);
+	TEST_ASSERT_NULL(ptr2, "realloc should fail for oversized request");
+	TEST_ASSERT_EQUAL(rte_errno, E2BIG, "expected E2BIG");
+
+	/* Original pointer should still be valid. */
+	rte_fastmem_free(ptr);
+	return TEST_SUCCESS;
+}
+
+static int
+test_realloc_invalid_align(void)
+{
+	void *ptr = rte_fastmem_alloc(64, 0, 0);
+	TEST_ASSERT_NOT_NULL(ptr, "alloc failed");
+
+	void *ptr2 = rte_fastmem_realloc(ptr, 64, 3);
+	TEST_ASSERT_NULL(ptr2, "realloc should fail for non-power-of-2 align");
+	TEST_ASSERT_EQUAL(rte_errno, EINVAL, "expected EINVAL");
+
+	rte_fastmem_free(ptr);
+	return TEST_SUCCESS;
+}
+
+/*
+ * Handle-based allocation API.
+ */
+
+static int
+test_halloc_basic(void)
+{
+	rte_fastmem_handle_t handle;
+	void *ptrs[16];
+	void *p;
+	int rc;
+	unsigned int i;
+
+	rc = rte_fastmem_hlookup(64, 0, rte_socket_id_by_idx(0), &handle);
+	TEST_ASSERT_EQUAL(rc, 0, "hlookup failed: %d", rc);
+
+	p = rte_fastmem_halloc(handle, RTE_FASTMEM_F_ZERO);
+	TEST_ASSERT_NOT_NULL(p, "halloc failed: rte_errno=%d", rte_errno);
+	memset(p, 0x5a, 64);
+	rte_fastmem_hfree(handle, p);
+
+	/* NULL pointer free is a no-op. */
+	rte_fastmem_hfree(handle, NULL);
+
+	rc = rte_fastmem_halloc_bulk(handle, ptrs, RTE_DIM(ptrs), 0);
+	TEST_ASSERT_EQUAL(rc, 0, "halloc_bulk failed: %d", rc);
+	for (i = 0; i < RTE_DIM(ptrs); i++)
+		TEST_ASSERT_NOT_NULL(ptrs[i], "halloc_bulk[%u] NULL", i);
+	rte_fastmem_hfree_bulk(handle, ptrs, RTE_DIM(ptrs));
+
+	return TEST_SUCCESS;
+}
+
+struct halloc_worker_args {
+	rte_fastmem_handle_t handle;
+	int result;
+};
+
+/*
+ * Allocate and free using a handle that was looked up on a
+ * different lcore. The worker lcore has no pre-existing cache for
+ * the handle's size class, so this exercises the path where
+ * halloc/hfree must lazily create (or bypass) the per-lcore cache.
+ */
+static int
+halloc_worker(void *arg)
+{
+	struct halloc_worker_args *args = arg;
+	void *ptrs[8];
+	uint8_t *p;
+	unsigned int i;
+
+	args->result = TEST_FAILED;
+
+	p = rte_fastmem_halloc(args->handle, 0);
+	if (p == NULL)
+		return -1;
+	memset(p, 0x3c, 64);
+	rte_fastmem_hfree(args->handle, p);
+
+	if (rte_fastmem_halloc_bulk(args->handle, ptrs, RTE_DIM(ptrs), 0) < 0)
+		return -1;
+	for (i = 0; i < RTE_DIM(ptrs); i++) {
+		if (ptrs[i] == NULL)
+			return -1;
+	}
+	rte_fastmem_hfree_bulk(args->handle, ptrs, RTE_DIM(ptrs));
+
+	args->result = TEST_SUCCESS;
+	return 0;
+}
+
+static int
+test_halloc_other_lcore(void)
+{
+	struct halloc_worker_args args;
+	rte_fastmem_handle_t handle;
+	unsigned int lcore_id;
+	int rc;
+
+	lcore_id = rte_get_next_lcore(-1, 1, 0);
+	if (lcore_id == RTE_MAX_LCORE)
+		return TEST_SKIPPED;
+
+	/* Look up the handle on the main lcore only. */
+	rc = rte_fastmem_hlookup(64, 0, rte_socket_id_by_idx(0), &handle);
+	TEST_ASSERT_EQUAL(rc, 0, "hlookup failed: %d", rc);
+
+	args.handle = handle;
+	args.result = TEST_FAILED;
+
+	rte_eal_remote_launch(halloc_worker, &args, lcore_id);
+	rc = rte_eal_wait_lcore(lcore_id);
+	TEST_ASSERT_EQUAL(rc, 0, "worker returned %d", rc);
+	TEST_ASSERT_EQUAL(args.result, TEST_SUCCESS,
+		"halloc/hfree failed on a lcore that did not call hlookup");
+
+	return TEST_SUCCESS;
+}
+
+static uint32_t
+halloc_non_eal_main(void *arg)
+{
+	struct halloc_worker_args *args = arg;
+
+	return halloc_worker(args) == 0 ? 0 : 1;
+}
+
+static int
+test_halloc_non_eal_thread(void)
+{
+	struct halloc_worker_args args;
+	rte_fastmem_handle_t handle;
+	rte_thread_t thread_id;
+	int rc;
+
+	rc = rte_fastmem_hlookup(64, 0, rte_socket_id_by_idx(0), &handle);
+	TEST_ASSERT_EQUAL(rc, 0, "hlookup failed: %d", rc);
+
+	args.handle = handle;
+	args.result = TEST_FAILED;
+
+	rc = rte_thread_create(&thread_id, NULL, halloc_non_eal_main, &args);
+	TEST_ASSERT_EQUAL(rc, 0, "rte_thread_create() failed: %d", rc);
+	rc = rte_thread_join(thread_id, NULL);
+	TEST_ASSERT_EQUAL(rc, 0, "rte_thread_join() failed: %d", rc);
+
+	TEST_ASSERT_EQUAL(args.result, TEST_SUCCESS,
+		"halloc/hfree failed on a non-EAL thread");
+
+	return TEST_SUCCESS;
+}
+
+static int
+fastmem_setup(void)
+{
+	return rte_fastmem_init();
+}
+
+static void
+fastmem_teardown(void)
+{
+	rte_fastmem_deinit();
+}
+
+static struct unit_test_suite fastmem_testsuite = {
+	.suite_name = "fastmem tests",
+	.setup = NULL,
+	.teardown = NULL,
+	.unit_test_cases = {
+		TEST_CASE(test_init_deinit),
+		TEST_CASE(test_init_is_not_idempotent),
+		TEST_CASE(test_deinit_without_init),
+		TEST_CASE(test_max_size),
+		TEST_CASE(test_reserve_without_init),
+		TEST_CASE(test_cache_flush_without_init),
+		TEST_CASE(test_classes),
+		TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+			test_reserve_small),
+		TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+			test_reserve_multiple_memzones),
+		TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+			test_reserve_cumulative),
+		TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+			test_reserve_invalid_socket),
+		TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+			test_reserve_any_socket),
+		TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+			test_alloc_too_big),
+		TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+			test_alloc_invalid_align),
+		TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+			test_alloc_free_small),
+		TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+			test_alloc_free_various_sizes),
+		TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+			test_alloc_alignment),
+		TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+			test_alloc_zero_flag),
+		TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+			test_alloc_reuse),
+		TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+			test_alloc_many_in_class),
+		TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+			test_alloc_socket),
+		TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+			test_alloc_block_repurposing),
+		TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+			test_alloc_block_repurposing_no_growth),
+		TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+			test_free_null),
+		TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+			test_alloc_content_integrity),
+		TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+			test_alloc_align_too_big),
+		TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+			test_alloc_align_one),
+		TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+			test_alloc_socket_numa_placement),
+		TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+			test_alloc_cross_socket_deinit),
+		TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+			test_cache_flush),
+		TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+			test_cache_exceeds_capacity),
+		TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+			test_non_eal_thread),
+		TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+			test_cache_flush_returns_memory),
+		TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+			test_alloc_bulk_basic),
+		TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+			test_alloc_bulk_zero_flag),
+		TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+			test_alloc_bulk_exceeds_cache),
+		TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+			test_alloc_bulk_socket),
+		TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+			test_free_bulk),
+		TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+			test_stats_class),
+		TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+			test_stats_lcore),
+		TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+			test_stats_lcore_class),
+		TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+			test_stats_reset),
+		TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+			test_stats_survive_cache_flush),
+		TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+			test_stats_count_non_eal),
+		TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+			test_stats_shared_non_eal),
+		TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+			test_mixed_lifetimes_multi_lcore),
+		TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+			test_memory_limit_basic),
+		TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+			test_memory_limit_alloc_exhaustion),
+		TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+			test_memory_limit_zero_blocks_growth),
+		TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+			test_memory_limit_below_current),
+		TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+			test_memory_limit_socket_id_any),
+		TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+			test_memory_limit_unlimited),
+		TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+			test_memory_limit_alloc_integrity_under_oom),
+		TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+			test_memory_limit_bulk_alloc_oom),
+		TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+			test_memory_limit_recovery_after_free),
+		TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+			test_memory_limit_multi_lcore_oom),
+		TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+			test_realloc_same_class),
+		TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+			test_realloc_grow),
+		TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+			test_realloc_shrink),
+		TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+			test_realloc_null_ptr),
+		TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+			test_realloc_zero_size),
+		TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+			test_realloc_too_big),
+		TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+			test_realloc_invalid_align),
+		TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+			test_halloc_basic),
+		TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+			test_halloc_other_lcore),
+		TEST_CASE_ST(fastmem_setup, fastmem_teardown,
+			test_halloc_non_eal_thread),
+		TEST_CASES_END()
+	}
+};
+
+static int
+test_fastmem(void)
+{
+	return unit_test_suite_runner(&fastmem_testsuite);
+}
+
+REGISTER_FAST_TEST(fastmem_autotest, NOHUGE_SKIP, ASAN_OK, test_fastmem);
diff --git a/app/test/test_fastmem_perf.c b/app/test/test_fastmem_perf.c
new file mode 100644
index 0000000000..73c0a4c6ce
--- /dev/null
+++ b/app/test/test_fastmem_perf.c
@@ -0,0 +1,1040 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2026 Ericsson AB
+ */
+
+#include <inttypes.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <rte_common.h>
+#include <rte_cycles.h>
+#include <rte_launch.h>
+#include <rte_lcore.h>
+#include <rte_malloc.h>
+#include <rte_mempool.h>
+#include <rte_stdatomic.h>
+
+#include <rte_fastmem.h>
+
+#include "test.h"
+
+#define TEST_LOG(...) printf(__VA_ARGS__)
+
+static const size_t SIZES[] = { 8, 64, 256, 1024, 4096 };
+#define N_SIZES RTE_DIM(SIZES)
+
+/* Number of ops for warmup and measurement. */
+#define WARMUP_OPS 20000u
+#define MEASURE_OPS 2000000u
+
+/* Buffer for scenarios that allocate N then free N. */
+#define BATCH_N 256
+
+/*
+ * Allocator vtable: a thin adapter exposing alloc / free /
+ * per-allocator setup/teardown. Each scenario calls these
+ * indirectly so the same timing loop serves all allocators.
+ */
+struct allocator {
+	const char *name;
+	int (*setup)(size_t size, unsigned int n_max);
+	void (*teardown)(void);
+	void *(*alloc)(void);
+	void (*free_obj)(void *ptr);
+	int (*alloc_bulk)(void **ptrs, unsigned int n);
+	void (*free_bulk)(void **ptrs, unsigned int n);
+};
+
+/* Fastmem adapter -------------------------------------------------- */
+
+static size_t fastmem_size;
+
+static int
+fastmem_setup(size_t size, unsigned int n_max __rte_unused)
+{
+	fastmem_size = size;
+	return 0;
+}
+
+static void
+fastmem_teardown(void)
+{
+	rte_fastmem_cache_flush();
+}
+
+static void * __rte_noinline
+fastmem_alloc(void)
+{
+	return rte_fastmem_alloc(fastmem_size, 0, 0);
+}
+
+static void __rte_noinline
+fastmem_free(void *ptr)
+{
+	rte_fastmem_free(ptr);
+}
+
+/* Mempool adapter -------------------------------------------------- */
+
+static struct rte_mempool *mempool_pool;
+
+static int
+mempool_setup(size_t size, unsigned int n_max)
+{
+	char name[RTE_MEMPOOL_NAMESIZE];
+	unsigned int cache_size;
+
+	/*
+	 * Pool size must accommodate the full batch burst plus
+	 * per-lcore cache capacity. Use mempool's default cache
+	 * size so we're measuring its standard hot path.
+	 */
+	cache_size = RTE_MEMPOOL_CACHE_MAX_SIZE;
+
+	snprintf(name, sizeof(name), "fmperf_mp_%zu", size);
+	mempool_pool = rte_mempool_create(name, n_max + cache_size * 2,
+			size, cache_size, 0, NULL, NULL, NULL, NULL,
+			SOCKET_ID_ANY, 0);
+	if (mempool_pool == NULL) {
+		TEST_LOG("mempool_create(%zu) failed\n", size);
+		return -1;
+	}
+
+	return 0;
+}
+
+static void
+mempool_teardown(void)
+{
+	rte_mempool_free(mempool_pool);
+	mempool_pool = NULL;
+}
+
+static void * __rte_noinline
+mempool_alloc_one(void)
+{
+	void *obj = NULL;
+
+	if (rte_mempool_get(mempool_pool, &obj) < 0)
+		return NULL;
+	return obj;
+}
+
+static void __rte_noinline
+mempool_free_one(void *ptr)
+{
+	rte_mempool_put(mempool_pool, ptr);
+}
+
+/* rte_malloc adapter ----------------------------------------------- */
+
+static size_t malloc_size;
+
+static int
+malloc_setup(size_t size, unsigned int n_max __rte_unused)
+{
+	malloc_size = size;
+	return 0;
+}
+
+static void
+malloc_teardown(void)
+{
+}
+
+static void * __rte_noinline
+malloc_alloc(void)
+{
+	return rte_malloc(NULL, malloc_size, 0);
+}
+
+static void __rte_noinline
+malloc_free(void *ptr)
+{
+	rte_free(ptr);
+}
+
+/* libc (glibc) malloc adapter -------------------------------------- */
+
+static size_t libc_size;
+
+static int
+libc_setup(size_t size, unsigned int n_max __rte_unused)
+{
+	/*
+	 * Round up to cache-line alignment to match the other
+	 * allocators' default alignment guarantees and keep the
+	 * comparison honest. aligned_alloc() requires size to be
+	 * a multiple of the alignment.
+	 */
+	libc_size = RTE_ALIGN_CEIL(size, RTE_CACHE_LINE_SIZE);
+	return 0;
+}
+
+static void
+libc_teardown(void)
+{
+}
+
+static void * __rte_noinline
+libc_alloc(void)
+{
+	return aligned_alloc(RTE_CACHE_LINE_SIZE, libc_size);
+}
+
+static void __rte_noinline
+libc_free(void *ptr)
+{
+	free(ptr);
+}
+
+/* Bulk adapters ---------------------------------------------------- */
+
+static int __rte_noinline
+fastmem_alloc_bulk(void **ptrs, unsigned int n)
+{
+	return rte_fastmem_alloc_bulk(ptrs, n, fastmem_size, 0, 0);
+}
+
+static void __rte_noinline
+fastmem_free_bulk(void **ptrs, unsigned int n)
+{
+	rte_fastmem_free_bulk(ptrs, n);
+}
+
+/* Fastmem handle adapter ------------------------------------------- */
+
+static rte_fastmem_handle_t fastmem_handle;
+
+static int
+fastmem_h_setup(size_t size, unsigned int n_max __rte_unused)
+{
+	return rte_fastmem_hlookup(size, 0, rte_socket_id(), &fastmem_handle);
+}
+
+static void
+fastmem_h_teardown(void)
+{
+	rte_fastmem_cache_flush();
+}
+
+static void * __rte_noinline
+fastmem_h_alloc(void)
+{
+	return rte_fastmem_halloc(fastmem_handle, 0);
+}
+
+static void __rte_noinline
+fastmem_h_free(void *ptr)
+{
+	rte_fastmem_hfree(fastmem_handle, ptr);
+}
+
+static int __rte_noinline
+fastmem_h_alloc_bulk(void **ptrs, unsigned int n)
+{
+	return rte_fastmem_halloc_bulk(fastmem_handle, ptrs, n, 0);
+}
+
+static void __rte_noinline
+fastmem_h_free_bulk(void **ptrs, unsigned int n)
+{
+	rte_fastmem_hfree_bulk(fastmem_handle, ptrs, n);
+}
+
+/* Mempool adapter -------------------------------------------------- */
+
+static int __rte_noinline
+mempool_alloc_bulk(void **ptrs, unsigned int n)
+{
+	return rte_mempool_get_bulk(mempool_pool, ptrs, n);
+}
+
+static void __rte_noinline
+mempool_free_bulk(void **ptrs, unsigned int n)
+{
+	rte_mempool_put_bulk(mempool_pool, ptrs, n);
+}
+
+static int __rte_noinline
+generic_alloc_bulk(void **ptrs, unsigned int n, void *(*alloc_fn)(void))
+{
+	unsigned int i;
+
+	for (i = 0; i < n; i++) {
+		ptrs[i] = alloc_fn();
+		if (ptrs[i] == NULL)
+			return -1;
+	}
+	return 0;
+}
+
+static int __rte_noinline
+malloc_alloc_bulk(void **ptrs, unsigned int n)
+{
+	return generic_alloc_bulk(ptrs, n, malloc_alloc);
+}
+
+static void __rte_noinline
+malloc_free_bulk(void **ptrs, unsigned int n)
+{
+	unsigned int i;
+
+	for (i = 0; i < n; i++)
+		malloc_free(ptrs[i]);
+}
+
+static int __rte_noinline
+libc_alloc_bulk(void **ptrs, unsigned int n)
+{
+	return generic_alloc_bulk(ptrs, n, libc_alloc);
+}
+
+static void __rte_noinline
+libc_free_bulk(void **ptrs, unsigned int n)
+{
+	unsigned int i;
+
+	for (i = 0; i < n; i++)
+		libc_free(ptrs[i]);
+}
+
+/* Adapter table ---------------------------------------------------- */
+
+static const struct allocator allocators[] = {
+	{ "fastmem",    fastmem_setup,   fastmem_teardown,   fastmem_alloc,     fastmem_free,     fastmem_alloc_bulk,   fastmem_free_bulk },
+	{ "fastmem_h",  fastmem_h_setup, fastmem_h_teardown, fastmem_h_alloc,   fastmem_h_free,   fastmem_h_alloc_bulk, fastmem_h_free_bulk },
+	{ "mempool",    mempool_setup,   mempool_teardown,   mempool_alloc_one, mempool_free_one, mempool_alloc_bulk,   mempool_free_bulk },
+	{ "rte_malloc", malloc_setup,    malloc_teardown,    malloc_alloc,      malloc_free,      malloc_alloc_bulk,    malloc_free_bulk },
+	{ "libc",       libc_setup,      libc_teardown,      libc_alloc,        libc_free,        libc_alloc_bulk,      libc_free_bulk },
+};
+#define N_ALLOCATORS RTE_DIM(allocators)
+
+/*
+ * Scenario 1: tight alloc+free loop. A single object is cycled
+ * repeatedly. The LIFO path keeps the same pointer hot, giving
+ * a best-case measurement.
+ */
+static double
+run_tight(const struct allocator *alloc, size_t size)
+{
+	void *p;
+	uint64_t tsc;
+	unsigned int i;
+
+	if (alloc->setup(size, 1) < 0)
+		return -1.0;
+
+	/* Warmup. */
+	for (i = 0; i < WARMUP_OPS; i++) {
+		p = alloc->alloc();
+		if (p == NULL)
+			goto err;
+		alloc->free_obj(p);
+	}
+
+	tsc = rte_rdtsc_precise();
+	for (i = 0; i < MEASURE_OPS; i++) {
+		p = alloc->alloc();
+		if (p == NULL)
+			goto err;
+		alloc->free_obj(p);
+	}
+	tsc = rte_rdtsc_precise() - tsc;
+
+	alloc->teardown();
+
+	return (double)tsc / MEASURE_OPS;
+err:
+	alloc->teardown();
+	return -1.0;
+}
+
+/*
+ * Scenario 2: allocate N, free N (FIFO free order). Exercises
+ * cache refill and drain paths when N exceeds cache capacity.
+ */
+static void
+run_batch(const struct allocator *alloc, size_t size,
+		double *cycles_alloc, double *cycles_free)
+{
+	void *ptrs[BATCH_N];
+	uint64_t tsc_alloc = 0, tsc_free = 0;
+	unsigned int iter, i;
+	unsigned int iters;
+
+	*cycles_alloc = -1.0;
+	*cycles_free = -1.0;
+
+	if (alloc->setup(size, BATCH_N) < 0)
+		return;
+
+	/* Pick iteration count so total ops ~= MEASURE_OPS. */
+	iters = MEASURE_OPS / BATCH_N;
+
+	/* Warmup. */
+	for (iter = 0; iter < WARMUP_OPS / BATCH_N; iter++) {
+		for (i = 0; i < BATCH_N; i++) {
+			ptrs[i] = alloc->alloc();
+			if (ptrs[i] == NULL)
+				goto err;
+		}
+		for (i = 0; i < BATCH_N; i++)
+			alloc->free_obj(ptrs[i]);
+	}
+
+	for (iter = 0; iter < iters; iter++) {
+		uint64_t t0;
+
+		t0 = rte_rdtsc_precise();
+		for (i = 0; i < BATCH_N; i++) {
+			ptrs[i] = alloc->alloc();
+			if (ptrs[i] == NULL)
+				goto err;
+		}
+		tsc_alloc += rte_rdtsc_precise() - t0;
+
+		t0 = rte_rdtsc_precise();
+		for (i = 0; i < BATCH_N; i++)
+			alloc->free_obj(ptrs[i]);
+		tsc_free += rte_rdtsc_precise() - t0;
+	}
+
+	alloc->teardown();
+
+	*cycles_alloc = (double)tsc_alloc / (iters * BATCH_N);
+	*cycles_free = (double)tsc_free / (iters * BATCH_N);
+	return;
+err:
+	alloc->teardown();
+}
+
+/*
+ * Scenario 3: allocate N, free N in reverse order.
+ */
+static void
+run_batch_reverse(const struct allocator *alloc, size_t size,
+		double *cycles_alloc, double *cycles_free)
+{
+	void *ptrs[BATCH_N];
+	uint64_t tsc_alloc = 0, tsc_free = 0;
+	unsigned int iter, i;
+	unsigned int iters;
+
+	*cycles_alloc = -1.0;
+	*cycles_free = -1.0;
+
+	if (alloc->setup(size, BATCH_N) < 0)
+		return;
+
+	iters = MEASURE_OPS / BATCH_N;
+
+	for (iter = 0; iter < WARMUP_OPS / BATCH_N; iter++) {
+		for (i = 0; i < BATCH_N; i++) {
+			ptrs[i] = alloc->alloc();
+			if (ptrs[i] == NULL)
+				goto err;
+		}
+		for (i = BATCH_N; i > 0; i--)
+			alloc->free_obj(ptrs[i - 1]);
+	}
+
+	for (iter = 0; iter < iters; iter++) {
+		uint64_t t0;
+
+		t0 = rte_rdtsc_precise();
+		for (i = 0; i < BATCH_N; i++) {
+			ptrs[i] = alloc->alloc();
+			if (ptrs[i] == NULL)
+				goto err;
+		}
+		tsc_alloc += rte_rdtsc_precise() - t0;
+
+		t0 = rte_rdtsc_precise();
+		for (i = BATCH_N; i > 0; i--)
+			alloc->free_obj(ptrs[i - 1]);
+		tsc_free += rte_rdtsc_precise() - t0;
+	}
+
+	alloc->teardown();
+
+	*cycles_alloc = (double)tsc_alloc / (iters * BATCH_N);
+	*cycles_free = (double)tsc_free / (iters * BATCH_N);
+	return;
+err:
+	alloc->teardown();
+}
+
+/*
+ * Scenario 4: multi-lcore alloc/work/free with a dummy-work
+ * baseline. Each worker runs a tight alloc → touch → free loop
+ * on its own lcore. A second run with the same dummy work but
+ * no allocator traffic establishes a baseline; the per-op
+ * allocator cost is reported as (alloc_run - baseline_run).
+ *
+ * Fixed size class and a fixed amount of dummy work per op —
+ * this scenario sweeps lcore count rather than size.
+ */
+#define MULTI_SIZE 256u
+#define MULTI_WORK_BYTES 64u
+#define MULTI_WORK_PASSES 8u   /* RMW passes over the work region. */
+#define MULTI_OPS 200000u
+#define MULTI_WARMUP 2000u
+#define MAX_MULTI_LCORES 32u
+
+/*
+ * Per-worker volatile sink. Each worker writes to its own
+ * slot, preventing dead-code elimination of touch_buffer() and
+ * avoiding cross-lcore cache-line sharing on the hot path.
+ * Padded to cache-line stride to prevent false sharing between
+ * neighboring workers' slots.
+ */
+struct worker_sink {
+	volatile uint64_t value;
+} __rte_cache_aligned;
+
+static struct worker_sink worker_sinks[RTE_MAX_LCORE];
+
+/*
+ * Out-of-line dummy workload: run MULTI_WORK_PASSES
+ * read-modify-write passes over the first 'bytes' of the
+ * buffer. Each pass reads what the previous pass wrote, so the
+ * compiler cannot unroll or parallelize across passes — the
+ * work scales linearly with MULTI_WORK_PASSES. Returns an
+ * accumulator so the caller can feed it into a volatile sink;
+ * without that, the compiler could elide the whole function.
+ *
+ * __rte_noinline so it looks identical to the compiler in both
+ * the baseline (pre-allocated scratch buffer) and alloc-path
+ * runs, making the cycle-delta subtraction valid.
+ *
+ * The purpose of this being tunably expensive is to keep
+ * worker-per-iteration cost high relative to the allocator's
+ * critical section, so that even serialized allocators like
+ * rte_malloc spend most of their time outside the lock and the
+ * measured per-op allocator cost reflects its own work rather
+ * than its contention queue.
+ */
+static uint64_t __rte_noinline
+touch_buffer(void *buf, size_t bytes)
+{
+	uint64_t *p = buf;
+	size_t n = bytes / sizeof(uint64_t);
+	uint64_t acc = 0;
+	unsigned int pass;
+	size_t i;
+
+	/* Prime the buffer with a known pattern. */
+	for (i = 0; i < n; i++)
+		p[i] = i * 0x9E3779B97F4A7C15ULL;
+
+	/*
+	 * Dependent RMW passes: each pass reads p[i] written by
+	 * the previous pass, mixes the pass index in, and writes
+	 * back. The XOR into acc keeps the chain live.
+	 */
+	for (pass = 0; pass < MULTI_WORK_PASSES; pass++) {
+		for (i = 0; i < n; i++) {
+			uint64_t v = p[i];
+
+			v = v * 0xC2B2AE3D27D4EB4FULL + pass;
+			v ^= v >> 33;
+			p[i] = v;
+			acc ^= v;
+		}
+	}
+
+	return acc;
+}
+
+struct worker_args {
+	const struct allocator *alloc;
+	void *scratch;            /* baseline only; NULL => alloc path */
+	unsigned int iters;
+	unsigned int warmup;
+	unsigned int bulk_n;      /* 0 = single-object, >0 = bulk */
+	RTE_ATOMIC(bool) start_flag; /* barrier at worker entry */
+	uint64_t cycles;          /* out */
+	unsigned int ops;         /* out */
+	int err;                  /* out */
+};
+
+static int
+worker_run(void *arg)
+{
+	struct worker_args *wa = arg;
+	unsigned int lcore = rte_lcore_id();
+	uint64_t acc = 0;
+	uint64_t t0;
+	unsigned int i;
+
+	wa->err = 0;
+	wa->ops = 0;
+	wa->cycles = 0;
+
+	/* Wait for start flag (spin-barrier set by main). */
+	while (!rte_atomic_load_explicit(&wa->start_flag,
+			rte_memory_order_acquire))
+		rte_pause();
+
+	/* Warmup. */
+	for (i = 0; i < wa->warmup; i++) {
+		void *p;
+
+		if (wa->scratch != NULL)
+			p = wa->scratch;
+		else {
+			p = wa->alloc->alloc();
+			if (p == NULL) {
+				wa->err = -1;
+				return -1;
+			}
+		}
+		acc ^= touch_buffer(p, MULTI_WORK_BYTES);
+		if (wa->scratch == NULL)
+			wa->alloc->free_obj(p);
+	}
+
+	/* Measured loop. */
+	t0 = rte_rdtsc_precise();
+	for (i = 0; i < wa->iters; i++) {
+		void *p;
+
+		if (wa->scratch != NULL)
+			p = wa->scratch;
+		else {
+			p = wa->alloc->alloc();
+			if (p == NULL) {
+				wa->err = -1;
+				break;
+			}
+		}
+		acc ^= touch_buffer(p, MULTI_WORK_BYTES);
+		if (wa->scratch == NULL)
+			wa->alloc->free_obj(p);
+	}
+	wa->cycles = rte_rdtsc_precise() - t0;
+	wa->ops = i;
+
+	/* Publish accumulator to defeat dead-code elimination. */
+	worker_sinks[lcore].value ^= acc;
+
+	return 0;
+}
+
+static int
+worker_run_bulk(void *arg)
+{
+	struct worker_args *wa = arg;
+	unsigned int lcore = rte_lcore_id();
+	void *ptrs[BATCH_N];
+	uint64_t acc = 0;
+	uint64_t t0;
+	unsigned int i, j;
+	unsigned int bulk_n = wa->bulk_n;
+
+	wa->err = 0;
+	wa->ops = 0;
+	wa->cycles = 0;
+
+	while (!rte_atomic_load_explicit(&wa->start_flag,
+			rte_memory_order_acquire))
+		rte_pause();
+
+	/* Warmup. */
+	for (i = 0; i < wa->warmup; i++) {
+		if (wa->alloc->alloc_bulk(ptrs, bulk_n) < 0) {
+			wa->err = -1;
+			return -1;
+		}
+		for (j = 0; j < bulk_n; j++)
+			acc ^= touch_buffer(ptrs[j], MULTI_WORK_BYTES);
+		wa->alloc->free_bulk(ptrs, bulk_n);
+	}
+
+	t0 = rte_rdtsc_precise();
+	for (i = 0; i < wa->iters; i++) {
+		if (wa->alloc->alloc_bulk(ptrs, bulk_n) < 0) {
+			wa->err = -1;
+			break;
+		}
+		for (j = 0; j < bulk_n; j++)
+			acc ^= touch_buffer(ptrs[j], MULTI_WORK_BYTES);
+		wa->alloc->free_bulk(ptrs, bulk_n);
+	}
+	wa->cycles = rte_rdtsc_precise() - t0;
+	wa->ops = i * bulk_n;
+
+	worker_sinks[lcore].value ^= acc;
+
+	return 0;
+}
+
+/*
+ * Launch workers on the first 'n_workers' worker lcores, run
+ * either the baseline (scratch != NULL) or the alloc path
+ * (scratch == NULL), and return the mean per-op cycle cost
+ * averaged across participating workers.
+ *
+ * On any worker error, returns -1.0.
+ */
+static double
+run_multi_workers(const struct allocator *alloc, unsigned int n_workers,
+		void *const *scratches, unsigned int bulk_n)
+{
+	struct worker_args wargs[RTE_MAX_LCORE];
+	unsigned int worker_lcores[MAX_MULTI_LCORES];
+	unsigned int n = 0;
+	unsigned int lcore_id;
+	unsigned int i;
+	lcore_function_t *fn = bulk_n > 0 ? worker_run_bulk : worker_run;
+
+	/* Collect the first n_workers worker lcores. */
+	RTE_LCORE_FOREACH_WORKER(lcore_id) {
+		if (n >= n_workers)
+			break;
+		worker_lcores[n++] = lcore_id;
+	}
+	if (n < n_workers)
+		return -1.0;
+
+	/* Prepare per-worker args. */
+	for (i = 0; i < n_workers; i++) {
+		struct worker_args *wa = &wargs[worker_lcores[i]];
+
+		wa->alloc = alloc;
+		wa->scratch = scratches != NULL ? scratches[i] : NULL;
+		wa->iters = MULTI_OPS;
+		wa->warmup = MULTI_WARMUP;
+		wa->bulk_n = bulk_n;
+		rte_atomic_store_explicit(&wa->start_flag, false,
+				rte_memory_order_relaxed);
+	}
+
+	/* Launch workers. They spin on start_flag until released. */
+	for (i = 0; i < n_workers; i++)
+		rte_eal_remote_launch(fn, &wargs[worker_lcores[i]],
+				worker_lcores[i]);
+
+	/* Release all workers roughly simultaneously. */
+	for (i = 0; i < n_workers; i++)
+		rte_atomic_store_explicit(
+			&wargs[worker_lcores[i]].start_flag, true,
+			rte_memory_order_release);
+
+	/* Wait for completion. */
+	for (i = 0; i < n_workers; i++)
+		rte_eal_wait_lcore(worker_lcores[i]);
+
+	/* Aggregate: mean cycles per op across workers. */
+	{
+		double sum_cycles_per_op = 0.0;
+		unsigned int n_ok = 0;
+
+		for (i = 0; i < n_workers; i++) {
+			struct worker_args *wa = &wargs[worker_lcores[i]];
+
+			if (wa->err != 0 || wa->ops == 0)
+				return -1.0;
+			sum_cycles_per_op +=
+				(double)wa->cycles / (double)wa->ops;
+			n_ok++;
+		}
+		return sum_cycles_per_op / n_ok;
+	}
+}
+
+/*
+ * One sub-run of Scenario 4: given an allocator and a worker
+ * count, return (baseline, alloc_path) mean cycles per op.
+ */
+static void
+run_multi_lcore(const struct allocator *alloc, unsigned int n_workers,
+		unsigned int bulk_n, double *baseline, double *alloc_path)
+{
+	void *scratches[MAX_MULTI_LCORES] = {0};
+	unsigned int n_alloced = 0;
+	unsigned int i;
+
+	*baseline = -1.0;
+	*alloc_path = -1.0;
+
+	if (alloc->setup(MULTI_SIZE, n_workers * 64) < 0)
+		return;
+
+	/* Baseline: pre-allocate one scratch per worker. */
+	for (i = 0; i < n_workers; i++) {
+		scratches[i] = alloc->alloc();
+		if (scratches[i] == NULL)
+			goto err;
+		n_alloced++;
+	}
+
+	*baseline = run_multi_workers(alloc, n_workers, scratches, 0);
+
+	for (i = 0; i < n_alloced; i++)
+		alloc->free_obj(scratches[i]);
+	n_alloced = 0;
+
+	/* Alloc path: workers alloc+free each iter. */
+	*alloc_path = run_multi_workers(alloc, n_workers, NULL, bulk_n);
+
+	alloc->teardown();
+	return;
+err:
+	for (i = 0; i < n_alloced; i++)
+		alloc->free_obj(scratches[i]);
+	alloc->teardown();
+}
+
+/* Reporting -------------------------------------------------------- */
+
+static void
+print_header(const char *title)
+{
+	size_t i;
+
+	TEST_LOG("\n=== %s ===\n", title);
+	TEST_LOG("%-12s", "allocator");
+	for (i = 0; i < N_SIZES; i++)
+		TEST_LOG(" %10zu B", SIZES[i]);
+	TEST_LOG("\n");
+}
+
+static void
+print_row(const char *name, const double *values)
+{
+	size_t i;
+
+	TEST_LOG("%-12s", name);
+	for (i = 0; i < N_SIZES; i++) {
+		if (values[i] < 0)
+			TEST_LOG(" %12s", "--");
+		else
+			TEST_LOG(" %12.1f", values[i]);
+	}
+	TEST_LOG("\n");
+}
+
+static void
+print_multi_header(const char *title, const unsigned int *lcore_counts,
+		unsigned int n_counts)
+{
+	unsigned int i;
+
+	TEST_LOG("\n=== %s ===\n", title);
+	TEST_LOG("%-12s", "allocator");
+	for (i = 0; i < n_counts; i++)
+		TEST_LOG(" %8u lcore%c", lcore_counts[i],
+				lcore_counts[i] == 1 ? ' ' : 's');
+	TEST_LOG("\n");
+}
+
+static void
+print_multi_row(const char *name, const double *values, unsigned int n_counts)
+{
+	unsigned int i;
+
+	TEST_LOG("%-12s", name);
+	for (i = 0; i < n_counts; i++) {
+		if (values[i] < 0)
+			TEST_LOG(" %14s", "--");
+		else
+			TEST_LOG(" %14.1f", values[i]);
+	}
+	TEST_LOG("\n");
+}
+
+/* Driver ----------------------------------------------------------- */
+
+static int
+test_fastmem_perf(void)
+{
+	size_t i;
+	size_t a;
+	int rc;
+
+	rc = rte_fastmem_init();
+	if (rc < 0) {
+		TEST_LOG("rte_fastmem_init() failed: %d\n", rc);
+		return -1;
+	}
+
+	rc = rte_fastmem_reserve(128 * 1024 * 1024, SOCKET_ID_ANY);
+	if (rc < 0) {
+		TEST_LOG("rte_fastmem_reserve() failed: %d\n", rc);
+		rte_fastmem_deinit();
+		return -1;
+	}
+
+	TEST_LOG("\nfastmem performance — single-lcore, fixed-size\n");
+	TEST_LOG("All numbers are TSC cycles.\n");
+
+	/* Scenario 1: tight alloc+free. */
+	print_header("Scenario 1: Single-object hot path — cycles per (alloc + free)");
+	for (a = 0; a < N_ALLOCATORS; a++) {
+		double vals[N_SIZES];
+
+		for (i = 0; i < N_SIZES; i++)
+			vals[i] = run_tight(&allocators[a], SIZES[i]);
+		print_row(allocators[a].name, vals);
+	}
+
+	/* Scenario 2: batched, FIFO free. */
+	print_header("Scenario 2: Batch alloc, FIFO free — cycles per alloc");
+	for (a = 0; a < N_ALLOCATORS; a++) {
+		double vals_alloc[N_SIZES], vals_free[N_SIZES];
+
+		for (i = 0; i < N_SIZES; i++)
+			run_batch(&allocators[a], SIZES[i],
+				&vals_alloc[i], &vals_free[i]);
+		print_row(allocators[a].name, vals_alloc);
+	}
+	print_header("Scenario 2: Batch alloc, FIFO free — cycles per free");
+	for (a = 0; a < N_ALLOCATORS; a++) {
+		double vals_alloc[N_SIZES], vals_free[N_SIZES];
+
+		for (i = 0; i < N_SIZES; i++)
+			run_batch(&allocators[a], SIZES[i],
+				&vals_alloc[i], &vals_free[i]);
+		print_row(allocators[a].name, vals_free);
+	}
+
+	/* Scenario 3: batched, reverse free. */
+	print_header("Scenario 3: Batch alloc, LIFO free — cycles per alloc");
+	for (a = 0; a < N_ALLOCATORS; a++) {
+		double vals_alloc[N_SIZES], vals_free[N_SIZES];
+
+		for (i = 0; i < N_SIZES; i++)
+			run_batch_reverse(&allocators[a], SIZES[i],
+				&vals_alloc[i], &vals_free[i]);
+		print_row(allocators[a].name, vals_alloc);
+	}
+	print_header("Scenario 3: Batch alloc, LIFO free — cycles per free");
+	for (a = 0; a < N_ALLOCATORS; a++) {
+		double vals_alloc[N_SIZES], vals_free[N_SIZES];
+
+		for (i = 0; i < N_SIZES; i++)
+			run_batch_reverse(&allocators[a], SIZES[i],
+				&vals_alloc[i], &vals_free[i]);
+		print_row(allocators[a].name, vals_free);
+	}
+
+	/* Scenario 4: multi-lcore alloc/work/free with baseline. */
+	{
+		unsigned int max_workers = rte_lcore_count() - 1;
+		unsigned int lcore_counts[8];
+		unsigned int n_counts = 0;
+		unsigned int w;
+		double base_vals[N_ALLOCATORS][8];
+		double alloc_vals[N_ALLOCATORS][8];
+		double delta_vals[N_ALLOCATORS][8];
+
+		if (max_workers > MAX_MULTI_LCORES)
+			max_workers = MAX_MULTI_LCORES;
+
+		/* Sweep lcore counts: 1, 2, 4, 8, ... up to max_workers. */
+		for (w = 1; w <= max_workers && n_counts < RTE_DIM(lcore_counts); w *= 2)
+			lcore_counts[n_counts++] = w;
+		/* Ensure max_workers is the final column if not power of two. */
+		if (n_counts > 0 && lcore_counts[n_counts - 1] != max_workers &&
+				n_counts < RTE_DIM(lcore_counts) && max_workers >= 1)
+			lcore_counts[n_counts++] = max_workers;
+
+		if (n_counts == 0) {
+			TEST_LOG("\nScenario 4 (Multi-lcore contention) skipped: no worker lcores available.\n");
+		} else {
+			TEST_LOG("\nScenario 4 parameters: size=%u B\n",
+				MULTI_SIZE);
+
+			for (a = 0; a < N_ALLOCATORS; a++) {
+				unsigned int c;
+
+				for (c = 0; c < n_counts; c++)
+					run_multi_lcore(&allocators[a], lcore_counts[c],
+							0, &base_vals[a][c],
+							&alloc_vals[a][c]);
+				for (c = 0; c < n_counts; c++) {
+					if (base_vals[a][c] < 0 || alloc_vals[a][c] < 0)
+						delta_vals[a][c] = -1.0;
+					else
+						delta_vals[a][c] = alloc_vals[a][c] -
+							base_vals[a][c];
+				}
+			}
+
+			TEST_LOG("Baseline (domain logic only): %.1f cycles/op\n",
+					base_vals[0][0]);
+
+			print_multi_header("Scenario 4: Multi-lcore contention — allocator overhead (cycles/op)",
+					lcore_counts, n_counts);
+			for (a = 0; a < N_ALLOCATORS; a++)
+				print_multi_row(allocators[a].name,
+						delta_vals[a], n_counts);
+		}
+	}
+
+	/* Scenario 5: multi-lcore bulk alloc/work/free. */
+	{
+		unsigned int max_workers = rte_lcore_count() - 1;
+		unsigned int lcore_counts[8];
+		unsigned int n_counts = 0;
+		unsigned int w;
+		double base_vals[N_ALLOCATORS][8];
+		double alloc_vals[N_ALLOCATORS][8];
+		double delta_vals[N_ALLOCATORS][8];
+		unsigned int bulk_n = 8;
+
+		if (max_workers > MAX_MULTI_LCORES)
+			max_workers = MAX_MULTI_LCORES;
+
+		for (w = 1; w <= max_workers && n_counts < RTE_DIM(lcore_counts); w *= 2)
+			lcore_counts[n_counts++] = w;
+		if (n_counts > 0 && lcore_counts[n_counts - 1] != max_workers &&
+				n_counts < RTE_DIM(lcore_counts) && max_workers >= 1)
+			lcore_counts[n_counts++] = max_workers;
+
+		if (n_counts == 0) {
+			TEST_LOG("\nScenario 5 (Multi-lcore bulk contention) skipped: no worker lcores available.\n");
+		} else {
+			TEST_LOG("\nScenario 5 parameters: size=%u B, "
+				"bulk=%u\n",
+				MULTI_SIZE, bulk_n);
+
+			for (size_t a = 0; a < N_ALLOCATORS; a++) {
+				unsigned int c;
+
+				for (c = 0; c < n_counts; c++)
+					run_multi_lcore(&allocators[a],
+							lcore_counts[c], bulk_n,
+							&base_vals[a][c],
+							&alloc_vals[a][c]);
+				for (c = 0; c < n_counts; c++) {
+					if (base_vals[a][c] < 0 || alloc_vals[a][c] < 0)
+						delta_vals[a][c] = -1.0;
+					else
+						delta_vals[a][c] = alloc_vals[a][c] -
+							base_vals[a][c];
+				}
+			}
+
+			TEST_LOG("Baseline (domain logic only): %.1f cycles/op\n",
+					base_vals[0][0]);
+
+			print_multi_header("Scenario 5: Multi-lcore bulk contention — allocator overhead (cycles/op)",
+					lcore_counts, n_counts);
+			for (size_t a = 0; a < N_ALLOCATORS; a++)
+				print_multi_row(allocators[a].name,
+						delta_vals[a], n_counts);
+		}
+	}
+
+	TEST_LOG("\n");
+	rte_fastmem_deinit();
+	return 0;
+}
+
+REGISTER_PERF_TEST(fastmem_perf_autotest, test_fastmem_perf);
diff --git a/app/test/test_fastmem_profile.c b/app/test/test_fastmem_profile.c
new file mode 100644
index 0000000000..9a5dc94018
--- /dev/null
+++ b/app/test/test_fastmem_profile.c
@@ -0,0 +1,157 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2026 Ericsson AB
+ */
+
+/*
+ * A minimal fastmem workload intended for use with perf record /
+ * perf report. Runs a tight alloc/free loop for a fixed duration
+ * so that sampling profilers can attribute cycles to individual
+ * functions and instructions within the fastmem hot path.
+ *
+ * Usage:
+ *   perf record -g -- dpdk-test --no-huge --no-pci -m 8192 \
+ *       -l 0 <<< fastmem_profile_autotest
+ *   perf report
+ */
+
+#include <inttypes.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include <rte_common.h>
+#include <rte_cycles.h>
+#include <rte_lcore.h>
+#include <rte_memory.h>
+
+#include <rte_fastmem.h>
+
+#include "test.h"
+
+/* Duration of each sub-test in TSC cycles (~3 seconds at 3 GHz). */
+#define PROFILE_DURATION_CYCLES (3ULL * rte_get_tsc_hz())
+
+/* Allocation size for the profiling workload. */
+#define PROFILE_SIZE 256u
+
+/*
+ * Sub-test 1: tight alloc+free, exercises only the per-lcore
+ * cache (no bin interaction after warmup).
+ */
+static int
+profile_cache_hit(void)
+{
+	uint64_t deadline;
+	uint64_t ops = 0;
+
+	deadline = rte_rdtsc() + PROFILE_DURATION_CYCLES;
+
+	while (rte_rdtsc() < deadline) {
+		void *p = rte_fastmem_alloc(PROFILE_SIZE, 0, 0);
+
+		if (p == NULL)
+			return -1;
+		rte_fastmem_free(p);
+		ops++;
+	}
+
+	printf("  cache_hit: %" PRIu64 " ops\n", ops);
+	return 0;
+}
+
+/*
+ * Sub-test 2: alloc N then free N, where N exceeds the cache
+ * capacity. This forces repeated cache refills and drains,
+ * exercising the bin lock and slab free-list traversal.
+ */
+#define PROFILE_BATCH 256u
+
+static int
+profile_cache_miss(void)
+{
+	void *ptrs[PROFILE_BATCH];
+	uint64_t deadline;
+	uint64_t ops = 0;
+	unsigned int i;
+
+	deadline = rte_rdtsc() + PROFILE_DURATION_CYCLES;
+
+	while (rte_rdtsc() < deadline) {
+		for (i = 0; i < PROFILE_BATCH; i++) {
+			ptrs[i] = rte_fastmem_alloc(PROFILE_SIZE, 0, 0);
+			if (ptrs[i] == NULL)
+				return -1;
+		}
+		for (i = 0; i < PROFILE_BATCH; i++)
+			rte_fastmem_free(ptrs[i]);
+		ops += PROFILE_BATCH;
+	}
+
+	printf("  cache_miss: %" PRIu64 " ops\n", ops);
+	return 0;
+}
+
+static int
+test_fastmem_profile_cache_hit(void)
+{
+	int rc;
+
+	rc = rte_fastmem_init();
+	if (rc < 0) {
+		printf("rte_fastmem_init() failed: %d\n", rc);
+		return -1;
+	}
+
+	rc = rte_fastmem_reserve(128 * 1024 * 1024, SOCKET_ID_ANY);
+	if (rc < 0) {
+		printf("rte_fastmem_reserve() failed: %d\n", rc);
+		rte_fastmem_deinit();
+		return -1;
+	}
+
+	printf("fastmem profile: cache-hit workload (size=%u, ~%u s)\n",
+		PROFILE_SIZE, 3);
+
+	if (profile_cache_hit() < 0) {
+		rte_fastmem_deinit();
+		return -1;
+	}
+
+	rte_fastmem_deinit();
+	return 0;
+}
+
+static int
+test_fastmem_profile_cache_miss(void)
+{
+	int rc;
+
+	rc = rte_fastmem_init();
+	if (rc < 0) {
+		printf("rte_fastmem_init() failed: %d\n", rc);
+		return -1;
+	}
+
+	rc = rte_fastmem_reserve(128 * 1024 * 1024, SOCKET_ID_ANY);
+	if (rc < 0) {
+		printf("rte_fastmem_reserve() failed: %d\n", rc);
+		rte_fastmem_deinit();
+		return -1;
+	}
+
+	printf("fastmem profile: cache-miss workload (size=%u, ~%u s)\n",
+		PROFILE_SIZE, 3);
+
+	if (profile_cache_miss() < 0) {
+		rte_fastmem_deinit();
+		return -1;
+	}
+
+	rte_fastmem_deinit();
+	return 0;
+}
+
+REGISTER_PERF_TEST(fastmem_profile_cache_hit_autotest,
+		test_fastmem_profile_cache_hit);
+REGISTER_PERF_TEST(fastmem_profile_cache_miss_autotest,
+		test_fastmem_profile_cache_miss);
-- 
2.43.0
Previous message (by thread): [RFC v4 2/3] lib: add fastmem library
Next message (by thread): [RFC v3 2/3] lib: add fastmem library
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]
More information about the dev mailing list