[dpdk-dev] [RFC v2 16/23] eal: enable dynamic memory allocation/free on malloc/free

Anatoly Burakov anatoly.burakov at intel.com
Tue Dec 19 12:14:43 CET 2017


This set of changes enables rte_malloc to allocate and free memory
as needed. The way it works is, first malloc checks if there is
enough memory already allocated to satisfy user's request. If there
isn't, we try and allocate more memory. The reverse happens with
free - we free an element, check its size (including free element
merging due to adjacency) and see if it's bigger than hugepage
size and that its start and end span a hugepage or more. Then we
remove the area from malloc heap (adjusting element lengths where
appropriate), and deallocate the page.

For legacy mode, dynamic alloc/free is disabled.

It is worth noting that memseg lists are being sorted by page size,
and that we try our best to satisfy user's request. That is, if
the user requests an element from a 2MB page memory, we will check
if we can satisfy that request from existing memory, if not we try
and allocate more 2MB pages. If that fails and user also specified
a "size is hint" flag, we then check other page sizes and try to
allocate from there. If that fails too, then, depending on flags,
we may try allocating from other sockets. In other words, we try
our best to give the user what they asked for, but going to other
sockets is last resort - first we try to allocate more memory on
the same socket.

Signed-off-by: Anatoly Burakov <anatoly.burakov at intel.com>
---
 lib/librte_eal/common/eal_common_memzone.c |  19 +-
 lib/librte_eal/common/malloc_elem.c        |  96 +++++++++-
 lib/librte_eal/common/malloc_elem.h        |   8 +-
 lib/librte_eal/common/malloc_heap.c        | 280 +++++++++++++++++++++++++++--
 lib/librte_eal/common/malloc_heap.h        |   4 +-
 lib/librte_eal/common/rte_malloc.c         |  24 +--
 6 files changed, 373 insertions(+), 58 deletions(-)

diff --git a/lib/librte_eal/common/eal_common_memzone.c b/lib/librte_eal/common/eal_common_memzone.c
index f558ac2..c571145 100644
--- a/lib/librte_eal/common/eal_common_memzone.c
+++ b/lib/librte_eal/common/eal_common_memzone.c
@@ -132,7 +132,7 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len,
 	struct rte_memzone *mz;
 	struct rte_mem_config *mcfg;
 	size_t requested_len;
-	int socket, i;
+	int socket;
 
 	/* get pointer to global configuration */
 	mcfg = rte_eal_get_configuration()->mem_config;
@@ -216,21 +216,8 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len,
 		socket = socket_id;
 
 	/* allocate memory on heap */
-	void *mz_addr = malloc_heap_alloc(&mcfg->malloc_heaps[socket], NULL,
-			requested_len, flags, align, bound);
-
-	if ((mz_addr == NULL) && (socket_id == SOCKET_ID_ANY)) {
-		/* try other heaps */
-		for (i = 0; i < RTE_MAX_NUMA_NODES; i++) {
-			if (socket == i)
-				continue;
-
-			mz_addr = malloc_heap_alloc(&mcfg->malloc_heaps[i],
-					NULL, requested_len, flags, align, bound);
-			if (mz_addr != NULL)
-				break;
-		}
-	}
+	void *mz_addr = malloc_heap_alloc(NULL, requested_len, socket, flags,
+			align, bound);
 
 	if (mz_addr == NULL) {
 		rte_errno = ENOMEM;
diff --git a/lib/librte_eal/common/malloc_elem.c b/lib/librte_eal/common/malloc_elem.c
index ab09b94..48ac604 100644
--- a/lib/librte_eal/common/malloc_elem.c
+++ b/lib/librte_eal/common/malloc_elem.c
@@ -269,8 +269,8 @@ malloc_elem_free_list_insert(struct malloc_elem *elem)
 /*
  * Remove the specified element from its heap's free list.
  */
-static void
-elem_free_list_remove(struct malloc_elem *elem)
+void
+malloc_elem_free_list_remove(struct malloc_elem *elem)
 {
 	LIST_REMOVE(elem, free_list);
 }
@@ -290,7 +290,7 @@ malloc_elem_alloc(struct malloc_elem *elem, size_t size, unsigned align,
 	const size_t trailer_size = elem->size - old_elem_size - size -
 		MALLOC_ELEM_OVERHEAD;
 
-	elem_free_list_remove(elem);
+	malloc_elem_free_list_remove(elem);
 
 	if (trailer_size > MALLOC_ELEM_OVERHEAD + MIN_DATA_SIZE) {
 		/* split it, too much free space after elem */
@@ -363,7 +363,7 @@ malloc_elem_join_adjacent_free(struct malloc_elem *elem) {
 		erase = RTE_PTR_SUB(elem->next, MALLOC_ELEM_TRAILER_LEN);
 
 		/* remove from free list, join to this one */
-		elem_free_list_remove(elem->next);
+		malloc_elem_free_list_remove(elem->next);
 		join_elem(elem, elem->next);
 
 		/* erase header and trailer */
@@ -383,7 +383,7 @@ malloc_elem_join_adjacent_free(struct malloc_elem *elem) {
 		erase = RTE_PTR_SUB(elem, MALLOC_ELEM_TRAILER_LEN);
 
 		/* remove from free list, join to this one */
-		elem_free_list_remove(elem->prev);
+		malloc_elem_free_list_remove(elem->prev);
 
 		new_elem = elem->prev;
 		join_elem(new_elem, elem);
@@ -402,7 +402,7 @@ malloc_elem_join_adjacent_free(struct malloc_elem *elem) {
  * blocks either immediately before or immediately after newly freed block
  * are also free, the blocks are merged together.
  */
-int
+struct malloc_elem *
 malloc_elem_free(struct malloc_elem *elem)
 {
 	void *ptr;
@@ -420,7 +420,87 @@ malloc_elem_free(struct malloc_elem *elem)
 
 	memset(ptr, 0, data_len);
 
-	return 0;
+	return elem;
+}
+
+/* assume all checks were already done */
+void
+malloc_elem_hide_region(struct malloc_elem *elem, void *start, size_t len) {
+	size_t len_before, len_after;
+	struct malloc_elem *prev, *next;
+	void *end, *elem_end;
+
+	end = RTE_PTR_ADD(start, len);
+	elem_end = RTE_PTR_ADD(elem, elem->size);
+	len_before = RTE_PTR_DIFF(start, elem);
+	len_after = RTE_PTR_DIFF(elem_end, end);
+
+	prev = elem->prev;
+	next = elem->next;
+
+	if (len_after >= MALLOC_ELEM_OVERHEAD + MIN_DATA_SIZE) {
+		/* split after */
+		struct malloc_elem *split_after = end;
+
+		split_elem(elem, split_after);
+
+		next = split_after;
+
+		malloc_elem_free_list_insert(split_after);
+	} else if (len_after >= MALLOC_ELEM_HEADER_LEN) {
+		struct malloc_elem *pad_elem = end;
+
+		/* shrink current element */
+		elem->size -= len_after;
+		memset(pad_elem, 0, sizeof(*pad_elem));
+
+		/* copy next element's data to our pad */
+		memcpy(pad_elem, next, sizeof(*pad_elem));
+
+		/* pad next element */
+		next->state = ELEM_PAD;
+		next->pad = len_after;
+
+		/* next element is busy, would've been merged otherwise */
+		pad_elem->pad = len_after;
+		pad_elem->size += len_after;
+	} else if (len_after > 0) {
+		rte_panic("Unaligned element, heap is probably corrupt\n");
+	}
+
+	if (len_before >= MALLOC_ELEM_OVERHEAD + MIN_DATA_SIZE) {
+		/* split before */
+		struct malloc_elem *split_before = start;
+
+		split_elem(elem, split_before);
+
+		prev = elem;
+		elem = split_before;
+
+		malloc_elem_free_list_insert(prev);
+	} else if (len_before > 0) {
+		/*
+		 * unlike with elements after current, here we don't need to
+		 * pad elements, but rather just increase the size of previous
+		 * element, copy the old header and and set up trailer.
+		 */
+		void *trailer = RTE_PTR_ADD(prev,
+				prev->size - MALLOC_ELEM_TRAILER_LEN);
+		struct malloc_elem *new_elem = start;
+
+		memcpy(new_elem, elem, sizeof(*elem));
+		new_elem->size -= len_before;
+
+		prev->size += len_before;
+		set_trailer(prev);
+
+		elem = new_elem;
+
+		/* erase old trailer */
+		memset(trailer, 0, MALLOC_ELEM_TRAILER_LEN);
+	}
+
+	remove_elem(elem);
 }
 
 /*
@@ -446,7 +526,7 @@ malloc_elem_resize(struct malloc_elem *elem, size_t size)
 	/* we now know the element fits, so remove from free list,
 	 * join the two
 	 */
-	elem_free_list_remove(elem->next);
+	malloc_elem_free_list_remove(elem->next);
 	join_elem(elem, elem->next);
 
 	if (elem->size - new_size >= MIN_DATA_SIZE + MALLOC_ELEM_OVERHEAD) {
diff --git a/lib/librte_eal/common/malloc_elem.h b/lib/librte_eal/common/malloc_elem.h
index 330bddc..b47c55e 100644
--- a/lib/librte_eal/common/malloc_elem.h
+++ b/lib/librte_eal/common/malloc_elem.h
@@ -164,7 +164,7 @@ malloc_elem_alloc(struct malloc_elem *elem, size_t size,
  * blocks either immediately before or immediately after newly freed block
  * are also free, the blocks are merged together.
  */
-int
+struct malloc_elem *
 malloc_elem_free(struct malloc_elem *elem);
 
 struct malloc_elem *
@@ -177,6 +177,12 @@ malloc_elem_join_adjacent_free(struct malloc_elem *elem);
 int
 malloc_elem_resize(struct malloc_elem *elem, size_t size);
 
+void
+malloc_elem_hide_region(struct malloc_elem *elem, void *start, size_t len);
+
+void
+malloc_elem_free_list_remove(struct malloc_elem *elem);
+
 /*
  * Given an element size, compute its freelist index.
  */
diff --git a/lib/librte_eal/common/malloc_heap.c b/lib/librte_eal/common/malloc_heap.c
index 5fa21fe..0d61704 100644
--- a/lib/librte_eal/common/malloc_heap.c
+++ b/lib/librte_eal/common/malloc_heap.c
@@ -49,8 +49,10 @@
 #include <rte_spinlock.h>
 #include <rte_memcpy.h>
 #include <rte_atomic.h>
+#include <rte_fbarray.h>
 
 #include "eal_internal_cfg.h"
+#include "eal_memalloc.h"
 #include "malloc_elem.h"
 #include "malloc_heap.h"
 
@@ -151,46 +153,304 @@ find_suitable_element(struct malloc_heap *heap, size_t size,
  * scan fails. Once the new memseg is added, it re-scans and should return
  * the new element after releasing the lock.
  */
-void *
-malloc_heap_alloc(struct malloc_heap *heap,
-		const char *type __attribute__((unused)), size_t size, unsigned flags,
-		size_t align, size_t bound)
+static void *
+heap_alloc(struct malloc_heap *heap, const char *type __rte_unused, size_t size,
+		unsigned flags, size_t align, size_t bound)
 {
 	struct malloc_elem *elem;
 
 	size = RTE_CACHE_LINE_ROUNDUP(size);
 	align = RTE_CACHE_LINE_ROUNDUP(align);
 
-	rte_spinlock_lock(&heap->lock);
-
 	elem = find_suitable_element(heap, size, flags, align, bound);
 	if (elem != NULL) {
 		elem = malloc_elem_alloc(elem, size, align, bound);
+
 		/* increase heap's count of allocated elements */
 		heap->alloc_count++;
 	}
-	rte_spinlock_unlock(&heap->lock);
 
 	return elem == NULL ? NULL : (void *)(&elem[1]);
 }
 
+static void *
+try_expand_heap(struct malloc_heap *heap, struct rte_memseg_list *msl,
+		const char *type, size_t size, int socket, unsigned flags,
+		size_t align, size_t bound) {
+	struct malloc_elem *elem;
+	struct rte_memseg **ms;
+	size_t map_len;
+	void *map_addr;
+	int i, n_pages, allocd_pages;
+	void *ret;
+
+	align = RTE_MAX(align, MALLOC_ELEM_HEADER_LEN);
+	map_len = RTE_ALIGN_CEIL(align + size + MALLOC_ELEM_TRAILER_LEN,
+			msl->hugepage_sz);
+
+	n_pages = map_len / msl->hugepage_sz;
+
+	/* we can't know in advance how many pages we'll need, so malloc */
+	ms = malloc(sizeof(*ms) * n_pages);
+
+	allocd_pages = eal_memalloc_alloc_page_bulk(ms, n_pages,
+			msl->hugepage_sz, socket, true);
+
+	/* make sure we've allocated our pages... */
+	if (allocd_pages != n_pages)
+		goto free_ms;
+
+	map_addr = ms[0]->addr;
+
+	/* add newly minted memsegs to malloc heap */
+	elem = malloc_heap_add_memory(heap, msl, map_addr, map_len);
+
+	RTE_LOG(DEBUG, EAL, "Heap on socket %d was expanded by %zdMB\n",
+		msl->socket_id, map_len >> 20ULL);
+
+	/* try once more, as now we have allocated new memory */
+	ret = heap_alloc(heap, type, size, flags,
+			align == 0 ? 1 : align, bound);
+
+	if (ret == NULL)
+		goto free_elem;
+
+	free(ms);
+	return ret;
+
+free_elem:
+	malloc_elem_free_list_remove(elem);
+	malloc_elem_hide_region(elem, map_addr, map_len);
+	heap->total_size -= map_len;
+
+	RTE_LOG(DEBUG, EAL, "%s(): couldn't allocate, so shrinking heap on socket %d by %zdMB\n",
+		__func__, socket, map_len >> 20ULL);
+
+	for (i = 0; i < n_pages; i++) {
+		eal_memalloc_free_page(ms[i]);
+	}
+free_ms:
+	free(ms);
+	return NULL;
+}
+
+static int
+compare_pagesz(const void *a, const void *b) {
+	const struct rte_memseg_list *msla = a;
+	const struct rte_memseg_list *mslb = b;
+
+	if (msla->hugepage_sz < mslb->hugepage_sz)
+		return 1;
+	if (msla->hugepage_sz > mslb->hugepage_sz)
+		return -1;
+	return 0;
+}
+
+/* this will try lower page sizes first */
+static void *
+heap_alloc_on_socket(const char *type, size_t size, int socket,
+		unsigned flags, size_t align, size_t bound) {
+	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+	struct malloc_heap *heap = &mcfg->malloc_heaps[socket];
+	struct rte_memseg_list *requested_msls[RTE_MAX_MEMSEG_LISTS];
+	struct rte_memseg_list *other_msls[RTE_MAX_MEMSEG_LISTS];
+	int i, n_other_msls = 0, n_requested_msls = 0;
+	bool size_hint = (flags & RTE_MEMZONE_SIZE_HINT_ONLY) > 0;
+	unsigned size_flags = flags & ~RTE_MEMZONE_SIZE_HINT_ONLY;
+	void *ret;
+
+	rte_spinlock_lock(&(heap->lock));
+
+	/* for legacy mode, try once and with all flags */
+	if (internal_config.legacy_mem) {
+		ret = heap_alloc(heap, type, size, flags,
+				align == 0 ? 1 : align, bound);
+		goto alloc_unlock;
+	}
+
+	/*
+	 * we do not pass the size hint here, because even if allocation fails,
+	 * we may still be able to allocate memory from appropriate page sizes,
+	 * we just need to request more memory first.
+	 */
+	ret = heap_alloc(heap, type, size, size_flags, align == 0 ? 1 : align,
+			bound);
+	if (ret != NULL)
+		goto alloc_unlock;
+
+	memset(requested_msls, 0, sizeof(requested_msls));
+	memset(other_msls, 0, sizeof(other_msls));
+
+	/*
+	 * go through memseg list and take note of all the page sizes available,
+	 * and if any of them were specifically requested by the user.
+	 */
+	for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
+		struct rte_memseg_list *msl = &mcfg->memsegs[i];
+
+		if (msl->socket_id != socket)
+			continue;
+
+		if (msl->base_va == NULL)
+			continue;
+
+		/* if pages of specific size were requested */
+		if (size_flags != 0 && check_hugepage_sz(size_flags,
+				msl->hugepage_sz)) {
+			requested_msls[n_requested_msls++] = msl;
+		} else if (size_flags == 0 || size_hint) {
+			other_msls[n_other_msls++] = msl;
+		}
+	}
+
+	/* sort the lists, smallest first */
+	qsort(requested_msls, n_requested_msls, sizeof(requested_msls[0]),
+			compare_pagesz);
+	qsort(other_msls, n_other_msls, sizeof(other_msls[0]),
+			compare_pagesz);
+
+	for (i = 0; i < n_requested_msls; i++) {
+		struct rte_memseg_list *msl = requested_msls[i];
+
+		/*
+		 * do not pass the size hint here, as user expects other page
+		 * sizes first, before resorting to best effort allocation.
+		 */
+		ret = try_expand_heap(heap, msl, type, size, socket, size_flags,
+				align, bound);
+		if (ret != NULL)
+			goto alloc_unlock;
+	}
+	if (n_other_msls == 0)
+		goto alloc_unlock;
+
+	/* now, try reserving with size hint */
+	ret = heap_alloc(heap, type, size, flags, align == 0 ? 1 : align,
+			bound);
+	if (ret != NULL)
+		goto alloc_unlock;
+
+	/*
+	 * we still couldn't reserve memory, so try expanding heap with other
+	 * page sizes, if there are any
+	 */
+	for (i = 0; i < n_other_msls; i++) {
+		struct rte_memseg_list *msl = other_msls[i];
+
+		ret = try_expand_heap(heap, msl, type, size, socket, flags,
+				align, bound);
+		if (ret != NULL)
+			goto alloc_unlock;
+	}
+alloc_unlock:
+	rte_spinlock_unlock(&(heap->lock));
+	return ret;
+}
+
+void *
+malloc_heap_alloc(const char *type, size_t size, int socket_arg, unsigned flags,
+		size_t align, size_t bound) {
+	int socket, i;
+	void *ret;
+
+	/* return NULL if size is 0 or alignment is not power-of-2 */
+	if (size == 0 || (align && !rte_is_power_of_2(align)))
+		return NULL;
+
+	if (!rte_eal_has_hugepages())
+		socket_arg = SOCKET_ID_ANY;
+
+	if (socket_arg == SOCKET_ID_ANY)
+		socket = malloc_get_numa_socket();
+	else
+		socket = socket_arg;
+
+	/* Check socket parameter */
+	if (socket >= RTE_MAX_NUMA_NODES)
+		return NULL;
+
+	// TODO: add warning for alignments bigger than page size if not VFIO
+
+	ret = heap_alloc_on_socket(type, size, socket, flags, align, bound);
+	if (ret != NULL || socket_arg != SOCKET_ID_ANY)
+		return ret;
+
+	/* try other heaps */
+	for (i = 0; i < (int) rte_num_sockets(); i++) {
+		if (i == socket)
+			continue;
+		ret = heap_alloc_on_socket(type, size, socket, flags,
+				align, bound);
+		if (ret != NULL)
+			return ret;
+	}
+	return NULL;
+}
+
 int
 malloc_heap_free(struct malloc_elem *elem) {
 	struct malloc_heap *heap;
-	int ret;
+	void *start, *aligned_start, *end, *aligned_end;
+	size_t len, aligned_len;
+	const struct rte_memseg_list *msl;
+	int n_pages, page_idx, max_page_idx, ret;
 
 	if (!malloc_elem_cookies_ok(elem) || elem->state != ELEM_BUSY)
 		return -1;
 
 	/* elem may be merged with previous element, so keep heap address */
 	heap = elem->heap;
+	msl = elem->msl;
 
 	rte_spinlock_lock(&(heap->lock));
 
-	ret = malloc_elem_free(elem);
+	elem = malloc_elem_free(elem);
 
-	rte_spinlock_unlock(&(heap->lock));
+	/* anything after this is a bonus */
+	ret = 0;
+
+	/* ...of which we can't avail if we are in legacy mode */
+	if (internal_config.legacy_mem)
+		goto free_unlock;
+
+	/* check if we can free any memory back to the system */
+	if (elem->size < msl->hugepage_sz)
+		goto free_unlock;
+
+	/* probably, but let's make sure, as we may not be using up full page */
+	start = elem;
+	len = elem->size;
+	aligned_start = RTE_PTR_ALIGN_CEIL(start, msl->hugepage_sz);
+	end = RTE_PTR_ADD(elem, len);
+	aligned_end = RTE_PTR_ALIGN_FLOOR(end, msl->hugepage_sz);
 
+	aligned_len = RTE_PTR_DIFF(aligned_end, aligned_start);
+
+	/* can't free anything */
+	if (aligned_len < msl->hugepage_sz)
+		goto free_unlock;
+
+	malloc_elem_free_list_remove(elem);
+
+	malloc_elem_hide_region(elem, (void*) aligned_start, aligned_len);
+
+	/* we don't really care if we fail to deallocate memory */
+	n_pages = aligned_len / msl->hugepage_sz;
+	page_idx = RTE_PTR_DIFF(aligned_start, msl->base_va) / msl->hugepage_sz;
+	max_page_idx = page_idx + n_pages;
+
+	for (; page_idx < max_page_idx; page_idx++) {
+		struct rte_memseg *ms;
+
+		ms = rte_fbarray_get(&msl->memseg_arr, page_idx);
+		eal_memalloc_free_page(ms);
+		heap->total_size -= msl->hugepage_sz;
+	}
+
+	RTE_LOG(DEBUG, EAL, "Heap on socket %d was shrunk by %zdMB\n",
+		msl->socket_id, aligned_len >> 20ULL);
+free_unlock:
+	rte_spinlock_unlock(&(heap->lock));
 	return ret;
 }
 
diff --git a/lib/librte_eal/common/malloc_heap.h b/lib/librte_eal/common/malloc_heap.h
index df04dd8..3fcd14f 100644
--- a/lib/librte_eal/common/malloc_heap.h
+++ b/lib/librte_eal/common/malloc_heap.h
@@ -53,8 +53,8 @@ malloc_get_numa_socket(void)
 }
 
 void *
-malloc_heap_alloc(struct malloc_heap *heap,	const char *type, size_t size,
-		unsigned flags, size_t align, size_t bound);
+malloc_heap_alloc(const char *type, size_t size, int socket, unsigned flags,
+		size_t align, size_t bound);
 
 int
 malloc_heap_free(struct malloc_elem *elem);
diff --git a/lib/librte_eal/common/rte_malloc.c b/lib/librte_eal/common/rte_malloc.c
index 92cd7d8..dc3199a 100644
--- a/lib/librte_eal/common/rte_malloc.c
+++ b/lib/librte_eal/common/rte_malloc.c
@@ -68,9 +68,7 @@ void rte_free(void *addr)
 void *
 rte_malloc_socket(const char *type, size_t size, unsigned align, int socket_arg)
 {
-	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
-	int socket, i;
-	void *ret;
+	int socket;
 
 	/* return NULL if size is 0 or alignment is not power-of-2 */
 	if (size == 0 || (align && !rte_is_power_of_2(align)))
@@ -88,24 +86,8 @@ rte_malloc_socket(const char *type, size_t size, unsigned align, int socket_arg)
 	if (socket >= RTE_MAX_NUMA_NODES)
 		return NULL;
 
-	ret = malloc_heap_alloc(&mcfg->malloc_heaps[socket], type,
-				size, 0, align == 0 ? 1 : align, 0);
-	if (ret != NULL || socket_arg != SOCKET_ID_ANY)
-		return ret;
-
-	/* try other heaps */
-	for (i = 0; i < RTE_MAX_NUMA_NODES; i++) {
-		/* we already tried this one */
-		if (i == socket)
-			continue;
-
-		ret = malloc_heap_alloc(&mcfg->malloc_heaps[i], type,
-					size, 0, align == 0 ? 1 : align, 0);
-		if (ret != NULL)
-			return ret;
-	}
-
-	return NULL;
+	return malloc_heap_alloc(type, size, socket_arg, 0,
+			align == 0 ? 1 : align, 0);
 }
 
 /*
-- 
2.7.4



More information about the dev mailing list