[dpdk-dev] [PATCH 26/41] eal: prepare memseg lists for multiprocess	sync
    Anatoly Burakov 
    anatoly.burakov at intel.com
       
    Sat Mar  3 14:46:14 CET 2018
    
    
  
In preparation for implementing multiprocess support, we are adding
a version number and write locks to memseg lists.
There are two ways of implementing multiprocess support for memory
hotplug: either all information about mapped memory is shared
between processes, and secondary processes simply attempt to
map/unmap memory based on requests from the primary, or secondary
processes store their own maps and only check if they are in sync
with the primary process' maps.
This implementation will opt for the latter option: primary process
shared mappings will be authoritative, and each secondary process
will use its own interal view of mapped memory, and will attempt
to synchronize on these mappings using versioning.
Under this model, only primary process will decide which pages get
mapped, and secondary processes will only copy primary's page
maps and get notified of the changes via IPC mechanism (coming
in later commits).
To avoid race conditions, memseg lists will also have write locks -
that is, it will be possible for several secondary processes to
initialize concurrently, but it will not be possible for several
processes to request memory allocation unless all other allocations
were complete (on a single socket - it is OK to allocate/free memory
on different sockets concurrently).
In principle, it is possible for multiple processes to request
allocation/deallcation on multiple sockets, but we will only allow
one such request to be active at any one time.
Signed-off-by: Anatoly Burakov <anatoly.burakov at intel.com>
---
 lib/librte_eal/bsdapp/eal/eal_memalloc.c          |   7 +
 lib/librte_eal/common/eal_memalloc.h              |   4 +
 lib/librte_eal/common/include/rte_eal_memconfig.h |   2 +
 lib/librte_eal/linuxapp/eal/eal_memalloc.c        | 288 +++++++++++++++++++++-
 4 files changed, 295 insertions(+), 6 deletions(-)
diff --git a/lib/librte_eal/bsdapp/eal/eal_memalloc.c b/lib/librte_eal/bsdapp/eal/eal_memalloc.c
index be8340b..255aedc 100644
--- a/lib/librte_eal/bsdapp/eal/eal_memalloc.c
+++ b/lib/librte_eal/bsdapp/eal/eal_memalloc.c
@@ -24,3 +24,10 @@ eal_memalloc_alloc_page(uint64_t __rte_unused size, int __rte_unused socket)
 	RTE_LOG(ERR, EAL, "Memory hotplug not supported on FreeBSD\n");
 	return NULL;
 }
+
+int
+eal_memalloc_sync_with_primary(void)
+{
+	RTE_LOG(ERR, EAL, "Memory hotplug not supported on FreeBSD\n");
+	return -1;
+}
diff --git a/lib/librte_eal/common/eal_memalloc.h b/lib/librte_eal/common/eal_memalloc.h
index 08ba70e..beac296 100644
--- a/lib/librte_eal/common/eal_memalloc.h
+++ b/lib/librte_eal/common/eal_memalloc.h
@@ -24,4 +24,8 @@ bool
 eal_memalloc_is_contig(struct rte_memseg_list *msl, void *start,
 		size_t len);
 
+/* synchronize local memory map to primary process */
+int
+eal_memalloc_sync_with_primary(void);
+
 #endif // EAL_MEMALLOC_H
diff --git a/lib/librte_eal/common/include/rte_eal_memconfig.h b/lib/librte_eal/common/include/rte_eal_memconfig.h
index b6bdb21..d653d57 100644
--- a/lib/librte_eal/common/include/rte_eal_memconfig.h
+++ b/lib/librte_eal/common/include/rte_eal_memconfig.h
@@ -32,6 +32,8 @@ struct rte_memseg_list {
 	};
 	int socket_id; /**< Socket ID for all memsegs in this list. */
 	uint64_t hugepage_sz; /**< page size for all memsegs in this list. */
+	rte_rwlock_t mplock; /**< read-write lock for multiprocess sync. */
+	uint32_t version; /**< version number for multiprocess sync. */
 	struct rte_fbarray memseg_arr;
 };
 
diff --git a/lib/librte_eal/linuxapp/eal/eal_memalloc.c b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
index c03e7bc..227d703 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memalloc.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
@@ -65,6 +65,9 @@ static struct msl_entry_list msl_entry_list =
 		TAILQ_HEAD_INITIALIZER(msl_entry_list);
 static rte_spinlock_t tailq_lock = RTE_SPINLOCK_INITIALIZER;
 
+/** local copy of a memory map, used to synchronize memory hotplug in MP */
+static struct rte_memseg_list local_memsegs[RTE_MAX_MEMSEG_LISTS];
+
 static sigjmp_buf huge_jmpenv;
 
 static void __rte_unused huge_sigbus_handler(int signo __rte_unused)
@@ -619,11 +622,14 @@ eal_memalloc_alloc_page_bulk(struct rte_memseg **ms, int n,
 			continue;
 		msl = cur_msl;
 
+		/* lock memseg list */
+		rte_rwlock_write_lock(&msl->mplock);
+
 		/* try finding space in memseg list */
 		cur_idx = rte_fbarray_find_next_n_free(&msl->memseg_arr, 0, n);
 
 		if (cur_idx < 0)
-			continue;
+			goto next_list;
 
 		end_idx = cur_idx + n;
 		start_idx = cur_idx;
@@ -637,7 +643,6 @@ eal_memalloc_alloc_page_bulk(struct rte_memseg **ms, int n,
 
 			if (alloc_page(cur, addr, size, socket, hi, msl_idx,
 					cur_idx)) {
-
 				RTE_LOG(DEBUG, EAL, "attempted to allocate %i pages, but only %i were allocated\n",
 					n, i);
 
@@ -648,7 +653,7 @@ eal_memalloc_alloc_page_bulk(struct rte_memseg **ms, int n,
 				 */
 				if (!exact) {
 					ret = i;
-					goto restore_numa;
+					goto success;
 				}
 				RTE_LOG(DEBUG, EAL, "exact amount of pages was requested, so returning %i allocated pages\n",
 					i);
@@ -680,10 +685,13 @@ eal_memalloc_alloc_page_bulk(struct rte_memseg **ms, int n,
 		}
 		/* we allocated all pages */
 		ret = n;
+success:
+		msl->version++;
+		rte_rwlock_write_unlock(&msl->mplock);
 
 		break;
 next_list:
-		/* dummy semi-colon to make label work */;
+		rte_rwlock_write_unlock(&msl->mplock);
 	}
 	/* we didn't break */
 	if (msl_idx == RTE_MAX_MEMSEG_LISTS) {
@@ -716,7 +724,7 @@ eal_memalloc_free_page(struct rte_memseg *ms)
 	struct rte_memseg_list *msl = NULL;
 	unsigned int msl_idx, seg_idx;
 	struct hugepage_info *hi = NULL;
-	int i;
+	int ret, i;
 
 	/* dynamic free not supported in legacy mode */
 	if (internal_config.legacy_mem)
@@ -753,6 +761,274 @@ eal_memalloc_free_page(struct rte_memseg *ms)
 		RTE_LOG(ERR, EAL, "Couldn't find memseg list\n");
 		return -1;
 	}
+	rte_rwlock_write_lock(&msl->mplock);
+
 	rte_fbarray_set_free(&msl->memseg_arr, seg_idx);
-	return free_page(ms, hi, msl_idx, seg_idx);
+
+	/* increment version number */
+	msl->version++;
+
+	ret = free_page(ms, hi, msl_idx, seg_idx);
+
+	rte_rwlock_write_unlock(&msl->mplock);
+
+	return ret;
+}
+
+static int
+sync_chunk(struct rte_memseg_list *primary_msl,
+		struct rte_memseg_list *local_msl, struct hugepage_info *hi,
+		unsigned int msl_idx, bool used, int start, int end)
+{
+	struct rte_fbarray *l_arr, *p_arr;
+	int i, ret, chunk_len, diff_len;
+
+	l_arr = &local_msl->memseg_arr;
+	p_arr = &primary_msl->memseg_arr;
+
+	/* we need to aggregate allocations/deallocations into bigger chunks,
+	 * as we don't want to spam the user with per-page callbacks.
+	 *
+	 * to avoid any potential issues, we also want to trigger
+	 * deallocation callbacks *before* we actually deallocate
+	 * memory, so that the user application could wrap up its use
+	 * before it goes away.
+	 */
+
+	chunk_len = end - start;
+
+	/* find how many contiguous pages we can map/unmap for this chunk */
+	diff_len = used ?
+			rte_fbarray_find_contig_free(l_arr, start) :
+			rte_fbarray_find_contig_used(l_arr, start);
+
+	/* has to be at least one page */
+	if (diff_len < 1)
+		return -1;
+
+	diff_len = RTE_MIN(chunk_len, diff_len);
+
+	for (i = 0; i < diff_len; i++) {
+		struct rte_memseg *p_ms, *l_ms;
+		int seg_idx = start + i;
+
+		l_ms = rte_fbarray_get(l_arr, seg_idx);
+		p_ms = rte_fbarray_get(p_arr, seg_idx);
+
+		if (l_ms == NULL || p_ms == NULL)
+			return -1;
+
+		if (used) {
+			ret = alloc_page(l_ms, p_ms->addr,
+					p_ms->hugepage_sz,
+					p_ms->socket_id, hi,
+					msl_idx, seg_idx);
+			if (ret < 0)
+				return -1;
+			rte_fbarray_set_used(l_arr, seg_idx);
+		} else {
+			ret = free_page(l_ms, hi, msl_idx, seg_idx);
+			if (ret < 0)
+				return -1;
+			rte_fbarray_set_free(l_arr, seg_idx);
+		}
+	}
+
+	/* calculate how much we can advance until next chunk */
+	diff_len = used ?
+			rte_fbarray_find_contig_used(l_arr, start) :
+			rte_fbarray_find_contig_free(l_arr, start);
+	ret = RTE_MIN(chunk_len, diff_len);
+
+	return ret;
+}
+
+static int
+sync_status(struct rte_memseg_list *primary_msl,
+		struct rte_memseg_list *local_msl, struct hugepage_info *hi,
+		unsigned int msl_idx, bool used)
+{
+	struct rte_fbarray *l_arr, *p_arr;
+	int p_idx, l_chunk_len, p_chunk_len, ret;
+	int start, end;
+
+	/* this is a little bit tricky, but the basic idea is - walk both lists
+	 * and spot any places where there are discrepancies. walking both lists
+	 * and noting discrepancies in a single go is a hard problem, so we do
+	 * it in two passes - first we spot any places where allocated segments
+	 * mismatch (i.e. ensure that everything that's allocated in the primary
+	 * is also allocated in the secondary), and then we do it by looking at
+	 * free segments instead.
+	 *
+	 * we also need to aggregate changes into chunks, as we have to call
+	 * callbacks per allocation, not per page.
+	 */
+	l_arr = &local_msl->memseg_arr;
+	p_arr = &primary_msl->memseg_arr;
+
+	if (used)
+		p_idx = rte_fbarray_find_next_used(p_arr, 0);
+	else
+		p_idx = rte_fbarray_find_next_free(p_arr, 0);
+
+	while (p_idx >= 0) {
+		int next_chunk_search_idx;
+
+		if (used) {
+			p_chunk_len = rte_fbarray_find_contig_used(p_arr,
+					p_idx);
+			l_chunk_len = rte_fbarray_find_contig_used(l_arr,
+					p_idx);
+		} else {
+			p_chunk_len = rte_fbarray_find_contig_free(p_arr,
+					p_idx);
+			l_chunk_len = rte_fbarray_find_contig_free(l_arr,
+					p_idx);
+		}
+		/* best case scenario - no differences (or bigger, which will be
+		 * fixed during next iteration), look for next chunk
+		 */
+		if (l_chunk_len >= p_chunk_len) {
+			next_chunk_search_idx = p_idx + p_chunk_len;
+			goto next_chunk;
+		}
+
+		/* if both chunks start at the same point, skip parts we know
+		 * are identical, and sync the rest. each call to sync_chunk
+		 * will only sync contiguous segments, so we need to call this
+		 * until we are sure there are no more differences in this
+		 * chunk.
+		 */
+		start = p_idx + l_chunk_len;
+		end = p_idx + p_chunk_len;
+		do {
+			ret = sync_chunk(primary_msl, local_msl, hi, msl_idx,
+					used, start, end);
+			start += ret;
+		} while (start < end && ret >= 0);
+		/* if ret is negative, something went wrong */
+		if (ret < 0)
+			return -1;
+
+		next_chunk_search_idx = p_idx + p_chunk_len;
+next_chunk:
+		/* skip to end of this chunk */
+		if (used) {
+			p_idx = rte_fbarray_find_next_used(p_arr,
+					next_chunk_search_idx);
+		} else {
+			p_idx = rte_fbarray_find_next_free(p_arr,
+					next_chunk_search_idx);
+		}
+	}
+	return 0;
+}
+
+static int
+sync_existing(struct rte_memseg_list *primary_msl,
+		struct rte_memseg_list *local_msl, struct hugepage_info *hi,
+		unsigned int msl_idx)
+{
+	int ret;
+
+	/* ensure all allocated space is the same in both lists */
+	ret = sync_status(primary_msl, local_msl, hi, msl_idx, true);
+	if (ret < 0)
+		return -1;
+
+	/* ensure all unallocated space is the same in both lists */
+	ret = sync_status(primary_msl, local_msl, hi, msl_idx, false);
+	if (ret < 0)
+		return -1;
+
+	/* update version number */
+	local_msl->version = primary_msl->version;
+
+	return 0;
+}
+
+
+int
+eal_memalloc_sync_with_primary(void)
+{
+	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+	struct rte_memseg_list *primary_msl, *local_msl;
+	struct hugepage_info *hi = NULL;
+	unsigned int msl_idx;
+	int i;
+
+	/* nothing to be done in primary */
+	if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+		return 0;
+
+	for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
+		bool new_msl = false;
+		bool fail = false;
+
+		primary_msl = &mcfg->memsegs[msl_idx];
+		local_msl = &local_memsegs[msl_idx];
+
+		if (primary_msl->base_va == 0)
+			continue;
+
+		/* this is a valid memseg list, so read-lock it */
+		rte_rwlock_read_lock(&primary_msl->mplock);
+
+		/* write-lock local memseg list */
+		rte_rwlock_write_lock(&local_msl->mplock);
+
+		/* check if secondary has this memseg list set up */
+		if (local_msl->base_va == 0) {
+			char name[PATH_MAX];
+			int ret;
+			new_msl = true;
+
+			/* create distinct fbarrays for each secondary */
+			snprintf(name, RTE_FBARRAY_NAME_LEN, "%s_%i",
+				primary_msl->memseg_arr.name, getpid());
+
+			ret = rte_fbarray_init(&local_msl->memseg_arr, name,
+				primary_msl->memseg_arr.len,
+				primary_msl->memseg_arr.elt_sz);
+			if (ret < 0) {
+				RTE_LOG(ERR, EAL, "Cannot initialize local memory map\n");
+				fail = true;
+				goto endloop;
+			}
+
+			local_msl->base_va = primary_msl->base_va;
+		}
+
+		for (i = 0; i < (int) RTE_DIM(internal_config.hugepage_info);
+					i++) {
+			uint64_t cur_sz =
+				internal_config.hugepage_info[i].hugepage_sz;
+			uint64_t msl_sz = primary_msl->hugepage_sz;
+			if (msl_sz == cur_sz) {
+				hi = &internal_config.hugepage_info[i];
+				break;
+			}
+		}
+		if (!hi) {
+			RTE_LOG(ERR, EAL, "Can't find relevant hugepage_info entry\n");
+			fail = true;
+			goto endloop;
+		}
+
+		/* if versions don't match or if we have just allocated a new
+		 * memseg list, synchronize everything
+		 */
+		if ((new_msl || local_msl->version != primary_msl->version) &&
+				sync_existing(primary_msl, local_msl, hi,
+				msl_idx)) {
+			fail = true;
+			goto endloop;
+		}
+endloop:
+		rte_rwlock_write_unlock(&local_msl->mplock);
+		rte_rwlock_read_unlock(&primary_msl->mplock);
+		if (fail)
+			return -1;
+	}
+	return 0;
 }
-- 
2.7.4
    
    
More information about the dev
mailing list