[dpdk-dev] [PATCH v3 2/2] mem: revert to using flock() and add per-segment lockfiles

Anatoly Burakov anatoly.burakov at intel.com
Wed Apr 25 12:36:57 CEST 2018


The original implementation used flock() locks, but was later
switched to using fcntl() locks for page locking, because
fcntl() locks allow locking parts of a file, which is useful
for single-file segments mode, where locking the entire file
isn't as useful because we still need to grow and shrink it.

However, according to fcntl()'s Ubuntu manpage [1], semantics of
fcntl() locks have a giant oversight:

  This interface follows the completely stupid semantics of System
  V and IEEE Std 1003.1-1988 (“POSIX.1”) that require that all
  locks associated with a file for a given process are removed
  when any file descriptor for that file is closed by that process.
  This semantic means that applications must be aware of any files
  that a subroutine library may access.

Basically, closing *any* fd with an fcntl() lock (which we do because
we don't want to leak fd's) will drop the lock completely.

So, in this commit, we will be reverting back to using flock() locks
everywhere. However, that still leaves the problem of locking parts
of a memseg list file in single file segments mode, and we will be
solving it with creating separate lock files per each page, and
tracking those with flock().

We will also be removing all of this tailq business and replacing it
with a simple array - saving a few bytes is not worth the extra
hassle of dealing with pointers and potential memory allocation
failures. Also, remove the tailq lock since it is not needed - these
fd lists are per-process, and within a given process, it is always
only one thread handling access to hugetlbfs.

So, first one to allocate a segment will create a lockfile, and put
a shared lock on it. When we're shrinking the page file, we will be
trying to take out a write lock on that lockfile, which would fail if
any other process is holding onto the lockfile as well. This way, we
can know if we can shrink the segment file. Also, if no other locks
are found in the lock list for a given memseg list, the memseg list
fd is automatically closed.

One other thing to note is, according to flock() Ubuntu manpage [2],
upgrading the lock from shared to exclusive is implemented by dropping
and reacquiring the lock, which is not atomic and thus would have
created race conditions. So, on attempting to perform operations in
hugetlbfs, we will take out a writelock on hugetlbfs directory, so
that only one process could perform hugetlbfs operations concurrently.

[1] http://manpages.ubuntu.com/manpages/artful/en/man2/fcntl.2freebsd.html
[2] http://manpages.ubuntu.com/manpages/bionic/en/man2/flock.2.html

Fixes: 66cc45e293ed ("mem: replace memseg with memseg lists")
Fixes: 582bed1e1d1d ("mem: support mapping hugepages at runtime")
Fixes: a5ff05d60fc5 ("mem: support unmapping pages at runtime")
Fixes: 2a04139f66b4 ("eal: add single file segments option")
Cc: anatoly.burakov at intel.com

Signed-off-by: Anatoly Burakov <anatoly.burakov at intel.com>
Acked-by: Bruce Richardson <bruce.richardson at intel.com>
---

Notes:
    v3:
    - Make lock naming consistent with hugetlbfs file naming
    
    v2:
    - Make lockfiles hidden
    - renamed functions as per Bruce's comments
    
    We could've used OFD locks [1] instead of flock(), but they are
    only supported on kernel 3.15+ [2], so we're forced to use this
    flock()-based contraption to support older kernels.
    
    [1] https://www.gnu.org/software/libc/manual/html_node/Open-File-Description-Locks.html
    [2] http://man7.org/linux/man-pages/man2/fcntl.2.html

 lib/librte_eal/common/eal_filesystem.h          |  17 +
 lib/librte_eal/linuxapp/eal/eal_hugepage_info.c |  28 +-
 lib/librte_eal/linuxapp/eal/eal_memalloc.c      | 539 +++++++++++++++---------
 lib/librte_eal/linuxapp/eal/eal_memory.c        |  22 +-
 4 files changed, 368 insertions(+), 238 deletions(-)

diff --git a/lib/librte_eal/common/eal_filesystem.h b/lib/librte_eal/common/eal_filesystem.h
index ad059ef..0a82f89 100644
--- a/lib/librte_eal/common/eal_filesystem.h
+++ b/lib/librte_eal/common/eal_filesystem.h
@@ -115,6 +115,23 @@ eal_get_hugefile_path(char *buffer, size_t buflen, const char *hugedir, int f_id
 	return buffer;
 }
 
+/** String format for hugepage map lock files. */
+#define HUGEFILE_LOCK_FMT "%s/.%smap_%d.lock"
+
+static inline const char *
+eal_get_hugefile_lock_path(char *buffer, size_t buflen, int f_id)
+{
+	const char *directory = default_config_dir;
+	const char *home_dir = getenv("HOME");
+
+	if (getuid() != 0 && home_dir != NULL)
+		directory = home_dir;
+	snprintf(buffer, buflen - 1, HUGEFILE_LOCK_FMT, directory,
+			internal_config.hugefile_prefix, f_id);
+	buffer[buflen - 1] = '\0';
+	return buffer;
+}
+
 /** define the default filename prefix for the %s values above */
 #define HUGEFILE_PREFIX_DEFAULT "rte"
 
diff --git a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
index db5aabd..7eca711 100644
--- a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
+++ b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
@@ -237,18 +237,6 @@ get_hugepage_dir(uint64_t hugepage_sz, char *hugedir, int len)
 }
 
 /*
- * uses fstat to report the size of a file on disk
- */
-static off_t
-get_file_size(int fd)
-{
-	struct stat st;
-	if (fstat(fd, &st) < 0)
-		return 0;
-	return st.st_size;
-}
-
-/*
  * Clear the hugepage directory of whatever hugepage files
  * there are. Checks if the file is locked (i.e.
  * if it's in use by another DPDK process).
@@ -278,8 +266,6 @@ clear_hugedir(const char * hugedir)
 	}
 
 	while(dirent != NULL){
-		struct flock lck = {0};
-
 		/* skip files that don't match the hugepage pattern */
 		if (fnmatch(filter, dirent->d_name, 0) > 0) {
 			dirent = readdir(dir);
@@ -296,19 +282,11 @@ clear_hugedir(const char * hugedir)
 		}
 
 		/* non-blocking lock */
-		lck.l_type = F_RDLCK;
-		lck.l_whence = SEEK_SET;
-		lck.l_start = 0;
-		lck.l_len = get_file_size(fd);
+		lck_result = flock(fd, LOCK_EX | LOCK_NB);
 
-		lck_result = fcntl(fd, F_SETLK, &lck);
-
-		/* if lock succeeds, unlock and remove the file */
-		if (lck_result != -1) {
-			lck.l_type = F_UNLCK;
-			fcntl(fd, F_SETLK, &lck);
+		/* if lock succeeds, remove the file */
+		if (lck_result != -1)
 			unlinkat(dir_fd, dirent->d_name, 0);
-		}
 		close (fd);
 		dirent = readdir(dir);
 	}
diff --git a/lib/librte_eal/linuxapp/eal/eal_memalloc.c b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
index 162306a..39e00c6 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memalloc.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
@@ -46,24 +46,25 @@
  */
 static int fallocate_supported = -1; /* unknown */
 
-/*
- * If each page is in a separate file, we can close fd's since we need each fd
- * only once. However, in single file segments mode, we can get away with using
- * a single fd for entire segments, but we need to store them somewhere. Each
- * fd is different within each process, so we'll store them in a local tailq.
+/* for single-file segments, we need some kind of mechanism to keep track of
+ * which hugepages can be freed back to the system, and which cannot. we cannot
+ * use flock() because they don't allow locking parts of a file, and we cannot
+ * use fcntl() due to issues with their semantics, so we will have to rely on a
+ * bunch of lockfiles for each page.
+ *
+ * we cannot know how many pages a system will have in advance, but we do know
+ * that they come in lists, and we know lengths of these lists. so, simply store
+ * a malloc'd array of fd's indexed by list and segment index.
+ *
+ * they will be initialized at startup, and filled as we allocate/deallocate
+ * segments. also, use this to track memseg list proper fd.
  */
-struct msl_entry {
-	TAILQ_ENTRY(msl_entry) next;
-	unsigned int msl_idx;
-	int fd;
-};
-
-/** Double linked list of memseg list fd's. */
-TAILQ_HEAD(msl_entry_list, msl_entry);
-
-static struct msl_entry_list msl_entry_list =
-		TAILQ_HEAD_INITIALIZER(msl_entry_list);
-static rte_spinlock_t tailq_lock = RTE_SPINLOCK_INITIALIZER;
+static struct {
+	int *fds; /**< dynamically allocated array of segment lock fd's */
+	int memseg_list_fd; /**< memseg list fd */
+	int len; /**< total length of the array */
+	int count; /**< entries used in an array */
+} lock_fds[RTE_MAX_MEMSEG_LISTS];
 
 /** local copy of a memory map, used to synchronize memory hotplug in MP */
 static struct rte_memseg_list local_memsegs[RTE_MAX_MEMSEG_LISTS];
@@ -158,35 +159,6 @@ resotre_numa(int *oldpolicy, struct bitmask *oldmask)
 }
 #endif
 
-static struct msl_entry *
-get_msl_entry_by_idx(unsigned int list_idx)
-{
-	struct msl_entry *te;
-
-	rte_spinlock_lock(&tailq_lock);
-
-	TAILQ_FOREACH(te, &msl_entry_list, next) {
-		if (te->msl_idx == list_idx)
-			break;
-	}
-	if (te == NULL) {
-		/* doesn't exist, so create it and set fd to -1 */
-
-		te = malloc(sizeof(*te));
-		if (te == NULL) {
-			RTE_LOG(ERR, EAL, "%s(): cannot allocate tailq entry for memseg list\n",
-				__func__);
-			goto unlock;
-		}
-		te->msl_idx = list_idx;
-		te->fd = -1;
-		TAILQ_INSERT_TAIL(&msl_entry_list, te, next);
-	}
-unlock:
-	rte_spinlock_unlock(&tailq_lock);
-	return te;
-}
-
 /*
  * uses fstat to report the size of a file on disk
  */
@@ -199,19 +171,6 @@ get_file_size(int fd)
 	return st.st_size;
 }
 
-/*
- * uses fstat to check if file size on disk is zero (regular fstat won't show
- * true file size due to how fallocate works)
- */
-static bool
-is_zero_length(int fd)
-{
-	struct stat st;
-	if (fstat(fd, &st) < 0)
-		return false;
-	return st.st_blocks == 0;
-}
-
 /* we cannot use rte_memseg_list_walk() here because we will be holding a
  * write lock whenever we enter every function in this file, however copying
  * the same iteration code everywhere is not ideal as well. so, use a lockless
@@ -238,6 +197,102 @@ memseg_list_walk_thread_unsafe(rte_memseg_list_walk_t func, void *arg)
 	return 0;
 }
 
+/* returns 1 on successful lock, 0 on unsuccessful lock, -1 on error */
+static int lock(int fd, int type)
+{
+	int ret;
+
+	/* flock may be interrupted */
+	do {
+		ret = flock(fd, type | LOCK_NB);
+	} while (ret && errno == EINTR);
+
+	if (ret && errno == EWOULDBLOCK) {
+		/* couldn't lock */
+		return 0;
+	} else if (ret) {
+		RTE_LOG(ERR, EAL, "%s(): error calling flock(): %s\n",
+			__func__, strerror(errno));
+		return -1;
+	}
+	/* lock was successful */
+	return 1;
+}
+
+static int get_segment_lock_fd(int list_idx, int seg_idx)
+{
+	char path[PATH_MAX] = {0};
+	int fd;
+
+	if (list_idx < 0 || list_idx >= (int)RTE_DIM(lock_fds))
+		return -1;
+	if (seg_idx < 0 || seg_idx >= lock_fds[list_idx].len)
+		return -1;
+
+	fd = lock_fds[list_idx].fds[seg_idx];
+	/* does this lock already exist? */
+	if (fd >= 0)
+		return fd;
+
+	eal_get_hugefile_lock_path(path, sizeof(path),
+			list_idx * RTE_MAX_MEMSEG_PER_LIST + seg_idx);
+
+	fd = open(path, O_CREAT | O_RDWR, 0660);
+	if (fd < 0) {
+		RTE_LOG(ERR, EAL, "%s(): error creating lockfile '%s': %s\n",
+			__func__, path, strerror(errno));
+		return -1;
+	}
+	/* take out a read lock */
+	if (lock(fd, LOCK_SH) != 1) {
+		RTE_LOG(ERR, EAL, "%s(): failed to take out a readlock on '%s': %s\n",
+			__func__, path, strerror(errno));
+		close(fd);
+		return -1;
+	}
+	/* store it for future reference */
+	lock_fds[list_idx].fds[seg_idx] = fd;
+	lock_fds[list_idx].count++;
+	return fd;
+}
+
+static int unlock_segment(int list_idx, int seg_idx)
+{
+	int fd, ret;
+
+	if (list_idx < 0 || list_idx >= (int)RTE_DIM(lock_fds))
+		return -1;
+	if (seg_idx < 0 || seg_idx >= lock_fds[list_idx].len)
+		return -1;
+
+	fd = lock_fds[list_idx].fds[seg_idx];
+
+	/* upgrade lock to exclusive to see if we can remove the lockfile */
+	ret = lock(fd, LOCK_EX);
+	if (ret == 1) {
+		/* we've succeeded in taking exclusive lock, this lockfile may
+		 * be removed.
+		 */
+		char path[PATH_MAX] = {0};
+		eal_get_hugefile_lock_path(path, sizeof(path),
+				list_idx * RTE_MAX_MEMSEG_PER_LIST + seg_idx);
+		if (unlink(path)) {
+			RTE_LOG(ERR, EAL, "%s(): error removing lockfile '%s': %s\n",
+					__func__, path, strerror(errno));
+		}
+	}
+	/* we don't want to leak the fd, so even if we fail to lock, close fd
+	 * and remove it from list anyway.
+	 */
+	close(fd);
+	lock_fds[list_idx].fds[seg_idx] = -1;
+	lock_fds[list_idx].count--;
+
+	if (ret < 0)
+		return -1;
+	return 0;
+}
+
 static int
 get_seg_fd(char *path, int buflen, struct hugepage_info *hi,
 		unsigned int list_idx, unsigned int seg_idx)
@@ -245,31 +300,29 @@ get_seg_fd(char *path, int buflen, struct hugepage_info *hi,
 	int fd;
 
 	if (internal_config.single_file_segments) {
-		/*
-		 * try to find a tailq entry, for this memseg list, or create
-		 * one if it doesn't exist.
-		 */
-		struct msl_entry *te = get_msl_entry_by_idx(list_idx);
-		if (te == NULL) {
-			RTE_LOG(ERR, EAL, "%s(): cannot allocate tailq entry for memseg list\n",
-				__func__);
-			return -1;
-		} else if (te->fd < 0) {
-			/* create a hugepage file */
-			eal_get_hugefile_path(path, buflen, hi->hugedir,
-					list_idx);
+		/* create a hugepage file path */
+		eal_get_hugefile_path(path, buflen, hi->hugedir, list_idx);
+
+		fd = lock_fds[list_idx].memseg_list_fd;
+
+		if (fd < 0) {
 			fd = open(path, O_CREAT | O_RDWR, 0600);
 			if (fd < 0) {
-				RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n",
+				RTE_LOG(ERR, EAL, "%s(): open failed: %s\n",
 					__func__, strerror(errno));
 				return -1;
 			}
-			te->fd = fd;
-		} else {
-			fd = te->fd;
+			/* take out a read lock and keep it indefinitely */
+			if (lock(fd, LOCK_SH) < 0) {
+				RTE_LOG(ERR, EAL, "%s(): lock failed: %s\n",
+					__func__, strerror(errno));
+				close(fd);
+				return -1;
+			}
+			lock_fds[list_idx].memseg_list_fd = fd;
 		}
 	} else {
-		/* one file per page, just create it */
+		/* create a hugepage file path */
 		eal_get_hugefile_path(path, buflen, hi->hugedir,
 				list_idx * RTE_MAX_MEMSEG_PER_LIST + seg_idx);
 		fd = open(path, O_CREAT | O_RDWR, 0600);
@@ -278,48 +331,27 @@ get_seg_fd(char *path, int buflen, struct hugepage_info *hi,
 					strerror(errno));
 			return -1;
 		}
+		/* take out a read lock */
+		if (lock(fd, LOCK_SH) < 0) {
+			RTE_LOG(ERR, EAL, "%s(): lock failed: %s\n",
+				__func__, strerror(errno));
+			close(fd);
+			return -1;
+		}
 	}
 	return fd;
 }
 
-/* returns 1 on successful lock, 0 on unsuccessful lock, -1 on error */
-static int lock(int fd, uint64_t offset, uint64_t len, int type)
-{
-	struct flock lck;
-	int ret;
-
-	memset(&lck, 0, sizeof(lck));
-
-	lck.l_type = type;
-	lck.l_whence = SEEK_SET;
-	lck.l_start = offset;
-	lck.l_len = len;
-
-	ret = fcntl(fd, F_SETLK, &lck);
-
-	if (ret && (errno == EAGAIN || errno == EACCES)) {
-		/* locked by another process, not an error */
-		return 0;
-	} else if (ret) {
-		RTE_LOG(ERR, EAL, "%s(): error calling fcntl(): %s\n",
-			__func__, strerror(errno));
-		/* we've encountered an unexpected error */
-		return -1;
-	}
-	return 1;
-}
-
 static int
-resize_hugefile(int fd, uint64_t fa_offset, uint64_t page_sz,
-		bool grow)
+resize_hugefile(int fd, char *path, int list_idx, int seg_idx,
+		uint64_t fa_offset, uint64_t page_sz, bool grow)
 {
 	bool again = false;
 	do {
 		if (fallocate_supported == 0) {
 			/* we cannot deallocate memory if fallocate() is not
-			 * supported, but locks are still needed to prevent
-			 * primary process' initialization from clearing out
-			 * huge pages used by this process.
+			 * supported, and hugepage file is already locked at
+			 * creation, so no further synchronization needed.
 			 */
 
 			if (!grow) {
@@ -337,13 +369,10 @@ resize_hugefile(int fd, uint64_t fa_offset, uint64_t page_sz,
 					__func__, strerror(errno));
 				return -1;
 			}
-			/* not being able to take out a read lock is an error */
-			if (lock(fd, fa_offset, page_sz, F_RDLCK) != 1)
-				return -1;
 		} else {
 			int flags = grow ? 0 : FALLOC_FL_PUNCH_HOLE |
 					FALLOC_FL_KEEP_SIZE;
-			int ret;
+			int ret, lock_fd;
 
 			/* if fallocate() is supported, we need to take out a
 			 * read lock on allocate (to prevent other processes
@@ -351,20 +380,69 @@ resize_hugefile(int fd, uint64_t fa_offset, uint64_t page_sz,
 			 * lock on deallocate (to ensure nobody else is using
 			 * this page).
 			 *
-			 * we can't use flock() for this, as we actually need to
-			 * lock part of the file, not the entire file.
+			 * read locks on page itself are already taken out at
+			 * file creation, in get_seg_fd().
+			 *
+			 * we cannot rely on simple use of flock() call, because
+			 * we need to be able to lock a section of the file,
+			 * and we cannot use fcntl() locks, because of numerous
+			 * problems with their semantics, so we will use
+			 * deterministically named lock files for each section
+			 * of the file.
+			 *
+			 * if we're shrinking the file, we want to upgrade our
+			 * lock from shared to exclusive.
+			 *
+			 * lock_fd is an fd for a lockfile, not for the segment
+			 * list.
 			 */
+			lock_fd = get_segment_lock_fd(list_idx, seg_idx);
 
 			if (!grow) {
-				ret = lock(fd, fa_offset, page_sz, F_WRLCK);
+				/* we are using this lockfile to determine
+				 * whether this particular page is locked, as we
+				 * are in single file segments mode and thus
+				 * cannot use regular flock() to get this info.
+				 *
+				 * we want to try and take out an exclusive lock
+				 * on the lock file to determine if we're the
+				 * last ones using this page, and if not, we
+				 * won't be shrinking it, and will instead exit
+				 * prematurely.
+				 */
+				ret = lock(lock_fd, LOCK_EX);
+
+				/* drop the lock on the lockfile, so that even
+				 * if we couldn't shrink the file ourselves, we
+				 * are signalling to other processes that we're
+				 * no longer using this page.
+				 */
+				if (unlock_segment(list_idx, seg_idx))
+					RTE_LOG(ERR, EAL, "Could not unlock segment\n");
+
+				/* additionally, if this was the last lock on
+				 * this segment list, we can safely close the
+				 * page file fd, so that one of the processes
+				 * could then delete the file after shrinking.
+				 */
+				if (ret < 1 && lock_fds[list_idx].count == 0) {
+					close(fd);
+					lock_fds[list_idx].memseg_list_fd = -1;
+				}
 
-				if (ret < 0)
+				if (ret < 0) {
+					RTE_LOG(ERR, EAL, "Could not lock segment\n");
 					return -1;
-				else if (ret == 0)
-					/* failed to lock, not an error */
+				}
+				if (ret == 0)
+					/* failed to lock, not an error. */
 					return 0;
 			}
-			if (fallocate(fd, flags, fa_offset, page_sz) < 0) {
+
+			/* grow or shrink the file */
+			ret = fallocate(fd, flags, fa_offset, page_sz);
+
+			if (ret < 0) {
 				if (fallocate_supported == -1 &&
 						errno == ENOTSUP) {
 					RTE_LOG(ERR, EAL, "%s(): fallocate() not supported, hugepage deallocation will be disabled\n",
@@ -380,16 +458,18 @@ resize_hugefile(int fd, uint64_t fa_offset, uint64_t page_sz,
 			} else {
 				fallocate_supported = 1;
 
-				if (grow) {
-					/* if can't read lock, it's an error */
-					if (lock(fd, fa_offset, page_sz,
-							F_RDLCK) != 1)
-						return -1;
-				} else {
-					/* if can't unlock, it's an error */
-					if (lock(fd, fa_offset, page_sz,
-							F_UNLCK) != 1)
-						return -1;
+				/* we've grew/shrunk the file, and we hold an
+				 * exclusive lock now. check if there are no
+				 * more segments active in this segment list,
+				 * and remove the file if there aren't.
+				 */
+				if (lock_fds[list_idx].count == 0) {
+					if (unlink(path))
+						RTE_LOG(ERR, EAL, "%s(): unlinking '%s' failed: %s\n",
+							__func__, path,
+							strerror(errno));
+					close(fd);
+					lock_fds[list_idx].memseg_list_fd = -1;
 				}
 			}
 		}
@@ -411,15 +491,19 @@ alloc_seg(struct rte_memseg *ms, void *addr, int socket_id,
 	int fd;
 	size_t alloc_sz;
 
+	/* takes out a read lock on segment or segment list */
 	fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx);
-	if (fd < 0)
+	if (fd < 0) {
+		RTE_LOG(ERR, EAL, "Couldn't get fd on hugepage file\n");
 		return -1;
+	}
 
 	alloc_sz = hi->hugepage_sz;
 	if (internal_config.single_file_segments) {
 		map_offset = seg_idx * alloc_sz;
-		ret = resize_hugefile(fd, map_offset, alloc_sz, true);
-		if (ret < 1)
+		ret = resize_hugefile(fd, path, list_idx, seg_idx, map_offset,
+				alloc_sz, true);
+		if (ret < 0)
 			goto resized;
 	} else {
 		map_offset = 0;
@@ -428,28 +512,6 @@ alloc_seg(struct rte_memseg *ms, void *addr, int socket_id,
 				__func__, strerror(errno));
 			goto resized;
 		}
-		/* we've allocated a page - take out a read lock. we're using
-		 * fcntl() locks rather than flock() here because doing that
-		 * gives us one huge advantage - fcntl() locks are per-process,
-		 * not per-file descriptor, which means that we don't have to
-		 * keep the original fd's around to keep a lock on the file.
-		 *
-		 * this is useful, because when it comes to unmapping pages, we
-		 * will have to take out a write lock (to figure out if another
-		 * process still has this page mapped), and to do itwith flock()
-		 * we'll have to use original fd, as lock is associated with
-		 * that particular fd. with fcntl(), this is not necessary - we
-		 * can open a new fd and use fcntl() on that.
-		 */
-		ret = lock(fd, map_offset, alloc_sz, F_RDLCK);
-
-		/* this should not fail */
-		if (ret != 1) {
-			RTE_LOG(ERR, EAL, "%s(): error locking file: %s\n",
-				__func__,
-				strerror(errno));
-			goto resized;
-		}
 	}
 
 	/*
@@ -518,19 +580,14 @@ alloc_seg(struct rte_memseg *ms, void *addr, int socket_id,
 	munmap(addr, alloc_sz);
 resized:
 	if (internal_config.single_file_segments) {
-		resize_hugefile(fd, map_offset, alloc_sz, false);
-		if (is_zero_length(fd)) {
-			struct msl_entry *te = get_msl_entry_by_idx(list_idx);
-			if (te != NULL && te->fd >= 0) {
-				close(te->fd);
-				te->fd = -1;
-			}
-			/* ignore errors, can't make it any worse */
-			unlink(path);
-		}
+		resize_hugefile(fd, path, list_idx, seg_idx, map_offset,
+				alloc_sz, false);
+		/* ignore failure, can't make it any worse */
 	} else {
+		/* only remove file if we can take out a write lock */
+		if (lock(fd, LOCK_EX) == 1)
+			unlink(path);
 		close(fd);
-		unlink(path);
 	}
 	return -1;
 }
@@ -546,6 +603,14 @@ free_seg(struct rte_memseg *ms, struct hugepage_info *hi,
 	/* erase page data */
 	memset(ms->addr, 0, ms->len);
 
+	/* if we are not in single file segments mode, we're going to unmap the
+	 * segment and thus drop the lock on original fd, so take out another
+	 * shared lock before we do that.
+	 */
+	fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx);
+	if (fd < 0)
+		return -1;
+
 	if (mmap(ms->addr, ms->len, PROT_READ,
 			MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0) ==
 				MAP_FAILED) {
@@ -553,41 +618,23 @@ free_seg(struct rte_memseg *ms, struct hugepage_info *hi,
 		return -1;
 	}
 
-	fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx);
-	if (fd < 0)
-		return -1;
-
 	if (internal_config.single_file_segments) {
 		map_offset = seg_idx * ms->len;
-		if (resize_hugefile(fd, map_offset, ms->len, false))
+		if (resize_hugefile(fd, path, list_idx, seg_idx, map_offset,
+				ms->len, false))
 			return -1;
-		/* if file is zero-length, we've already shrunk it, so it's
-		 * safe to remove.
-		 */
-		if (is_zero_length(fd)) {
-			struct msl_entry *te = get_msl_entry_by_idx(list_idx);
-			if (te != NULL && te->fd >= 0) {
-				close(te->fd);
-				te->fd = -1;
-			}
-			unlink(path);
-		}
 		ret = 0;
 	} else {
 		/* if we're able to take out a write lock, we're the last one
 		 * holding onto this page.
 		 */
-
-		ret = lock(fd, 0, ms->len, F_WRLCK);
+		ret = lock(fd, LOCK_EX);
 		if (ret >= 0) {
 			/* no one else is using this page */
 			if (ret == 1)
 				unlink(path);
-			ret = lock(fd, 0, ms->len, F_UNLCK);
-			if (ret != 1)
-				RTE_LOG(ERR, EAL, "%s(): unable to unlock file %s\n",
-					__func__, path);
 		}
+		/* closing fd will drop the lock */
 		close(fd);
 	}
 
@@ -612,7 +659,7 @@ alloc_seg_walk(const struct rte_memseg_list *msl, void *arg)
 	struct alloc_walk_param *wa = arg;
 	struct rte_memseg_list *cur_msl;
 	size_t page_sz;
-	int cur_idx, start_idx, j;
+	int cur_idx, start_idx, j, dir_fd;
 	unsigned int msl_idx, need, i;
 
 	if (msl->page_sz != wa->page_sz)
@@ -633,6 +680,25 @@ alloc_seg_walk(const struct rte_memseg_list *msl, void *arg)
 		return 0;
 	start_idx = cur_idx;
 
+	/* do not allow any page allocations during the time we're allocating,
+	 * because file creation and locking operations are not atomic,
+	 * and we might be the first or the last ones to use a particular page,
+	 * so we need to ensure atomicity of every operation.
+	 */
+	dir_fd = open(wa->hi->hugedir, O_RDONLY);
+	if (dir_fd < 0) {
+		RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n", __func__,
+			wa->hi->hugedir, strerror(errno));
+		return -1;
+	}
+	/* blocking writelock */
+	if (flock(dir_fd, LOCK_EX)) {
+		RTE_LOG(ERR, EAL, "%s(): Cannot lock '%s': %s\n", __func__,
+			wa->hi->hugedir, strerror(errno));
+		close(dir_fd);
+		return -1;
+	}
+
 	for (i = 0; i < need; i++, cur_idx++) {
 		struct rte_memseg *cur;
 		void *map_addr;
@@ -668,6 +734,8 @@ alloc_seg_walk(const struct rte_memseg_list *msl, void *arg)
 			/* clear the list */
 			if (wa->ms)
 				memset(wa->ms, 0, sizeof(*wa->ms) * wa->n_segs);
+
+			close(dir_fd);
 			return -1;
 		}
 		if (wa->ms)
@@ -679,6 +747,7 @@ alloc_seg_walk(const struct rte_memseg_list *msl, void *arg)
 	wa->segs_allocated = i;
 	if (i > 0)
 		cur_msl->version++;
+	close(dir_fd);
 	return 1;
 }
 
@@ -693,7 +762,7 @@ free_seg_walk(const struct rte_memseg_list *msl, void *arg)
 	struct rte_memseg_list *found_msl;
 	struct free_walk_param *wa = arg;
 	uintptr_t start_addr, end_addr;
-	int msl_idx, seg_idx;
+	int msl_idx, seg_idx, ret, dir_fd;
 
 	start_addr = (uintptr_t) msl->base_va;
 	end_addr = start_addr + msl->memseg_arr.len * (size_t)msl->page_sz;
@@ -708,11 +777,34 @@ free_seg_walk(const struct rte_memseg_list *msl, void *arg)
 	/* msl is const */
 	found_msl = &mcfg->memsegs[msl_idx];
 
+	/* do not allow any page allocations during the time we're freeing,
+	 * because file creation and locking operations are not atomic,
+	 * and we might be the first or the last ones to use a particular page,
+	 * so we need to ensure atomicity of every operation.
+	 */
+	dir_fd = open(wa->hi->hugedir, O_RDONLY);
+	if (dir_fd < 0) {
+		RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n", __func__,
+			wa->hi->hugedir, strerror(errno));
+		return -1;
+	}
+	/* blocking writelock */
+	if (flock(dir_fd, LOCK_EX)) {
+		RTE_LOG(ERR, EAL, "%s(): Cannot lock '%s': %s\n", __func__,
+			wa->hi->hugedir, strerror(errno));
+		close(dir_fd);
+		return -1;
+	}
+
 	found_msl->version++;
 
 	rte_fbarray_set_free(&found_msl->memseg_arr, seg_idx);
 
-	if (free_seg(wa->ms, wa->hi, msl_idx, seg_idx))
+	ret = free_seg(wa->ms, wa->hi, msl_idx, seg_idx);
+
+	close(dir_fd);
+
+	if (ret < 0)
 		return -1;
 
 	return 1;
@@ -1034,22 +1126,46 @@ sync_existing(struct rte_memseg_list *primary_msl,
 		struct rte_memseg_list *local_msl, struct hugepage_info *hi,
 		unsigned int msl_idx)
 {
-	int ret;
+	int ret, dir_fd;
+
+	/* do not allow any page allocations during the time we're allocating,
+	 * because file creation and locking operations are not atomic,
+	 * and we might be the first or the last ones to use a particular page,
+	 * so we need to ensure atomicity of every operation.
+	 */
+	dir_fd = open(hi->hugedir, O_RDONLY);
+	if (dir_fd < 0) {
+		RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n", __func__,
+			hi->hugedir, strerror(errno));
+		return -1;
+	}
+	/* blocking writelock */
+	if (flock(dir_fd, LOCK_EX)) {
+		RTE_LOG(ERR, EAL, "%s(): Cannot lock '%s': %s\n", __func__,
+			hi->hugedir, strerror(errno));
+		close(dir_fd);
+		return -1;
+	}
 
 	/* ensure all allocated space is the same in both lists */
 	ret = sync_status(primary_msl, local_msl, hi, msl_idx, true);
 	if (ret < 0)
-		return -1;
+		goto fail;
 
 	/* ensure all unallocated space is the same in both lists */
 	ret = sync_status(primary_msl, local_msl, hi, msl_idx, false);
 	if (ret < 0)
-		return -1;
+		goto fail;
 
 	/* update version number */
 	local_msl->version = primary_msl->version;
 
+	close(dir_fd);
+
 	return 0;
+fail:
+	close(dir_fd);
+	return -1;
 }
 
 static int
@@ -1128,11 +1244,46 @@ secondary_msl_create_walk(const struct rte_memseg_list *msl,
 	return 0;
 }
 
+static int
+secondary_lock_list_create_walk(const struct rte_memseg_list *msl,
+		void *arg __rte_unused)
+{
+	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+	unsigned int i, len;
+	int msl_idx;
+	int *data;
+
+	msl_idx = msl - mcfg->memsegs;
+	len = msl->memseg_arr.len;
+
+	/* ensure we have space to store lock fd per each possible segment */
+	data = malloc(sizeof(int) * len);
+	if (data == NULL) {
+		RTE_LOG(ERR, EAL, "Unable to allocate space for lock descriptors\n");
+		return -1;
+	}
+	/* set all fd's as invalid */
+	for (i = 0; i < len; i++)
+		data[i] = -1;
+
+	lock_fds[msl_idx].fds = data;
+	lock_fds[msl_idx].len = len;
+	lock_fds[msl_idx].count = 0;
+	lock_fds[msl_idx].memseg_list_fd = -1;
+
+	return 0;
+}
+
 int
 eal_memalloc_init(void)
 {
 	if (rte_eal_process_type() == RTE_PROC_SECONDARY)
 		if (rte_memseg_list_walk(secondary_msl_create_walk, NULL) < 0)
 			return -1;
+
+	/* initialize all of the lock fd lists */
+	if (internal_config.single_file_segments)
+		if (rte_memseg_list_walk(secondary_lock_list_create_walk, NULL))
+			return -1;
 	return 0;
 }
diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
index fadc1de..ad3add8 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memory.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
@@ -259,7 +259,6 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi,
 	int fd;
 	unsigned i;
 	void *virtaddr;
-	struct flock lck = {0};
 #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
 	int node_id = -1;
 	int essential_prev = 0;
@@ -378,13 +377,8 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi,
 		}
 		*(int *)virtaddr = 0;
 
-
 		/* set shared lock on the file. */
-		lck.l_type = F_RDLCK;
-		lck.l_whence = SEEK_SET;
-		lck.l_start = 0;
-		lck.l_len = hugepage_sz;
-		if (fcntl(fd, F_SETLK, &lck) == -1) {
+		if (flock(fd, LOCK_SH) < 0) {
 			RTE_LOG(DEBUG, EAL, "%s(): Locking file failed:%s \n",
 				__func__, strerror(errno));
 			close(fd);
@@ -708,7 +702,6 @@ remap_segment(struct hugepage_file *hugepages, int seg_start, int seg_end)
 #endif
 		struct hugepage_file *hfile = &hugepages[cur_page];
 		struct rte_memseg *ms = rte_fbarray_get(arr, ms_idx);
-		struct flock lck;
 		void *addr;
 		int fd;
 
@@ -719,11 +712,7 @@ remap_segment(struct hugepage_file *hugepages, int seg_start, int seg_end)
 			return -1;
 		}
 		/* set shared lock on the file. */
-		lck.l_type = F_RDLCK;
-		lck.l_whence = SEEK_SET;
-		lck.l_start = 0;
-		lck.l_len = page_sz;
-		if (fcntl(fd, F_SETLK, &lck) == -1) {
+		if (flock(fd, LOCK_SH) < 0) {
 			RTE_LOG(DEBUG, EAL, "Could not lock '%s': %s\n",
 					hfile->filepath, strerror(errno));
 			close(fd);
@@ -1732,7 +1721,6 @@ eal_legacy_hugepage_attach(void)
 		struct hugepage_file *hf = &hp[i];
 		size_t map_sz = hf->size;
 		void *map_addr = hf->final_va;
-		struct flock lck;
 
 		/* if size is zero, no more pages left */
 		if (map_sz == 0)
@@ -1754,11 +1742,7 @@ eal_legacy_hugepage_attach(void)
 		}
 
 		/* set shared lock on the file. */
-		lck.l_type = F_RDLCK;
-		lck.l_whence = SEEK_SET;
-		lck.l_start = 0;
-		lck.l_len = map_sz;
-		if (fcntl(fd, F_SETLK, &lck) == -1) {
+		if (flock(fd, LOCK_SH) < 0) {
 			RTE_LOG(DEBUG, EAL, "%s(): Locking file failed: %s\n",
 				__func__, strerror(errno));
 			close(fd);
-- 
2.7.4


More information about the dev mailing list