[dpdk-dev] [RFC PATCH 6/9] eal: introduce memory management wrappers

Dmitry Kozlyuk dmitry.kozliuk at gmail.com
Mon Mar 30 06:10:23 CEST 2020


System meory management is implemented differently for POSIX and
Windows. Introduce wrapper functions for operations used across DPDK:

* rte_mem_map()
  Create memory mapping for a regular file or a page file (swap).
  This supports mapping to a reserved memory region even on Windows.

* rte_mem_unmap()
  Remove mapping created with rte_mem_map().

* rte_get_page_size()
  Obtain default system page size.

* rte_mem_lock()
  Make arbitrary-sized memory region non-swappable.

Wrappers follow POSIX semantics limited to DPDK tasks, but their
signatures deliberately differ from POSIX ones to be more safe and
expressive.

Signed-off-by: Dmitry Kozlyuk <dmitry.kozliuk at gmail.com>
---
 config/meson.build                         |  10 +-
 lib/librte_eal/common/eal_private.h        |  71 +++-
 lib/librte_eal/common/include/rte_memory.h |  63 +++
 lib/librte_eal/freebsd/eal/eal_memory.c    | 117 ++++++
 lib/librte_eal/linux/eal/eal_memory.c      | 117 ++++++
 lib/librte_eal/rte_eal_exports.def         |   4 +
 lib/librte_eal/rte_eal_version.map         |   4 +
 lib/librte_eal/windows/eal/eal.c           |   6 +
 lib/librte_eal/windows/eal/eal_memory.c    | 433 +++++++++++++++++++++
 lib/librte_eal/windows/eal/eal_windows.h   |  51 +++
 lib/librte_eal/windows/eal/meson.build     |   1 +
 11 files changed, 871 insertions(+), 6 deletions(-)
 create mode 100644 lib/librte_eal/windows/eal/eal_memory.c

diff --git a/config/meson.build b/config/meson.build
index 73cf69814..295425742 100644
--- a/config/meson.build
+++ b/config/meson.build
@@ -256,14 +256,20 @@ if is_freebsd
 endif
 
 if is_windows
-	# Minimum supported API is Windows 7.
-	add_project_arguments('-D_WIN32_WINNT=0x0601', language: 'c')
+	# VirtualAlloc2() is available since Windows 10 / Server 2016.
+	add_project_arguments('-D_WIN32_WINNT=0x0A00', language: 'c')
 
 	# Use MinGW-w64 stdio, because DPDK assumes ANSI-compliant formatting.
 	if cc.get_id() == 'gcc'
 		add_project_arguments('-D__USE_MINGW_ANSI_STDIO', language: 'c')
 	endif
 
+	# Contrary to docs, VirtualAlloc2() is exported by mincore.lib
+	# in Windows SDK, while MinGW exports it by advapi32.a.
+	if is_ms_linker
+		add_project_link_arguments('-lmincore', language: 'c')
+	endif
+
 	add_project_link_arguments('-ladvapi32', language: 'c')
 endif
 
diff --git a/lib/librte_eal/common/eal_private.h b/lib/librte_eal/common/eal_private.h
index 0130571e8..2e5e4312a 100644
--- a/lib/librte_eal/common/eal_private.h
+++ b/lib/librte_eal/common/eal_private.h
@@ -11,6 +11,7 @@
 
 #include <rte_dev.h>
 #include <rte_lcore.h>
+#include <rte_memory.h>
 
 /**
  * Structure storing internal configuration (per-lcore)
@@ -202,6 +203,16 @@ int rte_eal_alarm_init(void);
  */
 int rte_eal_check_module(const char *module_name);
 
+/**
+ * Memory reservation flags.
+ */
+enum rte_mem_reserve_flags {
+	/**< Reserve hugepages. */
+	RTE_RESERVE_HUGEPAGES = 1 << 0,
+	/**< Fail if requested address is not available. */
+	RTE_RESERVE_EXACT_ADDRESS = 1 << 1
+};
+
 /**
  * Get virtual area of specified size from the OS.
  *
@@ -215,8 +226,8 @@ int rte_eal_check_module(const char *module_name);
  *   Page size on which to align requested virtual area.
  * @param flags
  *   EAL_VIRTUAL_AREA_* flags.
- * @param mmap_flags
- *   Extra flags passed directly to mmap().
+ * @param reserve_flags
+ *   Extra flags passed directly to eal_mem_reserve().
  *
  * @return
  *   Virtual area address if successful.
@@ -232,8 +243,8 @@ int rte_eal_check_module(const char *module_name);
 #define EAL_VIRTUAL_AREA_UNMAP (1 << 2)
 /**< immediately unmap reserved virtual area. */
 void *
-eal_get_virtual_area(void *requested_addr, size_t *size,
-		size_t page_sz, int flags, int mmap_flags);
+eal_get_virtual_area(void *requested_addr, size_t *size, size_t page_sz,
+	int flags, enum rte_mem_reserve_flags reserve_flags);
 
 /**
  * Get cpu core_id.
@@ -488,4 +499,56 @@ int eal_file_lock(int fd, enum rte_flock_op op, enum rte_flock_mode mode);
  */
 int eal_file_truncate(int fd, ssize_t size);
 
+/**
+ * Reserve a region of virtual memory.
+ *
+ * Use eal_mem_free() to free reserved memory.
+ *
+ * @param requested_addr
+ *  A desired reservation address. The system may not respect it.
+ *  NULL means the address will be chosen by the system.
+ * @param size
+ *  Reservation size. Must be a multiple of system page size.
+ * @param flags
+ *  Reservation options.
+ * @returns
+ *  Starting address of the reserved area on success, NULL on failure.
+ *  Callers must not access this memory until remapping it.
+ */
+void *eal_mem_reserve(void *requested_addr, size_t size,
+	enum rte_mem_reserve_flags flags);
+
+/**
+ * Allocate a contiguous chunk of virtual memory.
+ *
+ * Use eal_mem_free() to free allocated memory.
+ *
+ * @param size
+ *  Number of bytes to allocate.
+ * @param page_size
+ *  If non-zero, means memory must be allocated in hugepages
+ *  of the specified size. The @code size @endcode parameter
+ *  must then be a multiple of the largest hugepage size requested.
+ * @return
+ *  Address of allocated memory or NULL on failure (rte_errno is set).
+ */
+void *eal_mem_alloc(size_t size, enum rte_page_sizes page_size);
+
+/**
+ * Free memory obtained by eal_mem_reserve() or eal_mem_alloc().
+ *
+ * If @code virt @endcode and @code size @endcode describe a part of the
+ * reserved region, only this part of the region is freed (accurately
+ * up to the system page size). If @code virt @endcode points to allocated
+ * memory, @code size @endcode must match the one specified on allocation.
+ * The behavior is undefined if the memory pointed by @code virt @endcode
+ * is obtained from another source than listed above.
+ *
+ * @param virt
+ *  A virtual address in a region previously reserved.
+ * @param size
+ *  Number of bytes to unreserve.
+ */
+void eal_mem_free(void *virt, size_t size);
+
 #endif /* _EAL_PRIVATE_H_ */
diff --git a/lib/librte_eal/common/include/rte_memory.h b/lib/librte_eal/common/include/rte_memory.h
index 3d8d0bd69..1742fde9a 100644
--- a/lib/librte_eal/common/include/rte_memory.h
+++ b/lib/librte_eal/common/include/rte_memory.h
@@ -85,6 +85,69 @@ struct rte_memseg_list {
 	struct rte_fbarray memseg_arr;
 };
 
+/**
+ * Memory protection flags.
+ */
+enum rte_mem_prot {
+	RTE_PROT_READ = 1 << 0,   /**< Read access. */
+	RTE_PROT_WRITE = 1 << 1,   /**< Write access. */
+	RTE_PROT_EXECUTE = 1 << 2 /**< Code execution. */
+};
+
+/**
+ * Memory mapping additional flags.
+ *
+ * In Linux and FreeBSD, each flag is semantically equivalent
+ * to OS-specific mmap(3) flag with the same or similar name.
+ * In Windows, POSIX and MAP_ANONYMOUS semantics are followed.
+ */
+enum rte_map_flags {
+	/** Changes of mapped memory are visible to other processes. */
+	RTE_MAP_SHARED = 1 << 0,
+	/** Mapping is not backed by a regular file. */
+	RTE_MAP_ANONYMOUS = 1 << 1,
+	/** Copy-on-write mapping, changes are invisible to other processes. */
+	RTE_MAP_PRIVATE = 1 << 2,
+	/** Fail if requested address cannot be taken. */
+	RTE_MAP_FIXED = 1 << 3
+};
+
+/**
+ * OS-independent implementation of POSIX mmap(3)
+ * with MAP_ANONYMOUS Linux/FreeBSD extension.
+ */
+__rte_experimental
+void *rte_mem_map(void *requested_addr, size_t size, enum rte_mem_prot prot,
+	enum rte_map_flags flags, int fd, size_t offset);
+
+/**
+ * OS-independent implementation of POSIX munmap(3).
+ */
+__rte_experimental
+int rte_mem_unmap(void *virt, size_t size);
+
+/**
+ * Get system page size. This function never failes.
+ *
+ * @return
+ *   Positive page size in bytes.
+ */
+__rte_experimental
+int rte_get_page_size(void);
+
+/**
+ * Lock region in physical memory and prevent it from swapping.
+ *
+ * @param virt
+ *   The virtual address.
+ * @param size
+ *   Size of the region.
+ * @return
+ *   0 on success, negative on error.
+ */
+__rte_experimental
+int rte_mem_lock(const void *virt, size_t size);
+
 /**
  * Lock page in physical memory and prevent from swapping.
  *
diff --git a/lib/librte_eal/freebsd/eal/eal_memory.c b/lib/librte_eal/freebsd/eal/eal_memory.c
index a97d8f0f0..bcceba636 100644
--- a/lib/librte_eal/freebsd/eal/eal_memory.c
+++ b/lib/librte_eal/freebsd/eal/eal_memory.c
@@ -534,3 +534,120 @@ rte_eal_memseg_init(void)
 			memseg_primary_init() :
 			memseg_secondary_init();
 }
+
+void *
+eal_mem_reserve(void *requested_addr, size_t size,
+	enum rte_mem_reserve_flags flags)
+{
+	int sys_flags = MAP_PRIVATE | MAP_ANONYMOUS;
+
+	if (flags & RTE_RESERVE_HUGEPAGES)
+		sys_flags |= MAP_HUGETLB;
+	if (flags & RTE_RESERVE_EXACT_ADDRESS)
+		sys_flags |= MAP_FIXED;
+
+	return mem_map(requested_addr, size, PROT_READ, sys_flags, -1, 0);
+}
+
+void *
+eal_mem_alloc(size_t size, enum rte_page_sizes page_size)
+{
+	int flags = MAP_ANONYMOUS | MAP_PRIVATE;
+
+	if (page_size != 0) {
+		/* as per mmap() manpage, all page sizes are log2 of page size
+		 * shifted by MAP_HUGE_SHIFT
+		 */
+		int page_flag = rte_log2_u64(page_size) << MAP_HUGE_SHIFT;
+		flags |= MAP_HUGETLB | page_flag;
+	}
+
+	return mem_map(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0);
+}
+
+void
+eal_mem_free(void *virt, size_t size)
+{
+	mem_unmap(virt, size);
+}
+
+static int
+mem_rte_to_sys_prot(enum rte_mem_prot prot)
+{
+	int sys_prot = 0;
+
+	if (prot & RTE_PROT_READ)
+		sys_prot |= PROT_READ;
+	if (prot & RTE_PROT_WRITE)
+		sys_prot |= PROT_WRITE;
+	if (prot & RTE_PROT_EXECUTE)
+		sys_prot |= PROT_EXEC;
+
+	return sys_prot;
+}
+
+static void *
+mem_map(void *requested_addr, size_t size, int prot, int flags,
+	int fd, size_t offset)
+{
+	void *virt = mmap(requested_addr, size, prot, flags, fd, offset);
+	if (virt == MAP_FAILED) {
+		RTE_LOG(ERR, EAL,
+			"Cannot mmap(%p, 0x%zx, 0x%x, 0x%x, %d, 0x%zx): %s\n",
+			requested_addr, size, prot, flags, fd, offset,
+			strerror(errno));
+		rte_errno = errno;
+	}
+	return virt;
+}
+
+static int
+mem_unmap(void *virt, size_t size)
+{
+	int ret = munmap(virt, size);
+	if (ret < 0) {
+		RTE_LOG(ERR, EAL, "Cannot munmap(%p, 0x%zx): %s\n",
+			virt, size, strerror(errno));
+		rte_errno = errno;
+	}
+	return ret;
+}
+
+void *
+rte_mem_map(void *requested_addr, size_t size, enum rte_mem_prot prot,
+	enum rte_map_flags flags, int fd, size_t offset)
+{
+	int sys_prot = 0;
+	int sys_flags = 0;
+
+	sys_prot = mem_rte_to_sys_prot(prot);
+
+	if (flags & RTE_MAP_SHARED)
+		sys_flags |= MAP_SHARED;
+	if (flags & RTE_MAP_ANONYMOUS)
+		sys_flags |= MAP_ANONYMOUS;
+	if (flags & RTE_MAP_PRIVATE)
+		sys_flags |= MAP_PRIVATE;
+	if (flags & RTE_MAP_FIXED)
+		sys_flags |= MAP_FIXED;
+
+	return mem_map(requested_addr, size, sys_prot, sys_flags, fd, offset);
+}
+
+int
+rte_mem_unmap(void *virt, size_t size)
+{
+	return mem_unmap(virt, size);
+}
+
+int
+rte_get_page_size(void)
+{
+	return getpagesize();
+}
+
+int
+rte_mem_lock(const void *virt, size_t size)
+{
+	return mlock(virt, size);
+}
diff --git a/lib/librte_eal/linux/eal/eal_memory.c b/lib/librte_eal/linux/eal/eal_memory.c
index 7a9c97ff8..72205580a 100644
--- a/lib/librte_eal/linux/eal/eal_memory.c
+++ b/lib/librte_eal/linux/eal/eal_memory.c
@@ -2479,3 +2479,120 @@ rte_eal_memseg_init(void)
 #endif
 			memseg_secondary_init();
 }
+
+static void *
+mem_map(void *requested_addr, size_t size, int prot, int flags,
+	int fd, size_t offset)
+{
+	void *virt = mmap(requested_addr, size, prot, flags, fd, offset);
+	if (virt == MAP_FAILED) {
+		RTE_LOG(ERR, EAL,
+			"Cannot mmap(%p, 0x%zx, 0x%x, 0x%x, %d, 0x%zx): %s\n",
+			requested_addr, size, prot, flags, fd, offset,
+			strerror(errno));
+		rte_errno = errno;
+	}
+	return virt;
+}
+
+static int
+mem_unmap(void *virt, size_t size)
+{
+	int ret = munmap(virt, size);
+	if (ret < 0) {
+		RTE_LOG(ERR, EAL, "Cannot munmap(%p, 0x%zx): %s\n",
+			virt, size, strerror(errno));
+		rte_errno = errno;
+	}
+	return ret;
+}
+
+void *
+eal_mem_reserve(void *requested_addr, size_t size,
+	enum rte_mem_reserve_flags flags)
+{
+	int sys_flags = MAP_PRIVATE | MAP_ANONYMOUS;
+
+	if (flags & RTE_RESERVE_HUGEPAGES)
+		sys_flags |= MAP_HUGETLB;
+	if (flags & RTE_RESERVE_EXACT_ADDRESS)
+		sys_flags |= MAP_FIXED;
+
+	return mem_map(requested_addr, size, PROT_READ, sys_flags, -1, 0);
+}
+
+void *
+eal_mem_alloc(size_t size, enum rte_page_sizes page_size)
+{
+	int flags = MAP_ANONYMOUS | MAP_PRIVATE;
+
+	if (page_size != 0) {
+		/* as per mmap() manpage, all page sizes are log2 of page size
+		 * shifted by MAP_HUGE_SHIFT
+		 */
+		int page_flag = rte_log2_u64(page_size) << MAP_HUGE_SHIFT;
+		flags |= MAP_HUGETLB | page_flag;
+	}
+
+	return mem_map(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0);
+}
+
+void
+eal_mem_free(void *virt, size_t size)
+{
+	mem_unmap(virt, size);
+}
+
+static int
+mem_rte_to_sys_prot(enum rte_mem_prot prot)
+{
+	int sys_prot = 0;
+
+	if (prot & RTE_PROT_READ)
+		sys_prot |= PROT_READ;
+	if (prot & RTE_PROT_WRITE)
+		sys_prot |= PROT_WRITE;
+	if (prot & RTE_PROT_EXECUTE)
+		sys_prot |= PROT_EXEC;
+
+	return sys_prot;
+}
+
+void *
+rte_mem_map(void *requested_addr, size_t size, enum rte_mem_prot prot,
+	enum rte_map_flags flags, int fd, size_t offset)
+{
+	int sys_prot = 0;
+	int sys_flags = 0;
+
+	sys_prot = mem_rte_to_sys_prot(prot);
+
+	if (flags & RTE_MAP_SHARED)
+		sys_flags |= MAP_SHARED;
+	if (flags & RTE_MAP_ANONYMOUS)
+		sys_flags |= MAP_ANONYMOUS;
+	if (flags & RTE_MAP_PRIVATE)
+		sys_flags |= MAP_PRIVATE;
+	if (flags & RTE_MAP_FIXED)
+		sys_flags |= MAP_FIXED;
+
+	return mem_map(requested_addr, size, sys_prot, sys_flags, fd, offset);
+}
+
+int
+rte_mem_unmap(void *virt, size_t size)
+{
+	return mem_unmap(virt, size);
+}
+
+int
+rte_get_page_size(void)
+{
+	return getpagesize();
+}
+
+int
+rte_mem_lock(const void *virt, size_t size)
+{
+	return mlock(virt, size);
+}
diff --git a/lib/librte_eal/rte_eal_exports.def b/lib/librte_eal/rte_eal_exports.def
index 12a6c79d6..bacf9a107 100644
--- a/lib/librte_eal/rte_eal_exports.def
+++ b/lib/librte_eal/rte_eal_exports.def
@@ -5,5 +5,9 @@ EXPORTS
 	rte_eal_mp_remote_launch
 	rte_eal_mp_wait_lcore
 	rte_eal_remote_launch
+	rte_get_page_size
 	rte_log
+	rte_mem_lock
+	rte_mem_map
+	rte_mem_unmap
 	rte_vlog
diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
index f9ede5b41..07128898f 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -337,5 +337,9 @@ EXPERIMENTAL {
 	rte_thread_is_intr;
 
 	# added in 20.05
+	rte_get_page_size;
 	rte_log_can_log;
+	rte_mem_lock;
+	rte_mem_map;
+	rte_mem_unmap;
 };
diff --git a/lib/librte_eal/windows/eal/eal.c b/lib/librte_eal/windows/eal/eal.c
index 4932185ec..98afd8a68 100644
--- a/lib/librte_eal/windows/eal/eal.c
+++ b/lib/librte_eal/windows/eal/eal.c
@@ -339,6 +339,12 @@ rte_eal_init(int argc, char **argv)
 			internal_config.memory = MEMSIZE_IF_NO_HUGE_PAGE;
 	}
 
+	if (eal_mem_win32api_init() < 0) {
+		rte_eal_init_alert("Cannot access Win32 memory management");
+		rte_errno = ENOTSUP;
+		return -1;
+	}
+
 	eal_thread_init_master(rte_config.master_lcore);
 
 	RTE_LCORE_FOREACH_SLAVE(i) {
diff --git a/lib/librte_eal/windows/eal/eal_memory.c b/lib/librte_eal/windows/eal/eal_memory.c
new file mode 100644
index 000000000..f8b312d7c
--- /dev/null
+++ b/lib/librte_eal/windows/eal/eal_memory.c
@@ -0,0 +1,433 @@
+#include <io.h>
+
+#include <rte_errno.h>
+#include <rte_memory.h>
+
+#include "eal_private.h"
+#include "eal_windows.h"
+
+/* MinGW-w64 headers lack VirtualAlloc2() in some distributions.
+ * Provide a copy of definitions and code to load it dynamically.
+ * Note: definitions are copied verbatim from Microsoft documentation
+ * and don't follow DPDK code style.
+ */
+#ifndef MEM_PRESERVE_PLACEHOLDER
+
+/* https://docs.microsoft.com/en-us/windows/win32/api/winnt/ne-winnt-mem_extended_parameter_type */
+typedef enum MEM_EXTENDED_PARAMETER_TYPE {
+	MemExtendedParameterInvalidType,
+	MemExtendedParameterAddressRequirements,
+	MemExtendedParameterNumaNode,
+	MemExtendedParameterPartitionHandle,
+	MemExtendedParameterMax,
+	MemExtendedParameterUserPhysicalHandle,
+	MemExtendedParameterAttributeFlags
+} *PMEM_EXTENDED_PARAMETER_TYPE;
+
+#define MEM_EXTENDED_PARAMETER_TYPE_BITS 4
+
+/* https://docs.microsoft.com/en-us/windows/win32/api/winnt/ns-winnt-mem_extended_parameter */
+typedef struct MEM_EXTENDED_PARAMETER {
+	struct {
+		DWORD64 Type : MEM_EXTENDED_PARAMETER_TYPE_BITS;
+		DWORD64 Reserved : 64 - MEM_EXTENDED_PARAMETER_TYPE_BITS;
+	} DUMMYSTRUCTNAME;
+	union {
+		DWORD64 ULong64;
+		PVOID   Pointer;
+		SIZE_T  Size;
+		HANDLE  Handle;
+		DWORD   ULong;
+	} DUMMYUNIONNAME;
+} MEM_EXTENDED_PARAMETER, *PMEM_EXTENDED_PARAMETER;
+
+/* https://docs.microsoft.com/en-us/windows/win32/api/memoryapi/nf-memoryapi-virtualalloc2 */
+typedef PVOID (*VirtualAlloc2_type)(
+	HANDLE                 Process,
+	PVOID                  BaseAddress,
+	SIZE_T                 Size,
+	ULONG                  AllocationType,
+	ULONG                  PageProtection,
+	MEM_EXTENDED_PARAMETER *ExtendedParameters,
+	ULONG                  ParameterCount
+);
+
+/* VirtualAlloc2() flags. */
+#define MEM_COALESCE_PLACEHOLDERS 0x00000001
+#define MEM_PRESERVE_PLACEHOLDER  0x00000002
+#define MEM_REPLACE_PLACEHOLDER   0x00004000
+#define MEM_RESERVE_PLACEHOLDER   0x00040000
+
+/* Named exactly as the function, so that user code does not depend
+ * on it being found at compile time or dynamically.
+ */
+static VirtualAlloc2_type VirtualAlloc2;
+
+int
+eal_mem_win32api_init(void)
+{
+	static const char library_name[] = "kernelbase.dll";
+	static const char function[] = "VirtualAlloc2";
+
+	OSVERSIONINFO info;
+	HMODULE library = NULL;
+	int ret = 0;
+
+	/* Already done. */
+	if (VirtualAlloc2 != NULL)
+		return 0;
+
+	/* IsWindows10OrGreater() may also be unavailable. */
+	memset(&info, 0, sizeof(info));
+	info.dwOSVersionInfoSize = sizeof(info);
+	GetVersionEx(&info);
+
+	/* Checking for Windows 10+ will also detect Windows Server 2016+.
+	 * Do not abort, because Windows may report false version depending
+	 * on executable manifest, compatibility mode, etc.
+	 */
+	if (info.dwMajorVersion < 10)
+		RTE_LOG(DEBUG, EAL, "Windows 10+ or Windows Server 2016+ "
+			"is required for advanced memory features\n");
+
+	library = LoadLibraryA(library_name);
+	if (library == NULL) {
+		RTE_LOG_WIN32_ERR("LoadLibraryA(\"%s\")", library_name);
+		return -1;
+	}
+
+	VirtualAlloc2 = (VirtualAlloc2_type)(
+		(void *)GetProcAddress(library, function));
+	if (VirtualAlloc2 == NULL) {
+		RTE_LOG_WIN32_ERR("GetProcAddress(\"%s\", \"%s\")\n",
+			library_name, function);
+		ret = -1;
+	}
+
+	FreeLibrary(library);
+
+	return ret;
+}
+
+#else
+
+/* Stub in case VirtualAlloc2() is provided by the compiler. */
+int
+eal_mem_win32api_init(void)
+{
+	return 0;
+}
+
+#endif /* no VirtualAlloc2() */
+
+/* Approximate error mapping from VirtualAlloc2() to POSIX mmap(3). */
+static int
+win32_alloc_error_to_errno(DWORD code)
+{
+	switch (code) {
+	case ERROR_SUCCESS:
+		return 0;
+
+	case ERROR_INVALID_ADDRESS:
+		/* A valid requested address is not available. */
+	case ERROR_COMMITMENT_LIMIT:
+		/* May occcur when committing regular memory. */
+	case ERROR_NO_SYSTEM_RESOURCES:
+		/* Occurs when the system runs out of hugepages. */
+		return ENOMEM;
+
+	case ERROR_INVALID_PARAMETER:
+	default:
+		return EINVAL;
+	}
+}
+
+void *
+eal_mem_reserve(void *requested_addr, size_t size,
+	enum rte_mem_reserve_flags flags)
+{
+	void *virt;
+
+	/* Windows requires hugepages to be committed. */
+	if (flags & RTE_RESERVE_HUGEPAGES) {
+		RTE_LOG(ERR, EAL, "Hugepage reservation is not supported\n");
+		rte_errno = ENOTSUP;
+		return NULL;
+	}
+
+	virt = VirtualAlloc2(GetCurrentProcess(), requested_addr, size,
+		MEM_RESERVE | MEM_RESERVE_PLACEHOLDER, PAGE_NOACCESS,
+		NULL, 0);
+	if (virt == NULL) {
+		RTE_LOG_WIN32_ERR("VirtualAlloc2()");
+		rte_errno = win32_alloc_error_to_errno(GetLastError());
+	}
+
+	if ((flags & RTE_RESERVE_EXACT_ADDRESS) && (virt != requested_addr)) {
+		if (!VirtualFree(virt, 0, MEM_RELEASE))
+			RTE_LOG_WIN32_ERR("VirtualFree()");
+		rte_errno = ENOMEM;
+		return NULL;
+	}
+
+	return virt;
+}
+
+void *
+eal_mem_alloc(size_t size, enum rte_page_sizes page_size)
+{
+	if (page_size != 0)
+		return eal_mem_alloc_socket(size, SOCKET_ID_ANY);
+
+	return VirtualAlloc(
+		NULL, size, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
+}
+
+void *
+eal_mem_alloc_socket(size_t size, int socket_id)
+{
+	DWORD flags = MEM_RESERVE | MEM_COMMIT;
+	void *addr;
+
+	flags = MEM_RESERVE | MEM_COMMIT | MEM_LARGE_PAGES;
+	addr = VirtualAllocExNuma(GetCurrentProcess(), NULL, size, flags,
+		PAGE_READWRITE, eal_socket_numa_node(socket_id));
+	if (addr == NULL)
+		rte_errno = ENOMEM;
+	return addr;
+}
+
+void*
+eal_mem_commit(void *requested_addr, size_t size, int socket_id)
+{
+	MEM_EXTENDED_PARAMETER param;
+	DWORD param_count = 0;
+	DWORD flags;
+	void *addr;
+
+	if (requested_addr != NULL) {
+		MEMORY_BASIC_INFORMATION info;
+		if (VirtualQuery(requested_addr, &info, sizeof(info)) == 0) {
+			RTE_LOG_WIN32_ERR("VirtualQuery()");
+			return NULL;
+		}
+
+		/* Split reserved region if only a part is committed. */
+		flags = MEM_RELEASE | MEM_PRESERVE_PLACEHOLDER;
+		if ((info.RegionSize > size) &&
+			!VirtualFree(requested_addr, size, flags)) {
+			RTE_LOG_WIN32_ERR("VirtualFree(%p, %zu, "
+				"<split placeholder>)", requested_addr, size);
+			return NULL;
+		}
+	}
+
+	if (socket_id != SOCKET_ID_ANY) {
+		param_count = 1;
+		memset(&param, 0, sizeof(param));
+		param.Type = MemExtendedParameterNumaNode;
+		param.ULong = eal_socket_numa_node(socket_id);
+	}
+
+	flags = MEM_RESERVE | MEM_COMMIT | MEM_LARGE_PAGES;
+	if (requested_addr != NULL)
+		flags |= MEM_REPLACE_PLACEHOLDER;
+
+	addr = VirtualAlloc2(GetCurrentProcess(), requested_addr, size,
+		flags, PAGE_READWRITE, &param, param_count);
+	if (addr == NULL) {
+		int err = GetLastError();
+		RTE_LOG_WIN32_ERR("VirtualAlloc2(%p, %zu, "
+			"<replace placeholder>)", addr, size);
+		rte_errno = win32_alloc_error_to_errno(err);
+		return NULL;
+	}
+
+	return addr;
+}
+
+int
+eal_mem_decommit(void *addr, size_t size)
+{
+	if (!VirtualFree(addr, size, MEM_RELEASE | MEM_PRESERVE_PLACEHOLDER)) {
+		RTE_LOG_WIN32_ERR("VirtualFree(%p, %zu, ...)", addr, size);
+		return -1;
+	}
+	return 0;
+}
+
+/**
+ * Free a reserved memory region in full or in part.
+ *
+ * @param addr
+ *  Starting address of the area to free.
+ * @param size
+ *  Number of bytes to free. Must be a multiple of page size.
+ * @param reserved
+ *  Fail if the region is not in reserved state.
+ * @return
+ *  * 0 on successful deallocation;
+ *  * 1 if region mut be in reserved state but it is not;
+ *  * (-1) on system API failures.
+ */
+static int
+mem_free(void *addr, size_t size, bool reserved)
+{
+	MEMORY_BASIC_INFORMATION info;
+	if (VirtualQuery(addr, &info, sizeof(info)) == 0) {
+		RTE_LOG_WIN32_ERR("VirtualQuery()");
+		return -1;
+	}
+
+	if (reserved && (info.State != MEM_RESERVE))
+		return 1;
+
+	/* Free complete region. */
+	if ((addr == info.AllocationBase) && (size == info.RegionSize)) {
+		if (!VirtualFree(addr, 0, MEM_RELEASE)) {
+			RTE_LOG_WIN32_ERR("VirtualFree(%p, 0, MEM_RELEASE)",
+				addr);
+		}
+		return 0;
+	}
+
+	/* Split the part to be freed and the remaining reservation. */
+	if (!VirtualFree(addr, size, MEM_RELEASE | MEM_PRESERVE_PLACEHOLDER)) {
+		RTE_LOG_WIN32_ERR("VirtualFree(%p, %zu, "
+			"MEM_RELEASE | MEM_PRESERVE_PLACEHOLDER)", addr, size);
+		return -1;
+	}
+
+	/* Actually free reservation part. */
+	if (!VirtualFree(addr, 0, MEM_RELEASE)) {
+		RTE_LOG_WIN32_ERR("VirtualFree(%p, 0, MEM_RELEASE)", addr);
+		return -1;
+	}
+
+	return 0;
+}
+
+void
+eal_mem_free(void *virt, size_t size)
+{
+	mem_free(virt, size, false);
+}
+
+void *
+rte_mem_map(void *requested_addr, size_t size, enum rte_mem_prot prot,
+	enum rte_map_flags flags, int fd, size_t offset)
+{
+	HANDLE file_handle = INVALID_HANDLE_VALUE;
+	HANDLE mapping_handle = INVALID_HANDLE_VALUE;
+	DWORD sys_prot = 0;
+	DWORD sys_access = 0;
+	DWORD size_high = (DWORD)(size >> 32);
+	DWORD size_low = (DWORD)size;
+	DWORD offset_high = (DWORD)(offset >> 32);
+	DWORD offset_low = (DWORD)offset;
+	LPVOID virt = NULL;
+
+	if (prot & RTE_PROT_EXECUTE) {
+		if (prot & RTE_PROT_READ) {
+			sys_prot = PAGE_EXECUTE_READ;
+			sys_access = FILE_MAP_READ | FILE_MAP_EXECUTE;
+		}
+		if (prot & RTE_PROT_WRITE) {
+			sys_prot = PAGE_EXECUTE_READWRITE;
+			sys_access = FILE_MAP_WRITE | FILE_MAP_EXECUTE;
+		}
+	} else {
+		if (prot & RTE_PROT_READ) {
+			sys_prot = PAGE_READONLY;
+			sys_access = FILE_MAP_READ;
+		}
+		if (prot & RTE_PROT_WRITE) {
+			sys_prot = PAGE_READWRITE;
+			sys_access = FILE_MAP_WRITE;
+		}
+	}
+
+	if (flags & RTE_MAP_PRIVATE)
+		sys_access |= FILE_MAP_COPY;
+
+	if ((flags & RTE_MAP_ANONYMOUS) == 0)
+		file_handle = (HANDLE)_get_osfhandle(fd);
+
+	mapping_handle = CreateFileMapping(
+		file_handle, NULL, sys_prot, size_high, size_low, NULL);
+	if (mapping_handle == INVALID_HANDLE_VALUE) {
+		RTE_LOG_WIN32_ERR("CreateFileMapping()");
+		return NULL;
+	}
+
+	/* TODO: there is a race for the requested_addr between mem_free()
+	 * and MapViewOfFileEx(). MapViewOfFile3() that can replace a reserved
+	 * region with a mapping in a single operation, but it does not support
+	 * private mappings.
+	 */
+	if (requested_addr != NULL) {
+		int ret = mem_free(requested_addr, size, true);
+		if (ret) {
+			if (ret > 0) {
+				RTE_LOG(ERR, EAL, "Cannot map memory "
+					"to a region not reserved\n");
+				rte_errno = EADDRNOTAVAIL;
+			}
+			return NULL;
+		}
+	}
+
+	virt = MapViewOfFileEx(mapping_handle, sys_access,
+		offset_high, offset_low, size, requested_addr);
+	if (!virt) {
+		RTE_LOG_WIN32_ERR("MapViewOfFileEx()");
+		return NULL;
+	}
+
+	if ((flags & RTE_MAP_FIXED) && (virt != requested_addr)) {
+		BOOL ret = UnmapViewOfFile(virt);
+		virt = NULL;
+		if (!ret)
+			RTE_LOG_WIN32_ERR("UnmapViewOfFile()");
+	}
+
+	if (!CloseHandle(mapping_handle))
+		RTE_LOG_WIN32_ERR("CloseHandle()");
+
+	return virt;
+}
+
+int
+rte_mem_unmap(void *virt, size_t size)
+{
+	RTE_SET_USED(size);
+
+	if (!UnmapViewOfFile(virt)) {
+		rte_errno = GetLastError();
+		RTE_LOG_WIN32_ERR("UnmapViewOfFile()");
+		return -1;
+	}
+	return 0;
+}
+
+int
+rte_get_page_size(void)
+{
+	SYSTEM_INFO info;
+	GetSystemInfo(&info);
+	return info.dwPageSize;
+}
+
+int
+rte_mem_lock(const void *virt, size_t size)
+{
+	/* VirtualLock() takes `void*`, work around compiler warning. */
+	void *addr = (void *)((uintptr_t)virt);
+
+	if (!VirtualLock(addr, size)) {
+		RTE_LOG_WIN32_ERR("VirtualLock()");
+		return -1;
+	}
+
+	return 0;
+}
diff --git a/lib/librte_eal/windows/eal/eal_windows.h b/lib/librte_eal/windows/eal/eal_windows.h
index 390d2fd66..002d7e4a5 100644
--- a/lib/librte_eal/windows/eal/eal_windows.h
+++ b/lib/librte_eal/windows/eal/eal_windows.h
@@ -36,4 +36,55 @@ int eal_thread_create(pthread_t *thread);
  */
 unsigned int eal_socket_numa_node(unsigned int socket_id);
 
+/**
+ * Locate Win32 memory management routines in system libraries.
+ *
+ * @return 0 on success, (-1) on failure.
+ */
+int eal_mem_win32api_init(void);
+
+/**
+ * Allocate new memory in hugepages on the specified NUMA node.
+ *
+ * @param size
+ *  Number of bytes to allocate. Must be a multiple of huge page size.
+ * @param socket_id
+ *  Socket ID.
+ * @return
+ *  Address of the memory allocated on success or NULL on failure.
+ */
+void *eal_mem_alloc_socket(size_t size, int socket_id);
+
+/**
+ * Commit memory previously reserved with @ref eal_mem_reserve()
+ * or decommitted from hugepages by @ref eal_mem_decommit().
+ *
+ * @param requested_addr
+ *  Address within a reserved region. Must not be NULL.
+ * @param size
+ *  Number of bytes to commit. Must be a multiple of page size.
+ * @param socket_id
+ *  Socket ID to allocate on. Can be SOCKET_ID_ANY.
+ * @return
+ *  On success, address of the committed memory, that is, requested_addr.
+ *  On failure, NULL and @code rte_errno @endcode is set.
+ */
+void *eal_mem_commit(void *requested_addr, size_t size, int socket_id);
+
+/**
+ * Put allocated or committed memory back into reserved state.
+ *
+ * @param addr
+ *  Address of the region to decommit.
+ * @param size
+ *  Number of bytes to decommit.
+ *
+ * The @code addr @endcode and @code param @endcode must match
+ * location and size of previously allocated or committed region.
+ *
+ * @return
+ *  0 on success, (-1) on failure.
+ */
+int eal_mem_decommit(void *addr, size_t size);
+
 #endif /* _EAL_WINDOWS_H_ */
diff --git a/lib/librte_eal/windows/eal/meson.build b/lib/librte_eal/windows/eal/meson.build
index 8b407c9ae..7b0f03f0a 100644
--- a/lib/librte_eal/windows/eal/meson.build
+++ b/lib/librte_eal/windows/eal/meson.build
@@ -24,6 +24,7 @@ env_sources = files('eal.c',
 	'eal_debug.c',
 	'eal_hugepages.c',
 	'eal_lcore.c',
+	'eal_memory.c',
 	'eal_thread.c',
 	'getopt.c',
 )
-- 
2.25.1



More information about the dev mailing list