[dpdk-dev] [PATCH v2] eal: make hugetlb initialization more robust

Jianfeng Tan jianfeng.tan at intel.com
Tue Mar 8 02:42:39 CET 2016


This patch adds an option, --huge-trybest, to use a recover mechanism to
the case that there are not so many hugepages (declared in sysfs), which
can be used. It relys on a mem access to fault-in hugepages, and if fails
with SIGBUS, recover to previously saved stack environment with
siglongjmp().

Test example:
  a. cgcreate -g hugetlb:/test-subgroup
  b. cgset -r hugetlb.1GB.limit_in_bytes=2147483648 test-subgroup
  c. cgexec -g hugetlb:test-subgroup \
	  ./examples/helloworld/build/helloworld -c 0x2 -n 4 --huge-trybest

Signed-off-by: Jianfeng Tan <jianfeng.tan at intel.com>
---
v2:
 - Address the compiling error by move setjmp into a wrap method.

 lib/librte_eal/common/eal_common_options.c |   4 ++
 lib/librte_eal/common/eal_internal_cfg.h   |   1 +
 lib/librte_eal/common/eal_options.h        |   2 +
 lib/librte_eal/linuxapp/eal/eal.c          |   1 +
 lib/librte_eal/linuxapp/eal/eal_memory.c   | 104 ++++++++++++++++++++++++++---
 5 files changed, 104 insertions(+), 8 deletions(-)

diff --git a/lib/librte_eal/common/eal_common_options.c b/lib/librte_eal/common/eal_common_options.c
index 29942ea..8ff6a2e 100644
--- a/lib/librte_eal/common/eal_common_options.c
+++ b/lib/librte_eal/common/eal_common_options.c
@@ -95,6 +95,7 @@ eal_long_options[] = {
 	{OPT_VFIO_INTR,         1, NULL, OPT_VFIO_INTR_NUM        },
 	{OPT_VMWARE_TSC_MAP,    0, NULL, OPT_VMWARE_TSC_MAP_NUM   },
 	{OPT_XEN_DOM0,          0, NULL, OPT_XEN_DOM0_NUM         },
+	{OPT_HUGE_TRYBEST,      0, NULL, OPT_HUGE_TRYBEST_NUM     },
 	{0,                     0, NULL, 0                        }
 };
 
@@ -896,6 +897,9 @@ eal_parse_common_option(int opt, const char *optarg,
 			return -1;
 		}
 		break;
+	case OPT_HUGE_TRYBEST_NUM:
+		internal_config.huge_trybest = 1;
+		break;
 
 	/* don't know what to do, leave this to caller */
 	default:
diff --git a/lib/librte_eal/common/eal_internal_cfg.h b/lib/librte_eal/common/eal_internal_cfg.h
index 5f1367e..90a3533 100644
--- a/lib/librte_eal/common/eal_internal_cfg.h
+++ b/lib/librte_eal/common/eal_internal_cfg.h
@@ -64,6 +64,7 @@ struct internal_config {
 	volatile unsigned force_nchannel; /**< force number of channels */
 	volatile unsigned force_nrank;    /**< force number of ranks */
 	volatile unsigned no_hugetlbfs;   /**< true to disable hugetlbfs */
+	volatile unsigned huge_trybest;   /**< try best to allocate hugepages */
 	unsigned hugepage_unlink;         /**< true to unlink backing files */
 	volatile unsigned xen_dom0_support; /**< support app running on Xen Dom0*/
 	volatile unsigned no_pci;         /**< true to disable PCI */
diff --git a/lib/librte_eal/common/eal_options.h b/lib/librte_eal/common/eal_options.h
index a881c62..02397c5 100644
--- a/lib/librte_eal/common/eal_options.h
+++ b/lib/librte_eal/common/eal_options.h
@@ -83,6 +83,8 @@ enum {
 	OPT_VMWARE_TSC_MAP_NUM,
 #define OPT_XEN_DOM0          "xen-dom0"
 	OPT_XEN_DOM0_NUM,
+#define OPT_HUGE_TRYBEST      "huge-trybest"
+	OPT_HUGE_TRYBEST_NUM,
 	OPT_LONG_MAX_NUM
 };
 
diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index ceac435..3e23877 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -343,6 +343,7 @@ eal_usage(const char *prgname)
 	       "  --"OPT_CREATE_UIO_DEV"    Create /dev/uioX (usually done by hotplug)\n"
 	       "  --"OPT_VFIO_INTR"         Interrupt mode for VFIO (legacy|msi|msix)\n"
 	       "  --"OPT_XEN_DOM0"          Support running on Xen dom0 without hugetlbfs\n"
+	       "  --"OPT_HUGE_TRYBEST"      Try best to accommodate hugepages\n"
 	       "\n");
 	/* Allow the application to print its usage message too if hook is set */
 	if ( rte_application_usage_hook ) {
diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
index 5b9132c..e4e1f3b 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memory.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
@@ -80,6 +80,8 @@
 #include <errno.h>
 #include <sys/ioctl.h>
 #include <sys/time.h>
+#include <signal.h>
+#include <setjmp.h>
 
 #include <rte_log.h>
 #include <rte_memory.h>
@@ -309,6 +311,21 @@ get_virtual_area(size_t *size, size_t hugepage_sz)
 	return addr;
 }
 
+static sigjmp_buf jmpenv;
+
+static void sigbus_handler(int signo __rte_unused)
+{
+	siglongjmp(jmpenv, 1);
+}
+
+/* Put setjmp into a wrap method to avoid compiling error. Any non-volatile,
+ * non-static local variable in the stack frame calling setjmp might be
+ * clobbered by a call to longjmp.
+ */
+static int wrap_setjmp(void)
+{
+	return setjmp(jmpenv);
+}
 /*
  * Mmap all hugepages of hugepage table: it first open a file in
  * hugetlbfs, then mmap() hugepage_sz data in it. If orig is set, the
@@ -396,7 +413,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
 		if (fd < 0) {
 			RTE_LOG(ERR, EAL, "%s(): open failed: %s\n", __func__,
 					strerror(errno));
-			return -1;
+			return i;
 		}
 
 		/* map the segment, and populate page tables,
@@ -407,7 +424,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
 			RTE_LOG(ERR, EAL, "%s(): mmap failed: %s\n", __func__,
 					strerror(errno));
 			close(fd);
-			return -1;
+			return i;
 		}
 
 		if (orig) {
@@ -417,12 +434,33 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
 			hugepg_tbl[i].final_va = virtaddr;
 		}
 
+		if (orig && internal_config.huge_trybest) {
+			/* In linux, hugetlb limitations, like cgroup, are
+			 * enforced at fault time instead of mmap(), even
+			 * with the option of MAP_POPULATE. Kernel will send
+			 * a SIGBUS signal. To avoid to be killed, save stack
+			 * environment here, if SIGBUS happens, we can jump
+			 * back here.
+			 */
+			if (wrap_setjmp()) {
+				RTE_LOG(ERR, EAL, "SIGBUS: Cannot mmap more "
+					"hugepages of size %u MB\n",
+					(unsigned)(hugepage_sz / 0x100000));
+				munmap(virtaddr, hugepage_sz);
+				close(fd);
+				unlink(hugepg_tbl[i].filepath);
+				return i;
+			}
+			*(int *)virtaddr = 0;
+		}
+
+
 		/* set shared flock on the file. */
 		if (flock(fd, LOCK_SH | LOCK_NB) == -1) {
 			RTE_LOG(ERR, EAL, "%s(): Locking file failed:%s \n",
 				__func__, strerror(errno));
 			close(fd);
-			return -1;
+			return i;
 		}
 
 		close(fd);
@@ -430,7 +468,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
 		vma_addr = (char *)vma_addr + hugepage_sz;
 		vma_len -= hugepage_sz;
 	}
-	return 0;
+	return i;
 }
 
 #ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
@@ -1036,6 +1074,33 @@ calc_num_pages_per_socket(uint64_t * memory,
 	return total_num_pages;
 }
 
+static struct sigaction action_old;
+static int need_recover;
+
+static void
+register_sigbus(void)
+{
+	sigset_t mask;
+	struct sigaction action;
+
+	sigemptyset(&mask);
+	sigaddset(&mask, SIGBUS);
+	action.sa_flags = 0;
+	action.sa_mask = mask;
+	action.sa_handler = sigbus_handler;
+
+	need_recover = !sigaction(SIGBUS, &action, &action_old);
+}
+
+static void
+recover_sigbus(void)
+{
+	if (need_recover) {
+		sigaction(SIGBUS, &action_old, NULL);
+		need_recover = 0;
+	}
+}
+
 /*
  * Prepare physical memory mapping: fill configuration structure with
  * these infos, return 0 on success.
@@ -1122,8 +1187,12 @@ rte_eal_hugepage_init(void)
 
 	hp_offset = 0; /* where we start the current page size entries */
 
+	if (internal_config.huge_trybest)
+		register_sigbus();
+
 	/* map all hugepages and sort them */
 	for (i = 0; i < (int)internal_config.num_hugepage_sizes; i ++){
+		int pages_old, pages_new;
 		struct hugepage_info *hpi;
 
 		/*
@@ -1137,10 +1206,24 @@ rte_eal_hugepage_init(void)
 			continue;
 
 		/* map all hugepages available */
-		if (map_all_hugepages(&tmp_hp[hp_offset], hpi, 1) < 0){
-			RTE_LOG(DEBUG, EAL, "Failed to mmap %u MB hugepages\n",
-					(unsigned)(hpi->hugepage_sz / 0x100000));
-			goto fail;
+		pages_old = hpi->num_pages[0];
+		pages_new = map_all_hugepages(&tmp_hp[hp_offset], hpi, 1);
+		if (pages_new < pages_old) {
+			RTE_LOG(DEBUG, EAL,
+				"%d not %d hugepages of size %u MB allocated\n",
+				pages_new, pages_old,
+				(unsigned)(hpi->hugepage_sz / 0x100000));
+			if (internal_config.huge_trybest) {
+				int pages = pages_old - pages_new;
+
+				internal_config.memory -=
+					hpi->hugepage_sz * pages;
+				nr_hugepages -= pages;
+				hpi->num_pages[0] = pages_new;
+				if (pages_new == 0)
+					continue;
+			} else
+				goto fail;
 		}
 
 		/* find physical addresses and sockets for each hugepage */
@@ -1187,6 +1270,9 @@ rte_eal_hugepage_init(void)
 #endif
 	}
 
+	if (internal_config.huge_trybest)
+		recover_sigbus();
+
 #ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
 	nr_hugefiles = 0;
 	for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) {
@@ -1373,6 +1459,8 @@ rte_eal_hugepage_init(void)
 	return 0;
 
 fail:
+	if (internal_config.huge_trybest)
+		recover_sigbus();
 	free(tmp_hp);
 	return -1;
 }
-- 
2.1.4



More information about the dev mailing list