[dpdk-dev] [PATCH v3] Implement memcmp using Intel SIMD instrinsics.

Ravi Kerur rkerur at gmail.com
Mon May 18 22:01:43 CEST 2015


This patch implements memcmp and use librte_hash as the first candidate
to use rte_memcmp which is implemented using AVX/SSE intrinsics.

Tested with GCC(4.8.2) and Clang(3.4-1) compilers and both tests show better
performance on Intel(R) Core(TM) i7-4790 CPU @ 3.60GHz, Ubuntu 14.04
x86_64 shows when compared to memcmp.

Changes in v3:
Implement complete memcmp functionality.
Implement functional and performance tests and add it to
"make test" infrastructure code.

Changes in v2:
Modified code to support only upto 64 bytes as that's the max bytes
used by hash for comparison.

Changes in v1:
Initial changes to support memcmp with support upto 128 bytes.

Signed-off-by: Ravi Kerur <rkerur at gmail.com>
---
 app/test/Makefile                                  |   5 +-
 app/test/autotest_data.py                          |  19 +
 app/test/test_hash_perf.c                          |  36 +-
 app/test/test_memcmp.c                             | 229 ++++++
 app/test/test_memcmp_perf.c                        | 339 ++++++++
 .../common/include/arch/ppc_64/rte_memcmp.h        |  62 ++
 .../common/include/arch/x86/rte_memcmp.h           | 900 +++++++++++++++++++++
 lib/librte_eal/common/include/generic/rte_memcmp.h | 175 ++++
 lib/librte_hash/rte_hash.c                         |  59 +-
 9 files changed, 1789 insertions(+), 35 deletions(-)
 create mode 100644 app/test/test_memcmp.c
 create mode 100644 app/test/test_memcmp_perf.c
 create mode 100644 lib/librte_eal/common/include/arch/ppc_64/rte_memcmp.h
 create mode 100644 lib/librte_eal/common/include/arch/x86/rte_memcmp.h
 create mode 100644 lib/librte_eal/common/include/generic/rte_memcmp.h

diff --git a/app/test/Makefile b/app/test/Makefile
index 4aca77c..957e4f1 100644
--- a/app/test/Makefile
+++ b/app/test/Makefile
@@ -81,6 +81,9 @@ SRCS-y += test_logs.c
 SRCS-y += test_memcpy.c
 SRCS-y += test_memcpy_perf.c
 
+SRCS-y += test_memcmp.c
+SRCS-y += test_memcmp_perf.c
+
 SRCS-$(CONFIG_RTE_LIBRTE_HASH) += test_hash.c
 SRCS-$(CONFIG_RTE_LIBRTE_HASH) += test_hash_perf.c
 
@@ -150,7 +153,7 @@ CFLAGS_test_kni.o += -Wno-deprecated-declarations
 endif
 CFLAGS += -D_GNU_SOURCE
 
-# Disable VTA for memcpy test
+# Disable VTA for memcpy tests
 ifeq ($(CC), gcc)
 ifeq ($(shell test $(GCC_VERSION) -ge 44 && echo 1), 1)
 CFLAGS_test_memcpy.o += -fno-var-tracking-assignments
diff --git a/app/test/autotest_data.py b/app/test/autotest_data.py
index 618a946..e07f087 100644
--- a/app/test/autotest_data.py
+++ b/app/test/autotest_data.py
@@ -187,6 +187,12 @@ parallel_test_group_list = [
 		 "Report" :	None,
 		},
 		{
+		 "Name" :	"Memcmp autotest",
+		 "Command" : 	"memcmp_autotest",
+		 "Func" :	default_autotest,
+		 "Report" :	None,
+		},
+		{
 		 "Name" :	"Memzone autotest",
 		 "Command" : 	"memzone_autotest",
 		 "Func" :	default_autotest,
@@ -399,6 +405,19 @@ non_parallel_test_group_list = [
 	]
 },
 {
+	"Prefix":	"memcmp_perf",
+	"Memory" :	all_sockets(512),
+	"Tests" :
+	[
+		{
+		 "Name" :	"Memcmp performance autotest",
+		 "Command" : 	"memcmp_perf_autotest",
+		 "Func" :	default_autotest,
+		 "Report" :	None,
+		},
+	]
+},
+{
 	"Prefix":	"hash_perf",
 	"Memory" :	all_sockets(512),
 	"Tests" :	
diff --git a/app/test/test_hash_perf.c b/app/test/test_hash_perf.c
index 6eabb21..6887629 100644
--- a/app/test/test_hash_perf.c
+++ b/app/test/test_hash_perf.c
@@ -440,7 +440,7 @@ run_single_tbl_perf_test(const struct rte_hash *h, hash_operation func,
 		uint32_t *invalid_pos_count)
 {
 	uint64_t begin, end, ticks = 0;
-	uint8_t *key = NULL;
+	uint8_t * volatile key = NULL;
 	uint32_t *bucket_occupancies = NULL;
 	uint32_t num_buckets, i, j;
 	int32_t pos;
@@ -547,30 +547,30 @@ run_tbl_perf_test(struct tbl_perf_test_params *params)
 	case ADD_UPDATE:
 		num_iterations = params->num_iterations;
 		params->num_iterations = params->entries;
-		run_single_tbl_perf_test(handle, rte_hash_add_key, params,
-				&avg_occupancy, &invalid_pos);
-		params->num_iterations = num_iterations;
 		ticks = run_single_tbl_perf_test(handle, rte_hash_add_key,
 				params, &avg_occupancy, &invalid_pos);
+		params->num_iterations = num_iterations;
+		ticks += run_single_tbl_perf_test(handle, rte_hash_add_key,
+				params, &avg_occupancy, &invalid_pos);
 		break;
 	case DELETE:
 		num_iterations = params->num_iterations;
 		params->num_iterations = params->entries;
-		run_single_tbl_perf_test(handle, rte_hash_add_key, params,
-				&avg_occupancy, &invalid_pos);
+		ticks = run_single_tbl_perf_test(handle, rte_hash_add_key,
+				params, &avg_occupancy, &invalid_pos);
 
 		params->num_iterations = num_iterations;
-		ticks = run_single_tbl_perf_test(handle, rte_hash_del_key,
+		ticks += run_single_tbl_perf_test(handle, rte_hash_del_key,
 				params, &avg_occupancy, &invalid_pos);
 		break;
 	case LOOKUP:
 		num_iterations = params->num_iterations;
 		params->num_iterations = params->entries;
-		run_single_tbl_perf_test(handle, rte_hash_add_key, params,
-				&avg_occupancy, &invalid_pos);
+		ticks = run_single_tbl_perf_test(handle, rte_hash_add_key,
+				params, &avg_occupancy, &invalid_pos);
 
 		params->num_iterations = num_iterations;
-		ticks = run_single_tbl_perf_test(handle, rte_hash_lookup,
+		ticks += run_single_tbl_perf_test(handle, rte_hash_lookup,
 				params, &avg_occupancy, &invalid_pos);
 		break;
 	default: return -1;
@@ -623,10 +623,15 @@ static int run_all_tbl_perf_tests(void)
 static void run_hash_func_test(rte_hash_function f, uint32_t init_val,
 		uint32_t key_len)
 {
-	static uint8_t key[RTE_HASH_KEY_LENGTH_MAX];
+	static uint8_t * volatile key;
 	uint64_t ticks = 0, start, end;
 	unsigned i, j;
 
+	key = rte_zmalloc("func hash key",
+			  key_len * sizeof(uint8_t), 16);
+	if (key == NULL)
+		return;
+
 	for (i = 0; i < HASHTEST_ITERATIONS; i++) {
 
 		for (j = 0; j < key_len; j++)
@@ -638,8 +643,11 @@ static void run_hash_func_test(rte_hash_function f, uint32_t init_val,
 		ticks += end - start;
 	}
 
-	printf("%-12s, %-18u, %-13u, %.02f\n", get_hash_name(f), (unsigned) key_len,
-			(unsigned) init_val, (double)ticks / HASHTEST_ITERATIONS);
+	rte_free(key);
+
+	printf("%-12s, %-18u, %-13u, %.02f\n",
+		get_hash_name(f), (unsigned) key_len, (unsigned) init_val,
+		(double)ticks / HASHTEST_ITERATIONS);
 }
 
 /*
@@ -687,7 +695,7 @@ fbk_hash_perf_test(void)
 		.socket_id = rte_socket_id(),
 	};
 	struct rte_fbk_hash_table *handle = NULL;
-	uint32_t *keys = NULL;
+	uint32_t * volatile keys = NULL;
 	unsigned indexes[TEST_SIZE];
 	uint64_t lookup_time = 0;
 	unsigned added = 0;
diff --git a/app/test/test_memcmp.c b/app/test/test_memcmp.c
new file mode 100644
index 0000000..7d9c85f
--- /dev/null
+++ b/app/test/test_memcmp.c
@@ -0,0 +1,229 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <errno.h>
+#include <sys/queue.h>
+
+#include <rte_common.h>
+#include <rte_malloc.h>
+#include <rte_cycles.h>
+#include <rte_random.h>
+#include <rte_memory.h>
+#include <rte_eal.h>
+#include <rte_memcmp.h>
+
+#include "test.h"
+
+/*******************************************************************************
+ * Memcmp function performance test configuration section.
+ * Each performance test will be performed HASHTEST_ITERATIONS times.
+ *
+ * The five arrays below control what tests are performed. Every combination
+ * from the array entries is tested.
+ */
+static size_t memcmp_sizes[] = {
+	1, 7, 8, 9, 15, 16, 17, 31, 32, 33, 63, 64, 65, 127, 128, 129, 255,
+	256, 257, 320, 384, 511, 512, 513, 1023, 1024, 1025, 1518, 1522, 1600,
+	2048, 3072, 4096, 5120, 6144, 7168, 8192, 16384
+};
+
+/******************************************************************************/
+
+#define RTE_MEMCMP_LENGTH_MAX 16384
+
+/*
+ * Test a memcmp equal function.
+ */
+static int run_memcmp_eq_func_test(uint32_t len)
+{
+	uint32_t i, rc = 0;
+	uint8_t * volatile key = NULL;
+
+	key = rte_zmalloc("memcmp key", len * sizeof(uint8_t), 16);
+	if (key == NULL)
+		return -1;
+
+	for (i = 0; i < len; i++)
+		key[i] = (uint8_t) rte_rand();
+
+	rc = rte_memcmp(key, key, len);
+	rte_free(key);
+
+	return rc;
+}
+
+/*
+ * Test memcmp equal functions.
+ */
+static int run_memcmp_eq_func_tests(void)
+{
+	unsigned i;
+
+	for (i = 0;
+	     i < sizeof(memcmp_sizes) / sizeof(memcmp_sizes[0]);
+	     i++) {
+		if (run_memcmp_eq_func_test(memcmp_sizes[i])) {
+			printf("Comparing equal %zd bytes failed\n", memcmp_sizes[i]);
+			return 1;
+		}
+	}
+	printf("RTE memcmp for equality successful\n");
+	return 0;
+}
+
+/*
+ * Test a memcmp less than function.
+ */
+static int run_memcmp_lt_func_test(uint32_t len)
+{
+	uint32_t i, rc;
+	uint8_t * volatile key_1 = NULL;
+	uint8_t * volatile key_2 = NULL;
+
+	key_1 = rte_zmalloc("memcmp key_1", len * sizeof(uint8_t), 16);
+	if (key_1 == NULL)
+		return -1;
+
+	key_2 = rte_zmalloc("memcmp key_2", len * sizeof(uint8_t), 16);
+	if (key_2 == NULL)
+		return -1;
+
+	for (i = 0; i < len; i++)
+		key_1[i] = i;
+
+	for (i = 0; i < len; i++)
+		key_2[i] = 2;
+
+	rc = rte_memcmp(key_1, key_2, len);
+	rte_free(key_1);
+	rte_free(key_2);
+
+	return rc;
+}
+
+/*
+ * Test memcmp less than functions.
+ */
+static int run_memcmp_lt_func_tests(void)
+{
+	unsigned i;
+
+	for (i = 0;
+	     i < sizeof(memcmp_sizes) / sizeof(memcmp_sizes[0]);
+	     i++) {
+		if (!(run_memcmp_lt_func_test(memcmp_sizes[i]) < 0)) {
+			printf("Comparing less than for %zd bytes failed\n", memcmp_sizes[i]);
+			return 1;
+		}
+	}
+	printf("RTE memcmp for less than successful\n");
+	return 0;
+}
+
+/*
+ * Test a memcmp greater than function.
+ */
+static int run_memcmp_gt_func_test(uint32_t len)
+{
+	uint32_t i, rc;
+	uint8_t * volatile key_1 = NULL;
+	uint8_t * volatile key_2 = NULL;
+
+	key_1 = rte_zmalloc("memcmp key_1", len * sizeof(uint8_t), 16);
+	if (key_1 == NULL)
+		return -1;
+
+	key_2 = rte_zmalloc("memcmp key_2", len * sizeof(uint8_t), 16);
+	if (key_2 == NULL)
+		return -1;
+
+	for (i = 0; i < len; i++)
+		key_1[i] = 2;
+
+	for (i = 0; i < len; i++)
+		key_2[i] = i;
+
+	rc = rte_memcmp(key_1, key_2, len);
+	rte_free(key_1);
+	rte_free(key_2);
+
+	return rc;
+}
+
+/*
+ * Test memcmp less than functions.
+ */
+static int run_memcmp_gt_func_tests(void)
+{
+	unsigned i;
+
+	for (i = 0;
+	     i < sizeof(memcmp_sizes) / sizeof(memcmp_sizes[0]);
+	     i++) {
+		if (!(run_memcmp_gt_func_test(memcmp_sizes[i]) > 0)) {
+			printf("Comparing greater than for %zd bytes failed\n", memcmp_sizes[i]);
+			return 1;
+		}
+	}
+	printf("RTE memcmp for greater than successful\n");
+	return 0;
+}
+
+/*
+ * Do all unit and performance tests.
+ */
+static int
+test_memcmp(void)
+{
+	if (run_memcmp_eq_func_tests())
+		return -1;
+
+	if (run_memcmp_gt_func_tests())
+		return -1;
+
+	if (run_memcmp_lt_func_tests())
+		return -1;
+
+	return 0;
+}
+
+static struct test_command memcmp_cmd = {
+	.command = "memcmp_autotest",
+	.callback = test_memcmp,
+};
+REGISTER_TEST_COMMAND(memcmp_cmd);
diff --git a/app/test/test_memcmp_perf.c b/app/test/test_memcmp_perf.c
new file mode 100644
index 0000000..8b7a0c4
--- /dev/null
+++ b/app/test/test_memcmp_perf.c
@@ -0,0 +1,339 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <errno.h>
+#include <sys/queue.h>
+#include <sys/times.h>
+
+#include <rte_common.h>
+#include <rte_malloc.h>
+#include <rte_cycles.h>
+#include <rte_random.h>
+#include <rte_memory.h>
+#include <rte_memcmp.h>
+
+#include "test.h"
+
+/*******************************************************************************
+ * Memcmp function performance test configuration section. Each performance test
+ * will be performed MEMCMP_ITERATIONS times.
+ *
+ * The five arrays below control what tests are performed. Every combination
+ * from the array entries is tested.
+ */
+#define MEMCMP_ITERATIONS 500 * 500 * 500
+
+static size_t memcmp_sizes[] = {
+	2, 5, 8, 9, 15, 16, 17, 31, 32, 33, 63, 64, 65, 127, 128,
+	129, 191, 192, 193, 255, 256, 257, 319, 320, 321, 383, 384, 385, 447, 448,
+	449, 511, 512, 513, 767, 768, 769, 1023, 1024, 1025, 1522, 1536, 1600,
+	2048, 2560, 3072, 3584, 4096, 4608, 5632, 6144, 6656, 7168, 7680, 8192,
+	16834
+};
+
+static size_t memcmp_lt_gt_sizes[] = {
+	16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192};
+
+/******************************************************************************/
+
+static int
+run_single_memcmp_eq_perf_test(uint32_t len, int func_type, uint64_t iterations)
+{
+	double begin = 0, end = 0;
+	uint64_t i, j, rc = 0;
+	uint8_t * volatile key = NULL;
+
+	key = rte_zmalloc("memcmp key", len * sizeof(uint8_t), 16);
+	if (key == NULL)
+		return -1;
+
+	/* Prepare inputs for the current iteration */
+	for (j = 0; j < len; j++)
+		key[j] = j / 64;
+
+	begin = rte_rdtsc();
+
+	/* Perform operation, and measure time it takes */
+	for (i = 0; i < iterations; i++) {
+
+		if (func_type == 1)
+			rc += rte_memcmp(key, key, len);
+		else
+			rc += memcmp(key, key, len);
+	}
+
+	end = rte_rdtsc() - begin;
+
+	printf(" *** %10i, %10.4f ***\n", len, (double)(end/iterations));
+
+	rte_free(key);
+
+	return rc;
+}
+
+/*
+ * Run all memcmp table performance tests.
+ */
+static int run_all_memcmp_eq_perf_tests(void)
+{
+	unsigned i;
+
+	printf(" *** RTE memcmp equal performance test results ***\n");
+	printf(" *** Length (bytes), Ticks/Op. ***\n");
+
+	/* Loop through every combination of test parameters */
+	for (i = 0;
+	     i < sizeof(memcmp_sizes) / sizeof(memcmp_sizes[0]);
+	     i++) {
+		/* Perform test */
+		if (run_single_memcmp_eq_perf_test(memcmp_sizes[i], 1,
+						MEMCMP_ITERATIONS) != 0)
+			return -1;
+	}
+
+	printf(" *** memcmp equal performance test results ***\n");
+	printf(" *** Length (bytes), Ticks/Op. ***\n");
+
+	/* Loop through every combination of test parameters */
+	for (i = 0;
+	     i < sizeof(memcmp_sizes) / sizeof(memcmp_sizes[0]);
+	     i++) {
+		/* Perform test */
+		if (run_single_memcmp_eq_perf_test(memcmp_sizes[i], 2,
+						MEMCMP_ITERATIONS) != 0)
+			return -1;
+	}
+	return 0;
+}
+
+static int
+run_single_memcmp_lt_perf_test(uint32_t len, int func_type,
+					uint64_t iterations)
+{
+	double begin = 0, end = 0;
+	uint64_t i, j;
+	uint8_t * volatile key_1 = NULL;
+	uint8_t * volatile key_2 = NULL;
+
+	key_1 = rte_zmalloc("memcmp key_1", len * sizeof(uint8_t), 16);
+	if (key_1 == NULL)
+		return -1;
+
+	key_2 = rte_zmalloc("memcmp key_2", len * sizeof(uint8_t), 16);
+	if (key_2 == NULL) {
+		rte_free(key_1);
+		return -1;
+	}
+
+	/* Prepare inputs for the current iteration */
+	for (j = 0; j < len; j++)
+		key_1[j] = 1;
+
+	for (j = 0; j < len; j++)
+		key_2[j] = 1;
+
+	key_2[len / 2] = 2;
+
+	begin = rte_rdtsc();
+
+	/* Perform operation, and measure time it takes */
+	for (i = 0; i < iterations; i++) {
+
+		if (func_type == 1) {
+			if (!(rte_memcmp(key_1, key_2, len) < 0))
+				return -1;
+		} else {
+			if (!(memcmp(key_1, key_2, len) < 0))
+				return -1;
+		}
+	}
+
+	end = rte_rdtsc() - begin;
+
+	printf(" *** %10i, %10.4f ***\n", len, (double)(end/iterations));
+
+	rte_free(key_1);
+	rte_free(key_2);
+
+	return 0;
+}
+
+/*
+ * Run all memcmp table performance tests.
+ */
+static int run_all_memcmp_lt_perf_tests(void)
+{
+	unsigned i;
+
+	printf(" *** RTE memcmp less than performance test results ***\n");
+	printf(" *** Length (bytes), Ticks/Op. ***\n");
+
+	/* Loop through every combination of test parameters */
+	for (i = 0;
+	     i < sizeof(memcmp_lt_gt_sizes) / sizeof(memcmp_lt_gt_sizes[0]);
+	     i++) {
+		/* Perform test */
+		if (run_single_memcmp_lt_perf_test(memcmp_lt_gt_sizes[i], 1,
+						MEMCMP_ITERATIONS) != 0)
+			return -1;
+	}
+
+	printf(" *** memcmp less than performance test results ***\n");
+	printf(" *** Length (bytes), Ticks/Op. ***\n");
+
+	/* Loop through every combination of test parameters */
+	for (i = 0;
+	     i < sizeof(memcmp_lt_gt_sizes) / sizeof(memcmp_lt_gt_sizes[0]);
+	     i++) {
+		/* Perform test */
+		if (run_single_memcmp_lt_perf_test(memcmp_lt_gt_sizes[i], 2,
+						MEMCMP_ITERATIONS) != 0)
+			return -1;
+	}
+	return 0;
+}
+
+static int
+run_single_memcmp_gt_perf_test(uint32_t len, int func_type,
+					uint64_t iterations)
+{
+	double begin = 0, end = 0;
+	uint64_t i, j;
+	uint8_t * volatile key_1 = NULL;
+	uint8_t * volatile key_2 = NULL;
+
+	key_1 = rte_zmalloc("memcmp key_1", len * sizeof(uint8_t), 16);
+	if (key_1 == NULL)
+		return -1;
+
+	key_2 = rte_zmalloc("memcmp key_2", len * sizeof(uint8_t), 16);
+	if (key_2 == NULL) {
+		rte_free(key_1);
+		return -1;
+	}
+
+	/* Prepare inputs for the current iteration */
+	for (j = 0; j < len; j++)
+		key_1[j] = 1;
+	key_1[len / 2] = 2;
+
+	for (j = 0; j < len; j++)
+		key_2[j] = 1;
+
+	begin = rte_rdtsc();
+
+	/* Perform operation, and measure time it takes */
+	for (i = 0; i < iterations; i++) {
+
+		if (func_type == 1) {
+			if (!(rte_memcmp(key_1, key_2, len) > 0))
+				return -1;
+		} else {
+			if (!(memcmp(key_1, key_2, len) > 0))
+				return -1;
+		}
+	}
+
+	end = rte_rdtsc() - begin;
+
+	printf(" *** %10i, %10.4f ***\n", len, (double)(end/iterations));
+
+	rte_free(key_1);
+	rte_free(key_2);
+
+	return 0;
+}
+
+/*
+ * Run all memcmp table performance tests.
+ */
+static int run_all_memcmp_gt_perf_tests(void)
+{
+	unsigned i;
+
+	printf(" *** RTE memcmp greater than performance test results ***\n");
+	printf(" *** Length (bytes), Ticks/Op. ***\n");
+
+	/* Loop through every combination of test parameters */
+	for (i = 0;
+	     i < sizeof(memcmp_lt_gt_sizes) / sizeof(memcmp_lt_gt_sizes[0]);
+	     i++) {
+		/* Perform test */
+		if (run_single_memcmp_gt_perf_test(memcmp_lt_gt_sizes[i], 1,
+						MEMCMP_ITERATIONS) != 0)
+			return -1;
+	}
+
+	printf(" *** memcmp greater than performance test results ***\n");
+	printf(" *** Length (bytes), Ticks/Op. ***\n");
+
+	/* Loop through every combination of test parameters */
+	for (i = 0;
+	     i < sizeof(memcmp_lt_gt_sizes) / sizeof(memcmp_lt_gt_sizes[0]);
+	     i++) {
+		/* Perform test */
+		if (run_single_memcmp_gt_perf_test(memcmp_lt_gt_sizes[i], 2,
+						MEMCMP_ITERATIONS) != 0)
+			return -1;
+	}
+	return 0;
+}
+
+/*
+ * Do all performance tests.
+ */
+static int
+test_memcmp_perf(void)
+{
+	if (run_all_memcmp_eq_perf_tests() != 0)
+		return -1;
+
+	if (run_all_memcmp_lt_perf_tests() != 0)
+		return -1;
+
+	if (run_all_memcmp_gt_perf_tests() != 0)
+		return -1;
+
+	return 0;
+}
+
+static struct test_command memcmp_perf_cmd = {
+	.command = "memcmp_perf_autotest",
+	.callback = test_memcmp_perf,
+};
+REGISTER_TEST_COMMAND(memcmp_perf_cmd);
diff --git a/lib/librte_eal/common/include/arch/ppc_64/rte_memcmp.h b/lib/librte_eal/common/include/arch/ppc_64/rte_memcmp.h
new file mode 100644
index 0000000..6e54f3b
--- /dev/null
+++ b/lib/librte_eal/common/include/arch/ppc_64/rte_memcmp.h
@@ -0,0 +1,62 @@
+/*
+ *   BSD LICENSE
+ *
+ *   Copyright (C) IBM Corporation 2015.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of IBM Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef _RTE_MEMCMP_PPC_64_H_
+#define _RTE_MEMCMP_PPC_64_H_
+
+#include <stdint.h>
+#include <string.h>
+/*To include altivec.h, GCC version must  >= 4.8 */
+#include <altivec.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "generic/rte_memcmp.h"
+
+#define rte_memcmp(dst, src, n)              \
+	({ (__builtin_constant_p(n)) ?       \
+	memcmp((dst), (src), (n)) :          \
+	rte_memcmp_func((dst), (src), (n)); })
+
+static inline bool
+rte_memcmp_func(void *dst, const void *src, size_t n)
+{
+	return memcmp(dst, src, n);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_MEMCMP_PPC_64_H_ */
diff --git a/lib/librte_eal/common/include/arch/x86/rte_memcmp.h b/lib/librte_eal/common/include/arch/x86/rte_memcmp.h
new file mode 100644
index 0000000..085dfb2
--- /dev/null
+++ b/lib/librte_eal/common/include/arch/x86/rte_memcmp.h
@@ -0,0 +1,900 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _RTE_MEMCMP_X86_64_H_
+#define _RTE_MEMCMP_X86_64_H_
+
+/**
+ * @file
+ *
+ * Functions for SSE/AVX/AVX2 implementation of memcmp().
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <stdlib.h>
+
+#include <rte_vect.h>
+#include <rte_branch_prediction.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Compare bytes between two locations. The locations must not overlap.
+ *
+ * @param src_1
+ *   Pointer to the first source of the data.
+ * @param src_2
+ *   Pointer to the second source of the data.
+ * @param n
+ *   Number of bytes to compare.
+ * @return
+ *   zero if src_1 equal src_2
+ *   -ve if src_1 less than src_2
+ *   +ve if src_1 greater than src_2
+ */
+static inline int
+rte_memcmp(const void *src_1, const void *src,
+		size_t n) __attribute__((always_inline));
+
+/**
+ * Find the first different bit for comparison.
+ */
+static inline int
+rte_cmpffd (uint32_t x, uint32_t y)
+{
+	int i;
+	int pos = x ^ y;
+	for (i = 0; i < 32; i++)
+		if (pos & (1<<i))
+			return i;
+	return -1;
+}
+
+/**
+ * Find the first different byte for comparison.
+ */
+static inline int
+rte_cmpffdb (const uint8_t *x, const uint8_t *y, size_t n)
+{
+	size_t i;
+	for (i = 0; i < n; i++)
+		if (x[i] != y[i])
+			return x[i] - y[i];
+	return 0;
+}
+
+/**
+ * Compare 16 bytes between two locations.
+ * locations should not overlap.
+ */
+static inline int
+rte_cmp16(const void *src_1, const void *src_2)
+{
+	__m128i xmm0, xmm1, xmm2;
+
+	xmm0 = _mm_lddqu_si128((const __m128i *)src_1);
+	xmm1 = _mm_lddqu_si128((const __m128i *)src_2);
+	xmm2 = _mm_xor_si128(xmm0, xmm1);
+
+	if (unlikely(!_mm_testz_si128(xmm2, xmm2))) {
+
+		uint64_t mm11 = _mm_extract_epi64(xmm0, 0);
+		uint64_t mm12 = _mm_extract_epi64(xmm0, 1);
+
+		uint64_t mm21 = _mm_extract_epi64(xmm1, 0);
+		uint64_t mm22 = _mm_extract_epi64(xmm1, 1);
+
+		if (mm11 == mm21)
+			return rte_cmpffdb((const uint8_t *)&mm12,
+					(const uint8_t *)&mm22, 8);
+		else
+			return rte_cmpffdb((const uint8_t *)&mm11,
+					(const uint8_t *)&mm21, 8);
+	}
+
+	return 0;
+}
+
+/**
+ * Compare 0 to 15 bytes between two locations.
+ * Locations should not overlap.
+ */
+static inline int
+rte_memcmp_regular(const uint8_t *src_1u, const uint8_t *src_2u, size_t n)
+{
+	int ret = 1;
+
+	/**
+	 * Compare less than 16 bytes
+	 */
+	if (n & 0x08) {
+		ret = (*(const uint64_t *)src_1u ==
+				*(const uint64_t *)src_2u);
+
+		if ((ret != 1))
+			goto exit_8;
+
+		n -= 0x8;
+		src_1u += 0x8;
+		src_2u += 0x8;
+	}
+
+	if (n & 0x04) {
+		ret = (*(const uint32_t *)src_1u ==
+				*(const uint32_t *)src_2u);
+
+		if ((ret != 1))
+			goto exit_4;
+
+		n -= 0x4;
+		src_1u += 0x4;
+		src_2u += 0x4;
+	}
+
+	if (n & 0x02) {
+		ret = (*(const uint16_t *)src_1u ==
+				*(const uint16_t *)src_2u);
+
+		if ((ret != 1))
+			goto exit_2;
+
+		n -= 0x2;
+		src_1u += 0x2;
+		src_2u += 0x2;
+	}
+
+	if (n & 0x01) {
+		ret = (*(const uint8_t *)src_1u ==
+				*(const uint8_t *)src_2u);
+
+		if ((ret != 1))
+			goto exit_1;
+
+		n -= 0x1;
+		src_1u += 0x1;
+		src_2u += 0x1;
+	}
+
+	return !ret;
+
+exit_8:
+	return rte_cmpffdb(src_1u, src_2u, 8);
+exit_4:
+	return rte_cmpffdb(src_1u, src_2u, 4);
+exit_2:
+	return rte_cmpffdb(src_1u, src_2u, 2);
+exit_1:
+	return rte_cmpffdb(src_1u, src_2u, 1);
+}
+
+/**
+ * AVX2 implementation below
+ */
+#ifdef RTE_MACHINE_CPUFLAG_AVX2
+
+/**
+ * Compare 32 bytes between two locations.
+ * Locations should not overlap.
+ */
+static inline int
+rte_cmp32(const void *src_1, const void *src_2)
+{
+	const __m128i* src1 = (const __m128i*)src_1;
+	const __m128i* src2 = (const __m128i*)src_2;
+	const uint8_t *s1, *s2;
+
+	__m128i mm11 = _mm_lddqu_si128(src1);
+	__m128i mm12 = _mm_lddqu_si128(src1 + 1);
+	__m128i mm21 = _mm_lddqu_si128(src2);
+	__m128i mm22 = _mm_lddqu_si128(src2 + 1);
+
+	__m128i mm1 = _mm_xor_si128(mm11, mm21);
+	__m128i mm2 = _mm_xor_si128(mm12, mm22);
+	__m128i mm = _mm_or_si128(mm1, mm2);
+
+	if (unlikely(!_mm_testz_si128(mm, mm))) {
+
+		/*
+		 * Find out which of the two 16-byte blocks
+		 * are different.
+		 */
+		if (_mm_testz_si128(mm1, mm1)) {
+			mm11 = mm12;
+			mm21 = mm22;
+			mm1 = mm2;
+			s1 = (const uint8_t *)(src1 + 1);
+			s2 = (const uint8_t *)(src2 + 1);
+		} else {
+			s1 = (const uint8_t *)src1;
+			s2 = (const uint8_t *)src2;
+		}
+
+		// Produce the comparison result
+		__m128i mm_cmp = _mm_cmpgt_epi8(mm11, mm21);
+		__m128i mm_rcmp = _mm_cmpgt_epi8(mm21, mm11);
+		mm_cmp = _mm_xor_si128(mm1, mm_cmp);
+		mm_rcmp = _mm_xor_si128(mm1, mm_rcmp);
+
+		uint32_t cmp = _mm_movemask_epi8(mm_cmp);
+		uint32_t rcmp = _mm_movemask_epi8(mm_rcmp);
+
+		int cmp_b = rte_cmpffd(cmp, rcmp);
+
+		int ret = (cmp_b == -1) ? 0 : (s1[cmp_b] - s2[cmp_b]);
+		return ret;
+	}
+
+	return 0;
+}
+
+/**
+ * Compare 48 bytes between two locations.
+ * Locations should not overlap.
+ */
+static inline int
+rte_cmp48(const void *src_1, const void *src_2)
+{
+	int ret;
+
+	ret = rte_cmp32((const uint8_t *)src_1 + 0 * 32,
+			(const uint8_t *)src_2 + 0 * 32);
+
+	if (unlikely(ret != 0))
+		return ret;
+
+	ret = rte_cmp16((const uint8_t *)src_1 + 1 * 32,
+			(const uint8_t *)src_2 + 1 * 32);
+	return ret;
+}
+
+/**
+ * Compare 64 bytes between two locations.
+ * Locations should not overlap.
+ */
+static inline int
+rte_cmp64 (const void* src_1, const void* src_2)
+{
+	const __m256i* src1 = (const __m256i*)src_1;
+	const __m256i* src2 = (const __m256i*)src_2;
+	const uint8_t *s1, *s2;
+
+	__m256i mm11 = _mm256_lddqu_si256(src1);
+	__m256i mm12 = _mm256_lddqu_si256(src1 + 1);
+	__m256i mm21 = _mm256_lddqu_si256(src2);
+	__m256i mm22 = _mm256_lddqu_si256(src2 + 1);
+
+	__m256i mm1 = _mm256_xor_si256(mm11, mm21);
+	__m256i mm2 = _mm256_xor_si256(mm12, mm22);
+	__m256i mm = _mm256_or_si256(mm1, mm2);
+
+	if (unlikely(!_mm256_testz_si256(mm, mm))) {
+		/*
+		 * Find out which of the two 32-byte blocks
+		 * are different.
+		 */
+		if (_mm256_testz_si256(mm1, mm1)) {
+			mm11 = mm12;
+			mm21 = mm22;
+			mm1 = mm2;
+			s1 = (const uint8_t *)(src1 + 1);
+			s2 = (const uint8_t *)(src2 + 1);
+		} else {
+			s1 = (const uint8_t *)src1;
+			s2 = (const uint8_t *)src2;
+		}
+
+		// Produce the comparison result
+		__m256i mm_cmp = _mm256_cmpgt_epi8(mm11, mm21);
+		__m256i mm_rcmp = _mm256_cmpgt_epi8(mm21, mm11);
+		mm_cmp = _mm256_xor_si256(mm1, mm_cmp);
+		mm_rcmp = _mm256_xor_si256(mm1, mm_rcmp);
+
+		uint32_t cmp = _mm256_movemask_epi8(mm_cmp);
+		uint32_t rcmp = _mm256_movemask_epi8(mm_rcmp);
+
+		int cmp_b = rte_cmpffd(cmp, rcmp);
+
+		int ret = (cmp_b == -1) ? 0 : (s1[cmp_b] - s2[cmp_b]);
+		return ret;
+	}
+
+	return 0;
+}
+
+/**
+ * Compare 128 bytes between two locations.
+ * Locations should not overlap.
+ */
+static inline int
+rte_cmp128(const void *src_1, const void *src_2)
+{
+	int ret;
+
+	ret = rte_cmp64((const uint8_t *)src_1 + 0 * 64,
+			(const uint8_t *)src_2 + 0 * 64);
+
+	if (unlikely(ret != 0))
+		return ret;
+
+	return rte_cmp64((const uint8_t *)src_1 + 1 * 64,
+			(const uint8_t *)src_2 + 1 * 64);
+}
+
+/**
+ * Compare 256 bytes between two locations.
+ * Locations should not overlap.
+ */
+static inline int
+rte_cmp256(const void *src_1, const void *src_2)
+{
+	int ret;
+
+	ret = rte_cmp64((const uint8_t *)src_1 + 0 * 64,
+			(const uint8_t *)src_2 + 0 * 64);
+
+	if (unlikely(ret != 0))
+		return ret;
+
+	ret = rte_cmp64((const uint8_t *)src_1 + 1 * 64,
+			(const uint8_t *)src_2 + 1 * 64);
+
+	if (unlikely(ret != 0))
+		return ret;
+
+	ret = rte_cmp64((const uint8_t *)src_1 + 2 * 64,
+			(const uint8_t *)src_2 + 2 * 64);
+
+	if (unlikely(ret != 0))
+		return ret;
+
+	return rte_cmp64((const uint8_t *)src_1 + 3 * 64,
+			(const uint8_t *)src_2 + 3 * 64);
+}
+
+/**
+ * Compare bytes between two locations. The locations must not overlap.
+ *
+ * @param src_1
+ *   Pointer to the first source of the data.
+ * @param src_2
+ *   Pointer to the second source of the data.
+ * @param n
+ *   Number of bytes to compare.
+ * @return
+ *   zero if src_1 equal src_2
+ *   -ve if src_1 less than src_2
+ *   +ve if src_1 greater than src_2
+ */
+static inline int
+rte_memcmp(const void *_src_1, const void *_src_2, size_t n)
+{
+	const uint8_t *src_1 = (const uint8_t *)_src_1;
+	const uint8_t *src_2 = (const uint8_t *)_src_2;
+	int ret = 0;
+
+	if (n < 16)
+		return rte_memcmp_regular(src_1, src_2, n);
+
+	if (n <= 32) {
+		ret = rte_cmp16(src_1, src_2);
+		if (unlikely(ret != 0))
+			return ret;
+
+		return rte_cmp16(src_1 - 16 + n, src_2 - 16 + n);
+	}
+
+	if (n <= 48) {
+		ret = rte_cmp32(src_1, src_2);
+		if (unlikely(ret != 0))
+			return ret;
+
+		return rte_cmp16(src_1 - 16 + n, src_2 - 16 + n);
+	}
+
+	if (n <= 64) {
+		ret = rte_cmp32(src_1, src_2);
+		if (unlikely(ret != 0))
+			return ret;
+
+		ret = rte_cmp16(src_1 + 32, src_2 + 32);
+
+		if (unlikely(ret != 0))
+			return ret;
+
+		return rte_cmp16(src_1 - 16 + n, src_2 - 16 + n);
+	}
+
+	if (n <= 96) {
+		ret = rte_cmp64(src_1, src_2);
+		if (unlikely(ret != 0))
+			return ret;
+
+		ret = rte_cmp16(src_1 + 64, src_2 + 64);
+		if (unlikely(ret != 0))
+			return ret;
+
+		return rte_cmp16(src_1 - 16 + n, src_2 - 16 + n);
+	}
+
+	if (n <= 128) {
+		ret = rte_cmp64(src_1, src_2);
+		if (unlikely(ret != 0))
+			return ret;
+
+		ret = rte_cmp32(src_1 + 64, src_2 + 64);
+		if (unlikely(ret != 0))
+			return ret;
+
+		ret = rte_cmp16(src_1 + 96, src_2 + 96);
+		if (unlikely(ret != 0))
+			return ret;
+
+		return rte_cmp16(src_1 - 16 + n, src_2 - 16 + n);
+	}
+
+CMP_BLOCK_LESS_THAN_512:
+	if (n <= 512) {
+		if (n >= 256) {
+			ret = rte_cmp256(src_1, src_2);
+			if (unlikely(ret != 0))
+				return ret;
+			src_1 = src_1 + 256;
+			src_2 = src_2 + 256;
+			n -= 256;
+		}
+		if (n >= 128) {
+			ret = rte_cmp128(src_1, src_2);
+			if (unlikely(ret != 0))
+				return ret;
+			src_1 = src_1 + 128;
+			src_2 = src_2 + 128;
+			n -= 128;
+		}
+		if (n >= 64) {
+			n -= 64;
+			ret = rte_cmp64(src_1, src_2);
+			if (unlikely(ret != 0))
+				return ret;
+			src_1 = src_1 + 64;
+			src_2 = src_2 + 64;
+		}
+		if (n > 32) {
+			ret = rte_cmp32(src_1, src_2);
+			if (unlikely(ret != 0))
+				return ret;
+			ret = rte_cmp32(src_1 - 32 + n, src_2 - 32 + n);
+			return ret;
+		}
+		if (n > 0)
+			ret = rte_cmp32(src_1 - 32 + n, src_2 - 32 + n);
+
+		return ret;
+	}
+
+	while (n > 512) {
+		ret = rte_cmp256(src_1 + 0 * 256, src_2 + 0 * 256);
+		if (unlikely(ret != 0))
+			return ret;
+
+		ret = rte_cmp256(src_1 + 1 * 256, src_2 + 1 * 256);
+		if (unlikely(ret != 0))
+			return ret;
+
+		src_1 = src_1 + 512;
+		src_2 = src_2 + 512;
+		n -= 512;
+	}
+	goto CMP_BLOCK_LESS_THAN_512;
+}
+
+#else /* RTE_MACHINE_CPUFLAG_AVX2 */
+
+/**
+ * Compare 32 bytes between two locations.
+ * Locations should not overlap.
+ */
+static inline int
+rte_cmp32(const void *src_1, const void *src_2)
+{
+	int ret;
+
+	ret = rte_cmp16((const uint8_t *)src_1 + 0 * 16,
+			(const uint8_t *)src_2 + 0 * 16);
+
+	if (unlikely(ret != 0))
+		return ret;
+
+	return rte_cmp16((const uint8_t *)src_1 + 1 * 16,
+			(const uint8_t *)src_2 + 1 * 16);
+}
+
+/**
+ * Compare 48 bytes between two locations.
+ * Locations should not overlap.
+ */
+static inline int
+rte_cmp48(const void *src_1, const void *src_2)
+{
+	int ret;
+
+	ret = rte_cmp16((const uint8_t *)src_1 + 0 * 16,
+			(const uint8_t *)src_2 + 0 * 16);
+
+	if (unlikely(ret != 0))
+		return ret;
+
+	ret = rte_cmp16((const uint8_t *)src_1 + 1 * 16,
+			(const uint8_t *)src_2 + 1 * 16);
+
+	if (unlikely(ret != 0))
+		return ret;
+
+	return rte_cmp16((const uint8_t *)src_1 + 2 * 16,
+			(const uint8_t *)src_2 + 2 * 16);
+}
+
+/**
+ * Compare 64 bytes between two locations.
+ * Locations should not overlap.
+ */
+static inline int
+rte_cmp64(const void *src_1, const void *src_2)
+{
+	int ret;
+
+	ret = rte_cmp16((const uint8_t *)src_1 + 0 * 16,
+			(const uint8_t *)src_2 + 0 * 16);
+
+	if (unlikely(ret != 0))
+		return ret;
+
+	ret = rte_cmp16((const uint8_t *)src_1 + 1 * 16,
+			(const uint8_t *)src_2 + 1 * 16);
+
+	if (unlikely(ret != 0))
+		return ret;
+
+	ret = rte_cmp16((const uint8_t *)src_1 + 2 * 16,
+			(const uint8_t *)src_2 + 2 * 16);
+
+	if (unlikely(ret != 0))
+		return ret;
+
+	return rte_cmp16((const uint8_t *)src_1 + 3 * 16,
+			(const uint8_t *)src_2 + 3 * 16);
+}
+
+/**
+ * Compare 128 bytes or its multiple between two locations.
+ * Locations should not overlap.
+ */
+static inline int
+rte_cmp128(const void *src_1, const void *src_2)
+{
+	int ret;
+
+	ret = rte_cmp16((const uint8_t *)src_1 + 0 * 16,
+			(const uint8_t *)src_2 + 0 * 16);
+
+	if (unlikely(ret != 0))
+		return ret;
+
+	ret = rte_cmp16((const uint8_t *)src_1 + 1 * 16,
+			(const uint8_t *)src_2 + 1 * 16);
+
+	if (unlikely(ret != 0))
+		return ret;
+
+	ret = rte_cmp16((const uint8_t *)src_1 + 2 * 16,
+			(const uint8_t *)src_2 + 2 * 16);
+
+	if (unlikely(ret != 0))
+		return ret;
+
+	ret = rte_cmp16((const uint8_t *)src_1 + 3 * 16,
+			(const uint8_t *)src_2 + 3 * 16);
+
+	if (unlikely(ret != 0))
+		return ret;
+
+	ret = rte_cmp16((const uint8_t *)src_1 + 4 * 16,
+			(const uint8_t *)src_2 + 4 * 16);
+
+	if (unlikely(ret != 0))
+		return ret;
+
+	ret = rte_cmp16((const uint8_t *)src_1 + 5 * 16,
+			(const uint8_t *)src_2 + 5 * 16);
+
+	if (unlikely(ret != 0))
+		return ret;
+
+	ret = rte_cmp16((const uint8_t *)src_1 + 6 * 16,
+			(const uint8_t *)src_2 + 6 * 16);
+
+	if (unlikely(ret != 0))
+		return ret;
+
+	return rte_cmp16((const uint8_t *)src_1 + 7 * 16,
+			(const uint8_t *)src_2 + 7 * 16);
+}
+
+/**
+ * Compare 256 bytes between two locations.
+ * Locations should not overlap.
+ */
+static inline int
+rte_cmp256(const void *src_1, const void *src_2)
+{
+	int ret;
+
+	ret = rte_cmp16((const uint8_t *)src_1 + 0 * 16,
+			(const uint8_t *)src_2 + 0 * 16);
+
+	if (unlikely(ret != 0))
+		return ret;
+
+	ret = rte_cmp16((const uint8_t *)src_1 + 1 * 16,
+			(const uint8_t *)src_2 + 1 * 16);
+
+	if (unlikely(ret != 0))
+		return ret;
+
+	ret = rte_cmp16((const uint8_t *)src_1 + 2 * 16,
+			(const uint8_t *)src_2 + 2 * 16);
+
+	if (unlikely(ret != 0))
+		return ret;
+
+	ret = rte_cmp16((const uint8_t *)src_1 + 3 * 16,
+			(const uint8_t *)src_2 + 3 * 16);
+
+	if (unlikely(ret != 0))
+		return ret;
+
+	ret = rte_cmp16((const uint8_t *)src_1 + 4 * 16,
+			(const uint8_t *)src_2 + 4 * 16);
+
+	if (unlikely(ret != 0))
+		return ret;
+
+	ret = rte_cmp16((const uint8_t *)src_1 + 5 * 16,
+			(const uint8_t *)src_2 + 5 * 16);
+
+	if (unlikely(ret != 0))
+		return ret;
+
+	ret = rte_cmp16((const uint8_t *)src_1 + 6 * 16,
+			(const uint8_t *)src_2 + 6 * 16);
+
+	if (unlikely(ret != 0))
+		return ret;
+
+	ret = rte_cmp16((const uint8_t *)src_1 + 7 * 16,
+			(const uint8_t *)src_2 + 7 * 16);
+
+	if (unlikely(ret != 0))
+		return ret;
+
+	ret = rte_cmp16((const uint8_t *)src_1 + 8 * 16,
+			(const uint8_t *)src_2 + 8 * 16);
+
+	if (unlikely(ret != 0))
+		return ret;
+
+	ret = rte_cmp16((const uint8_t *)src_1 + 9 * 16,
+			(const uint8_t *)src_2 + 9 * 16);
+
+	if (unlikely(ret != 0))
+		return ret;
+
+	ret = rte_cmp16((const uint8_t *)src_1 + 10 * 16,
+			(const uint8_t *)src_2 + 10 * 16);
+
+	if (unlikely(ret != 0))
+		return ret;
+
+	ret = rte_cmp16((const uint8_t *)src_1 + 11 * 16,
+			(const uint8_t *)src_2 + 11 * 16);
+
+	if (unlikely(ret != 0))
+		return ret;
+
+	ret = rte_cmp16((const uint8_t *)src_1 + 12 * 16,
+			(const uint8_t *)src_2 + 12 * 16);
+
+	if (unlikely(ret != 0))
+		return ret;
+
+	ret = rte_cmp16((const uint8_t *)src_1 + 13 * 16,
+			(const uint8_t *)src_2 + 13 * 16);
+
+	if (unlikely(ret != 0))
+		return ret;
+
+	ret = rte_cmp16((const uint8_t *)src_1 + 14 * 16,
+			(const uint8_t *)src_2 + 14 * 16);
+
+	if (unlikely(ret != 0))
+		return ret;
+
+	return rte_cmp16((const uint8_t *)src_1 + 15 * 16,
+			(const uint8_t *)src_2 + 15 * 16);
+}
+
+/**
+ * Compare bytes between two locations. The locations must not overlap.
+ *
+ * @param src_1
+ *   Pointer to the first source of the data.
+ * @param src_2
+ *   Pointer to the second source of the data.
+ * @param n
+ *   Number of bytes to compare.
+ * @return
+ *   zero if src_1 equal src_2
+ *   -ve if src_1 less than src_2
+ *   +ve if src_1 greater than src_2
+ */
+static inline int
+rte_memcmp(const void *_src_1, const void *_src_2, size_t n)
+{
+	const uint8_t *src_1 = (const uint8_t *)_src_1;
+	const uint8_t *src_2 = (const uint8_t *)_src_2;
+	int ret = 0;
+
+	if (n < 16)
+		return rte_memcmp_regular(src_1, src_2, n);
+
+	if (n <= 32) {
+		ret = rte_cmp16(src_1, src_2);
+		if (unlikely(ret != 0))
+			return ret;
+
+		return rte_cmp16(src_1 - 16 + n, src_2 - 16 + n);
+	}
+
+	if (n <= 48) {
+		ret = rte_cmp32(src_1, src_2);
+		if (unlikely(ret != 0))
+			return ret;
+
+		return rte_cmp16(src_1 - 16 + n, src_2 - 16 + n);
+	}
+
+	if (n <= 64) {
+		ret = rte_cmp32(src_1, src_2);
+		if (unlikely(ret != 0))
+			return ret;
+
+		ret = rte_cmp16(src_1 + 32, src_2 + 32);
+
+		if (unlikely(ret != 0))
+			return ret;
+
+		return rte_cmp16(src_1 - 16 + n, src_2 - 16 + n);
+	}
+
+	if (n <= 96) {
+		ret = rte_cmp64(src_1, src_2);
+		if (unlikely(ret != 0))
+			return ret;
+
+		ret = rte_cmp16(src_1 + 64, src_2 + 64);
+		if (unlikely(ret != 0))
+			return ret;
+
+		return rte_cmp16(src_1 - 16 + n, src_2 - 16 + n);
+	}
+
+	if (n <= 128)
+		goto CMP_BLOCK_LESS_THAN_128;
+
+	if (n <= 512) {
+		if (n >= 256) {
+			ret = rte_cmp256(src_1, src_2);
+			if (unlikely(ret != 0))
+				return ret;
+
+			src_1 = src_1 + 256;
+			src_2 = src_2 + 256;
+			n -= 256;
+		}
+
+CMP_BLOCK_LESS_THAN_256:
+		if (n >= 128) {
+			ret = rte_cmp128(src_1, src_2);
+			if (unlikely(ret != 0))
+				return ret;
+
+			src_1 = src_1 + 128;
+			src_2 = src_2 + 128;
+			n -= 128;
+		}
+
+CMP_BLOCK_LESS_THAN_128:
+		if (n >= 64) {
+			ret = rte_cmp64(src_1, src_2);
+			if (unlikely(ret != 0))
+				return ret;
+
+			src_1 = src_1 + 64;
+			src_2 = src_2 + 64;
+			n -= 64;
+		}
+
+		if (n >= 32) {
+			ret = rte_cmp32(src_1, src_2);
+			if (unlikely(ret != 0))
+				return ret;
+			src_1 = src_1 + 32;
+			src_2 = src_2 + 32;
+			n -= 32;
+		}
+		if (n > 16) {
+			ret = rte_cmp16(src_1, src_2);
+			if (unlikely(ret != 0))
+				return ret;
+			ret = rte_cmp16(src_1 - 16 + n, src_2 - 16 + n);
+			return ret;
+		}
+		if (n > 0)
+			ret = rte_cmp16(src_1 - 16 + n, src_2 - 16 + n);
+
+		return ret;
+	}
+
+	for (; n >= 256; n -= 256) {
+		ret = rte_cmp256(src_1, src_2);
+		if (unlikely(ret != 0))
+			return ret;
+
+		src_1 = src_1 + 256;
+		src_2 = src_2 + 256;
+	}
+
+	goto CMP_BLOCK_LESS_THAN_256;
+}
+
+#endif /* RTE_MACHINE_CPUFLAG_AVX2 */
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_MEMCMP_X86_64_H_ */
diff --git a/lib/librte_eal/common/include/generic/rte_memcmp.h b/lib/librte_eal/common/include/generic/rte_memcmp.h
new file mode 100644
index 0000000..5e68036
--- /dev/null
+++ b/lib/librte_eal/common/include/generic/rte_memcmp.h
@@ -0,0 +1,175 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _RTE_MEMCMP_H_
+#define _RTE_MEMCMP_H_
+
+/**
+ * @file
+ *
+ * Functions for vectorised implementation of memcmp().
+ */
+
+/**
+ * Find the first different bit for comparison.
+ */
+static inline int
+rte_cmpffd (uint32_t x, uint32_t y);
+
+/**
+ * Find the first different byte for comparison.
+ */
+static inline int
+rte_cmpffdb (const uint8_t *x, const uint8_t *y, size_t n);
+
+/**
+ * Compare 16 bytes between two locations using optimised
+ * instructions. The locations should not overlap.
+ *
+ * @param src_1
+ *   Pointer to the first source of the data.
+ * @param src
+ *   Pointer to the second source of the data.
+ *   zero if src_1 equal src_2
+ *   -ve if src_1 less than src_2
+ *   +ve if src_1 greater than src_2
+ */
+static inline int
+rte_cmp16(const void *src_1, const void *src_2);
+
+/**
+ * Compare 32 bytes between two locations using optimised
+ * instructions. The locations should not overlap.
+ *
+ * @param src_1
+ *   Pointer to the first source of the data.
+ * @param src_2
+ *   Pointer to the second source of the data.
+ *   zero if src_1 equal src_2
+ *   -ve if src_1 less than src_2
+ *   +ve if src_1 greater than src_2
+ */
+static inline int
+rte_cmp32(const void *src_1, const void *src_2);
+
+/**
+ * Compare 64 bytes between two locations using optimised
+ * instructions. The locations should not overlap.
+ *
+ * @param src_1
+ *   Pointer to the first source of the data.
+ * @param src
+ *   Pointer to the second source of the data.
+ *   zero if src_1 equal src_2
+ *   -ve if src_1 less than src_2
+ *   +ve if src_1 greater than src_2
+ */
+static inline int
+rte_cmp64(const void *src_1, const void *src_2);
+
+/**
+ * Compare 48 bytes between two locations using optimised
+ * instructions. The locations should not overlap.
+ *
+ * @param src_1
+ *   Pointer to the first source of the data.
+ * @param src
+ *   Pointer to the second source of the data.
+ *   zero if src_1 equal src_2
+ *   -ve if src_1 less than src_2
+ *   +ve if src_1 greater than src_2
+ */
+static inline int
+rte_cmp48(const void *src_1, const void *src_2);
+
+/**
+ * Compare 128 bytes between two locations using
+ * optimised instructions. The locations should not overlap.
+ *
+ * @param src_1
+ *   Pointer to the first source of the data.
+ * @param src_2
+ *   Pointer to the second source of the data.
+ *   zero if src_1 equal src_2
+ *   -ve if src_1 less than src_2
+ *   +ve if src_1 greater than src_2
+ */
+static inline int
+rte_cmp128(const void *src_1, const void *src_2);
+
+/**
+ * Compare 256 bytes or greater between two locations using
+ * optimised instructions. The locations should not overlap.
+ *
+ * @param src_1
+ *   Pointer to the first source of the data.
+ * @param src_2
+ *   Pointer to the second source of the data.
+ *   zero if src_1 equal src_2
+ *   -ve if src_1 less than src_2
+ *   +ve if src_1 greater than src_2
+ */
+static inline int
+rte_cmp256(const void *src_1, const void *src_2);
+
+#ifdef __DOXYGEN__
+
+/**
+ * Compare bytes between two locations. The locations must not overlap.
+ *
+ * @note This is implemented as a macro, so it's address should not be taken
+ * and care is needed as parameter expressions may be evaluated multiple times.
+ *
+ * @param src_1
+ *   Pointer to the first source of the data.
+ * @param src_2
+ *   Pointer to the second source of the data.
+ * @param n
+ *   Number of bytes to copy.
+ * @return
+ *   zero if src_1 equal src_2
+ *   -ve if src_1 less than src_2
+ *   +ve if src_1 greater than src_2
+ */
+static int
+rte_memcmp(const void *dst, const void *src, size_t n);
+
+#endif /* __DOXYGEN__ */
+
+/*
+ * memcmp() function used by rte_memcmp macro
+ */
+static inline int
+rte_memcmp_func(void *dst, const void *src, size_t n) __attribute__((always_inline));
+
+#endif /* _RTE_MEMCMP_H_ */
diff --git a/lib/librte_hash/rte_hash.c b/lib/librte_hash/rte_hash.c
index 9245716..075da62 100644
--- a/lib/librte_hash/rte_hash.c
+++ b/lib/librte_hash/rte_hash.c
@@ -42,6 +42,7 @@
 #include <rte_memory.h>         /* for definition of RTE_CACHE_LINE_SIZE */
 #include <rte_log.h>
 #include <rte_memcpy.h>
+#include <rte_memcmp.h>
 #include <rte_prefetch.h>
 #include <rte_branch_prediction.h>
 #include <rte_memzone.h>
@@ -299,6 +300,7 @@ __rte_hash_add_key_with_hash(const struct rte_hash *h,
 	uint8_t *key_bucket;
 	uint32_t bucket_index, i;
 	int32_t pos;
+	const void * volatile key_1 = key;
 
 	/* Get the hash signature and bucket index */
 	sig |= h->sig_msb;
@@ -308,10 +310,13 @@ __rte_hash_add_key_with_hash(const struct rte_hash *h,
 
 	/* Check if key is already present in the hash */
 	for (i = 0; i < h->bucket_entries; i++) {
-		if ((sig == sig_bucket[i]) &&
-		    likely(memcmp(key, get_key_from_bucket(h, key_bucket, i),
-				  h->key_len) == 0)) {
-			return bucket_index * h->bucket_entries + i;
+		if (sig == sig_bucket[i]) {
+
+			const void * volatile key_2 =
+				get_key_from_bucket(h, key_bucket, i);
+
+			if (likely(rte_memcmp(key_1, key_2, h->key_len) == 0))
+				return bucket_index * h->bucket_entries + i;
 		}
 	}
 
@@ -350,6 +355,8 @@ __rte_hash_del_key_with_hash(const struct rte_hash *h,
 	uint8_t *key_bucket;
 	uint32_t bucket_index, i;
 
+	const void * volatile key_1 = key;
+
 	/* Get the hash signature and bucket index */
 	sig = sig | h->sig_msb;
 	bucket_index = sig & h->bucket_bitmask;
@@ -358,11 +365,14 @@ __rte_hash_del_key_with_hash(const struct rte_hash *h,
 
 	/* Check if key is already present in the hash */
 	for (i = 0; i < h->bucket_entries; i++) {
-		if ((sig == sig_bucket[i]) &&
-		    likely(memcmp(key, get_key_from_bucket(h, key_bucket, i),
-				  h->key_len) == 0)) {
-			sig_bucket[i] = NULL_SIGNATURE;
-			return bucket_index * h->bucket_entries + i;
+		if (sig == sig_bucket[i]) {
+			const void * volatile key_2 =
+				get_key_from_bucket(h, key_bucket, i);
+
+			if (likely(rte_memcmp(key_1, key_2, h->key_len) == 0)) {
+				sig_bucket[i] = NULL_SIGNATURE;
+				return bucket_index * h->bucket_entries + i;
+			}
 		}
 	}
 
@@ -392,6 +402,8 @@ __rte_hash_lookup_with_hash(const struct rte_hash *h,
 	uint8_t *key_bucket;
 	uint32_t bucket_index, i;
 
+	const void * volatile key_1 = key;
+
 	/* Get the hash signature and bucket index */
 	sig |= h->sig_msb;
 	bucket_index = sig & h->bucket_bitmask;
@@ -400,10 +412,13 @@ __rte_hash_lookup_with_hash(const struct rte_hash *h,
 
 	/* Check if key is already present in the hash */
 	for (i = 0; i < h->bucket_entries; i++) {
-		if ((sig == sig_bucket[i]) &&
-		    likely(memcmp(key, get_key_from_bucket(h, key_bucket, i),
-				  h->key_len) == 0)) {
-			return bucket_index * h->bucket_entries + i;
+		if (sig == sig_bucket[i]) {
+
+			const void * volatile key_2 =
+				get_key_from_bucket(h, key_bucket, i);
+
+			if (likely(rte_memcmp(key_1, key_2, h->key_len) == 0))
+				return bucket_index * h->bucket_entries + i;
 		}
 	}
 
@@ -456,13 +471,17 @@ rte_hash_lookup_bulk(const struct rte_hash *h, const void **keys,
 		positions[i] = -ENOENT;
 
 		for (j = 0; j < h->bucket_entries; j++) {
-			if ((sigs[i] == sig_bucket[j]) &&
-			    likely(memcmp(keys[i],
-					  get_key_from_bucket(h, key_bucket, j),
-					  h->key_len) == 0)) {
-				positions[i] = bucket_index *
-					h->bucket_entries + j;
-				break;
+			if (sigs[i] == sig_bucket[j]) {
+
+				const void * volatile key_1 = keys[i];
+				const void * volatile key_2 =
+					get_key_from_bucket(h, key_bucket, j);
+				if (likely(rte_memcmp(key_1, key_2,
+							h->key_len) == 0)) {
+					positions[i] = bucket_index *
+							h->bucket_entries + j;
+					break;
+				}
 			}
 		}
 	}
-- 
1.9.1



More information about the dev mailing list