[dpdk-dev] [PATCH 1/4] eal/common: introduce rte_memset on IA platform

Zhiyong Yang zhiyong.yang at intel.com
Mon Dec 5 09:26:24 CET 2016


Performance drop has been caused in some cases when DPDK code calls glibc
function memset. reference to discussions about memset in
http://dpdk.org/ml/archives/dev/2016-October/048628.html
It is necessary to introduce more high efficient function to fix it.
One important thing about rte_memset is that we can get clear control
on what instruction flow is used. This patch supports instruction sets
such as sse & avx(128 bits), avx2(256 bits) and avx512(512bits).
rte_memset makes full use of vectorization and inline function to improve
the perf on IA. In addition, cache line and memory alignment are fully
taken into consideration.

Signed-off-by: Zhiyong Yang <zhiyong.yang at intel.com>
---
 .../common/include/arch/x86/rte_memset.h           | 376 +++++++++++++++++++++
 lib/librte_eal/common/include/generic/rte_memset.h |  51 +++
 2 files changed, 427 insertions(+)
 create mode 100644 lib/librte_eal/common/include/arch/x86/rte_memset.h
 create mode 100644 lib/librte_eal/common/include/generic/rte_memset.h

diff --git a/lib/librte_eal/common/include/arch/x86/rte_memset.h b/lib/librte_eal/common/include/arch/x86/rte_memset.h
new file mode 100644
index 0000000..3b2d3a3
--- /dev/null
+++ b/lib/librte_eal/common/include/arch/x86/rte_memset.h
@@ -0,0 +1,376 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _RTE_MEMSET_X86_64_H_
+#define _RTE_MEMSET_X86_64_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @file
+ *
+ * Functions for vectorised implementation of memset().
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+#include <rte_vect.h>
+
+static inline void *
+rte_memset(void *dst, int a, size_t n) __attribute__((always_inline));
+
+static inline void
+rte_memset_less16(void *dst, int a, size_t n)
+{
+	uintptr_t dstu = (uintptr_t)dst;
+
+	if (n & 0x01) {
+		*(uint8_t *)dstu = (uint8_t)a;
+		dstu = (uintptr_t)((uint8_t *)dstu + 1);
+	}
+	if (n & 0x02) {
+		*(uint16_t *)dstu = (uint8_t)a | (((uint8_t)a) << 8);
+		dstu = (uintptr_t)((uint16_t *)dstu + 1);
+	}
+	if (n & 0x04) {
+		uint16_t b = ((uint8_t)a | (((uint8_t)a) << 8));
+
+		*(uint32_t *)dstu = (uint32_t)(b | (b << 16));
+		dstu = (uintptr_t)((uint32_t *)dstu + 1);
+	}
+	if (n & 0x08) {
+		uint16_t b = ((uint8_t)a | (((uint8_t)a) << 8));
+		uint32_t c = b | (b << 16);
+
+		*(uint32_t *)dstu = c;
+		*((uint32_t *)dstu + 1) = c;
+		dstu = (uintptr_t)((uint32_t *)dstu + 2);
+	}
+}
+
+static inline void
+rte_memset16(uint8_t *dst, int8_t a)
+{
+	__m128i xmm0;
+
+	xmm0 = _mm_set1_epi8(a);
+	_mm_storeu_si128((__m128i *)dst, xmm0);
+}
+
+static inline void
+rte_memset_17to32(void *dst, int a, size_t n)
+{
+	rte_memset16((uint8_t *)dst, a);
+	rte_memset16((uint8_t *)dst - 16 + n, a);
+}
+
+#ifdef RTE_MACHINE_CPUFLAG_AVX512
+
+/**
+ * AVX512 implementation below
+ */
+
+static inline void
+rte_memset32(uint8_t *dst, int8_t a)
+{
+	__m256i ymm0;
+
+	ymm0 = _mm256_set1_epi8(a);
+	_mm256_storeu_si256((__m256i *)dst, ymm0);
+}
+
+static inline void
+rte_memset64(uint8_t *dst, int8_t a)
+{
+	__m512i zmm0;
+
+	zmm0 = _mm512_set1_epi8(a);
+	_mm512_storeu_si512((void *)dst, zmm0);
+}
+
+static inline void
+rte_memset128blocks(uint8_t *dst, int8_t a, size_t n)
+{
+	__m512i zmm0;
+
+	zmm0 = _mm512_set1_epi8(a);
+	while (n >= 128) {
+		n -= 128;
+		_mm512_store_si512((void *)(dst + 0 * 64), zmm0);
+		_mm512_store_si512((void *)(dst + 1 * 64), zmm0);
+		dst = dst + 128;
+	}
+}
+
+static inline void *
+rte_memset(void *dst, int a, size_t n)
+{
+	void *ret = dst;
+	size_t dstofss;
+	size_t bits;
+
+	if (n < 16) {
+		rte_memset_less16(dst, a, n);
+		return ret;
+	} else if (n == 16) {
+		rte_memset16((uint8_t *)dst, a);
+		return ret;
+	}
+	if (n <= 32) {
+		rte_memset_17to32(dst, a, n);
+		return ret;
+	}
+	if (n <= 64) {
+		rte_memset32((uint8_t *)dst, a);
+		rte_memset32((uint8_t *)dst - 32 + n, a);
+		return ret;
+	}
+	if (n >= 256) {
+		dstofss = ((uintptr_t)dst & 0x3F);
+		if (dstofss > 0) {
+			dstofss = 64 - dstofss;
+			n -= dstofss;
+			rte_memset64((uint8_t *)dst, a);
+			dst = (uint8_t *)dst + dstofss;
+		}
+		rte_memset128blocks((uint8_t *)dst, a, n);
+		bits = n;
+		n = n & 127;
+		bits -= n;
+		dst = (uint8_t *)dst + bits;
+	}
+	if (n > 128) {
+		n -= 128;
+		rte_memset64((uint8_t *)dst, a);
+		rte_memset64((uint8_t *)dst + 64, a);
+		dst = (uint8_t *)dst + 128;
+	}
+	if (n > 64) {
+		rte_memset64((uint8_t *)dst, a);
+		rte_memset64((uint8_t *)dst - 64 + n, a);
+		return ret;
+	}
+	if (n > 0)
+		rte_memset64((uint8_t *)dst - 64 + n, a);
+	return ret;
+}
+
+#elif defined RTE_MACHINE_CPUFLAG_AVX2
+
+/**
+ *  AVX2 implementation below
+ */
+
+static inline void
+rte_memset32(uint8_t *dst, int8_t a)
+{
+	__m256i ymm0;
+
+	ymm0 = _mm256_set1_epi8(a);
+	_mm256_storeu_si256((__m256i *)dst, ymm0);
+}
+
+static inline void
+rte_memset_33to64(void *dst, int a, size_t n)
+{
+	rte_memset32((uint8_t *)dst, a);
+	rte_memset32((uint8_t *)dst - 32 + n, a);
+}
+
+static inline void
+rte_memset64blocks(uint8_t *dst, int8_t a, size_t n)
+{
+	__m256i ymm0;
+
+	ymm0 = _mm256_set1_epi8(a);
+	while (n >= 64) {
+		n -= 64;
+		_mm256_store_si256((__m256i *)((uint8_t *)dst + 0 * 32), ymm0);
+		_mm256_store_si256((__m256i *)((uint8_t *)dst + 1 * 32), ymm0);
+		dst = (uint8_t *)dst + 64;
+
+	}
+}
+
+static inline void *
+rte_memset(void *dst, int a, size_t n)
+{
+	void *ret = dst;
+	size_t dstofss;
+	size_t bits;
+
+	if (n < 16) {
+		rte_memset_less16(dst, a, n);
+		return ret;
+	} else if (n == 16) {
+		rte_memset16((uint8_t *)dst, a);
+		return ret;
+	}
+	if (n <= 32) {
+		rte_memset_17to32(dst, a, n);
+		return ret;
+	}
+	if (n <= 64) {
+		rte_memset_33to64(dst, a, n);
+		return ret;
+	}
+	if (n > 64) {
+		dstofss = (uintptr_t)dst & 0x1F;
+		if (dstofss > 0) {
+			dstofss = 32 - dstofss;
+			n -= dstofss;
+			rte_memset32((uint8_t *)dst, a);
+			dst = (uint8_t *)dst + dstofss;
+		}
+		rte_memset64blocks((uint8_t *)dst, a, n);
+		bits = n;
+		n = n & 63;
+		bits -= n;
+		dst = (uint8_t *)dst + bits;
+	}
+	if (n > 32) {
+		rte_memset_33to64(dst, a, n);
+		return ret;
+	}
+	if (n > 0)
+		rte_memset32((uint8_t *)dst - 32 + n, a);
+	return ret;
+}
+
+#else /* RTE_MACHINE_CPUFLAG */
+
+/**
+ * SSE && AVX implementation below
+ */
+
+static inline void
+rte_memset32(uint8_t *dst, int8_t a)
+{
+	__m128i xmm0 = _mm_set1_epi8(a);
+
+	_mm_storeu_si128((__m128i *)dst, xmm0);
+	_mm_storeu_si128((__m128i *)(dst + 16), xmm0);
+}
+
+static inline void
+rte_memset16blocks(uint8_t *dst, int8_t a, size_t n)
+{
+	__m128i xmm0 = _mm_set1_epi8(a);
+
+	while (n >= 16) {
+		n -= 16;
+		_mm_store_si128((__m128i *)(dst + 0 * 16), xmm0);
+		dst = (uint8_t *)dst + 16;
+	}
+}
+
+static inline void
+rte_memset64blocks(uint8_t *dst, int8_t a, size_t n)
+{
+	__m128i xmm0 = _mm_set1_epi8(a);
+
+	while (n >= 64) {
+		n -= 64;
+		_mm_store_si128((__m128i *)(dst + 0 * 16), xmm0);
+		_mm_store_si128((__m128i *)(dst + 1 * 16), xmm0);
+		_mm_store_si128((__m128i *)(dst + 2 * 16), xmm0);
+		_mm_store_si128((__m128i *)(dst + 3 * 16), xmm0);
+		dst = (uint8_t *)dst + 64;
+	}
+}
+
+static inline void *
+rte_memset(void *dst, int a, size_t n)
+{
+	void *ret = dst;
+	size_t dstofss;
+	size_t bits;
+
+	if (n < 16) {
+		rte_memset_less16(dst, a, n);
+		return ret;
+	} else if (n == 16) {
+		rte_memset16((uint8_t *)dst, a);
+		return ret;
+	}
+	if (n <= 32) {
+		rte_memset_17to32(dst, a, n);
+		return ret;
+	}
+	if (n <= 48) {
+		rte_memset32((uint8_t *)dst, a);
+		rte_memset16((uint8_t *)dst - 16 + n, a);
+		return ret;
+	}
+	if (n <= 64) {
+		rte_memset32((uint8_t *)dst, a);
+		rte_memset16((uint8_t *)dst + 32, a);
+		rte_memset16((uint8_t *)dst - 16 + n, a);
+		return ret;
+	}
+	if (n > 64) {
+		dstofss = (uintptr_t)dst & 0xF;
+		if (dstofss > 0) {
+			dstofss = 16 - dstofss;
+			n -= dstofss;
+			rte_memset16((uint8_t *)dst, a);
+			dst = (uint8_t *)dst + dstofss;
+		}
+		rte_memset64blocks((uint8_t *)dst, a, n);
+		bits = n;
+		n &= 63;
+		bits -= n;
+		dst = (uint8_t *)dst + bits;
+		rte_memset16blocks((uint8_t *)dst, a, n);
+		bits = n;
+		n &= 0xf;
+		bits -= n;
+		dst = (uint8_t *)dst + bits;
+		if (n > 0) {
+			rte_memset16((uint8_t *)dst - 16 + n, a);
+			return ret;
+		}
+	}
+	return ret;
+}
+
+#endif /* RTE_MACHINE_CPUFLAG */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_MEMSET_H_ */
diff --git a/lib/librte_eal/common/include/generic/rte_memset.h b/lib/librte_eal/common/include/generic/rte_memset.h
new file mode 100644
index 0000000..416a638
--- /dev/null
+++ b/lib/librte_eal/common/include/generic/rte_memset.h
@@ -0,0 +1,51 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _RTE_MEMSET_H_
+#define _RTE_MEMSET_H_
+
+/**
+ * @file
+ *
+ * Functions for vectorised implementation of memset().
+ */
+#ifndef _RTE_MEMSET_X86_64_H_
+
+#define rte_memset memset
+
+#else
+
+static void *
+rte_memset(void *dst, int a, size_t n);
+
+#endif
+#endif /* _RTE_MEMSET_H_ */
-- 
2.7.4



More information about the dev mailing list