[dpdk-dev] [PATCH 1/4] eal/common: introduce rte_memset on IA platform
Zhiyong Yang
zhiyong.yang at intel.com
Mon Dec 5 09:26:24 CET 2016
Performance drop has been caused in some cases when DPDK code calls glibc
function memset. reference to discussions about memset in
http://dpdk.org/ml/archives/dev/2016-October/048628.html
It is necessary to introduce more high efficient function to fix it.
One important thing about rte_memset is that we can get clear control
on what instruction flow is used. This patch supports instruction sets
such as sse & avx(128 bits), avx2(256 bits) and avx512(512bits).
rte_memset makes full use of vectorization and inline function to improve
the perf on IA. In addition, cache line and memory alignment are fully
taken into consideration.
Signed-off-by: Zhiyong Yang <zhiyong.yang at intel.com>
---
.../common/include/arch/x86/rte_memset.h | 376 +++++++++++++++++++++
lib/librte_eal/common/include/generic/rte_memset.h | 51 +++
2 files changed, 427 insertions(+)
create mode 100644 lib/librte_eal/common/include/arch/x86/rte_memset.h
create mode 100644 lib/librte_eal/common/include/generic/rte_memset.h
diff --git a/lib/librte_eal/common/include/arch/x86/rte_memset.h b/lib/librte_eal/common/include/arch/x86/rte_memset.h
new file mode 100644
index 0000000..3b2d3a3
--- /dev/null
+++ b/lib/librte_eal/common/include/arch/x86/rte_memset.h
@@ -0,0 +1,376 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _RTE_MEMSET_X86_64_H_
+#define _RTE_MEMSET_X86_64_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @file
+ *
+ * Functions for vectorised implementation of memset().
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+#include <rte_vect.h>
+
+static inline void *
+rte_memset(void *dst, int a, size_t n) __attribute__((always_inline));
+
+static inline void
+rte_memset_less16(void *dst, int a, size_t n)
+{
+ uintptr_t dstu = (uintptr_t)dst;
+
+ if (n & 0x01) {
+ *(uint8_t *)dstu = (uint8_t)a;
+ dstu = (uintptr_t)((uint8_t *)dstu + 1);
+ }
+ if (n & 0x02) {
+ *(uint16_t *)dstu = (uint8_t)a | (((uint8_t)a) << 8);
+ dstu = (uintptr_t)((uint16_t *)dstu + 1);
+ }
+ if (n & 0x04) {
+ uint16_t b = ((uint8_t)a | (((uint8_t)a) << 8));
+
+ *(uint32_t *)dstu = (uint32_t)(b | (b << 16));
+ dstu = (uintptr_t)((uint32_t *)dstu + 1);
+ }
+ if (n & 0x08) {
+ uint16_t b = ((uint8_t)a | (((uint8_t)a) << 8));
+ uint32_t c = b | (b << 16);
+
+ *(uint32_t *)dstu = c;
+ *((uint32_t *)dstu + 1) = c;
+ dstu = (uintptr_t)((uint32_t *)dstu + 2);
+ }
+}
+
+static inline void
+rte_memset16(uint8_t *dst, int8_t a)
+{
+ __m128i xmm0;
+
+ xmm0 = _mm_set1_epi8(a);
+ _mm_storeu_si128((__m128i *)dst, xmm0);
+}
+
+static inline void
+rte_memset_17to32(void *dst, int a, size_t n)
+{
+ rte_memset16((uint8_t *)dst, a);
+ rte_memset16((uint8_t *)dst - 16 + n, a);
+}
+
+#ifdef RTE_MACHINE_CPUFLAG_AVX512
+
+/**
+ * AVX512 implementation below
+ */
+
+static inline void
+rte_memset32(uint8_t *dst, int8_t a)
+{
+ __m256i ymm0;
+
+ ymm0 = _mm256_set1_epi8(a);
+ _mm256_storeu_si256((__m256i *)dst, ymm0);
+}
+
+static inline void
+rte_memset64(uint8_t *dst, int8_t a)
+{
+ __m512i zmm0;
+
+ zmm0 = _mm512_set1_epi8(a);
+ _mm512_storeu_si512((void *)dst, zmm0);
+}
+
+static inline void
+rte_memset128blocks(uint8_t *dst, int8_t a, size_t n)
+{
+ __m512i zmm0;
+
+ zmm0 = _mm512_set1_epi8(a);
+ while (n >= 128) {
+ n -= 128;
+ _mm512_store_si512((void *)(dst + 0 * 64), zmm0);
+ _mm512_store_si512((void *)(dst + 1 * 64), zmm0);
+ dst = dst + 128;
+ }
+}
+
+static inline void *
+rte_memset(void *dst, int a, size_t n)
+{
+ void *ret = dst;
+ size_t dstofss;
+ size_t bits;
+
+ if (n < 16) {
+ rte_memset_less16(dst, a, n);
+ return ret;
+ } else if (n == 16) {
+ rte_memset16((uint8_t *)dst, a);
+ return ret;
+ }
+ if (n <= 32) {
+ rte_memset_17to32(dst, a, n);
+ return ret;
+ }
+ if (n <= 64) {
+ rte_memset32((uint8_t *)dst, a);
+ rte_memset32((uint8_t *)dst - 32 + n, a);
+ return ret;
+ }
+ if (n >= 256) {
+ dstofss = ((uintptr_t)dst & 0x3F);
+ if (dstofss > 0) {
+ dstofss = 64 - dstofss;
+ n -= dstofss;
+ rte_memset64((uint8_t *)dst, a);
+ dst = (uint8_t *)dst + dstofss;
+ }
+ rte_memset128blocks((uint8_t *)dst, a, n);
+ bits = n;
+ n = n & 127;
+ bits -= n;
+ dst = (uint8_t *)dst + bits;
+ }
+ if (n > 128) {
+ n -= 128;
+ rte_memset64((uint8_t *)dst, a);
+ rte_memset64((uint8_t *)dst + 64, a);
+ dst = (uint8_t *)dst + 128;
+ }
+ if (n > 64) {
+ rte_memset64((uint8_t *)dst, a);
+ rte_memset64((uint8_t *)dst - 64 + n, a);
+ return ret;
+ }
+ if (n > 0)
+ rte_memset64((uint8_t *)dst - 64 + n, a);
+ return ret;
+}
+
+#elif defined RTE_MACHINE_CPUFLAG_AVX2
+
+/**
+ * AVX2 implementation below
+ */
+
+static inline void
+rte_memset32(uint8_t *dst, int8_t a)
+{
+ __m256i ymm0;
+
+ ymm0 = _mm256_set1_epi8(a);
+ _mm256_storeu_si256((__m256i *)dst, ymm0);
+}
+
+static inline void
+rte_memset_33to64(void *dst, int a, size_t n)
+{
+ rte_memset32((uint8_t *)dst, a);
+ rte_memset32((uint8_t *)dst - 32 + n, a);
+}
+
+static inline void
+rte_memset64blocks(uint8_t *dst, int8_t a, size_t n)
+{
+ __m256i ymm0;
+
+ ymm0 = _mm256_set1_epi8(a);
+ while (n >= 64) {
+ n -= 64;
+ _mm256_store_si256((__m256i *)((uint8_t *)dst + 0 * 32), ymm0);
+ _mm256_store_si256((__m256i *)((uint8_t *)dst + 1 * 32), ymm0);
+ dst = (uint8_t *)dst + 64;
+
+ }
+}
+
+static inline void *
+rte_memset(void *dst, int a, size_t n)
+{
+ void *ret = dst;
+ size_t dstofss;
+ size_t bits;
+
+ if (n < 16) {
+ rte_memset_less16(dst, a, n);
+ return ret;
+ } else if (n == 16) {
+ rte_memset16((uint8_t *)dst, a);
+ return ret;
+ }
+ if (n <= 32) {
+ rte_memset_17to32(dst, a, n);
+ return ret;
+ }
+ if (n <= 64) {
+ rte_memset_33to64(dst, a, n);
+ return ret;
+ }
+ if (n > 64) {
+ dstofss = (uintptr_t)dst & 0x1F;
+ if (dstofss > 0) {
+ dstofss = 32 - dstofss;
+ n -= dstofss;
+ rte_memset32((uint8_t *)dst, a);
+ dst = (uint8_t *)dst + dstofss;
+ }
+ rte_memset64blocks((uint8_t *)dst, a, n);
+ bits = n;
+ n = n & 63;
+ bits -= n;
+ dst = (uint8_t *)dst + bits;
+ }
+ if (n > 32) {
+ rte_memset_33to64(dst, a, n);
+ return ret;
+ }
+ if (n > 0)
+ rte_memset32((uint8_t *)dst - 32 + n, a);
+ return ret;
+}
+
+#else /* RTE_MACHINE_CPUFLAG */
+
+/**
+ * SSE && AVX implementation below
+ */
+
+static inline void
+rte_memset32(uint8_t *dst, int8_t a)
+{
+ __m128i xmm0 = _mm_set1_epi8(a);
+
+ _mm_storeu_si128((__m128i *)dst, xmm0);
+ _mm_storeu_si128((__m128i *)(dst + 16), xmm0);
+}
+
+static inline void
+rte_memset16blocks(uint8_t *dst, int8_t a, size_t n)
+{
+ __m128i xmm0 = _mm_set1_epi8(a);
+
+ while (n >= 16) {
+ n -= 16;
+ _mm_store_si128((__m128i *)(dst + 0 * 16), xmm0);
+ dst = (uint8_t *)dst + 16;
+ }
+}
+
+static inline void
+rte_memset64blocks(uint8_t *dst, int8_t a, size_t n)
+{
+ __m128i xmm0 = _mm_set1_epi8(a);
+
+ while (n >= 64) {
+ n -= 64;
+ _mm_store_si128((__m128i *)(dst + 0 * 16), xmm0);
+ _mm_store_si128((__m128i *)(dst + 1 * 16), xmm0);
+ _mm_store_si128((__m128i *)(dst + 2 * 16), xmm0);
+ _mm_store_si128((__m128i *)(dst + 3 * 16), xmm0);
+ dst = (uint8_t *)dst + 64;
+ }
+}
+
+static inline void *
+rte_memset(void *dst, int a, size_t n)
+{
+ void *ret = dst;
+ size_t dstofss;
+ size_t bits;
+
+ if (n < 16) {
+ rte_memset_less16(dst, a, n);
+ return ret;
+ } else if (n == 16) {
+ rte_memset16((uint8_t *)dst, a);
+ return ret;
+ }
+ if (n <= 32) {
+ rte_memset_17to32(dst, a, n);
+ return ret;
+ }
+ if (n <= 48) {
+ rte_memset32((uint8_t *)dst, a);
+ rte_memset16((uint8_t *)dst - 16 + n, a);
+ return ret;
+ }
+ if (n <= 64) {
+ rte_memset32((uint8_t *)dst, a);
+ rte_memset16((uint8_t *)dst + 32, a);
+ rte_memset16((uint8_t *)dst - 16 + n, a);
+ return ret;
+ }
+ if (n > 64) {
+ dstofss = (uintptr_t)dst & 0xF;
+ if (dstofss > 0) {
+ dstofss = 16 - dstofss;
+ n -= dstofss;
+ rte_memset16((uint8_t *)dst, a);
+ dst = (uint8_t *)dst + dstofss;
+ }
+ rte_memset64blocks((uint8_t *)dst, a, n);
+ bits = n;
+ n &= 63;
+ bits -= n;
+ dst = (uint8_t *)dst + bits;
+ rte_memset16blocks((uint8_t *)dst, a, n);
+ bits = n;
+ n &= 0xf;
+ bits -= n;
+ dst = (uint8_t *)dst + bits;
+ if (n > 0) {
+ rte_memset16((uint8_t *)dst - 16 + n, a);
+ return ret;
+ }
+ }
+ return ret;
+}
+
+#endif /* RTE_MACHINE_CPUFLAG */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_MEMSET_H_ */
diff --git a/lib/librte_eal/common/include/generic/rte_memset.h b/lib/librte_eal/common/include/generic/rte_memset.h
new file mode 100644
index 0000000..416a638
--- /dev/null
+++ b/lib/librte_eal/common/include/generic/rte_memset.h
@@ -0,0 +1,51 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _RTE_MEMSET_H_
+#define _RTE_MEMSET_H_
+
+/**
+ * @file
+ *
+ * Functions for vectorised implementation of memset().
+ */
+#ifndef _RTE_MEMSET_X86_64_H_
+
+#define rte_memset memset
+
+#else
+
+static void *
+rte_memset(void *dst, int a, size_t n);
+
+#endif
+#endif /* _RTE_MEMSET_H_ */
--
2.7.4
More information about the dev
mailing list