[dpdk-dev] [PATCH 6/7] Split memcpy operation to architecture specific

Chao Zhu bjzhuc at cn.ibm.com
Fri Sep 26 11:33:37 CEST 2014


This patch splits the vector instruction based memory copy from DPDK and
push them to architecture specific arch directories, so that other
processor architecture to support DPDK can be easily adopted.

Signed-off-by: Chao Zhu <bjzhuc at cn.ibm.com>
---
 lib/librte_eal/common/Makefile                     |    2 +-
 .../common/include/i686/arch/rte_memcpy_arch.h     |  199 ++++++++++++++++++++
 lib/librte_eal/common/include/rte_memcpy.h         |   95 +---------
 .../common/include/x86_64/arch/rte_memcpy_arch.h   |  199 ++++++++++++++++++++
 4 files changed, 406 insertions(+), 89 deletions(-)
 create mode 100644 lib/librte_eal/common/include/i686/arch/rte_memcpy_arch.h
 create mode 100644 lib/librte_eal/common/include/x86_64/arch/rte_memcpy_arch.h

diff --git a/lib/librte_eal/common/Makefile b/lib/librte_eal/common/Makefile
index 249ea2f..4add1c1 100644
--- a/lib/librte_eal/common/Makefile
+++ b/lib/librte_eal/common/Makefile
@@ -46,7 +46,7 @@ ifeq ($(CONFIG_RTE_INSECURE_FUNCTION_WARNING),y)
 INC += rte_warnings.h
 endif
 
-ARCH_INC := rte_atomic.h rte_atomic_arch.h rte_byteorder_arch.h rte_cycles_arch.h rte_prefetch_arch.h rte_spinlock_arch.h
+ARCH_INC := rte_atomic.h rte_atomic_arch.h rte_byteorder_arch.h rte_cycles_arch.h rte_prefetch_arch.h rte_spinlock_arch.h rte_memcpy_arch.h 
 
 SYMLINK-$(CONFIG_RTE_LIBRTE_EAL)-include := $(addprefix include/,$(INC))
 SYMLINK-$(CONFIG_RTE_LIBRTE_EAL)-include/arch := \
diff --git a/lib/librte_eal/common/include/i686/arch/rte_memcpy_arch.h b/lib/librte_eal/common/include/i686/arch/rte_memcpy_arch.h
new file mode 100644
index 0000000..44f7760
--- /dev/null
+++ b/lib/librte_eal/common/include/i686/arch/rte_memcpy_arch.h
@@ -0,0 +1,199 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _RTE_MEMCPY_ARCH_H_
+#define _RTE_MEMCPY_ARCH_H_
+
+#include <stdint.h>
+#include <string.h>
+#include <emmintrin.h>
+
+#ifdef __INTEL_COMPILER
+#pragma warning(disable:593) /* Stop unused variable warning (reg_a etc). */
+#endif
+
+/**
+ * Copy 16 bytes from one location to another using optimised SSE
+ * instructions. The locations should not overlap.
+ *
+ * @param dst
+ *   Pointer to the destination of the data.
+ * @param src
+ *   Pointer to the source data.
+ */
+static inline void
+rte_arch_mov16(uint8_t *dst, const uint8_t *src)
+{
+	__m128i reg_a;
+	asm volatile (
+		"movdqu (%[src]), %[reg_a]\n\t"
+		"movdqu %[reg_a], (%[dst])\n\t"
+		: [reg_a] "=x" (reg_a)
+		: [src] "r" (src),
+		  [dst] "r"(dst)
+		: "memory"
+	);
+}
+
+/**
+ * Copy 32 bytes from one location to another using optimised SSE
+ * instructions. The locations should not overlap.
+ *
+ * @param dst
+ *   Pointer to the destination of the data.
+ * @param src
+ *   Pointer to the source data.
+ */
+static inline void
+rte_arch_mov32(uint8_t *dst, const uint8_t *src)
+{
+	__m128i reg_a, reg_b;
+	asm volatile (
+		"movdqu (%[src]), %[reg_a]\n\t"
+		"movdqu 16(%[src]), %[reg_b]\n\t"
+		"movdqu %[reg_a], (%[dst])\n\t"
+		"movdqu %[reg_b], 16(%[dst])\n\t"
+		: [reg_a] "=x" (reg_a),
+		  [reg_b] "=x" (reg_b)
+		: [src] "r" (src),
+		  [dst] "r"(dst)
+		: "memory"
+	);
+}
+
+/**
+ * Copy 48 bytes from one location to another using optimised SSE
+ * instructions. The locations should not overlap.
+ *
+ * @param dst
+ *   Pointer to the destination of the data.
+ * @param src
+ *   Pointer to the source data.
+ */
+static inline void
+rte_arch_mov48(uint8_t *dst, const uint8_t *src)
+{
+	__m128i reg_a, reg_b, reg_c;
+	asm volatile (
+		"movdqu (%[src]), %[reg_a]\n\t"
+		"movdqu 16(%[src]), %[reg_b]\n\t"
+		"movdqu 32(%[src]), %[reg_c]\n\t"
+		"movdqu %[reg_a], (%[dst])\n\t"
+		"movdqu %[reg_b], 16(%[dst])\n\t"
+		"movdqu %[reg_c], 32(%[dst])\n\t"
+		: [reg_a] "=x" (reg_a),
+		  [reg_b] "=x" (reg_b),
+		  [reg_c] "=x" (reg_c)
+		: [src] "r" (src),
+		  [dst] "r"(dst)
+		: "memory"
+	);
+}
+
+/**
+ * Copy 64 bytes from one location to another using optimised SSE
+ * instructions. The locations should not overlap.
+ *
+ * @param dst
+ *   Pointer to the destination of the data.
+ * @param src
+ *   Pointer to the source data.
+ */
+static inline void
+rte_arch_mov64(uint8_t *dst, const uint8_t *src)
+{
+	__m128i reg_a, reg_b, reg_c, reg_d;
+	asm volatile (
+		"movdqu (%[src]), %[reg_a]\n\t"
+		"movdqu 16(%[src]), %[reg_b]\n\t"
+		"movdqu 32(%[src]), %[reg_c]\n\t"
+		"movdqu 48(%[src]), %[reg_d]\n\t"
+		"movdqu %[reg_a], (%[dst])\n\t"
+		"movdqu %[reg_b], 16(%[dst])\n\t"
+		"movdqu %[reg_c], 32(%[dst])\n\t"
+		"movdqu %[reg_d], 48(%[dst])\n\t"
+		: [reg_a] "=x" (reg_a),
+		  [reg_b] "=x" (reg_b),
+		  [reg_c] "=x" (reg_c),
+		  [reg_d] "=x" (reg_d)
+		: [src] "r" (src),
+		  [dst] "r"(dst)
+		: "memory"
+	);
+}
+
+/**
+ * Copy 128 bytes from one location to another using optimised SSE
+ * instructions. The locations should not overlap.
+ *
+ * @param dst
+ *   Pointer to the destination of the data.
+ * @param src
+ *   Pointer to the source data.
+ */
+static inline void
+rte_arch_mov128(uint8_t *dst, const uint8_t *src)
+{
+	__m128i reg_a, reg_b, reg_c, reg_d, reg_e, reg_f, reg_g, reg_h;
+	asm volatile (
+		"movdqu (%[src]), %[reg_a]\n\t"
+		"movdqu 16(%[src]), %[reg_b]\n\t"
+		"movdqu 32(%[src]), %[reg_c]\n\t"
+		"movdqu 48(%[src]), %[reg_d]\n\t"
+		"movdqu 64(%[src]), %[reg_e]\n\t"
+		"movdqu 80(%[src]), %[reg_f]\n\t"
+		"movdqu 96(%[src]), %[reg_g]\n\t"
+		"movdqu 112(%[src]), %[reg_h]\n\t"
+		"movdqu %[reg_a], (%[dst])\n\t"
+		"movdqu %[reg_b], 16(%[dst])\n\t"
+		"movdqu %[reg_c], 32(%[dst])\n\t"
+		"movdqu %[reg_d], 48(%[dst])\n\t"
+		"movdqu %[reg_e], 64(%[dst])\n\t"
+		"movdqu %[reg_f], 80(%[dst])\n\t"
+		"movdqu %[reg_g], 96(%[dst])\n\t"
+		"movdqu %[reg_h], 112(%[dst])\n\t"
+		: [reg_a] "=x" (reg_a),
+		  [reg_b] "=x" (reg_b),
+		  [reg_c] "=x" (reg_c),
+		  [reg_d] "=x" (reg_d),
+		  [reg_e] "=x" (reg_e),
+		  [reg_f] "=x" (reg_f),
+		  [reg_g] "=x" (reg_g),
+		  [reg_h] "=x" (reg_h)
+		: [src] "r" (src),
+		  [dst] "r"(dst)
+		: "memory"
+	);
+}
+
+#endif /* _RTE_MEMCPY_ARCH_H_ */
\ No newline at end of file
diff --git a/lib/librte_eal/common/include/rte_memcpy.h b/lib/librte_eal/common/include/rte_memcpy.h
index 131b196..11a099e 100644
--- a/lib/librte_eal/common/include/rte_memcpy.h
+++ b/lib/librte_eal/common/include/rte_memcpy.h
@@ -37,12 +37,10 @@
 /**
  * @file
  *
- * Functions for SSE implementation of memcpy().
+ * Functions for vector instruction implementation of memcpy().
  */
 
-#include <stdint.h>
-#include <string.h>
-#include <emmintrin.h>
+#include "arch/rte_memcpy_arch.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -64,15 +62,7 @@ extern "C" {
 static inline void
 rte_mov16(uint8_t *dst, const uint8_t *src)
 {
-	__m128i reg_a;
-	asm volatile (
-		"movdqu (%[src]), %[reg_a]\n\t"
-		"movdqu %[reg_a], (%[dst])\n\t"
-		: [reg_a] "=x" (reg_a)
-		: [src] "r" (src),
-		  [dst] "r"(dst)
-		: "memory"
-	);
+	rte_arch_mov16(dst, src);
 }
 
 /**
@@ -87,18 +77,7 @@ rte_mov16(uint8_t *dst, const uint8_t *src)
 static inline void
 rte_mov32(uint8_t *dst, const uint8_t *src)
 {
-	__m128i reg_a, reg_b;
-	asm volatile (
-		"movdqu (%[src]), %[reg_a]\n\t"
-		"movdqu 16(%[src]), %[reg_b]\n\t"
-		"movdqu %[reg_a], (%[dst])\n\t"
-		"movdqu %[reg_b], 16(%[dst])\n\t"
-		: [reg_a] "=x" (reg_a),
-		  [reg_b] "=x" (reg_b)
-		: [src] "r" (src),
-		  [dst] "r"(dst)
-		: "memory"
-	);
+	rte_arch_mov32(dst, src);
 }
 
 /**
@@ -113,21 +92,7 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
 static inline void
 rte_mov48(uint8_t *dst, const uint8_t *src)
 {
-	__m128i reg_a, reg_b, reg_c;
-	asm volatile (
-		"movdqu (%[src]), %[reg_a]\n\t"
-		"movdqu 16(%[src]), %[reg_b]\n\t"
-		"movdqu 32(%[src]), %[reg_c]\n\t"
-		"movdqu %[reg_a], (%[dst])\n\t"
-		"movdqu %[reg_b], 16(%[dst])\n\t"
-		"movdqu %[reg_c], 32(%[dst])\n\t"
-		: [reg_a] "=x" (reg_a),
-		  [reg_b] "=x" (reg_b),
-		  [reg_c] "=x" (reg_c)
-		: [src] "r" (src),
-		  [dst] "r"(dst)
-		: "memory"
-	);
+	rte_arch_mov48(dst, src);
 }
 
 /**
@@ -142,24 +107,7 @@ rte_mov48(uint8_t *dst, const uint8_t *src)
 static inline void
 rte_mov64(uint8_t *dst, const uint8_t *src)
 {
-	__m128i reg_a, reg_b, reg_c, reg_d;
-	asm volatile (
-		"movdqu (%[src]), %[reg_a]\n\t"
-		"movdqu 16(%[src]), %[reg_b]\n\t"
-		"movdqu 32(%[src]), %[reg_c]\n\t"
-		"movdqu 48(%[src]), %[reg_d]\n\t"
-		"movdqu %[reg_a], (%[dst])\n\t"
-		"movdqu %[reg_b], 16(%[dst])\n\t"
-		"movdqu %[reg_c], 32(%[dst])\n\t"
-		"movdqu %[reg_d], 48(%[dst])\n\t"
-		: [reg_a] "=x" (reg_a),
-		  [reg_b] "=x" (reg_b),
-		  [reg_c] "=x" (reg_c),
-		  [reg_d] "=x" (reg_d)
-		: [src] "r" (src),
-		  [dst] "r"(dst)
-		: "memory"
-	);
+	rte_arch_mov64(dst, src);
 }
 
 /**
@@ -174,36 +122,7 @@ rte_mov64(uint8_t *dst, const uint8_t *src)
 static inline void
 rte_mov128(uint8_t *dst, const uint8_t *src)
 {
-	__m128i reg_a, reg_b, reg_c, reg_d, reg_e, reg_f, reg_g, reg_h;
-	asm volatile (
-		"movdqu (%[src]), %[reg_a]\n\t"
-		"movdqu 16(%[src]), %[reg_b]\n\t"
-		"movdqu 32(%[src]), %[reg_c]\n\t"
-		"movdqu 48(%[src]), %[reg_d]\n\t"
-		"movdqu 64(%[src]), %[reg_e]\n\t"
-		"movdqu 80(%[src]), %[reg_f]\n\t"
-		"movdqu 96(%[src]), %[reg_g]\n\t"
-		"movdqu 112(%[src]), %[reg_h]\n\t"
-		"movdqu %[reg_a], (%[dst])\n\t"
-		"movdqu %[reg_b], 16(%[dst])\n\t"
-		"movdqu %[reg_c], 32(%[dst])\n\t"
-		"movdqu %[reg_d], 48(%[dst])\n\t"
-		"movdqu %[reg_e], 64(%[dst])\n\t"
-		"movdqu %[reg_f], 80(%[dst])\n\t"
-		"movdqu %[reg_g], 96(%[dst])\n\t"
-		"movdqu %[reg_h], 112(%[dst])\n\t"
-		: [reg_a] "=x" (reg_a),
-		  [reg_b] "=x" (reg_b),
-		  [reg_c] "=x" (reg_c),
-		  [reg_d] "=x" (reg_d),
-		  [reg_e] "=x" (reg_e),
-		  [reg_f] "=x" (reg_f),
-		  [reg_g] "=x" (reg_g),
-		  [reg_h] "=x" (reg_h)
-		: [src] "r" (src),
-		  [dst] "r"(dst)
-		: "memory"
-	);
+	rte_arch_mov128(dst, src);
 }
 
 #ifdef __INTEL_COMPILER
diff --git a/lib/librte_eal/common/include/x86_64/arch/rte_memcpy_arch.h b/lib/librte_eal/common/include/x86_64/arch/rte_memcpy_arch.h
new file mode 100644
index 0000000..44f7760
--- /dev/null
+++ b/lib/librte_eal/common/include/x86_64/arch/rte_memcpy_arch.h
@@ -0,0 +1,199 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _RTE_MEMCPY_ARCH_H_
+#define _RTE_MEMCPY_ARCH_H_
+
+#include <stdint.h>
+#include <string.h>
+#include <emmintrin.h>
+
+#ifdef __INTEL_COMPILER
+#pragma warning(disable:593) /* Stop unused variable warning (reg_a etc). */
+#endif
+
+/**
+ * Copy 16 bytes from one location to another using optimised SSE
+ * instructions. The locations should not overlap.
+ *
+ * @param dst
+ *   Pointer to the destination of the data.
+ * @param src
+ *   Pointer to the source data.
+ */
+static inline void
+rte_arch_mov16(uint8_t *dst, const uint8_t *src)
+{
+	__m128i reg_a;
+	asm volatile (
+		"movdqu (%[src]), %[reg_a]\n\t"
+		"movdqu %[reg_a], (%[dst])\n\t"
+		: [reg_a] "=x" (reg_a)
+		: [src] "r" (src),
+		  [dst] "r"(dst)
+		: "memory"
+	);
+}
+
+/**
+ * Copy 32 bytes from one location to another using optimised SSE
+ * instructions. The locations should not overlap.
+ *
+ * @param dst
+ *   Pointer to the destination of the data.
+ * @param src
+ *   Pointer to the source data.
+ */
+static inline void
+rte_arch_mov32(uint8_t *dst, const uint8_t *src)
+{
+	__m128i reg_a, reg_b;
+	asm volatile (
+		"movdqu (%[src]), %[reg_a]\n\t"
+		"movdqu 16(%[src]), %[reg_b]\n\t"
+		"movdqu %[reg_a], (%[dst])\n\t"
+		"movdqu %[reg_b], 16(%[dst])\n\t"
+		: [reg_a] "=x" (reg_a),
+		  [reg_b] "=x" (reg_b)
+		: [src] "r" (src),
+		  [dst] "r"(dst)
+		: "memory"
+	);
+}
+
+/**
+ * Copy 48 bytes from one location to another using optimised SSE
+ * instructions. The locations should not overlap.
+ *
+ * @param dst
+ *   Pointer to the destination of the data.
+ * @param src
+ *   Pointer to the source data.
+ */
+static inline void
+rte_arch_mov48(uint8_t *dst, const uint8_t *src)
+{
+	__m128i reg_a, reg_b, reg_c;
+	asm volatile (
+		"movdqu (%[src]), %[reg_a]\n\t"
+		"movdqu 16(%[src]), %[reg_b]\n\t"
+		"movdqu 32(%[src]), %[reg_c]\n\t"
+		"movdqu %[reg_a], (%[dst])\n\t"
+		"movdqu %[reg_b], 16(%[dst])\n\t"
+		"movdqu %[reg_c], 32(%[dst])\n\t"
+		: [reg_a] "=x" (reg_a),
+		  [reg_b] "=x" (reg_b),
+		  [reg_c] "=x" (reg_c)
+		: [src] "r" (src),
+		  [dst] "r"(dst)
+		: "memory"
+	);
+}
+
+/**
+ * Copy 64 bytes from one location to another using optimised SSE
+ * instructions. The locations should not overlap.
+ *
+ * @param dst
+ *   Pointer to the destination of the data.
+ * @param src
+ *   Pointer to the source data.
+ */
+static inline void
+rte_arch_mov64(uint8_t *dst, const uint8_t *src)
+{
+	__m128i reg_a, reg_b, reg_c, reg_d;
+	asm volatile (
+		"movdqu (%[src]), %[reg_a]\n\t"
+		"movdqu 16(%[src]), %[reg_b]\n\t"
+		"movdqu 32(%[src]), %[reg_c]\n\t"
+		"movdqu 48(%[src]), %[reg_d]\n\t"
+		"movdqu %[reg_a], (%[dst])\n\t"
+		"movdqu %[reg_b], 16(%[dst])\n\t"
+		"movdqu %[reg_c], 32(%[dst])\n\t"
+		"movdqu %[reg_d], 48(%[dst])\n\t"
+		: [reg_a] "=x" (reg_a),
+		  [reg_b] "=x" (reg_b),
+		  [reg_c] "=x" (reg_c),
+		  [reg_d] "=x" (reg_d)
+		: [src] "r" (src),
+		  [dst] "r"(dst)
+		: "memory"
+	);
+}
+
+/**
+ * Copy 128 bytes from one location to another using optimised SSE
+ * instructions. The locations should not overlap.
+ *
+ * @param dst
+ *   Pointer to the destination of the data.
+ * @param src
+ *   Pointer to the source data.
+ */
+static inline void
+rte_arch_mov128(uint8_t *dst, const uint8_t *src)
+{
+	__m128i reg_a, reg_b, reg_c, reg_d, reg_e, reg_f, reg_g, reg_h;
+	asm volatile (
+		"movdqu (%[src]), %[reg_a]\n\t"
+		"movdqu 16(%[src]), %[reg_b]\n\t"
+		"movdqu 32(%[src]), %[reg_c]\n\t"
+		"movdqu 48(%[src]), %[reg_d]\n\t"
+		"movdqu 64(%[src]), %[reg_e]\n\t"
+		"movdqu 80(%[src]), %[reg_f]\n\t"
+		"movdqu 96(%[src]), %[reg_g]\n\t"
+		"movdqu 112(%[src]), %[reg_h]\n\t"
+		"movdqu %[reg_a], (%[dst])\n\t"
+		"movdqu %[reg_b], 16(%[dst])\n\t"
+		"movdqu %[reg_c], 32(%[dst])\n\t"
+		"movdqu %[reg_d], 48(%[dst])\n\t"
+		"movdqu %[reg_e], 64(%[dst])\n\t"
+		"movdqu %[reg_f], 80(%[dst])\n\t"
+		"movdqu %[reg_g], 96(%[dst])\n\t"
+		"movdqu %[reg_h], 112(%[dst])\n\t"
+		: [reg_a] "=x" (reg_a),
+		  [reg_b] "=x" (reg_b),
+		  [reg_c] "=x" (reg_c),
+		  [reg_d] "=x" (reg_d),
+		  [reg_e] "=x" (reg_e),
+		  [reg_f] "=x" (reg_f),
+		  [reg_g] "=x" (reg_g),
+		  [reg_h] "=x" (reg_h)
+		: [src] "r" (src),
+		  [dst] "r"(dst)
+		: "memory"
+	);
+}
+
+#endif /* _RTE_MEMCPY_ARCH_H_ */
\ No newline at end of file
-- 
1.7.1



More information about the dev mailing list