[dpdk-dev] [PATCH] acl: If build does not support sse4.2, emulate missing instructions with C code

Neil Horman nhorman at tuxdriver.com
Mon Aug 4 17:35:58 CEST 2014


The ACL library makes extensive use of some SSE4.2 instructions, which means the
default build can't compile this library.  Work around the problem by testing
the __SSE42__ definition in the acl_vects.h file and defining the macros there
as intrinsics or c-level equivalants.  Note this is a minimal patch, adjusting
only the definitions that are currently used in the ACL library.

Only compile tested so far, but I wanted to post it for early review so that
others could aid in unit testing.

Signed-off-by: Neil Horman <nhorman at tuxdriver.com>
CC: Thomas Monjalon <thomas.monjalon at 6wind.com>
CC: "Konstantin Ananyev" <konstantin.ananyev at intel.com>
CC: Bruce Richardson <bruce.richardson at intel.com>
---
 lib/librte_acl/acl_bld.c  |   3 +-
 lib/librte_acl/acl_vect.h | 102 ++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 100 insertions(+), 5 deletions(-)

diff --git a/lib/librte_acl/acl_bld.c b/lib/librte_acl/acl_bld.c
index 873447b..de974a4 100644
--- a/lib/librte_acl/acl_bld.c
+++ b/lib/librte_acl/acl_bld.c
@@ -31,7 +31,6 @@
  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include <nmmintrin.h>
 #include <rte_acl.h>
 #include "tb_mem.h"
 #include "acl.h"
@@ -1481,7 +1480,7 @@ acl_calc_wildness(struct rte_acl_build_rule *head,
 			switch (rule->config->defs[n].type) {
 			case RTE_ACL_FIELD_TYPE_BITMASK:
 				wild = (size -
-					_mm_popcnt_u32(fld->mask_range.u8)) /
+					__builtin_popcountl(fld->mask_range.u8)) /
 					size;
 				break;
 
diff --git a/lib/librte_acl/acl_vect.h b/lib/librte_acl/acl_vect.h
index d813600..e5f391b 100644
--- a/lib/librte_acl/acl_vect.h
+++ b/lib/librte_acl/acl_vect.h
@@ -34,6 +34,10 @@
 #ifndef _RTE_ACL_VECT_H_
 #define _RTE_ACL_VECT_H_
 
+#ifdef __SSE4_1__
+#include <smmintrin.h>
+#endif
+
 /**
  * @file
  *
@@ -44,12 +48,12 @@
 extern "C" {
 #endif
 
+
 #define	MM_ADD16(a, b)		_mm_add_epi16(a, b)
 #define	MM_ADD32(a, b)		_mm_add_epi32(a, b)
 #define	MM_ALIGNR8(a, b, c)	_mm_alignr_epi8(a, b, c)
 #define	MM_AND(a, b)		_mm_and_si128(a, b)
 #define MM_ANDNOT(a, b)		_mm_andnot_si128(a, b)
-#define MM_BLENDV8(a, b, c)	_mm_blendv_epi8(a, b, c)
 #define MM_CMPEQ16(a, b)	_mm_cmpeq_epi16(a, b)
 #define MM_CMPEQ32(a, b)	_mm_cmpeq_epi32(a, b)
 #define	MM_CMPEQ8(a, b)		_mm_cmpeq_epi8(a, b)
@@ -59,7 +63,6 @@ extern "C" {
 #define	MM_CVT32(a)		_mm_cvtsi128_si32(a)
 #define MM_CVTU32(a)		_mm_cvtsi32_si128(a)
 #define	MM_INSERT16(a, c, b)	_mm_insert_epi16(a, c, b)
-#define	MM_INSERT32(a, c, b)	_mm_insert_epi32(a, c, b)
 #define	MM_LOAD(a)		_mm_load_si128(a)
 #define	MM_LOADH_PI(a, b)	_mm_loadh_pi(a, b)
 #define	MM_LOADU(a)		_mm_loadu_si128(a)
@@ -82,7 +85,6 @@ extern "C" {
 #define	MM_SRL32(a, b)		_mm_srli_epi32(a, b)
 #define	MM_STORE(a, b)		_mm_store_si128(a, b)
 #define	MM_STOREU(a, b)		_mm_storeu_si128(a, b)
-#define	MM_TESTZ(a, b)		_mm_testz_si128(a, b)
 #define	MM_XOR(a, b)		_mm_xor_si128(a, b)
 
 #define	MM_SET16(a, b, c, d, e, f, g, h)	\
@@ -93,6 +95,100 @@ extern "C" {
 	_mm_set_epi8(c0, c1, c2, c3, c4, c5, c6, c7,	\
 		c8, c9, cA, cB, cC, cD, cE, cF)
 
+
+#ifndef __SSE4_1__
+static inline xmm_t pblendvb(xmm_t dst, xmm_t src, xmm_t mask)
+{
+	unsigned char tmpd[16], tmps[16], tmpm[16];
+	int i;
+
+	MM_STOREU((xmm_t *)&tmpd, dst);
+	MM_STOREU((xmm_t *)&tmps, src);
+	MM_STOREU((xmm_t *)&tmpm, mask);
+
+	for (i = 0; i < 16; i++)
+		if (mask[i] & 0x8)
+			dst[i] = src[i];
+
+	dst = MM_LOADU((xmm_t *)&tmpd);
+
+	return dst;
+}
+
+#define MM_BLENDV8(a, b, c)	pblendvb(a, b, c)
+
+
+static inline int ptestz(xmm_t a, xmm_t b)
+{
+	unsigned long long tmpa[2], tmpb[2];
+
+	MM_STOREU((xmm_t *)&tmpa, a);
+	MM_STOREU((xmm_t *)&tmpb, b);
+
+	if (tmpa[0] & tmpb[0])
+		return 1;
+	if (tmpa[1] & tmpb[1])
+		return 1;
+
+	return 0;
+}
+
+#define	MM_TESTZ(a, b)		ptestz(a, b)
+
+static inline xmm_t pinsrd(xmm_t dst, int32_t val, char off)
+{
+	unsigned long long tmpa[2];
+	unsigned long long mask;
+	int32_t tmp;
+	
+	MM_STOREU((xmm_t *)&tmpa, dst);
+
+	/*
+	 * Inserting a dword is a bit odd as it can cross a word boundary
+	 */
+
+	if (off > 32) {
+		/*
+		 * If the offset is more than 32, then part of the 
+		 * inserted word will appear in the upper half of the xmm
+		 * register.  Grab the part of the value that crosses the 64 bit 
+		 * boundary.
+		 */
+		tmp = val >> (off - 32);
+
+		/*
+		 * Mask off the least significant bits of the upper longword
+		 */
+		mask = ~((1 << (off - 32)) - 1);
+		tmpa[1] &= mask;
+
+		/*
+		 * and insert the new value
+		 */
+		tmpa[1] |= tmp;
+	}
+	if (off < 64) {
+		/*
+		 * If the offset is less than 64 bits, we also need to mask and 
+		 * assign the lower longword
+		 */
+		mask = (1 << off) - 1;
+		tmpa[0] &= mask;
+		tmpa[0] |= (val << off);
+	}
+
+	dst = MM_LOADU((xmm_t *)&tmpa);
+	return dst;
+}
+
+#define	MM_INSERT32(a, c, b)	pinsrd(a, c, b)
+
+#else
+#define	MM_BLENDV8(a, b, c)	_mm_blendv_epi8(a, b, c)
+#define	MM_TESTZ(a, b)		_mm_testz_si128(a, b)
+#define	MM_INSERT32(a, c, b)	_mm_insert_epi32(a, c, b)
+#endif
+
 #ifdef RTE_ARCH_X86_64
 
 #define	MM_CVT64(a)		_mm_cvtsi128_si64(a)
-- 
1.8.3.1



More information about the dev mailing list