[PATCH v7 1/4] hash: pack the hitmask for hash in bulk lookup

Yoan Picchi yoan.picchi at foss.arm.com
Tue Mar 19 14:09:42 CET 2024


On 3/19/24 10:41, Konstantin Ananyev wrote:
> 
> Hi,
> 
>> Current hitmask includes padding due to Intel's SIMD
>> implementation detail. This patch allows non Intel SIMD
>> implementations to benefit from a dense hitmask.
>> In addition, the new dense hitmask interweave the primary
>> and secondary matches which allow a better cache usage and
>> enable future improvements for the SIMD implementations
>>
>> Signed-off-by: Yoan Picchi <yoan.picchi at arm.com>
>> Reviewed-by: Ruifeng Wang <ruifeng.wang at arm.com>
>> Reviewed-by: Nathan Brown <nathan.brown at arm.com>
>> ---
>>   .mailmap                                  |   2 +
>>   lib/hash/arch/arm/compare_signatures.h    |  61 +++++++
>>   lib/hash/arch/common/compare_signatures.h |  38 +++++
>>   lib/hash/arch/x86/compare_signatures.h    |  53 ++++++
>>   lib/hash/rte_cuckoo_hash.c                | 192 ++++++++++++----------
>>   5 files changed, 255 insertions(+), 91 deletions(-)
>>   create mode 100644 lib/hash/arch/arm/compare_signatures.h
>>   create mode 100644 lib/hash/arch/common/compare_signatures.h
>>   create mode 100644 lib/hash/arch/x86/compare_signatures.h
>>
>> diff --git a/.mailmap b/.mailmap
>> index 66ebc20666..00b50414d3 100644
>> --- a/.mailmap
>> +++ b/.mailmap
>> @@ -494,6 +494,7 @@ Hari Kumar Vemula <hari.kumarx.vemula at intel.com>
>>   Harini Ramakrishnan <harini.ramakrishnan at microsoft.com>
>>   Hariprasad Govindharajan <hariprasad.govindharajan at intel.com>
>>   Harish Patil <harish.patil at cavium.com> <harish.patil at qlogic.com>
>> +Harjot Singh <harjot.singh at arm.com>
>>   Harman Kalra <hkalra at marvell.com>
>>   Harneet Singh <harneet.singh at intel.com>
>>   Harold Huang <baymaxhuang at gmail.com>
>> @@ -1633,6 +1634,7 @@ Yixue Wang <yixue.wang at intel.com>
>>   Yi Yang <yangyi01 at inspur.com> <yi.y.yang at intel.com>
>>   Yi Zhang <zhang.yi75 at zte.com.cn>
>>   Yoann Desmouceaux <ydesmouc at cisco.com>
>> +Yoan Picchi <yoan.picchi at arm.com>
>>   Yogesh Jangra <yogesh.jangra at intel.com>
>>   Yogev Chaimovich <yogev at cgstowernetworks.com>
>>   Yongjie Gu <yongjiex.gu at intel.com>
>> diff --git a/lib/hash/arch/arm/compare_signatures.h b/lib/hash/arch/arm/compare_signatures.h
>> new file mode 100644
>> index 0000000000..1af6ba8190
>> --- /dev/null
>> +++ b/lib/hash/arch/arm/compare_signatures.h
>> @@ -0,0 +1,61 @@
>> +/* SPDX-License-Identifier: BSD-3-Clause
>> + * Copyright(c) 2010-2016 Intel Corporation
>> + * Copyright(c) 2018-2024 Arm Limited
>> + */
>> +
>> +/*
>> + * Arm's version uses a densely packed hitmask buffer:
>> + * Every bit is in use.
>> + */
>> +
>> +#include <inttypes.h>
>> +#include <rte_common.h>
>> +#include <rte_vect.h>
>> +#include "rte_cuckoo_hash.h"
>> +
>> +#define DENSE_HASH_BULK_LOOKUP 1
>> +
>> +static inline void
>> +compare_signatures_dense(uint16_t *hitmask_buffer,
>> +			const uint16_t *prim_bucket_sigs,
>> +			const uint16_t *sec_bucket_sigs,
>> +			uint16_t sig,
>> +			enum rte_hash_sig_compare_function sig_cmp_fn)
>> +{
>> +
>> +	static_assert(sizeof(*hitmask_buffer) >= 2*(RTE_HASH_BUCKET_ENTRIES/8),
>> +	"The hitmask must be exactly wide enough to accept the whole hitmask if it is dense");
>> +
>> +	/* For match mask every bits indicates the match */
>> +	switch (sig_cmp_fn) {
>> +#if RTE_HASH_BUCKET_ENTRIES <= 8
>> +	case RTE_HASH_COMPARE_NEON: {
>> +		uint16x8_t vmat, vsig, x;
>> +		int16x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
>> +		uint16_t low, high;
>> +
>> +		vsig = vld1q_dup_u16((uint16_t const *)&sig);
>> +		/* Compare all signatures in the primary bucket */
>> +		vmat = vceqq_u16(vsig,
>> +			vld1q_u16((uint16_t const *)prim_bucket_sigs));
>> +		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
>> +		low = (uint16_t)(vaddvq_u16(x));
>> +		/* Compare all signatures in the secondary bucket */
>> +		vmat = vceqq_u16(vsig,
>> +			vld1q_u16((uint16_t const *)sec_bucket_sigs));
>> +		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
>> +		high = (uint16_t)(vaddvq_u16(x));
>> +		*hitmask_buffer = low | high << RTE_HASH_BUCKET_ENTRIES;
>> +
>> +		}
>> +		break;
>> +#endif
>> +	default:
>> +		for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
>> +			*hitmask_buffer |=
>> +				((sig == prim_bucket_sigs[i]) << i);
>> +			*hitmask_buffer |=
>> +				((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES;
>> +		}
>> +	}
>> +}
>> diff --git a/lib/hash/arch/common/compare_signatures.h b/lib/hash/arch/common/compare_signatures.h
>> new file mode 100644
>> index 0000000000..dcf9444032
>> --- /dev/null
>> +++ b/lib/hash/arch/common/compare_signatures.h
>> @@ -0,0 +1,38 @@
>> +/* SPDX-License-Identifier: BSD-3-Clause
>> + * Copyright(c) 2010-2016 Intel Corporation
>> + * Copyright(c) 2018-2024 Arm Limited
>> + */
>> +
>> +/*
>> + * The generic version could use either a dense or sparsely packed hitmask buffer,
>> + * but the dense one is slightly faster.
>> + */
>> +
>> +#include <inttypes.h>
>> +#include <rte_common.h>
>> +#include <rte_vect.h>
>> +#include "rte_cuckoo_hash.h"
>> +
>> +#define DENSE_HASH_BULK_LOOKUP 1
>> +
>> +static inline void
>> +compare_signatures_dense(uint16_t *hitmask_buffer,
>> +			const uint16_t *prim_bucket_sigs,
>> +			const uint16_t *sec_bucket_sigs,
>> +			uint16_t sig,
>> +			enum rte_hash_sig_compare_function sig_cmp_fn)
>> +{
>> +	(void) sig_cmp_fn;
>> +
>> +	static_assert(sizeof(*hitmask_buffer) >= 2*(RTE_HASH_BUCKET_ENTRIES/8),
>> +	"The hitmask must be exactly wide enough to accept the whole hitmask if it is dense");
>> +
>> +	/* For match mask every bits indicates the match */
>> +	for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
>> +		*hitmask_buffer |=
>> +			((sig == prim_bucket_sigs[i]) << i);
>> +		*hitmask_buffer |=
>> +			((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES;
>> +	}
>> +
>> +}
> 
> Thanks for re-factoring compare_signatures_...() code, it looks much cleaner that way.
> One question I have - does it mean that now for x86 we always use 'sparse' while for all other
> ARM and non-ARM platforms we switch to 'dense'?

Yes it does. x86 support only the sparse method (the legacy one). Arm 
and generic code could support both dense and sparse. The reason I made 
them use the dense method is because it was slightly faster in my tests. 
(no need to add padding and shifts amongst other benefit.)

> 
>> diff --git a/lib/hash/arch/x86/compare_signatures.h b/lib/hash/arch/x86/compare_signatures.h
>> new file mode 100644
>> index 0000000000..7eec499e1f
>> --- /dev/null
>> +++ b/lib/hash/arch/x86/compare_signatures.h
>> @@ -0,0 +1,53 @@
>> +/* SPDX-License-Identifier: BSD-3-Clause
>> + * Copyright(c) 2010-2016 Intel Corporation
>> + * Copyright(c) 2018-2024 Arm Limited
>> + */
>> +
>> +/*
>> + * x86's version uses a sparsely packed hitmask buffer:
>> + * Every other bit is padding.
>> + */
>> +
>> +#include <inttypes.h>
>> +#include <rte_common.h>
>> +#include <rte_vect.h>
>> +#include "rte_cuckoo_hash.h"
>> +
>> +#define DENSE_HASH_BULK_LOOKUP 0
>> +
>> +static inline void
>> +compare_signatures_sparse(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
>> +			const struct rte_hash_bucket *prim_bkt,
>> +			const struct rte_hash_bucket *sec_bkt,
>> +			uint16_t sig,
>> +			enum rte_hash_sig_compare_function sig_cmp_fn)
>> +{
>> +	/* For match mask the first bit of every two bits indicates the match */
>> +	switch (sig_cmp_fn) {
>> +#if defined(__SSE2__) && RTE_HASH_BUCKET_ENTRIES <= 8
>> +	case RTE_HASH_COMPARE_SSE:
>> +		/* Compare all signatures in the bucket */
>> +		*prim_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
>> +				_mm_load_si128(
>> +					(__m128i const *)prim_bkt->sig_current),
>> +				_mm_set1_epi16(sig)));
>> +		/* Extract the even-index bits only */
>> +		*prim_hash_matches &= 0x5555;
>> +		/* Compare all signatures in the bucket */
>> +		*sec_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
>> +				_mm_load_si128(
>> +					(__m128i const *)sec_bkt->sig_current),
>> +				_mm_set1_epi16(sig)));
>> +		/* Extract the even-index bits only */
>> +		*sec_hash_matches &= 0x5555;
>> +		break;
>> +#endif /* defined(__SSE2__) */
>> +	default:
>> +		for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
>> +			*prim_hash_matches |=
>> +				((sig == prim_bkt->sig_current[i]) << (i << 1));
>> +			*sec_hash_matches |=
>> +				((sig == sec_bkt->sig_current[i]) << (i << 1));
>> +		}
>> +	}
>> +}



More information about the dev mailing list