[PATCH v7 1/4] hash: pack the hitmask for hash in bulk lookup
Yoan Picchi
yoan.picchi at foss.arm.com
Tue Mar 19 14:09:42 CET 2024
On 3/19/24 10:41, Konstantin Ananyev wrote:
>
> Hi,
>
>> Current hitmask includes padding due to Intel's SIMD
>> implementation detail. This patch allows non Intel SIMD
>> implementations to benefit from a dense hitmask.
>> In addition, the new dense hitmask interweave the primary
>> and secondary matches which allow a better cache usage and
>> enable future improvements for the SIMD implementations
>>
>> Signed-off-by: Yoan Picchi <yoan.picchi at arm.com>
>> Reviewed-by: Ruifeng Wang <ruifeng.wang at arm.com>
>> Reviewed-by: Nathan Brown <nathan.brown at arm.com>
>> ---
>> .mailmap | 2 +
>> lib/hash/arch/arm/compare_signatures.h | 61 +++++++
>> lib/hash/arch/common/compare_signatures.h | 38 +++++
>> lib/hash/arch/x86/compare_signatures.h | 53 ++++++
>> lib/hash/rte_cuckoo_hash.c | 192 ++++++++++++----------
>> 5 files changed, 255 insertions(+), 91 deletions(-)
>> create mode 100644 lib/hash/arch/arm/compare_signatures.h
>> create mode 100644 lib/hash/arch/common/compare_signatures.h
>> create mode 100644 lib/hash/arch/x86/compare_signatures.h
>>
>> diff --git a/.mailmap b/.mailmap
>> index 66ebc20666..00b50414d3 100644
>> --- a/.mailmap
>> +++ b/.mailmap
>> @@ -494,6 +494,7 @@ Hari Kumar Vemula <hari.kumarx.vemula at intel.com>
>> Harini Ramakrishnan <harini.ramakrishnan at microsoft.com>
>> Hariprasad Govindharajan <hariprasad.govindharajan at intel.com>
>> Harish Patil <harish.patil at cavium.com> <harish.patil at qlogic.com>
>> +Harjot Singh <harjot.singh at arm.com>
>> Harman Kalra <hkalra at marvell.com>
>> Harneet Singh <harneet.singh at intel.com>
>> Harold Huang <baymaxhuang at gmail.com>
>> @@ -1633,6 +1634,7 @@ Yixue Wang <yixue.wang at intel.com>
>> Yi Yang <yangyi01 at inspur.com> <yi.y.yang at intel.com>
>> Yi Zhang <zhang.yi75 at zte.com.cn>
>> Yoann Desmouceaux <ydesmouc at cisco.com>
>> +Yoan Picchi <yoan.picchi at arm.com>
>> Yogesh Jangra <yogesh.jangra at intel.com>
>> Yogev Chaimovich <yogev at cgstowernetworks.com>
>> Yongjie Gu <yongjiex.gu at intel.com>
>> diff --git a/lib/hash/arch/arm/compare_signatures.h b/lib/hash/arch/arm/compare_signatures.h
>> new file mode 100644
>> index 0000000000..1af6ba8190
>> --- /dev/null
>> +++ b/lib/hash/arch/arm/compare_signatures.h
>> @@ -0,0 +1,61 @@
>> +/* SPDX-License-Identifier: BSD-3-Clause
>> + * Copyright(c) 2010-2016 Intel Corporation
>> + * Copyright(c) 2018-2024 Arm Limited
>> + */
>> +
>> +/*
>> + * Arm's version uses a densely packed hitmask buffer:
>> + * Every bit is in use.
>> + */
>> +
>> +#include <inttypes.h>
>> +#include <rte_common.h>
>> +#include <rte_vect.h>
>> +#include "rte_cuckoo_hash.h"
>> +
>> +#define DENSE_HASH_BULK_LOOKUP 1
>> +
>> +static inline void
>> +compare_signatures_dense(uint16_t *hitmask_buffer,
>> + const uint16_t *prim_bucket_sigs,
>> + const uint16_t *sec_bucket_sigs,
>> + uint16_t sig,
>> + enum rte_hash_sig_compare_function sig_cmp_fn)
>> +{
>> +
>> + static_assert(sizeof(*hitmask_buffer) >= 2*(RTE_HASH_BUCKET_ENTRIES/8),
>> + "The hitmask must be exactly wide enough to accept the whole hitmask if it is dense");
>> +
>> + /* For match mask every bits indicates the match */
>> + switch (sig_cmp_fn) {
>> +#if RTE_HASH_BUCKET_ENTRIES <= 8
>> + case RTE_HASH_COMPARE_NEON: {
>> + uint16x8_t vmat, vsig, x;
>> + int16x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
>> + uint16_t low, high;
>> +
>> + vsig = vld1q_dup_u16((uint16_t const *)&sig);
>> + /* Compare all signatures in the primary bucket */
>> + vmat = vceqq_u16(vsig,
>> + vld1q_u16((uint16_t const *)prim_bucket_sigs));
>> + x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
>> + low = (uint16_t)(vaddvq_u16(x));
>> + /* Compare all signatures in the secondary bucket */
>> + vmat = vceqq_u16(vsig,
>> + vld1q_u16((uint16_t const *)sec_bucket_sigs));
>> + x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
>> + high = (uint16_t)(vaddvq_u16(x));
>> + *hitmask_buffer = low | high << RTE_HASH_BUCKET_ENTRIES;
>> +
>> + }
>> + break;
>> +#endif
>> + default:
>> + for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
>> + *hitmask_buffer |=
>> + ((sig == prim_bucket_sigs[i]) << i);
>> + *hitmask_buffer |=
>> + ((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES;
>> + }
>> + }
>> +}
>> diff --git a/lib/hash/arch/common/compare_signatures.h b/lib/hash/arch/common/compare_signatures.h
>> new file mode 100644
>> index 0000000000..dcf9444032
>> --- /dev/null
>> +++ b/lib/hash/arch/common/compare_signatures.h
>> @@ -0,0 +1,38 @@
>> +/* SPDX-License-Identifier: BSD-3-Clause
>> + * Copyright(c) 2010-2016 Intel Corporation
>> + * Copyright(c) 2018-2024 Arm Limited
>> + */
>> +
>> +/*
>> + * The generic version could use either a dense or sparsely packed hitmask buffer,
>> + * but the dense one is slightly faster.
>> + */
>> +
>> +#include <inttypes.h>
>> +#include <rte_common.h>
>> +#include <rte_vect.h>
>> +#include "rte_cuckoo_hash.h"
>> +
>> +#define DENSE_HASH_BULK_LOOKUP 1
>> +
>> +static inline void
>> +compare_signatures_dense(uint16_t *hitmask_buffer,
>> + const uint16_t *prim_bucket_sigs,
>> + const uint16_t *sec_bucket_sigs,
>> + uint16_t sig,
>> + enum rte_hash_sig_compare_function sig_cmp_fn)
>> +{
>> + (void) sig_cmp_fn;
>> +
>> + static_assert(sizeof(*hitmask_buffer) >= 2*(RTE_HASH_BUCKET_ENTRIES/8),
>> + "The hitmask must be exactly wide enough to accept the whole hitmask if it is dense");
>> +
>> + /* For match mask every bits indicates the match */
>> + for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
>> + *hitmask_buffer |=
>> + ((sig == prim_bucket_sigs[i]) << i);
>> + *hitmask_buffer |=
>> + ((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES;
>> + }
>> +
>> +}
>
> Thanks for re-factoring compare_signatures_...() code, it looks much cleaner that way.
> One question I have - does it mean that now for x86 we always use 'sparse' while for all other
> ARM and non-ARM platforms we switch to 'dense'?
Yes it does. x86 support only the sparse method (the legacy one). Arm
and generic code could support both dense and sparse. The reason I made
them use the dense method is because it was slightly faster in my tests.
(no need to add padding and shifts amongst other benefit.)
>
>> diff --git a/lib/hash/arch/x86/compare_signatures.h b/lib/hash/arch/x86/compare_signatures.h
>> new file mode 100644
>> index 0000000000..7eec499e1f
>> --- /dev/null
>> +++ b/lib/hash/arch/x86/compare_signatures.h
>> @@ -0,0 +1,53 @@
>> +/* SPDX-License-Identifier: BSD-3-Clause
>> + * Copyright(c) 2010-2016 Intel Corporation
>> + * Copyright(c) 2018-2024 Arm Limited
>> + */
>> +
>> +/*
>> + * x86's version uses a sparsely packed hitmask buffer:
>> + * Every other bit is padding.
>> + */
>> +
>> +#include <inttypes.h>
>> +#include <rte_common.h>
>> +#include <rte_vect.h>
>> +#include "rte_cuckoo_hash.h"
>> +
>> +#define DENSE_HASH_BULK_LOOKUP 0
>> +
>> +static inline void
>> +compare_signatures_sparse(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
>> + const struct rte_hash_bucket *prim_bkt,
>> + const struct rte_hash_bucket *sec_bkt,
>> + uint16_t sig,
>> + enum rte_hash_sig_compare_function sig_cmp_fn)
>> +{
>> + /* For match mask the first bit of every two bits indicates the match */
>> + switch (sig_cmp_fn) {
>> +#if defined(__SSE2__) && RTE_HASH_BUCKET_ENTRIES <= 8
>> + case RTE_HASH_COMPARE_SSE:
>> + /* Compare all signatures in the bucket */
>> + *prim_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
>> + _mm_load_si128(
>> + (__m128i const *)prim_bkt->sig_current),
>> + _mm_set1_epi16(sig)));
>> + /* Extract the even-index bits only */
>> + *prim_hash_matches &= 0x5555;
>> + /* Compare all signatures in the bucket */
>> + *sec_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
>> + _mm_load_si128(
>> + (__m128i const *)sec_bkt->sig_current),
>> + _mm_set1_epi16(sig)));
>> + /* Extract the even-index bits only */
>> + *sec_hash_matches &= 0x5555;
>> + break;
>> +#endif /* defined(__SSE2__) */
>> + default:
>> + for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
>> + *prim_hash_matches |=
>> + ((sig == prim_bkt->sig_current[i]) << (i << 1));
>> + *sec_hash_matches |=
>> + ((sig == sec_bkt->sig_current[i]) << (i << 1));
>> + }
>> + }
>> +}
More information about the dev
mailing list