[PATCH] net/crc: reduce usage of static arrays in net_crc_sse.c
Shreesh Adiga
16567adigashreesh at gmail.com
Sat Oct 11 13:29:34 CEST 2025
Replace the clearing of lower 32 bits of XMM register with blend of
zero register.
Remove the clearing of upper 64 bits of tmp1 as it is redundant.
tmp1 after clearing upper bits was being xor with tmp2 before the
bits 96:65 from tmp2 were returned. The xor operation of bits 96:65
remains unchanged due to tmp1 having bits 96:64 cleared to 0.
After removing the xor operation, the clearing of upper 64 bits of tmp1
becomes redundant and hence can be removed.
Clang is able to optimize away the AND + memory operand with the
above sequence, however GCC is still emitting the code for AND with
memory operands which is being explicitly eliminated here.
Additionally replace the 48 byte crc_xmm_shift_tab with the contents of
shf_table which is 32 bytes, achieving the same functionality.
Signed-off-by: Shreesh Adiga <16567adigashreesh at gmail.com>
---
Changes since v1:
Reversed the operands in the blend operation for readability.
Removed tmp1 operations that are not affecting the result and hence
avoid clearing the upper 64 bits for tmp1.
lib/net/net_crc_sse.c | 30 ++++++------------------------
1 file changed, 6 insertions(+), 24 deletions(-)
diff --git a/lib/net/net_crc_sse.c b/lib/net/net_crc_sse.c
index 112dc94ac1..e590aeb5ac 100644
--- a/lib/net/net_crc_sse.c
+++ b/lib/net/net_crc_sse.c
@@ -96,35 +96,24 @@ crcr32_reduce_128_to_64(__m128i data128, __m128i precomp)
static __rte_always_inline uint32_t
crcr32_reduce_64_to_32(__m128i data64, __m128i precomp)
{
- static const alignas(16) uint32_t mask1[4] = {
- 0xffffffff, 0xffffffff, 0x00000000, 0x00000000
- };
-
- static const alignas(16) uint32_t mask2[4] = {
- 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff
- };
__m128i tmp0, tmp1, tmp2;
- tmp0 = _mm_and_si128(data64, _mm_load_si128((const __m128i *)mask2));
+ tmp0 = _mm_blend_epi16(data64, _mm_setzero_si128(), 0x3);
tmp1 = _mm_clmulepi64_si128(tmp0, precomp, 0x00);
tmp1 = _mm_xor_si128(tmp1, tmp0);
- tmp1 = _mm_and_si128(tmp1, _mm_load_si128((const __m128i *)mask1));
tmp2 = _mm_clmulepi64_si128(tmp1, precomp, 0x10);
- tmp2 = _mm_xor_si128(tmp2, tmp1);
tmp2 = _mm_xor_si128(tmp2, tmp0);
return _mm_extract_epi32(tmp2, 2);
}
-static const alignas(16) uint8_t crc_xmm_shift_tab[48] = {
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+static const alignas(16) uint8_t crc_xmm_shift_tab[32] = {
+ 0x00, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
- 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
};
/**
@@ -216,19 +205,12 @@ crc32_eth_calc_pclmulqdq(
0x80808080, 0x80808080, 0x80808080, 0x80808080
};
- const alignas(16) uint8_t shf_table[32] = {
- 0x00, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
- 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
- 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
- 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
- };
-
__m128i last16, a, b;
last16 = _mm_loadu_si128((const __m128i *)&data[data_len - 16]);
temp = _mm_loadu_si128((const __m128i *)
- &shf_table[data_len & 15]);
+ &crc_xmm_shift_tab[data_len & 15]);
a = _mm_shuffle_epi8(fold, temp);
temp = _mm_xor_si128(temp,
--
2.49.1
More information about the dev
mailing list