[PATCH v11] net: optimize raw checksum computation

Morten Brørup mb at smartsharesystems.com
Fri Jan 9 19:28:05 CET 2026


>  static inline uint32_t
>  __rte_raw_cksum(const void *buf, size_t len, uint32_t sum)
>  {
> -	const void *end;
> -
> -	for (end = RTE_PTR_ADD(buf, RTE_ALIGN_FLOOR(len,
> sizeof(uint16_t)));
> -	     buf != end; buf = RTE_PTR_ADD(buf, sizeof(uint16_t))) {
> -		uint16_t v;
> -
> -		memcpy(&v, buf, sizeof(uint16_t));
> -		sum += v;
> -	}
> +	/* Process uint16 chunks to preserve overflow/carry math.
> GCC/Clang vectorize the loop. */
> +	const unaligned_uint16_t *buf16 = (const unaligned_uint16_t
> *)buf;
> +	const unaligned_uint16_t *end = buf16 + (len / sizeof(uint16_t));
> +	for (; buf16 != end; buf16++)
> +		sum += *buf16;

Here are some more thoughts about loop unroll...
In another mail [1], you are discussing manual loop unroll for rte_ipv4/ipv6_phdr_cksum().
Perhaps the compiler already loop unrolls those.
Check the assembler output for the existing code calling __rte_raw_cksum().
If the compiler doesn't loop unroll __rte_raw_cksum() for those two functions, maybe you can help it by modifying __rte_raw_cksum(); try replacing the end pointer with an int counter, which will be compile time constant when called by rte_ipv4/ipv6_phdr_cksum().

[1]: https://inbox.dpdk.org/dev/CAFn2buA5NzmzA0+t1_5auigvQTyT7Ne6RMVaPVU=sdC03nd2Lg@mail.gmail.com/

PS: I do the following when optimizing inline functions: Add non-inline functions calling the inline functions, and then use "objdump -S" to look at the generated code. E.g.:

uint32_t review__rte_raw_cksum(const void *buf, size_t len, uint32_t sum)
{ return __rte_raw_cksum(buf, len, sum); }

uint32_t review__rte_raw_cksum_len20(const void *buf, uint32_t sum)
{ return __rte_raw_cksum(buf, 20, sum); }

uint32_t review__rte_raw_cksum_len8(const void *buf, uint32_t sum)
{ return __rte_raw_cksum(buf, 8, sum); }

> 
>  	/* if length is odd, keeping it byte order independent */
> -	if (unlikely(len % 2)) {
> +	if (len & 1) {
>  		uint16_t left = 0;
> -
>  		memcpy(&left, end, 1);
>  		sum += left;
>  	}
> diff --git a/lib/net/rte_ip4.h b/lib/net/rte_ip4.h
> index 822a660cfb..63852717c9 100644
> --- a/lib/net/rte_ip4.h
> +++ b/lib/net/rte_ip4.h
> @@ -223,21 +223,17 @@ rte_ipv4_phdr_cksum(const struct rte_ipv4_hdr
> *ipv4_hdr, uint64_t ol_flags)
>  		uint8_t  zero;     /* zero. */
>  		uint8_t  proto;    /* L4 protocol type. */
>  		uint16_t len;      /* L4 length. */
> -	} psd_hdr;
> -
> -	uint32_t l3_len;
> -
> -	psd_hdr.src_addr = ipv4_hdr->src_addr;
> -	psd_hdr.dst_addr = ipv4_hdr->dst_addr;
> -	psd_hdr.zero = 0;
> -	psd_hdr.proto = ipv4_hdr->next_proto_id;
> -	if (ol_flags & (RTE_MBUF_F_TX_TCP_SEG | RTE_MBUF_F_TX_UDP_SEG)) {
> -		psd_hdr.len = 0;
> -	} else {
> -		l3_len = rte_be_to_cpu_16(ipv4_hdr->total_length);
> -		psd_hdr.len = rte_cpu_to_be_16((uint16_t)(l3_len -
> -			rte_ipv4_hdr_len(ipv4_hdr)));
> -	}
> +	} psd_hdr = {
> +		.src_addr = ipv4_hdr->src_addr,
> +		.dst_addr = ipv4_hdr->dst_addr,
> +		.proto = ipv4_hdr->next_proto_id,
> +		.len = (ol_flags & (RTE_MBUF_F_TX_TCP_SEG |
> RTE_MBUF_F_TX_UDP_SEG))
> +			? (uint16_t)0
> +			:
> rte_cpu_to_be_16((uint16_t)(rte_be_to_cpu_16(ipv4_hdr->total_length) -
> +					rte_ipv4_hdr_len(ipv4_hdr)))
> +	};
> +	RTE_SUPPRESS_UNINITIALIZED_WARNING(psd_hdr);
> +
>  	return rte_raw_cksum(&psd_hdr, sizeof(psd_hdr));
>  }
> 
> diff --git a/lib/net/rte_ip6.h b/lib/net/rte_ip6.h
> index d1abf1f5d5..8a7e5e4b8a 100644
> --- a/lib/net/rte_ip6.h
> +++ b/lib/net/rte_ip6.h
> @@ -560,19 +560,18 @@ rte_ipv6_phdr_cksum(const struct rte_ipv6_hdr
> *ipv6_hdr, uint64_t ol_flags)
>  static inline uint16_t
>  rte_ipv6_phdr_cksum(const struct rte_ipv6_hdr *ipv6_hdr, uint64_t
> ol_flags)
>  {
> -	uint32_t sum;
>  	struct {
>  		rte_be32_t len;   /* L4 length. */
>  		rte_be32_t proto; /* L4 protocol - top 3 bytes must be zero
> */
> -	} psd_hdr;
> -
> -	psd_hdr.proto = (uint32_t)(ipv6_hdr->proto << 24);
> -	if (ol_flags & (RTE_MBUF_F_TX_TCP_SEG | RTE_MBUF_F_TX_UDP_SEG))
> -		psd_hdr.len = 0;
> -	else
> -		psd_hdr.len = ipv6_hdr->payload_len;
> +	} psd_hdr = {
> +		.len = (ol_flags & (RTE_MBUF_F_TX_TCP_SEG |
> RTE_MBUF_F_TX_UDP_SEG))
> +			? (rte_be32_t)0
> +			: ipv6_hdr->payload_len,
> +		.proto = (uint32_t)(ipv6_hdr->proto << 24)
> +	};
> +	RTE_SUPPRESS_UNINITIALIZED_WARNING(psd_hdr);
> 
> -	sum = __rte_raw_cksum(&ipv6_hdr->src_addr,
> +	uint32_t sum = __rte_raw_cksum(&ipv6_hdr->src_addr,
>  		sizeof(ipv6_hdr->src_addr) + sizeof(ipv6_hdr->dst_addr),
>  		0);
>  	sum = __rte_raw_cksum(&psd_hdr, sizeof(psd_hdr), sum);
> --
> 2.39.5 (Apple Git-154)



More information about the dev mailing list