[dpdk-dev] [PATCH v2 10/13] mbuf: generic support for TCP segmentation offload

Ananyev, Konstantin konstantin.ananyev at intel.com
Tue Nov 18 00:33:30 CET 2014



> -----Original Message-----
> From: dev [mailto:dev-bounces at dpdk.org] On Behalf Of Olivier Matz
> Sent: Friday, November 14, 2014 5:03 PM
> To: dev at dpdk.org
> Cc: jigsaw at gmail.com
> Subject: [dpdk-dev] [PATCH v2 10/13] mbuf: generic support for TCP segmentation offload
> 
> Some of the NICs supported by DPDK have a possibility to accelerate TCP
> traffic by using segmentation offload. The application prepares a packet
> with valid TCP header with size up to 64K and deleguates the
> segmentation to the NIC.
> 
> Implement the generic part of TCP segmentation offload in rte_mbuf. It
> introduces 2 new fields in rte_mbuf: l4_len (length of L4 header in bytes)
> and tso_segsz (MSS of packets).
> 
> To delegate the TCP segmentation to the hardware, the user has to:
> 
> - set the PKT_TX_TCP_SEG flag in mbuf->ol_flags (this flag implies
>   PKT_TX_TCP_CKSUM)
> - set PKT_TX_IP_CKSUM if it's IPv4, and set the IP checksum to 0 in
>   the packet
> - fill the mbuf offload information: l2_len, l3_len, l4_len, tso_segsz
> - calculate the pseudo header checksum without taking ip_len in account,
>   and set it in the TCP header, for instance by using
>   rte_ipv4_phdr_cksum(ip_hdr, ol_flags)
> 
> The API is inspired from ixgbe hardware (the next commit adds the
> support for ixgbe), but it seems generic enough to be used for other
> hw/drivers in the future.
> 
> This commit also reworks the way l2_len and l3_len are used in igb
> and ixgbe drivers as the l2_l3_len is not available anymore in mbuf.
> 
> Signed-off-by: Mirek Walukiewicz <miroslaw.walukiewicz at intel.com>
> Signed-off-by: Olivier Matz <olivier.matz at 6wind.com>

Acked-by: Konstantin Ananyev <konstantin.ananyev at intel.com>

> ---
>  app/test-pmd/testpmd.c            |  2 +-
>  examples/ipv4_multicast/main.c    |  2 +-
>  lib/librte_mbuf/rte_mbuf.c        |  1 +
>  lib/librte_mbuf/rte_mbuf.h        | 44 +++++++++++++++++++++++----------------
>  lib/librte_net/rte_ip.h           | 39 +++++++++++++++++++++++++++-------
>  lib/librte_pmd_e1000/igb_rxtx.c   | 11 +++++++++-
>  lib/librte_pmd_ixgbe/ixgbe_rxtx.c | 11 +++++++++-
>  7 files changed, 81 insertions(+), 29 deletions(-)
> 
> diff --git a/app/test-pmd/testpmd.c b/app/test-pmd/testpmd.c
> index 12adafa..632a993 100644
> --- a/app/test-pmd/testpmd.c
> +++ b/app/test-pmd/testpmd.c
> @@ -408,7 +408,7 @@ testpmd_mbuf_ctor(struct rte_mempool *mp,
>  	mb->ol_flags     = 0;
>  	mb->data_off     = RTE_PKTMBUF_HEADROOM;
>  	mb->nb_segs      = 1;
> -	mb->l2_l3_len       = 0;
> +	mb->tx_offload   = 0;
>  	mb->vlan_tci     = 0;
>  	mb->hash.rss     = 0;
>  }
> diff --git a/examples/ipv4_multicast/main.c b/examples/ipv4_multicast/main.c
> index 590d11a..80c5140 100644
> --- a/examples/ipv4_multicast/main.c
> +++ b/examples/ipv4_multicast/main.c
> @@ -302,7 +302,7 @@ mcast_out_pkt(struct rte_mbuf *pkt, int use_clone)
>  	/* copy metadata from source packet*/
>  	hdr->port = pkt->port;
>  	hdr->vlan_tci = pkt->vlan_tci;
> -	hdr->l2_l3_len = pkt->l2_l3_len;
> +	hdr->tx_offload = pkt->tx_offload;
>  	hdr->hash = pkt->hash;
> 
>  	hdr->ol_flags = pkt->ol_flags;
> diff --git a/lib/librte_mbuf/rte_mbuf.c b/lib/librte_mbuf/rte_mbuf.c
> index 5cd9137..75295c8 100644
> --- a/lib/librte_mbuf/rte_mbuf.c
> +++ b/lib/librte_mbuf/rte_mbuf.c
> @@ -238,6 +238,7 @@ const char *rte_get_tx_ol_flag_name(uint64_t mask)
>  	case PKT_TX_UDP_CKSUM: return "PKT_TX_UDP_CKSUM";
>  	case PKT_TX_IEEE1588_TMST: return "PKT_TX_IEEE1588_TMST";
>  	case PKT_TX_VXLAN_CKSUM: return "PKT_TX_VXLAN_CKSUM";
> +	case PKT_TX_TCP_SEG: return "PKT_TX_TCP_SEG";
>  	default: return NULL;
>  	}
>  }
> diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
> index 3c8e825..9f44d08 100644
> --- a/lib/librte_mbuf/rte_mbuf.h
> +++ b/lib/librte_mbuf/rte_mbuf.h
> @@ -127,6 +127,20 @@ extern "C" {
> 
>  #define PKT_TX_VXLAN_CKSUM   (1ULL << 50) /**< TX checksum of VXLAN computed by NIC */
> 
> +/**
> + * TCP segmentation offload. To enable this offload feature for a
> + * packet to be transmitted on hardware supporting TSO:
> + *  - set the PKT_TX_TCP_SEG flag in mbuf->ol_flags (this flag implies
> + *    PKT_TX_TCP_CKSUM)
> + *  - if it's IPv4, set the PKT_TX_IP_CKSUM flag and write the IP checksum
> + *    to 0 in the packet
> + *  - fill the mbuf offload information: l2_len, l3_len, l4_len, tso_segsz
> + *  - calculate the pseudo header checksum without taking ip_len in accound,
> + *    and set it in the TCP header. Refer to rte_ipv4_phdr_cksum() and
> + *    rte_ipv6_phdr_cksum() that can be used as helpers.
> + */
> +#define PKT_TX_TCP_SEG       (1ULL << 49)
> +
>  /* Use final bit of flags to indicate a control mbuf */
>  #define CTRL_MBUF_FLAG       (1ULL << 63) /**< Mbuf contains control data */
> 
> @@ -228,22 +242,18 @@ struct rte_mbuf {
> 
>  	/* fields to support TX offloads */
>  	union {
> -		uint16_t l2_l3_len; /**< combined l2/l3 lengths as single var */
> +		uint64_t tx_offload;       /**< combined for easy fetch */
>  		struct {
> -			uint16_t l3_len:9;      /**< L3 (IP) Header Length. */
> -			uint16_t l2_len:7;      /**< L2 (MAC) Header Length. */
> -		};
> -	};
> +			uint64_t l2_len:7; /**< L2 (MAC) Header Length. */
> +			uint64_t l3_len:9; /**< L3 (IP) Header Length. */
> +			uint64_t l4_len:8; /**< L4 (TCP/UDP) Header Length. */
> +			uint64_t tso_segsz:16; /**< TCP TSO segment size */
> 
> -	/* fields for TX offloading of tunnels */
> -	union {
> -		uint16_t inner_l2_l3_len;
> -		/**< combined inner l2/l3 lengths as single var */
> -		struct {
> -			uint16_t inner_l3_len:9;
> -			/**< inner L3 (IP) Header Length. */
> -			uint16_t inner_l2_len:7;
> -			/**< inner L2 (MAC) Header Length. */
> +			/* fields for TX offloading of tunnels */
> +			uint64_t inner_l3_len:9; /**< inner L3 (IP) Hdr Length. */
> +			uint64_t inner_l2_len:7; /**< inner L2 (MAC) Hdr Length. */
> +
> +			/* uint64_t unused:8; */
>  		};
>  	};
>  } __rte_cache_aligned;
> @@ -595,8 +605,7 @@ static inline void rte_pktmbuf_reset(struct rte_mbuf *m)
>  {
>  	m->next = NULL;
>  	m->pkt_len = 0;
> -	m->l2_l3_len = 0;
> -	m->inner_l2_l3_len = 0;
> +	m->tx_offload = 0;
>  	m->vlan_tci = 0;
>  	m->nb_segs = 1;
>  	m->port = 0xff;
> @@ -665,8 +674,7 @@ static inline void rte_pktmbuf_attach(struct rte_mbuf *mi, struct rte_mbuf *md)
>  	mi->data_len = md->data_len;
>  	mi->port = md->port;
>  	mi->vlan_tci = md->vlan_tci;
> -	mi->l2_l3_len = md->l2_l3_len;
> -	mi->inner_l2_l3_len = md->inner_l2_l3_len;
> +	mi->tx_offload = md->tx_offload;
>  	mi->hash = md->hash;
> 
>  	mi->next = NULL;
> diff --git a/lib/librte_net/rte_ip.h b/lib/librte_net/rte_ip.h
> index 9cfca7f..1fafa73 100644
> --- a/lib/librte_net/rte_ip.h
> +++ b/lib/librte_net/rte_ip.h
> @@ -80,6 +80,7 @@
> 
>  #include <rte_memcpy.h>
>  #include <rte_byteorder.h>
> +#include <rte_mbuf.h>
> 
>  #ifdef __cplusplus
>  extern "C" {
> @@ -308,13 +309,21 @@ rte_ipv4_cksum(const struct ipv4_hdr *ipv4_hdr)
>   *
>   * The checksum field must be set to 0 by the caller.
>   *
> + * Depending on the ol_flags, the pseudo-header checksum expected by the
> + * drivers is not the same. For instance, when TSO is enabled, the IP
> + * payload length must not be included in the packet.
> + *
> + * When ol_flags is 0, it computes the standard pseudo-header checksum.
> + *
>   * @param ipv4_hdr
>   *   The pointer to the contiguous IPv4 header.
> + * @param ol_flags
> + *   The ol_flags of the associated mbuf.
>   * @return
>   *   The non-complemented checksum to set in the L4 header.
>   */
>  static inline uint16_t
> -rte_ipv4_phdr_cksum(const struct ipv4_hdr *ipv4_hdr)
> +rte_ipv4_phdr_cksum(const struct ipv4_hdr *ipv4_hdr, uint64_t ol_flags)
>  {
>  	struct ipv4_psd_header {
>  		uint32_t src_addr; /* IP address of source host. */
> @@ -328,9 +337,13 @@ rte_ipv4_phdr_cksum(const struct ipv4_hdr *ipv4_hdr)
>  	psd_hdr.dst_addr = ipv4_hdr->dst_addr;
>  	psd_hdr.zero = 0;
>  	psd_hdr.proto = ipv4_hdr->next_proto_id;
> -	psd_hdr.len = rte_cpu_to_be_16(
> -		(uint16_t)(rte_be_to_cpu_16(ipv4_hdr->total_length)
> -			- sizeof(struct ipv4_hdr)));
> +	if (ol_flags & PKT_TX_TCP_SEG) {
> +		psd_hdr.len = 0;
> +	} else {
> +		psd_hdr.len = rte_cpu_to_be_16(
> +			(uint16_t)(rte_be_to_cpu_16(ipv4_hdr->total_length)
> +				- sizeof(struct ipv4_hdr)));
> +	}
>  	return rte_raw_cksum((const char *)&psd_hdr, sizeof(psd_hdr));
>  }
> 
> @@ -357,7 +370,7 @@ rte_ipv4_udptcp_cksum(const struct ipv4_hdr *ipv4_hdr, const void *l4_hdr)
>  		sizeof(struct ipv4_hdr);
> 
>  	cksum = rte_raw_cksum(l4_hdr, l4_len);
> -	cksum += rte_ipv4_phdr_cksum(ipv4_hdr);
> +	cksum += rte_ipv4_phdr_cksum(ipv4_hdr, 0);
> 
>  	cksum = ((cksum & 0xffff0000) >> 16) + (cksum & 0xffff);
>  	cksum = (~cksum) & 0xffff;
> @@ -382,13 +395,21 @@ struct ipv6_hdr {
>  /**
>   * Process the pseudo-header checksum of an IPv6 header.
>   *
> + * Depending on the ol_flags, the pseudo-header checksum expected by the
> + * drivers is not the same. For instance, when TSO is enabled, the IPv6
> + * payload length must not be included in the packet.
> + *
> + * When ol_flags is 0, it computes the standard pseudo-header checksum.
> + *
>   * @param ipv6_hdr
>   *   The pointer to the contiguous IPv6 header.
> + * @param ol_flags
> + *   The ol_flags of the associated mbuf.
>   * @return
>   *   The non-complemented checksum to set in the L4 header.
>   */
>  static inline uint16_t
> -rte_ipv6_phdr_cksum(const struct ipv6_hdr *ipv6_hdr)
> +rte_ipv6_phdr_cksum(const struct ipv6_hdr *ipv6_hdr, uint64_t ol_flags)
>  {
>  	struct ipv6_psd_header {
>  		uint8_t src_addr[16]; /* IP address of source host. */
> @@ -400,7 +421,11 @@ rte_ipv6_phdr_cksum(const struct ipv6_hdr *ipv6_hdr)
>  	rte_memcpy(&psd_hdr.src_addr, ipv6_hdr->src_addr,
>  		sizeof(ipv6_hdr->src_addr) + sizeof(ipv6_hdr->dst_addr));
>  	psd_hdr.proto = (ipv6_hdr->proto << 24);
> -	psd_hdr.len = ipv6_hdr->payload_len;
> +	if (ol_flags & PKT_TX_TCP_SEG) {
> +		psd_hdr.len = 0;
> +	} else {
> +		psd_hdr.len = ipv6_hdr->payload_len;
> +	}
> 
>  	return rte_raw_cksum((const char *)&psd_hdr, sizeof(psd_hdr));
>  }
> diff --git a/lib/librte_pmd_e1000/igb_rxtx.c b/lib/librte_pmd_e1000/igb_rxtx.c
> index 433c616..848d5d1 100644
> --- a/lib/librte_pmd_e1000/igb_rxtx.c
> +++ b/lib/librte_pmd_e1000/igb_rxtx.c
> @@ -367,6 +367,13 @@ eth_igb_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
>  	struct rte_mbuf     *tx_pkt;
>  	struct rte_mbuf     *m_seg;
>  	union igb_vlan_macip vlan_macip_lens;
> +	union {
> +		uint16_t u16;
> +		struct {
> +			uint16_t l3_len:9;
> +			uint16_t l2_len:7;
> +		};
> +	} l2_l3_len;
>  	uint64_t buf_dma_addr;
>  	uint32_t olinfo_status;
>  	uint32_t cmd_type_len;
> @@ -404,8 +411,10 @@ eth_igb_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
>  		tx_last = (uint16_t) (tx_id + tx_pkt->nb_segs - 1);
> 
>  		ol_flags = tx_pkt->ol_flags;
> +		l2_l3_len.l2_len = tx_pkt->l2_len;
> +		l2_l3_len.l3_len = tx_pkt->l3_len;
>  		vlan_macip_lens.f.vlan_tci = tx_pkt->vlan_tci;
> -		vlan_macip_lens.f.l2_l3_len = tx_pkt->l2_l3_len;
> +		vlan_macip_lens.f.l2_l3_len = l2_l3_len.u16;
>  		tx_ol_req = ol_flags & IGB_TX_OFFLOAD_MASK;
> 
>  		/* If a Context Descriptor need be built . */
> diff --git a/lib/librte_pmd_ixgbe/ixgbe_rxtx.c b/lib/librte_pmd_ixgbe/ixgbe_rxtx.c
> index ca35db2..2df3385 100644
> --- a/lib/librte_pmd_ixgbe/ixgbe_rxtx.c
> +++ b/lib/librte_pmd_ixgbe/ixgbe_rxtx.c
> @@ -546,6 +546,13 @@ ixgbe_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
>  	struct rte_mbuf     *tx_pkt;
>  	struct rte_mbuf     *m_seg;
>  	union ixgbe_vlan_macip vlan_macip_lens;
> +	union {
> +		uint16_t u16;
> +		struct {
> +			uint16_t l3_len:9;
> +			uint16_t l2_len:7;
> +		};
> +	} l2_l3_len;
>  	uint64_t buf_dma_addr;
>  	uint32_t olinfo_status;
>  	uint32_t cmd_type_len;
> @@ -588,8 +595,10 @@ ixgbe_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
>  		/* If hardware offload required */
>  		tx_ol_req = ol_flags & IXGBE_TX_OFFLOAD_MASK;
>  		if (tx_ol_req) {
> +			l2_l3_len.l2_len = tx_pkt->l2_len;
> +			l2_l3_len.l3_len = tx_pkt->l3_len;
>  			vlan_macip_lens.f.vlan_tci = tx_pkt->vlan_tci;
> -			vlan_macip_lens.f.l2_l3_len = tx_pkt->l2_l3_len;
> +			vlan_macip_lens.f.l2_l3_len = l2_l3_len.u16;
> 
>  			/* If new context need be built or reuse the exist ctx. */
>  			ctx = what_advctx_update(txq, tx_ol_req,
> --
> 2.1.0



More information about the dev mailing list