[dpdk-dev] [PATCH v2 13/17] ixgbe: support TCP segmentation offload

Olivier Matz olivier.matz at 6wind.com
Mon May 19 15:56:25 CEST 2014


Implement TSO (TCP segmentation offload) in ixgbe driver.
The driver is now able to use PKT_TX_TCP_SEG mbuf flag and m->hw_offload
to configure the hardware support of TCP segmentation.

In the patch, the tx_desc_cksum_flags_to_olinfo() and
tx_desc_ol_flags_to_cmdtype() functions have been reworked to make them
clearer. This should not impact performance as gcc (version 4.8 in my
case) is smart enough to convert the tests into a code that does not
contain any branch instruction.

validation
==========

platform:

  Tester (linux)   <---->   DUT (DPDK)

Run testpmd on DUT:

  cd dpdk.org/
  make install T=x86_64-default-linuxapp-gcc
  cd x86_64-default-linuxapp-gcc/
  modprobe uio
  insmod kmod/igb_uio.ko
  python ../tools/igb_uio_bind.py -b igb_uio 0000:02:00.0
  echo 0 > /proc/sys/kernel/randomize_va_space
  echo 1000 > /sys/devices/system/node/node0/hugepages/hugepages-2048kB/nr_hugepages
  echo 1000 > /sys/devices/system/node/node1/hugepages/hugepages-2048kB/nr_hugepages
  mount -t hugetlbfs none /mnt/huge
  ./app/testpmd -c 0x55 -n 4 -m 800 -- -i --port-topology=chained

Disable all offload feature on Tester, and start capture:

  ethtool -K ixgbe0 rx off tx off tso off gso off gro off lro off
  ip l set ixgbe0 up
  tcpdump -n -e -i ixgbe0 -s 0 -w /tmp/cap

We use the following scapy script for testing:

  def test():
    ############### IPv4
    # checksum TCP
    p=Ether()/IP(src=RandIP(), dst=RandIP())/TCP(flags=0x10)/Raw(RandString(50))
    sendp(p, iface="ixgbe0", count=5)
    # checksum UDP
    p=Ether()/IP(src=RandIP(), dst=RandIP())/UDP()/Raw(RandString(50))
    sendp(p, iface="ixgbe0", count=5)
    # bad IP checksum
    p=Ether()/IP(src=RandIP(), dst=RandIP(), chksum=0x1234)/TCP(flags=0x10)/Raw(RandString(50))
    sendp(p, iface="ixgbe0", count=5)
    # bad TCP checksum
    p=Ether()/IP(src=RandIP(), dst=RandIP())/TCP(flags=0x10, chksum=0x1234)/Raw(RandString(50))
    sendp(p, iface="ixgbe0", count=5)
    # large packet
    p=Ether()/IP(src=RandIP(), dst=RandIP())/TCP(flags=0x10)/Raw(RandString(1400))
    sendp(p, iface="ixgbe0", count=5)
    ############### IPv6v6
    # checksum TCP
    p=Ether()/IPv6(src=RandIP6(), dst=RandIP6())/TCP(flags=0x10)/Raw(RandString(50))
    sendp(p, iface="ixgbe0", count=5)
    # checksum UDP
    p=Ether()/IPv6(src=RandIP6(), dst=RandIP6())/UDP()/Raw(RandString(50))
    sendp(p, iface="ixgbe0", count=5)
    # bad TCP checksum
    p=Ether()/IPv6(src=RandIP6(), dst=RandIP6())/TCP(flags=0x10, chksum=0x1234)/Raw(RandString(50))
    sendp(p, iface="ixgbe0", count=5)
    # large packet
    p=Ether()/IPv6(src=RandIP6(), dst=RandIP6())/TCP(flags=0x10)/Raw(RandString(1400))
    sendp(p, iface="ixgbe0", count=5)

Without hw cksum
----------------

On DUT:

  # disable hw cksum (use sw) in csumonly test, disable tso
  stop
  set fwd csum
  tx_checksum set 0x0 0
  tso set 0 0
  start

On tester:

  >>> test()

Then check the capture file.

With hw cksum
-------------

On DUT:

  # enable hw cksum in csumonly test, disable tso
  stop
  set fwd csum
  tx_checksum set 0xf 0
  tso set 0 0
  start

On tester:

  >>> test()

Then check the capture file.

With TSO
--------

On DUT:

  set fwd csum
  tx_checksum set 0xf 0
  tso set 800 0
  start

On tester:

  >>> test()

Then check the capture file.

Performance tests
=================

Performance tests have been done on v1 of this patch series.
See http://dpdk.org/ml/archives/dev/2014-May/002516.html

Signed-off-by: Olivier Matz <olivier.matz at 6wind.com>
---
 lib/librte_pmd_ixgbe/ixgbe_ethdev.c |   3 +-
 lib/librte_pmd_ixgbe/ixgbe_rxtx.c   | 169 +++++++++++++++++++++++++++---------
 2 files changed, 129 insertions(+), 43 deletions(-)

diff --git a/lib/librte_pmd_ixgbe/ixgbe_ethdev.c b/lib/librte_pmd_ixgbe/ixgbe_ethdev.c
index e78c208..2d5e524 100644
--- a/lib/librte_pmd_ixgbe/ixgbe_ethdev.c
+++ b/lib/librte_pmd_ixgbe/ixgbe_ethdev.c
@@ -1751,7 +1751,8 @@ ixgbe_dev_info_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
 		DEV_TX_OFFLOAD_IPV4_CKSUM  |
 		DEV_TX_OFFLOAD_UDP_CKSUM   |
 		DEV_TX_OFFLOAD_TCP_CKSUM   |
-		DEV_TX_OFFLOAD_SCTP_CKSUM;
+		DEV_TX_OFFLOAD_SCTP_CKSUM |
+		DEV_TX_OFFLOAD_TCP_TSO;
 }
 
 /* return 0 means link status changed, -1 means not changed */
diff --git a/lib/librte_pmd_ixgbe/ixgbe_rxtx.c b/lib/librte_pmd_ixgbe/ixgbe_rxtx.c
index e1eb59d..541dd58 100644
--- a/lib/librte_pmd_ixgbe/ixgbe_rxtx.c
+++ b/lib/librte_pmd_ixgbe/ixgbe_rxtx.c
@@ -347,13 +347,59 @@ ixgbe_xmit_pkts_simple(void *tx_queue, struct rte_mbuf **tx_pkts,
 	return nb_tx;
 }
 
+/* When doing TSO, the IP length must not be included in the pseudo
+ * header checksum of the packet given to the hardware */
+static inline void
+ixgbe_fix_tcp_phdr_cksum(struct rte_mbuf *m)
+{
+	char *data;
+	uint16_t *cksum_ptr;
+	uint16_t prev_cksum;
+	uint16_t new_cksum;
+	uint16_t ip_len, ip_paylen;
+	uint32_t tmp;
+	uint8_t ip_version;
+
+	/* get phdr cksum at offset 16 of TCP header */
+	data = rte_pktmbuf_mtod(m, char *);
+	cksum_ptr = (uint16_t *)(data + m->hw_offload.l2_len +
+		m->hw_offload.l3_len + 16);
+	prev_cksum = *cksum_ptr;
+
+	/* get ip_version */
+	ip_version = (*(uint8_t *)(data + m->hw_offload.l2_len)) >> 4;
+
+	/* get ip_len at offset 2 of IP header or offset 4 of IPv6 header */
+	if (ip_version == 4) {
+		/* override ip cksum to 0 */
+		data[m->hw_offload.l2_len + 10] = 0;
+		data[m->hw_offload.l2_len + 11] = 0;
+
+		ip_len = *(uint16_t *)(data + m->hw_offload.l2_len + 2);
+		ip_paylen = rte_cpu_to_be_16(rte_be_to_cpu_16(ip_len) -
+			m->hw_offload.l3_len);
+	} else {
+		ip_paylen = *(uint16_t *)(data + m->hw_offload.l2_len + 4);
+	}
+
+	/* calculate the new phdr checksum that doesn't include ip_paylen */
+	tmp = prev_cksum ^ 0xffff;
+	if (tmp < ip_paylen)
+		tmp += 0xffff;
+	tmp -= ip_paylen;
+	new_cksum = tmp;
+
+	/* replace it in the packet */
+	*cksum_ptr = new_cksum;
+}
+
 static inline void
 ixgbe_set_xmit_ctx(struct igb_tx_queue* txq,
 		volatile struct ixgbe_adv_tx_context_desc *ctx_txd,
 		uint32_t ol_flags, union rte_hw_offload hw_offload)
 {
 	uint32_t type_tucmd_mlhl;
-	uint32_t mss_l4len_idx;
+	uint32_t mss_l4len_idx = 0;
 	uint32_t ctx_idx;
 	uint32_t vlan_macip_lens;
 	union rte_hw_offload offload_mask;
@@ -362,44 +408,61 @@ ixgbe_set_xmit_ctx(struct igb_tx_queue* txq,
 	offload_mask.u64 = 0;
 	type_tucmd_mlhl = 0;
 
+	/* Specify which HW CTX to upload. */
+	mss_l4len_idx |= (ctx_idx << IXGBE_ADVTXD_IDX_SHIFT);
+
 	if (ol_flags & PKT_TX_VLAN_PKT) {
 		offload_mask.vlan_tci = 0xffff;
 	}
 
-	if (ol_flags & PKT_TX_IP_CKSUM) {
-		type_tucmd_mlhl = IXGBE_ADVTXD_TUCMD_IPV4;
+	/* check if TCP segmentation required for this packet */
+	if (ol_flags & PKT_TX_TCP_SEG) {
+		/* implies IP cksum and TCP cksum */
+		type_tucmd_mlhl = IXGBE_ADVTXD_TUCMD_IPV4 |
+			IXGBE_ADVTXD_TUCMD_L4T_TCP |
+			IXGBE_ADVTXD_DTYP_CTXT | IXGBE_ADVTXD_DCMD_DEXT;;
+
 		offload_mask.l2_len = HW_OFFLOAD_L2_LEN_MASK;
 		offload_mask.l3_len = HW_OFFLOAD_L3_LEN_MASK;
-	}
+		offload_mask.l4_len = HW_OFFLOAD_L4_LEN_MASK;
+		offload_mask.mss = 0xffff;
+		mss_l4len_idx |= hw_offload.mss << IXGBE_ADVTXD_MSS_SHIFT;
+		mss_l4len_idx |= hw_offload.l4_len << IXGBE_ADVTXD_L4LEN_SHIFT;
+	} else { /* no TSO, check if hardware checksum is needed */
+		if (ol_flags & PKT_TX_IP_CKSUM) {
+			type_tucmd_mlhl = IXGBE_ADVTXD_TUCMD_IPV4;
+			offload_mask.l2_len = HW_OFFLOAD_L2_LEN_MASK;
+			offload_mask.l3_len = HW_OFFLOAD_L3_LEN_MASK;
+		}
 
-	/* Specify which HW CTX to upload. */
-	mss_l4len_idx = (ctx_idx << IXGBE_ADVTXD_IDX_SHIFT);
-	switch (ol_flags & PKT_TX_L4_MASK) {
-	case PKT_TX_UDP_CKSUM:
-		type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_UDP |
+		switch (ol_flags & PKT_TX_L4_MASK) {
+		case PKT_TX_UDP_CKSUM:
+			type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_UDP |
 				IXGBE_ADVTXD_DTYP_CTXT | IXGBE_ADVTXD_DCMD_DEXT;
-		mss_l4len_idx |= sizeof(struct udp_hdr) << IXGBE_ADVTXD_L4LEN_SHIFT;
-		offload_mask.l2_len = HW_OFFLOAD_L2_LEN_MASK;
-		offload_mask.l3_len = HW_OFFLOAD_L3_LEN_MASK;
-		break;
-	case PKT_TX_TCP_CKSUM:
-		type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_TCP |
+			mss_l4len_idx |= sizeof(struct udp_hdr) << IXGBE_ADVTXD_L4LEN_SHIFT;
+			offload_mask.l2_len = HW_OFFLOAD_L2_LEN_MASK;
+			offload_mask.l3_len = HW_OFFLOAD_L3_LEN_MASK;
+			break;
+		case PKT_TX_TCP_CKSUM:
+			type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_TCP |
 				IXGBE_ADVTXD_DTYP_CTXT | IXGBE_ADVTXD_DCMD_DEXT;
-		mss_l4len_idx |= sizeof(struct tcp_hdr) << IXGBE_ADVTXD_L4LEN_SHIFT;
-		offload_mask.l2_len = HW_OFFLOAD_L2_LEN_MASK;
-		offload_mask.l3_len = HW_OFFLOAD_L3_LEN_MASK;
-		break;
-	case PKT_TX_SCTP_CKSUM:
-		type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_SCTP |
+			mss_l4len_idx |= sizeof(struct tcp_hdr) << IXGBE_ADVTXD_L4LEN_SHIFT;
+			offload_mask.l2_len = HW_OFFLOAD_L2_LEN_MASK;
+			offload_mask.l3_len = HW_OFFLOAD_L3_LEN_MASK;
+			offload_mask.l4_len = HW_OFFLOAD_L4_LEN_MASK;
+			break;
+		case PKT_TX_SCTP_CKSUM:
+			type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_SCTP |
 				IXGBE_ADVTXD_DTYP_CTXT | IXGBE_ADVTXD_DCMD_DEXT;
-		mss_l4len_idx |= sizeof(struct sctp_hdr) << IXGBE_ADVTXD_L4LEN_SHIFT;
-		offload_mask.l2_len = HW_OFFLOAD_L2_LEN_MASK;
-		offload_mask.l3_len = HW_OFFLOAD_L3_LEN_MASK;
-		break;
-	default:
-		type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_RSV |
+			mss_l4len_idx |= sizeof(struct sctp_hdr) << IXGBE_ADVTXD_L4LEN_SHIFT;
+			offload_mask.l2_len = HW_OFFLOAD_L2_LEN_MASK;
+			offload_mask.l3_len = HW_OFFLOAD_L3_LEN_MASK;
+			break;
+		default:
+			type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_RSV |
 				IXGBE_ADVTXD_DTYP_CTXT | IXGBE_ADVTXD_DCMD_DEXT;
-		break;
+			break;
+		}
 	}
 
 	txq->ctx_cache[ctx_idx].flags = ol_flags;
@@ -446,20 +509,25 @@ what_advctx_update(struct igb_tx_queue *txq, uint32_t flags,
 static inline uint32_t
 tx_desc_cksum_flags_to_olinfo(uint32_t ol_flags)
 {
-	static const uint32_t l4_olinfo[2] = {0, IXGBE_ADVTXD_POPTS_TXSM};
-	static const uint32_t l3_olinfo[2] = {0, IXGBE_ADVTXD_POPTS_IXSM};
-	uint32_t tmp;
-
-	tmp  = l4_olinfo[(ol_flags & PKT_TX_L4_MASK)  != PKT_TX_L4_NO_CKSUM];
-	tmp |= l3_olinfo[(ol_flags & PKT_TX_IP_CKSUM) != 0];
+	uint32_t tmp = 0;
+	if ((ol_flags & PKT_TX_L4_MASK) != PKT_TX_L4_NO_CKSUM)
+		tmp |= IXGBE_ADVTXD_POPTS_TXSM;
+	if (ol_flags & PKT_TX_IP_CKSUM)
+		tmp |= IXGBE_ADVTXD_POPTS_IXSM;
+	if (ol_flags & PKT_TX_TCP_SEG)
+		tmp |= IXGBE_ADVTXD_POPTS_TXSM | IXGBE_ADVTXD_POPTS_IXSM;
 	return tmp;
 }
 
 static inline uint32_t
-tx_desc_vlan_flags_to_cmdtype(uint32_t ol_flags)
+tx_desc_ol_flags_to_cmdtype(uint32_t ol_flags)
 {
-	static const uint32_t vlan_cmd[2] = {0, IXGBE_ADVTXD_DCMD_VLE};
-	return vlan_cmd[(ol_flags & PKT_TX_VLAN_PKT) != 0];
+	uint32_t cmdtype = 0;
+	if (ol_flags & PKT_TX_VLAN_PKT)
+		cmdtype |= IXGBE_ADVTXD_DCMD_VLE;
+	if (ol_flags & PKT_TX_TCP_SEG)
+		cmdtype |= IXGBE_ADVTXD_DCMD_TSE;
+	return cmdtype;
 }
 
 /* Default RS bit threshold values */
@@ -583,7 +651,8 @@ ixgbe_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 		/* If hardware offload required */
 		tx_ol_req = ol_flags &
-			(PKT_TX_VLAN_PKT | PKT_TX_IP_CKSUM | PKT_TX_L4_MASK);
+			(PKT_TX_VLAN_PKT | PKT_TX_IP_CKSUM | PKT_TX_L4_MASK |
+			PKT_TX_TCP_SEG);
 		if (tx_ol_req) {
 			/* If new context need be built or reuse the exist ctx. */
 			ctx = what_advctx_update(txq, tx_ol_req,
@@ -702,13 +771,26 @@ ixgbe_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
 		 */
 		cmd_type_len = IXGBE_ADVTXD_DTYP_DATA |
 			IXGBE_ADVTXD_DCMD_IFCS | IXGBE_ADVTXD_DCMD_DEXT;
-		olinfo_status = (pkt_len << IXGBE_ADVTXD_PAYLEN_SHIFT);
+
 #ifdef RTE_LIBRTE_IEEE1588
 		if (ol_flags & PKT_TX_IEEE1588_TMST)
 			cmd_type_len |= IXGBE_ADVTXD_MAC_1588;
 #endif
 
+		olinfo_status = 0;
 		if (tx_ol_req) {
+
+			if (ol_flags & PKT_TX_TCP_SEG) {
+				/* paylen in descriptor is the not the packet
+				 * len bu the tcp payload len if TSO in on */
+				pkt_len -= (hw_offload.l2_len +
+					hw_offload.l3_len + hw_offload.l4_len);
+
+				/* the pseudo header checksum must be modified:
+				 * it should not include the ip_len */
+				ixgbe_fix_tcp_phdr_cksum(tx_pkt);
+			}
+
 			/*
 			 * Setup the TX Advanced Context Descriptor if required
 			 */
@@ -741,11 +823,13 @@ ixgbe_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
 			 * This path will go through
 			 * whatever new/reuse the context descriptor
 			 */
-			cmd_type_len  |= tx_desc_vlan_flags_to_cmdtype(ol_flags);
+			cmd_type_len  |= tx_desc_ol_flags_to_cmdtype(ol_flags);
 			olinfo_status |= tx_desc_cksum_flags_to_olinfo(ol_flags);
 			olinfo_status |= ctx << IXGBE_ADVTXD_IDX_SHIFT;
 		}
 
+		olinfo_status |= (pkt_len << IXGBE_ADVTXD_PAYLEN_SHIFT);
+
 		m_seg = tx_pkt;
 		do {
 			txd = &txr[tx_id];
@@ -3420,9 +3504,10 @@ ixgbe_dev_tx_init(struct rte_eth_dev *dev)
 	PMD_INIT_FUNC_TRACE();
 	hw = IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
 
-	/* Enable TX CRC (checksum offload requirement) */
+	/* Enable TX CRC (checksum offload requirement) and hw padding
+	 * (TSO requirement) */
 	hlreg0 = IXGBE_READ_REG(hw, IXGBE_HLREG0);
-	hlreg0 |= IXGBE_HLREG0_TXCRCEN;
+	hlreg0 |= (IXGBE_HLREG0_TXCRCEN | IXGBE_HLREG0_TXPADEN);
 	IXGBE_WRITE_REG(hw, IXGBE_HLREG0, hlreg0);
 
 	/* Setup the Base and Length of the Tx Descriptor Rings */
-- 
1.9.2



More information about the dev mailing list