[dpdk-dev] [PATCH v4 2/3] lib/gro: add TCP/IPv4 GRO support

Jiayu Hu jiayu.hu at intel.com
Wed Jun 7 13:08:50 CEST 2017


In this patch, we introduce six APIs to support TCP/IPv4 GRO.
- gro_tcp_tbl_create: create a TCP reassembly table, which is used to
    merge packets.
- gro_tcp_tbl_destroy: free memory space of a TCP reassembly table.
- gro_tcp_tbl_flush: flush packets in the TCP reassembly table.
- gro_tcp_tbl_timeout_flush: flush timeout packets in the TCP
    reassembly table.
- gro_tcp4_reassemble: merge an inputted packet.
- gro_tcp4_tbl_cksum_update: update TCP and IPv4 header checksums for
    all merged packets in the TCP reassembly table.

In TCP GRO, we use a table structure, called TCP reassembly table, to
reassemble packets. Both TCP/IPv4 and TCP/IPv6 GRO use the same table
structure. A TCP reassembly table includes a flow array and a item array,
where the flow array is used to record flow information and the item
array is used to record packets information.

Each element in the flow array records the information of one flow,
which includes two parts:
- key: the criteria of the same flow. If packets have the same key
    value, they belong to the same flow.
- start_index: the index of the first incoming packet of this flow in
    the item array. With start_index, we can locate the first incoming
    packet of this flow.
Each element in the item array records one packet information. It mainly
includes two parts:
- pkt: packet address
- next_pkt_index: index of the next packet of the same flow in the item
    array. All packets of the same flow are chained by next_pkt_index.
    With next_pkt_index, we can locate all packets of the same flow
    one by one.

To process an incoming packet, we need three steps:
a. check if the packet should be processed. Packets with the following
    properties won't be processed:
	- packets without data;
	- packets with wrong checksums;
	- fragmented packets.
b. traverse the flow array to find a flow which the packet belongs to.
    If not find, insert a new flow and store the packet into the item
    array.
c. locate the first packet of this flow in the item array via
    start_index. Then traverse all packets of this flow one by one via
    next_pkt_index. If find one packet to merge with the incoming packet,
    merge them but without updating checksums. If not, allocate one item
    in the item array to store the incoming packet and update
    next_pkt_index value.

For better performance, we don't udpate header checksums once two
packets are merged. The header checksums are updated only when packets
are flushed from TCP reassembly tables.

Signed-off-by: Jiayu Hu <jiayu.hu at intel.com>
---
 lib/librte_gro/Makefile      |   1 +
 lib/librte_gro/rte_gro.c     | 150 +++++++++++--
 lib/librte_gro/rte_gro.h     |  34 +--
 lib/librte_gro/rte_gro_tcp.c | 509 +++++++++++++++++++++++++++++++++++++++++++
 lib/librte_gro/rte_gro_tcp.h | 206 +++++++++++++++++
 5 files changed, 869 insertions(+), 31 deletions(-)
 create mode 100644 lib/librte_gro/rte_gro_tcp.c
 create mode 100644 lib/librte_gro/rte_gro_tcp.h

diff --git a/lib/librte_gro/Makefile b/lib/librte_gro/Makefile
index 9f4063a..3495dfc 100644
--- a/lib/librte_gro/Makefile
+++ b/lib/librte_gro/Makefile
@@ -43,6 +43,7 @@ LIBABIVER := 1
 
 # source files
 SRCS-$(CONFIG_RTE_LIBRTE_GRO) += rte_gro.c
+SRCS-$(CONFIG_RTE_LIBRTE_GRO) += rte_gro_tcp.c
 
 # install this header file
 SYMLINK-$(CONFIG_RTE_LIBRTE_GRO)-include += rte_gro.h
diff --git a/lib/librte_gro/rte_gro.c b/lib/librte_gro/rte_gro.c
index ca6b0d2..f2defbd 100644
--- a/lib/librte_gro/rte_gro.c
+++ b/lib/librte_gro/rte_gro.c
@@ -31,11 +31,17 @@
 
 #include <rte_malloc.h>
 #include <rte_mbuf.h>
+#include <rte_ethdev.h>
+#include <rte_ip.h>
+#include <rte_tcp.h>
 
 #include "rte_gro.h"
+#include "rte_gro_tcp.h"
 
-static gro_tbl_create_fn tbl_create_functions[GRO_TYPE_MAX_NB];
-static gro_tbl_destroy_fn tbl_destroy_functions[GRO_TYPE_MAX_NB];
+static gro_tbl_create_fn tbl_create_functions[GRO_TYPE_MAX_NB] = {
+	gro_tcp_tbl_create, NULL};
+static gro_tbl_destroy_fn tbl_destroy_functions[GRO_TYPE_MAX_NB] = {
+	gro_tcp_tbl_destroy, NULL};
 
 struct rte_gro_tbl *rte_gro_tbl_create(uint16_t socket_id,
 		uint16_t max_flow_num,
@@ -93,33 +99,145 @@ void rte_gro_tbl_destroy(struct rte_gro_tbl *gro_tbl)
 }
 
 uint16_t
-rte_gro_reassemble_burst(struct rte_mbuf **pkts __rte_unused,
+rte_gro_reassemble_burst(struct rte_mbuf **pkts,
 		const uint16_t nb_pkts,
-		const struct rte_gro_param param __rte_unused)
+		const struct rte_gro_param param)
 {
-	return nb_pkts;
+	struct ether_hdr *eth_hdr;
+	struct ipv4_hdr *ipv4_hdr;
+	uint16_t l3proc_type, i;
+	uint16_t nb_after_gro = nb_pkts;
+	const uint64_t item_num = nb_pkts >
+		param.max_flow_num * param.max_item_per_flow ?
+		param.max_flow_num * param.max_item_per_flow :
+		nb_pkts;
+	const uint32_t flow_num = nb_pkts > param.max_flow_num ?
+		param.max_flow_num : nb_pkts;
+
+	/* allocate respective GRO tables for all supported GRO types */
+	struct gro_tcp_tbl tcp_tbl;
+	struct gro_tcp_flow tcp_flows[flow_num];
+	struct gro_tcp_item tcp_items[item_num];
+	struct gro_tcp_rule tcp_rule;
+
+	struct rte_mbuf *unprocess_pkts[nb_pkts];
+	uint16_t unprocess_num = 0;
+	int32_t ret;
+
+	if (unlikely(nb_pkts <= 1))
+		return nb_pkts;
+
+	memset(tcp_flows, 0, sizeof(struct gro_tcp_flow) *
+			flow_num);
+	memset(tcp_items, 0, sizeof(struct gro_tcp_item) *
+			item_num);
+	tcp_tbl.flows = tcp_flows;
+	tcp_tbl.items = tcp_items;
+	tcp_tbl.flow_num = 0;
+	tcp_tbl.item_num = 0;
+	tcp_tbl.max_flow_num = flow_num;
+	tcp_tbl.max_item_num = item_num;
+	tcp_rule.max_packet_size = param.max_packet_size;
+
+	for (i = 0; i < nb_pkts; i++) {
+		eth_hdr = rte_pktmbuf_mtod(pkts[i], struct ether_hdr *);
+		l3proc_type = rte_be_to_cpu_16(eth_hdr->ether_type);
+		if (l3proc_type == ETHER_TYPE_IPv4) {
+			ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
+			if (ipv4_hdr->next_proto_id == IPPROTO_TCP &&
+					(param.desired_gro_types &
+					 GRO_TCP_IPV4)) {
+				ret = gro_tcp4_reassemble(pkts[i],
+						&tcp_tbl,
+						&tcp_rule);
+				if (ret > 0)
+					nb_after_gro--;
+				else if (ret < 0)
+					unprocess_pkts[unprocess_num++] =
+						pkts[i];
+			} else
+				unprocess_pkts[unprocess_num++] =
+					pkts[i];
+		} else
+			unprocess_pkts[unprocess_num++] =
+				pkts[i];
+	}
+
+	if (nb_after_gro < nb_pkts) {
+		/* update packets headers and re-arrange GROed packets */
+		if (param.desired_gro_types & GRO_TCP_IPV4) {
+			gro_tcp4_tbl_cksum_update(&tcp_tbl);
+			for (i = 0; i < tcp_tbl.item_num; i++)
+				pkts[i] = tcp_tbl.items[i].pkt;
+		}
+		if (unprocess_num > 0) {
+			memcpy(&pkts[i], unprocess_pkts,
+					sizeof(struct rte_mbuf *) *
+					unprocess_num);
+			i += unprocess_num;
+		}
+		if (nb_pkts > i)
+			memset(&pkts[i], 0,
+					sizeof(struct rte_mbuf *) *
+					(nb_pkts - i));
+	}
+	return nb_after_gro;
 }
 
-int rte_gro_reassemble(struct rte_mbuf *pkt __rte_unused,
-		struct rte_gro_tbl *gro_tbl __rte_unused)
+int rte_gro_reassemble(struct rte_mbuf *pkt,
+		struct rte_gro_tbl *gro_tbl)
 {
+	struct ether_hdr *eth_hdr;
+	struct ipv4_hdr *ipv4_hdr;
+	uint16_t l3proc_type;
+	struct gro_tcp_rule tcp_rule;
+
+	if (pkt == NULL)
+		return -1;
+	tcp_rule.max_packet_size = gro_tbl->max_packet_size;
+	eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
+	l3proc_type = rte_be_to_cpu_16(eth_hdr->ether_type);
+	if (l3proc_type == ETHER_TYPE_IPv4) {
+		ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
+		if (ipv4_hdr->next_proto_id == IPPROTO_TCP &&
+				(gro_tbl->desired_gro_types & GRO_TCP_IPV4)) {
+			return gro_tcp4_reassemble(pkt,
+					gro_tbl->tbls[GRO_TCP_IPV4_INDEX],
+					&tcp_rule);
+		}
+	}
 	return -1;
 }
 
-uint16_t rte_gro_flush(struct rte_gro_tbl *gro_tbl __rte_unused,
-		uint64_t desired_gro_types __rte_unused,
-		uint16_t flush_num __rte_unused,
-		struct rte_mbuf **out __rte_unused,
-		const uint16_t max_nb_out __rte_unused)
+uint16_t rte_gro_flush(struct rte_gro_tbl *gro_tbl,
+		uint64_t desired_gro_types,
+		uint16_t flush_num,
+		struct rte_mbuf **out,
+		const uint16_t max_nb_out)
 {
+	desired_gro_types = desired_gro_types &
+		gro_tbl->desired_gro_types;
+	if (desired_gro_types & GRO_TCP_IPV4)
+		return gro_tcp_tbl_flush(
+				gro_tbl->tbls[GRO_TCP_IPV4_INDEX],
+				flush_num,
+				out,
+				max_nb_out);
 	return 0;
 }
 
 uint16_t
-rte_gro_timeout_flush(struct rte_gro_tbl *gro_tbl __rte_unused,
-		uint64_t desired_gro_types __rte_unused,
-		struct rte_mbuf **out __rte_unused,
-		const uint16_t max_nb_out __rte_unused)
+rte_gro_timeout_flush(struct rte_gro_tbl *gro_tbl,
+		uint64_t desired_gro_types,
+		struct rte_mbuf **out,
+		const uint16_t max_nb_out)
 {
+	desired_gro_types = desired_gro_types &
+		gro_tbl->desired_gro_types;
+	if (desired_gro_types & GRO_TCP_IPV4)
+		return gro_tcp_tbl_timeout_flush(
+				gro_tbl->tbls[GRO_TCP_IPV4_INDEX],
+				gro_tbl->max_timeout_cycles,
+				out, max_nb_out);
 	return 0;
 }
diff --git a/lib/librte_gro/rte_gro.h b/lib/librte_gro/rte_gro.h
index 7fe11a6..24e3e34 100644
--- a/lib/librte_gro/rte_gro.h
+++ b/lib/librte_gro/rte_gro.h
@@ -34,7 +34,11 @@
 
 /* maximum number of supported GRO types */
 #define GRO_TYPE_MAX_NB 64
-#define GRO_TYPE_SUPPORT_NB 0	/**< current supported GRO num */
+#define GRO_TYPE_SUPPORT_NB 1	/**< supported GRO types number */
+
+/* TCP/IPv4 GRO flag */
+#define GRO_TCP_IPV4_INDEX 0
+#define GRO_TCP_IPV4 (1ULL << GRO_TCP_IPV4_INDEX)
 
 /**
  * GRO table structure. DPDK GRO uses GRO table to reassemble
@@ -138,9 +142,9 @@ void rte_gro_tbl_destroy(struct rte_gro_tbl *gro_tbl);
  * @return
  *  the number of packets after GROed.
  */
-uint16_t rte_gro_reassemble_burst(struct rte_mbuf **pkts __rte_unused,
-		const uint16_t nb_pkts __rte_unused,
-		const struct rte_gro_param param __rte_unused);
+uint16_t rte_gro_reassemble_burst(struct rte_mbuf **pkts,
+		const uint16_t nb_pkts,
+		const struct rte_gro_param param);
 
 /**
  * This is the main reassembly API used in heavyweight mode, which
@@ -163,8 +167,8 @@ uint16_t rte_gro_reassemble_burst(struct rte_mbuf **pkts __rte_unused,
  *  if merge the packet successfully, return a positive value. If fail
  *  to merge, return zero. If errors happen, return a negative value.
  */
-int rte_gro_reassemble(struct rte_mbuf *pkt __rte_unused,
-		struct rte_gro_tbl *gro_tbl __rte_unused);
+int rte_gro_reassemble(struct rte_mbuf *pkt,
+		struct rte_gro_tbl *gro_tbl);
 
 /**
  * This function flushed packets of desired GRO types from their
@@ -183,11 +187,11 @@ int rte_gro_reassemble(struct rte_mbuf *pkt __rte_unused,
  * @return
  *  the number of flushed packets. If no packets are flushed, return 0.
  */
-uint16_t rte_gro_flush(struct rte_gro_tbl *gro_tbl __rte_unused,
-		uint64_t desired_gro_types __rte_unused,
-		uint16_t flush_num __rte_unused,
-		struct rte_mbuf **out __rte_unused,
-		const uint16_t max_nb_out __rte_unused);
+uint16_t rte_gro_flush(struct rte_gro_tbl *gro_tbl,
+		uint64_t desired_gro_types,
+		uint16_t flush_num,
+		struct rte_mbuf **out,
+		const uint16_t max_nb_out);
 
 /**
  * This function flushes the timeout packets from reassembly tables of
@@ -205,8 +209,8 @@ uint16_t rte_gro_flush(struct rte_gro_tbl *gro_tbl __rte_unused,
  * @return
  *  the number of flushed packets. If no packets are flushed, return 0.
  */
-uint16_t rte_gro_timeout_flush(struct rte_gro_tbl *gro_tbl __rte_unused,
-		uint64_t desired_gro_types __rte_unused,
-		struct rte_mbuf **out __rte_unused,
-		const uint16_t max_nb_out __rte_unused);
+uint16_t rte_gro_timeout_flush(struct rte_gro_tbl *gro_tbl,
+		uint64_t desired_gro_types,
+		struct rte_mbuf **out,
+		const uint16_t max_nb_out);
 #endif
diff --git a/lib/librte_gro/rte_gro_tcp.c b/lib/librte_gro/rte_gro_tcp.c
new file mode 100644
index 0000000..15f28f4
--- /dev/null
+++ b/lib/librte_gro/rte_gro_tcp.c
@@ -0,0 +1,509 @@
+/*-
+ *
+ *   Copyright(c) 2016-2017 Intel Corporation. All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <rte_malloc.h>
+#include <rte_mbuf.h>
+#include <rte_cycles.h>
+#include <rte_ethdev.h>
+#include <rte_ip.h>
+#include <rte_tcp.h>
+
+#include "rte_gro_tcp.h"
+
+void *gro_tcp_tbl_create(uint16_t socket_id,
+		uint16_t max_flow_num,
+		uint16_t max_item_per_flow)
+{
+	size_t size;
+	uint32_t entries_num;
+	struct gro_tcp_tbl *tbl;
+
+	entries_num = max_flow_num * max_item_per_flow;
+	if (entries_num == 0 || max_flow_num == 0)
+		return NULL;
+
+	tbl = (struct gro_tcp_tbl *)rte_zmalloc_socket(
+			__func__,
+			sizeof(struct gro_tcp_tbl),
+			RTE_CACHE_LINE_SIZE,
+			socket_id);
+
+	size = sizeof(struct gro_tcp_item) * entries_num;
+	tbl->items = (struct gro_tcp_item *)rte_zmalloc_socket(
+			__func__,
+			size,
+			RTE_CACHE_LINE_SIZE,
+			socket_id);
+	tbl->max_item_num = entries_num;
+
+	size = sizeof(struct gro_tcp_flow) * max_flow_num;
+	tbl->flows = (struct gro_tcp_flow *)rte_zmalloc_socket(
+			__func__,
+			size, RTE_CACHE_LINE_SIZE,
+			socket_id);
+	tbl->max_flow_num = max_flow_num;
+	return tbl;
+}
+
+void gro_tcp_tbl_destroy(void *tbl)
+{
+	struct gro_tcp_tbl *tcp_tbl = (struct gro_tcp_tbl *)tbl;
+
+	if (tcp_tbl) {
+		if (tcp_tbl->items)
+			rte_free(tcp_tbl->items);
+		if (tcp_tbl->flows)
+			rte_free(tcp_tbl->flows);
+		rte_free(tcp_tbl);
+	}
+}
+
+/* update TCP header and IPv4 header checksum */
+static void
+gro_tcp4_cksum_update(struct rte_mbuf *pkt)
+{
+	uint32_t len, offset, cksum;
+	struct ether_hdr *eth_hdr;
+	struct ipv4_hdr *ipv4_hdr;
+	struct tcp_hdr *tcp_hdr;
+	uint16_t ipv4_ihl, cksum_pld;
+
+	if (pkt == NULL)
+		return;
+
+	len = pkt->pkt_len;
+	eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
+	ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
+	ipv4_ihl = IPv4_HDR_LEN(ipv4_hdr);
+	tcp_hdr = (struct tcp_hdr *)((char *)ipv4_hdr + ipv4_ihl);
+
+	offset = sizeof(struct ether_hdr) + ipv4_ihl;
+	len -= offset;
+
+	/* TCP cksum without IP pseudo header */
+	ipv4_hdr->hdr_checksum = 0;
+	tcp_hdr->cksum = 0;
+	if (rte_raw_cksum_mbuf(pkt, offset, len, &cksum_pld) < 0) {
+		printf("invalid param for raw_cksum_mbuf\n");
+		return;
+	}
+	/* IP pseudo header cksum */
+	cksum = cksum_pld;
+	cksum += rte_ipv4_phdr_cksum(ipv4_hdr, 0);
+
+	/* combine TCP checksum and IP pseudo header checksum */
+	cksum = ((cksum & 0xffff0000) >> 16) + (cksum & 0xffff);
+	cksum = (~cksum) & 0xffff;
+	cksum = (cksum == 0) ? 0xffff : cksum;
+	tcp_hdr->cksum = cksum;
+
+	/* update IP header cksum */
+	ipv4_hdr->hdr_checksum = rte_ipv4_cksum(ipv4_hdr);
+}
+
+void gro_tcp4_tbl_cksum_update(struct gro_tcp_tbl *tbl)
+{
+	uint64_t i;
+
+	for (i = 0; i < tbl->item_num; i++) {
+		if (tbl->items[i].is_groed)
+			gro_tcp4_cksum_update(tbl->items[i].pkt);
+	}
+}
+
+/**
+ * merge two TCP/IPv4 packets without update header checksum.
+ */
+static int
+merge_two_tcp4_packets(struct rte_mbuf *pkt_src,
+		struct rte_mbuf *pkt,
+		struct gro_tcp_rule *rule)
+{
+	struct ipv4_hdr *ipv4_hdr1, *ipv4_hdr2;
+	struct tcp_hdr *tcp_hdr1;
+	uint16_t ipv4_ihl1, tcp_hl1, tcp_dl1;
+	struct rte_mbuf *tail;
+
+	/* parse the given packet */
+	ipv4_hdr1 = (struct ipv4_hdr *)(rte_pktmbuf_mtod(pkt,
+				struct ether_hdr *) + 1);
+	ipv4_ihl1 = IPv4_HDR_LEN(ipv4_hdr1);
+	tcp_hdr1 = (struct tcp_hdr *)((char *)ipv4_hdr1 + ipv4_ihl1);
+	tcp_hl1 = TCP_HDR_LEN(tcp_hdr1);
+	tcp_dl1 = rte_be_to_cpu_16(ipv4_hdr1->total_length) - ipv4_ihl1
+		- tcp_hl1;
+
+	/* parse the original packet */
+	ipv4_hdr2 = (struct ipv4_hdr *)(rte_pktmbuf_mtod(pkt_src,
+				struct ether_hdr *) + 1);
+
+	/* check reassembly rules */
+	if (pkt_src->pkt_len + tcp_dl1 > rule->max_packet_size)
+		return -1;
+
+	/* remove the header of the incoming packet */
+	rte_pktmbuf_adj(pkt, sizeof(struct ether_hdr) +
+			ipv4_ihl1 + tcp_hl1);
+
+	/* chain the two packet together */
+	tail = rte_pktmbuf_lastseg(pkt_src);
+	tail->next = pkt;
+
+	/* update IP header */
+	ipv4_hdr2->total_length = rte_cpu_to_be_16(
+			rte_be_to_cpu_16(
+				ipv4_hdr2->total_length)
+			+ tcp_dl1);
+
+	/* update mbuf metadata for the merged packet */
+	pkt_src->nb_segs++;
+	pkt_src->pkt_len += pkt->pkt_len;
+	return 1;
+}
+
+static int
+check_seq_option(struct rte_mbuf *pkt,
+		struct tcp_hdr *tcp_hdr,
+		uint16_t tcp_hl)
+{
+	struct ipv4_hdr *ipv4_hdr1;
+	struct tcp_hdr *tcp_hdr1;
+	uint16_t ipv4_ihl1, tcp_hl1, tcp_dl1;
+	uint32_t sent_seq1, sent_seq;
+	int ret = -1;
+
+	ipv4_hdr1 = (struct ipv4_hdr *)(rte_pktmbuf_mtod(pkt,
+				struct ether_hdr *) + 1);
+	ipv4_ihl1 = IPv4_HDR_LEN(ipv4_hdr1);
+	tcp_hdr1 = (struct tcp_hdr *)((char *)ipv4_hdr1 + ipv4_ihl1);
+	tcp_hl1 = TCP_HDR_LEN(tcp_hdr1);
+	tcp_dl1 = rte_be_to_cpu_16(ipv4_hdr1->total_length) - ipv4_ihl1
+		- tcp_hl1;
+	sent_seq1 = rte_be_to_cpu_32(tcp_hdr1->sent_seq) + tcp_dl1;
+	sent_seq = rte_be_to_cpu_32(tcp_hdr->sent_seq);
+
+	/* check if the two packets are neighbor */
+	if ((sent_seq ^ sent_seq1) == 0) {
+		/* check if the option fields equal */
+		if (tcp_hl1 > sizeof(struct tcp_hdr)) {
+			if ((tcp_hl1 != tcp_hl) ||
+					(memcmp(tcp_hdr1 + 1,
+							tcp_hdr + 1,
+							tcp_hl - sizeof
+							(struct tcp_hdr))
+					 == 0))
+				ret = 1;
+		}
+	}
+	return ret;
+}
+
+static uint32_t
+find_an_empty_item(struct gro_tcp_tbl *tbl)
+{
+	uint32_t i;
+
+	for (i = 0; i < tbl->max_item_num; i++)
+		if (tbl->items[i].is_valid == 0)
+			return i;
+	return INVALID_ITEM_INDEX;
+}
+
+static uint16_t
+find_an_empty_flow(struct gro_tcp_tbl *tbl)
+{
+	uint16_t i;
+
+	for (i = 0; i < tbl->max_flow_num; i++)
+		if (tbl->flows[i].is_valid == 0)
+			return i;
+	return INVALID_FLOW_INDEX;
+}
+
+int32_t
+gro_tcp4_reassemble(struct rte_mbuf *pkt,
+		struct gro_tcp_tbl *tbl,
+		struct gro_tcp_rule *rule)
+{
+	struct ether_hdr *eth_hdr;
+	struct ipv4_hdr *ipv4_hdr;
+	struct tcp_hdr *tcp_hdr;
+	uint16_t ipv4_ihl, tcp_hl, tcp_dl, tcp_cksum, ip_cksum;
+
+	struct gro_tcp_flow_key key;
+	uint64_t ol_flags;
+	uint32_t cur_idx, prev_idx, item_idx;
+	uint16_t i, flow_idx;
+
+	eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
+	ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
+	ipv4_ihl = IPv4_HDR_LEN(ipv4_hdr);
+
+	/* 1. check if the packet should be processed */
+	if (ipv4_ihl < sizeof(struct ipv4_hdr))
+		goto fail;
+	if (ipv4_hdr->next_proto_id != IPPROTO_TCP)
+		goto fail;
+	if ((ipv4_hdr->fragment_offset &
+				rte_cpu_to_be_16(IPV4_HDR_DF_MASK))
+			== 0)
+		goto fail;
+
+	tcp_hdr = (struct tcp_hdr *)((char *)ipv4_hdr + ipv4_ihl);
+	tcp_hl = TCP_HDR_LEN(tcp_hdr);
+	tcp_dl = rte_be_to_cpu_16(ipv4_hdr->total_length) - ipv4_ihl
+		- tcp_hl;
+	if (tcp_dl == 0)
+		goto fail;
+
+	/**
+	 * 2. if HW rx checksum offload isn't enabled, recalculate the
+	 * checksum in SW. Then, check if the checksum is correct
+	 */
+	ol_flags = pkt->ol_flags;
+	if ((ol_flags & PKT_RX_IP_CKSUM_MASK) !=
+			PKT_RX_IP_CKSUM_UNKNOWN) {
+		if (ol_flags == PKT_RX_IP_CKSUM_BAD)
+			goto fail;
+	} else {
+		ip_cksum = ipv4_hdr->hdr_checksum;
+		ipv4_hdr->hdr_checksum = 0;
+		ipv4_hdr->hdr_checksum = rte_ipv4_cksum(ipv4_hdr);
+		if (ipv4_hdr->hdr_checksum ^ ip_cksum)
+			goto fail;
+	}
+
+	if ((ol_flags & PKT_RX_L4_CKSUM_MASK) !=
+			PKT_RX_L4_CKSUM_UNKNOWN) {
+		if (ol_flags == PKT_RX_L4_CKSUM_BAD)
+			goto fail;
+	} else {
+		tcp_cksum = tcp_hdr->cksum;
+		tcp_hdr->cksum = 0;
+		tcp_hdr->cksum = rte_ipv4_udptcp_cksum
+			(ipv4_hdr, tcp_hdr);
+		if (tcp_hdr->cksum ^ tcp_cksum)
+			goto fail;
+	}
+
+	/**
+	 * 3. search for a flow and traverse all packets in the flow
+	 * to find one to merge with the given packet.
+	 */
+	key.eth_saddr = eth_hdr->s_addr;
+	key.eth_daddr = eth_hdr->d_addr;
+	key.ip_src_addr[0] = rte_be_to_cpu_32(ipv4_hdr->src_addr);
+	key.ip_dst_addr[0] = rte_be_to_cpu_32(ipv4_hdr->dst_addr);
+	key.src_port = rte_be_to_cpu_16(tcp_hdr->src_port);
+	key.dst_port = rte_be_to_cpu_16(tcp_hdr->dst_port);
+	key.recv_ack = rte_be_to_cpu_32(tcp_hdr->recv_ack);
+	key.tcp_flags = tcp_hdr->tcp_flags;
+
+	for (i = 0; i < tbl->max_flow_num; i++) {
+		/* search all packets in a valid flow. */
+		if (tbl->flows[i].is_valid &&
+				(memcmp(&(tbl->flows[i].key), &key,
+						sizeof(struct gro_tcp_flow_key))
+				 == 0)) {
+			cur_idx = tbl->flows[i].start_index;
+			prev_idx = cur_idx;
+			while (cur_idx != INVALID_ITEM_INDEX) {
+				if (check_seq_option(tbl->items[cur_idx].pkt,
+							tcp_hdr,
+							tcp_hl) > 0) {
+					if (merge_two_tcp4_packets(
+								tbl->items[cur_idx].pkt,
+								pkt,
+								rule) > 0) {
+						/* successfully merge two packets */
+						tbl->items[cur_idx].is_groed = 1;
+						return 1;
+					}
+					/**
+					 * fail to merge two packets since
+					 * break the rules, add the packet
+					 * into the flow.
+					 */
+					goto insert_to_existed_flow;
+				} else {
+					prev_idx = cur_idx;
+					cur_idx = tbl->items[cur_idx].next_pkt_idx;
+				}
+			}
+			/**
+			 * fail to merge the given packet into an existed flow,
+			 * add it into the flow.
+			 */
+insert_to_existed_flow:
+			item_idx = find_an_empty_item(tbl);
+			/* the item number is beyond the maximum value */
+			if (item_idx == INVALID_ITEM_INDEX)
+				return -1;
+			tbl->items[prev_idx].next_pkt_idx = item_idx;
+			tbl->items[item_idx].pkt = pkt;
+			tbl->items[item_idx].is_groed = 0;
+			tbl->items[item_idx].next_pkt_idx = INVALID_ITEM_INDEX;
+			tbl->items[item_idx].is_valid = 1;
+			tbl->items[item_idx].start_time = rte_rdtsc();
+			tbl->item_num++;
+			return 0;
+		}
+	}
+
+	/**
+	 * merge fail as the given packet is a new flow. Therefore,
+	 * insert a new flow.
+	 */
+	item_idx = find_an_empty_item(tbl);
+	flow_idx = find_an_empty_flow(tbl);
+	/**
+	 * if the flow or item number are beyond the maximum values,
+	 * the inputted packet won't be processed.
+	 */
+	if (item_idx == INVALID_ITEM_INDEX ||
+			flow_idx == INVALID_FLOW_INDEX)
+		return -1;
+	tbl->items[item_idx].pkt = pkt;
+	tbl->items[item_idx].next_pkt_idx = INVALID_ITEM_INDEX;
+	tbl->items[item_idx].is_groed = 0;
+	tbl->items[item_idx].is_valid = 1;
+	tbl->items[item_idx].start_time = rte_rdtsc();
+	tbl->item_num++;
+
+	memcpy(&(tbl->flows[flow_idx].key),
+			&key, sizeof(struct gro_tcp_flow_key));
+	tbl->flows[flow_idx].start_index = item_idx;
+	tbl->flows[flow_idx].is_valid = 1;
+	tbl->flow_num++;
+
+	return 0;
+fail:
+	return -1;
+}
+
+uint16_t gro_tcp_tbl_flush(struct gro_tcp_tbl *tbl,
+		uint16_t flush_num,
+		struct rte_mbuf **out,
+		const uint16_t nb_out)
+{
+	uint16_t num, k;
+	uint16_t i;
+	uint32_t j;
+
+	k = 0;
+	num = tbl->item_num > flush_num ? flush_num : tbl->item_num;
+	num = num > nb_out ? nb_out : num;
+	if (num == 0)
+		return 0;
+
+	for (i = 0; i < tbl->max_flow_num; i++) {
+		if (tbl->flows[i].is_valid) {
+			j = tbl->flows[i].start_index;
+			while (j != INVALID_ITEM_INDEX) {
+				/* update checksum for GROed packet */
+				if (tbl->items[j].is_groed)
+					gro_tcp4_cksum_update(tbl->items[j].pkt);
+
+				out[k++] = tbl->items[j].pkt;
+				tbl->items[j].is_valid = 0;
+				tbl->item_num--;
+				j = tbl->items[j].next_pkt_idx;
+
+				if (k == num) {
+					/* delete the flow */
+					if (j == INVALID_ITEM_INDEX) {
+						tbl->flows[i].is_valid = 0;
+						tbl->flow_num--;
+					} else
+						/* update flow information */
+						tbl->flows[i].start_index = j;
+					goto end;
+				}
+			}
+			/* delete the flow, as all of its packets are flushed */
+			tbl->flows[i].is_valid = 0;
+			tbl->flow_num--;
+		}
+	}
+end:
+	return num;
+}
+
+uint16_t
+gro_tcp_tbl_timeout_flush(struct gro_tcp_tbl *tbl,
+		uint64_t timeout_cycles,
+		struct rte_mbuf **out,
+		const uint16_t nb_out)
+{
+	uint16_t k;
+	uint16_t i;
+	uint32_t j;
+	uint64_t current_time;
+
+	if (nb_out == 0)
+		return 0;
+	k = 0;
+	current_time = rte_rdtsc();
+
+	for (i = 0; i < tbl->max_flow_num; i++) {
+		if (tbl->flows[i].is_valid) {
+			j = tbl->flows[i].start_index;
+			while (j != INVALID_ITEM_INDEX) {
+				if (current_time - tbl->items[j].start_time >=
+						timeout_cycles) {
+					/* update checksum for GROed packet */
+					if (tbl->items[j].is_groed)
+						gro_tcp4_cksum_update(tbl->items[j].pkt);
+
+					out[k++] = tbl->items[j].pkt;
+					tbl->items[j].is_valid = 0;
+					tbl->item_num--;
+					j = tbl->items[j].next_pkt_idx;
+
+					if (k == nb_out) {
+						if (j == INVALID_ITEM_INDEX) {
+							/* delete the flow */
+							tbl->flows[i].is_valid = 0;
+							tbl->flow_num--;
+						} else
+							tbl->flows[i].start_index = j;
+						goto end;
+					}
+				}
+			}
+			/* delete the flow, as all of its packets are flushed */
+			tbl->flows[i].is_valid = 0;
+			tbl->flow_num--;
+		}
+	}
+end:
+	return k;
+}
diff --git a/lib/librte_gro/rte_gro_tcp.h b/lib/librte_gro/rte_gro_tcp.h
new file mode 100644
index 0000000..76b2107
--- /dev/null
+++ b/lib/librte_gro/rte_gro_tcp.h
@@ -0,0 +1,206 @@
+/*-
+ *
+ *   Copyright(c) 2016-2017 Intel Corporation. All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _RTE_GRO_TCP_H_
+#define _RTE_GRO_TCP_H_
+
+#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
+#define TCP_HDR_LEN(tcph) \
+	((tcph->data_off >> 4) * 4)
+#define IPv4_HDR_LEN(iph) \
+	((iph->version_ihl & 0x0f) * 4)
+#else
+#define TCP_DATAOFF_MASK 0x0f
+#define TCP_HDR_LEN(tcph) \
+	((tcph->data_off & TCP_DATAOFF_MASK) * 4)
+#define IPv4_HDR_LEN(iph) \
+	((iph->version_ihl >> 4) * 4)
+#endif
+
+#define IPV4_HDR_DF_SHIFT 14
+#define IPV4_HDR_DF_MASK (1 << IPV4_HDR_DF_SHIFT)
+
+#define INVALID_FLOW_INDEX 0xffffUL
+#define INVALID_ITEM_INDEX 0xffffffffULL
+
+/* criteria of mergeing packets */
+struct gro_tcp_flow_key {
+	struct ether_addr eth_saddr;
+	struct ether_addr eth_daddr;
+	uint32_t ip_src_addr[4];	/**< IPv4 uses the first 8B */
+	uint32_t ip_dst_addr[4];
+
+	uint32_t recv_ack;	/**< acknowledgment sequence number. */
+	uint16_t src_port;
+	uint16_t dst_port;
+	uint8_t tcp_flags;	/**< TCP flags. */
+};
+
+struct gro_tcp_flow {
+	struct gro_tcp_flow_key key;
+	uint32_t start_index;	/**< the first packet index of the flow */
+	uint8_t is_valid;
+};
+
+struct gro_tcp_item {
+	struct rte_mbuf *pkt;	/**< packet address. */
+	/* the time when the packet in added into the table */
+	uint64_t start_time;
+	uint32_t next_pkt_idx;	/**< next packet index. */
+	/* flag to indicate if the packet is GROed */
+	uint8_t is_groed;
+	uint8_t is_valid;	/**< flag indicates if the item is valid */
+};
+
+/**
+ * TCP reassembly table. Both TCP/IPv4 and TCP/IPv6 use the same table
+ * structure.
+ */
+struct gro_tcp_tbl {
+	struct gro_tcp_item *items;	/**< item array */
+	struct gro_tcp_flow *flows;	/**< flow array */
+	uint32_t item_num;	/**< current item number */
+	uint16_t flow_num;	/**< current flow num */
+	uint32_t max_item_num;	/**< item array size */
+	uint16_t max_flow_num;	/**< flow array size */
+};
+
+/* rules to reassemble TCP packets, which are decided by applications */
+struct gro_tcp_rule {
+	/* the maximum packet length after merged */
+	uint32_t max_packet_size;
+};
+
+/**
+ * This function is to update TCP and IPv4 header checksums
+ * for merged packets in the TCP reassembly table.
+ */
+void gro_tcp4_tbl_cksum_update(struct gro_tcp_tbl *tbl);
+
+/**
+ * This function creates a TCP reassembly table.
+ *
+ * @param socket_id
+ *  socket index where the Ethernet port connects to.
+ * @param max_flow_num
+ *  the maximum number of flows in the TCP GRO table
+ * @param max_item_per_flow
+ *  the maximum packet number per flow.
+ * @return
+ *  if create successfully, return a pointer which points to the
+ *  created TCP GRO table. Otherwise, return NULL.
+ */
+void *gro_tcp_tbl_create(uint16_t socket_id,
+		uint16_t max_flow_num,
+		uint16_t max_item_per_flow);
+
+/**
+ * This function destroys a TCP reassembly table.
+ * @param tbl
+ *  a pointer points to the TCP reassembly table.
+ */
+void gro_tcp_tbl_destroy(void *tbl);
+
+/**
+ * This function searches for a packet in the TCP reassembly table to
+ * merge with the inputted one. To merge two packets is to chain them
+ * together and update packet headers. Note that this function won't
+ * re-calculate IPv4 and TCP checksums.
+ *
+ * If the packet doesn't have data, or with wrong checksums, or is
+ * fragmented etc., errors happen and gro_tcp4_reassemble returns
+ * immediately. If no errors happen, the packet is either merged, or
+ * inserted into the reassembly table.
+ *
+ * If applications want to get packets in the reassembly table, they
+ * need to manually flush the packets.
+ *
+ * @param pkt
+ *  packet to reassemble.
+ * @param tbl
+ *  a pointer that points to a TCP reassembly table.
+ * @param rule
+ *  TCP reassembly criteria defined by applications.
+ * @return
+ *  if the inputted packet is merged successfully, return an positive
+ *  value. If the packet hasn't be merged with any packets in the TCP
+ *  reassembly table. If errors happen, return a negative value and the
+ *  packet won't be inserted into the reassemble table.
+ */
+int32_t
+gro_tcp4_reassemble(struct rte_mbuf *pkt,
+		struct gro_tcp_tbl *tbl,
+		struct gro_tcp_rule *rule);
+
+/**
+ * This function flushes the packets in a TCP reassembly table to
+ * applications. Before returning the packets, it will update TCP and
+ * IPv4 header checksums.
+ *
+ * @param tbl
+ *  a pointer that points to a TCP GRO table.
+ * @param flush_num
+ *  the number of packets that applications want to flush.
+ * @param out
+ *  pointer array which is used to keep flushed packets.
+ * @param nb_out
+ *  the maximum element number of out.
+ * @return
+ *  the number of packets that are flushed finally.
+ */
+uint16_t
+gro_tcp_tbl_flush(struct gro_tcp_tbl *tbl,
+		uint16_t flush_num,
+		struct rte_mbuf **out,
+		const uint16_t nb_out);
+
+/**
+ * This function flushes timeout packets in a TCP reassembly table to
+ * applications. Before returning the packets, it updates TCP and IPv4
+ * header checksums.
+ *
+ * @param tbl
+ *  a pointer that points to a TCP GRO table.
+ * @param timeout_cycles
+ *  the maximum time that packets can stay in the table.
+ * @param out
+ *  pointer array which is used to keep flushed packets.
+ * @param nb_out
+ *  the maximum element number of out.
+ * @return
+ *  It returns the number of packets that are flushed finally.
+ */
+uint16_t
+gro_tcp_tbl_timeout_flush(struct gro_tcp_tbl *tbl,
+		uint64_t timeout_cycles,
+		struct rte_mbuf **out,
+		const uint16_t nb_out);
+#endif
-- 
2.7.4



More information about the dev mailing list