[dpdk-dev] af_packet dev default "framesz" of 2048B

Lam, Tiago tiago.lam at intel.com
Thu Nov 15 20:02:37 CET 2018

Hi guys,

OvS-DPDK has recently had small a change that changed the data room
available in an mbuf (commit dfaf00e in OvS). This seems to have had the
consequence of breaking the initialisation of eth_af_packets interfaces,
when using default values ("options:dpdk-

After investigating, what seems to be happening is that the
eth_af_packet dev expects an available space of "2048B - TPACKET2_HDRLEN
+ sizeof(struct sockaddr_ll) = 2016B" to be available in the data room
of each mbuf.  Previous to the above commit, OvS would allocate some
extra space, and this would mean there would be enough room for the
checks performed in eth_rx_queue_setup() and eth_dev_mtu_set() in
rte_eth_af_packet.c. However, with the recent commit that isn't the case
anymore, and without that extra space the first check in
eth_rx_queue_setup() will now be hit and setup of a eth_af_packet
interface fails.

What I'm trying to understand here is, the logic behind setting a
default 'framesz' of 2048B and it being hardcoded (instead of being
based on the underlying MTU of the interface, or the mbuf data room
directly). The documentation in [1] for mmap() and setting up buffer
rings mentions the exact same values
(tp_block_size=4096,tp_frame_size=2048), which seem to have been
introduced on the first commit, back in 2014. The only constraint
for the framesize, it seems, its that it fits inside the blocksize (i.e.
doesn't span multiple blocksizes), and is aligned to TPACKET_ALIGNMENT.

Thus, given those constraints, could we instead base the setting of the
framesize like on the below patch? It basically tries to set the
framesize to the MTU of the underlying interface +  TPACKET2_HDRLEN +
sizeof(structsockaddr_ll), aligned to TPACKET_ALIGNMENT. This would
allow applications such as OvS to use this interface by default, based
on a framesize set dynamically, instead of relying on a default of
2048B; If one is setting up an eth_af_packet interface with MTU 9000B
and the underlying MTU is 1500B, for example, this will still fail, and
that's fine as they can always supply the "framesz" argument
However, on the default case, or when one has configured manually the
underlying interface with the expected MTU already, this would work out
of the box.

Any thoughts on this?


diff --git a/drivers/net/af_packet/rte_eth_af_packet.c b/drivers/net/af_packet/rte_eth_af_packet.c
index 95a98c6..451cec3 100644
--- a/drivers/net/af_packet/rte_eth_af_packet.c
+++ b/drivers/net/af_packet/rte_eth_af_packet.c
@@ -24,6 +24,8 @@
 #include <unistd.h>
 #include <poll.h>

+#define RTE_ROUNDUP(x, y) ((((x) + ((y) - 1)) / (y)) * (y))
 #define ETH_AF_PACKET_IFACE_ARG		"iface"
 #define ETH_AF_PACKET_NUM_Q_ARG		"qpairs"
 #define ETH_AF_PACKET_BLOCKSIZE_ARG	"blocksz"
@@ -535,7 +537,6 @@ rte_pmd_init_internals(struct rte_vdev_device *dev,
                        const int sockfd,
                        const unsigned nb_queues,
                        unsigned int blocksize,
-                       unsigned int blockcnt,
                        unsigned int framesize,
                        unsigned int framecnt,
 		       unsigned int qdisc_bypass,
@@ -557,6 +558,7 @@ rte_pmd_init_internals(struct rte_vdev_device *dev,
 	int rc, tpver, discard;
 	int qsockfd = -1;
 	unsigned int i, q, rdsize;
+    unsigned int blockcnt;
 #if defined(PACKET_FANOUT)
 	int fanout_arg;
@@ -587,13 +589,6 @@ rte_pmd_init_internals(struct rte_vdev_device *dev,
 		(*internals)->tx_queue[q].map = MAP_FAILED;

-	req = &((*internals)->req);
-	req->tp_block_size = blocksize;
-	req->tp_block_nr = blockcnt;
-	req->tp_frame_size = framesize;
-	req->tp_frame_nr = framecnt;
 	ifnamelen = strlen(pair->value);
 	if (ifnamelen < sizeof(ifr.ifr_name)) {
 		memcpy(ifr.ifr_name, pair->value, ifnamelen);
@@ -622,6 +617,38 @@ rte_pmd_init_internals(struct rte_vdev_device *dev,
 		return -1;
 	memcpy(&(*internals)->eth_addr, ifr.ifr_hwaddr.sa_data, ETH_ALEN);
+	if (!framesize) {
+		if (ioctl(sockfd, SIOCGIFMTU, &ifr) == -1) {
+				"%s: ioctl failed (SIOCGIFMTU)",
+				name);
+			framesize = DFLT_FRAME_SIZE;
+		} else {
+			framesize = ifr.ifr_mtu;
+			framesize += TPACKET2_HDRLEN + sizeof(struct sockaddr_ll);
+			framesize = RTE_ROUNDUP(framesize, TPACKET_ALIGNMENT);
+        }
+	}
+	blockcnt = framecnt / (blocksize / framesize);
+	if (!blockcnt) {
+			"%s: invalid AF_PACKET MMAP parameters\n", name);
+		return -1;
+	}
+	RTE_LOG(INFO, PMD, "%s: AF_PACKET MMAP parameters:\n", name);
+	RTE_LOG(INFO, PMD, "%s:\tblock size %d\n", name, blocksize);
+	RTE_LOG(INFO, PMD, "%s:\tblock count %d\n", name, blockcnt);
+	RTE_LOG(INFO, PMD, "%s:\tframe size %d\n", name, framesize);
+	RTE_LOG(INFO, PMD, "%s:\tframe count %d\n", name, framecnt);
+	req = &((*internals)->req);
+	req->tp_block_size = blocksize;
+	req->tp_block_nr = blockcnt;
+	req->tp_frame_nr = framecnt;
+	req->tp_frame_size = framesize;

 	memset(&sockaddr, 0, sizeof(sockaddr));
 	sockaddr.sll_family = AF_PACKET;
@@ -811,9 +838,8 @@ rte_eth_from_packet(struct rte_vdev_device *dev,
 	struct rte_eth_dev *eth_dev = NULL;
 	struct rte_kvargs_pair *pair = NULL;
 	unsigned k_idx;
-	unsigned int blockcount;
 	unsigned int blocksize = DFLT_BLOCK_SIZE;
-	unsigned int framesize = DFLT_FRAME_SIZE;
+	unsigned int framesize = 0;
 	unsigned int framecount = DFLT_FRAME_COUNT;
 	unsigned int qpairs = 1;
 	unsigned int qdisc_bypass = 1;
@@ -887,21 +913,8 @@ rte_eth_from_packet(struct rte_vdev_device *dev,
 		return -1;

-	blockcount = framecount / (blocksize / framesize);
-	if (!blockcount) {
-			"%s: invalid AF_PACKET MMAP parameters", name);
-		return -1;
-	}
-	PMD_LOG(INFO, "%s: AF_PACKET MMAP parameters:", name);
-	PMD_LOG(INFO, "%s:\tblock size %d", name, blocksize);
-	PMD_LOG(INFO, "%s:\tblock count %d", name, blockcount);
-	PMD_LOG(INFO, "%s:\tframe size %d", name, framesize);
-	PMD_LOG(INFO, "%s:\tframe count %d", name, framecount);
 	if (rte_pmd_init_internals(dev, *sockfd, qpairs,
-				   blocksize, blockcount,
+				   blocksize,
 				   framesize, framecount,
 				   &internals, &eth_dev,

More information about the dev mailing list