[dpdk-dev] [RFC-PATCH-v3 6/6] test: add pktdev performance tests

Bruce Richardson bruce.richardson at intel.com
Wed Jun 10 15:07:21 CEST 2015


Add in some performance testing for the pktdev library. Looking at
cycles count for a ring-based implementation, based off the ring
performance tests.
Compares ring performance:
* native ring calls
* calls through pktdev to the ring
* calls through ring pmd wrapper to the ring
* calls through pktdev to the pmd wrapper to the ring.

Signed-off-by: Bruce Richardson <bruce.richardson at intel.com>
---
 app/test/Makefile              |   4 +-
 app/test/test_pktdev_perf.c    | 260 +++++++++++++++++++++++++++++++++++++++++
 lib/librte_pktdev/rte_pktdev.h |   8 +-
 3 files changed, 265 insertions(+), 7 deletions(-)
 create mode 100644 app/test/test_pktdev_perf.c

diff --git a/app/test/Makefile b/app/test/Makefile
index 77e48c1..8697893 100644
--- a/app/test/Makefile
+++ b/app/test/Makefile
@@ -58,9 +58,7 @@ SRCS-y += test_ring.c
 SRCS-y += test_ring_perf.c
 SRCS-y += test_pmd_perf.c
 
-ifeq ($(CONFIG_RTE_LIBRTE_PKTDEV),y)
-SRCS-y += test_pktdev.c
-endif
+SRCS-$(CONFIG_RTE_LIBRTE_PKTDEV) += test_pktdev.c test_pktdev_perf.c
 
 ifeq ($(CONFIG_RTE_LIBRTE_TABLE),y)
 SRCS-y += test_table.c
diff --git a/app/test/test_pktdev_perf.c b/app/test/test_pktdev_perf.c
new file mode 100644
index 0000000..6a94e4d
--- /dev/null
+++ b/app/test/test_pktdev_perf.c
@@ -0,0 +1,260 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2015 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#include <stdio.h>
+#include <inttypes.h>
+#include <rte_ring.h>
+#include <rte_cycles.h>
+#include <rte_launch.h>
+#include <rte_pktdev.h>
+#include <rte_ethdev.h>
+#include <rte_eth_ring.h>
+
+#include "test.h"
+
+/*
+ * Ring
+ * ====
+ *
+ * Measures performance of various operations using rdtsc
+ *  * Empty ring dequeue
+ *  * Enqueue/dequeue of bursts in 1 threads
+ *  * Enqueue/dequeue of bursts in 2 threads
+ */
+
+#define RING_NAME "RING_PERF"
+#define RING_SIZE 4096
+#define MAX_BURST 32
+
+/*
+ * the sizes to enqueue and dequeue in testing
+ * (marked volatile so they won't be seen as compile-time constants)
+ */
+static const volatile unsigned bulk_sizes[] = { 1, 8, 32 };
+
+/* The ring structure used for tests */
+static struct rte_ring *r;
+static struct rte_pktdev *r_pdev;
+static uint8_t ring_ethdev_port;
+static struct rte_pktdev *re_pdev;
+
+/* Get cycle counts for dequeuing from an empty ring. Should be 2 or 3 cycles */
+static void
+test_empty_dequeue(void)
+{
+	const unsigned iter_shift = 26;
+	const unsigned iterations = 1<<iter_shift;
+	unsigned i = 0;
+	void *burst[MAX_BURST];
+
+	const uint64_t sc_start = rte_rdtsc();
+	for (i = 0; i < iterations; i++)
+		rte_ring_sc_dequeue_bulk(r, burst, bulk_sizes[0]);
+	const uint64_t sc_end = rte_rdtsc();
+
+	const uint64_t mc_start = rte_rdtsc();
+	for (i = 0; i < iterations; i++)
+		rte_ring_mc_dequeue_bulk(r, burst, bulk_sizes[0]);
+	const uint64_t mc_end = rte_rdtsc();
+
+	printf("SC empty dequeue: %.2F\n",
+			(double)(sc_end-sc_start) / iterations);
+	printf("MC empty dequeue: %.2F\n",
+			(double)(mc_end-mc_start) / iterations);
+}
+
+/*
+ * Test function that determines how long an enqueue + dequeue of a single item
+ * takes on a single lcore. Result is for comparison with the bulk enq+deq.
+ */
+static void
+test_single_enqueue_dequeue(void)
+{
+	const unsigned iter_shift = 24;
+	const unsigned iterations = 1<<iter_shift;
+	unsigned i = 0;
+	void *burst = NULL;
+	struct rte_mbuf *mburst[1] = { NULL };
+
+	const uint64_t sc_start = rte_rdtsc_precise();
+	rte_compiler_barrier();
+	for (i = 0; i < iterations; i++) {
+		rte_ring_enqueue_bulk(r, &burst, 1);
+		rte_ring_dequeue_bulk(r, &burst, 1);
+	}
+	const uint64_t sc_end = rte_rdtsc_precise();
+	rte_compiler_barrier();
+
+	const uint64_t pd_start = rte_rdtsc_precise();
+	rte_compiler_barrier();
+	for (i = 0; i < iterations; i++) {
+		rte_pkt_tx_burst(r_pdev, mburst, 1);
+		rte_pkt_rx_burst(r_pdev, mburst, 1);
+	}
+	const uint64_t pd_end = rte_rdtsc_precise();
+	rte_compiler_barrier();
+
+	const uint64_t eth_start = rte_rdtsc_precise();
+	rte_compiler_barrier();
+	for (i = 0; i < iterations; i++) {
+		rte_eth_tx_burst(ring_ethdev_port, 0, mburst, 1);
+		rte_eth_rx_burst(ring_ethdev_port, 0, mburst, 1);
+	}
+	const uint64_t eth_end = rte_rdtsc_precise();
+	rte_compiler_barrier();
+
+	const uint64_t pd_eth_start = rte_rdtsc_precise();
+	rte_compiler_barrier();
+	for (i = 0; i < iterations; i++) {
+		rte_pkt_tx_burst(re_pdev, mburst, 1);
+		rte_pkt_rx_burst(re_pdev, mburst, 1);
+	}
+	const uint64_t pd_eth_end = rte_rdtsc_precise();
+	rte_compiler_barrier();
+
+	printf("Ring single enq/dequeue      : %"PRIu64"\n",
+			(sc_end-sc_start) >> iter_shift);
+	printf("Pktdev(ring) single enq/deq  : %"PRIu64"\n",
+			(pd_end-pd_start) >> iter_shift);
+	printf("Ethdev single enq/dequeue    : %"PRIu64"\n",
+			(eth_end-eth_start) >> iter_shift);
+	printf("Pktdev(ethdev) single enq/deq: %"PRIu64"\n",
+			(pd_eth_end-pd_eth_start) >> iter_shift);
+}
+
+/* Times enqueue and dequeue on a single lcore */
+static void
+test_bulk_enqueue_dequeue(void)
+{
+	const unsigned iter_shift = 23;
+	const unsigned iterations = 1<<iter_shift;
+	unsigned sz, i = 0;
+	struct rte_mbuf *burst[MAX_BURST] = {0};
+
+	for (sz = 0; sz < sizeof(bulk_sizes)/sizeof(bulk_sizes[0]); sz++) {
+		const uint64_t sc_start = rte_rdtsc();
+		for (i = 0; i < iterations; i++) {
+			rte_ring_sp_enqueue_bulk(r, (void *)burst, bulk_sizes[sz]);
+			rte_ring_sc_dequeue_bulk(r, (void *)burst, bulk_sizes[sz]);
+		}
+		const uint64_t sc_end = rte_rdtsc();
+
+		const uint64_t pd_start = rte_rdtsc_precise();
+		rte_compiler_barrier();
+		for (i = 0; i < iterations; i++) {
+			rte_pkt_tx_burst(r_pdev, burst, bulk_sizes[sz]);
+			rte_pkt_rx_burst(r_pdev, burst, bulk_sizes[sz]);
+		}
+		const uint64_t pd_end = rte_rdtsc_precise();
+		rte_compiler_barrier();
+
+		const uint64_t eth_start = rte_rdtsc_precise();
+		rte_compiler_barrier();
+		for (i = 0; i < iterations; i++) {
+			rte_eth_tx_burst(ring_ethdev_port, 0, burst, bulk_sizes[sz]);
+			rte_eth_rx_burst(ring_ethdev_port, 0, burst, bulk_sizes[sz]);
+		}
+		const uint64_t eth_end = rte_rdtsc_precise();
+		rte_compiler_barrier();
+
+		const uint64_t pd_eth_start = rte_rdtsc_precise();
+		rte_compiler_barrier();
+		for (i = 0; i < iterations; i++) {
+			rte_pkt_tx_burst(re_pdev, burst, bulk_sizes[sz]);
+			rte_pkt_rx_burst(re_pdev, burst, bulk_sizes[sz]);
+		}
+		const uint64_t pd_eth_end = rte_rdtsc_precise();
+		rte_compiler_barrier();
+
+		double sc_avg = ((double)(sc_end-sc_start) /
+				(iterations * bulk_sizes[sz]));
+		double pd_avg = ((double)(pd_end-pd_start) /
+				(iterations * bulk_sizes[sz]));
+		double eth_avg = ((double)(eth_end-eth_start) /
+				(iterations * bulk_sizes[sz]));
+		double pd_eth_avg = ((double)(pd_eth_end-pd_eth_start) /
+				(iterations * bulk_sizes[sz]));
+
+		printf("ring bulk enq/dequeue (size: %u): %.1F\n", bulk_sizes[sz],
+				sc_avg);
+		printf("pktdev(ring) bulk enq/deq (%u)  : %.1F\n", bulk_sizes[sz],
+				pd_avg);
+		printf("ethdev bulk enq/dequeue (%u)    : %.1F\n", bulk_sizes[sz],
+				eth_avg);
+		printf("pktdev(ethdev) bulk enq/deq (%u): %.1F\n", bulk_sizes[sz],
+				pd_eth_avg);
+
+		printf("\n");
+	}
+}
+
+static int
+test_pktdev_perf(void)
+{
+	const struct rte_eth_conf port_conf_default = {0};
+	struct rte_mempool *p;
+
+	r = rte_ring_create(RING_NAME, RING_SIZE, rte_socket_id(),
+			RING_F_SP_ENQ|RING_F_SC_DEQ);
+	if (r == NULL && (r = rte_ring_lookup(RING_NAME)) == NULL)
+		return -1;
+
+	r_pdev = rte_pktdev_from_ring(r);
+	ring_ethdev_port = rte_eth_from_rings("TEST_RING",
+			&r, 1, &r, 1, /* one RX ring, one TX ring */
+			rte_socket_id());
+	rte_eth_dev_configure(ring_ethdev_port, 1, 1, &port_conf_default);
+	p = rte_pktmbuf_pool_create("Test pool", 1023, 32, 0, 2048, rte_socket_id());
+	rte_eth_rx_queue_setup(ring_ethdev_port, 0, 128, rte_socket_id(), NULL, p);
+	rte_eth_tx_queue_setup(ring_ethdev_port, 0, 128, rte_socket_id(), NULL);
+
+	re_pdev = rte_pktdev_from_ethport(ring_ethdev_port, 0, 0);
+
+	printf("### Testing single element and burst enq/deq ###\n");
+	test_single_enqueue_dequeue();
+
+	printf("\n### Testing empty dequeue ###\n");
+	test_empty_dequeue();
+
+	printf("\n### Testing using a single lcore ###\n");
+	test_bulk_enqueue_dequeue();
+
+	return 0;
+}
+
+static struct test_command ring_perf_cmd = {
+	.command = "pktdev_perf_autotest",
+	.callback = test_pktdev_perf,
+};
+REGISTER_TEST_COMMAND(ring_perf_cmd);
diff --git a/lib/librte_pktdev/rte_pktdev.h b/lib/librte_pktdev/rte_pktdev.h
index 3acbc0d..4740c67 100644
--- a/lib/librte_pktdev/rte_pktdev.h
+++ b/lib/librte_pktdev/rte_pktdev.h
@@ -46,6 +46,7 @@ extern "C" {
 
 #include <stdint.h>
 #include <rte_ring.h>
+#include <rte_mbuf.h>
 #include <rte_branch_prediction.h>
 
 /* Buffered TX works in bursts of 32 */
@@ -53,9 +54,8 @@ extern "C" {
 
 /*
  * forward definition of data structures.
- * We don't need full mbuf/kni/ethdev headers here
+ * We don't need full kni/ethdev headers here
  */
-struct rte_mbuf;
 struct rte_kni;
 struct rte_eth_dev;
 
@@ -136,7 +136,7 @@ struct rte_pktdev {
  *   of pointers to *rte_mbuf* structures effectively supplied to the
  *   *rx_pkts* array.
  */
-static inline uint16_t
+static inline uint16_t __attribute__((always_inline))
 rte_pkt_rx_burst(struct rte_pktdev *dev, struct rte_mbuf **rx_pkts,
 		uint16_t nb_pkts)
 {
@@ -168,7 +168,7 @@ rte_pkt_rx_burst(struct rte_pktdev *dev, struct rte_mbuf **rx_pkts,
  *   the transmit ring. The return value can be less than the value of the
  *   *tx_pkts* parameter when the transmit ring is full or has been filled up.
  */
-static inline uint16_t
+static inline uint16_t __attribute__((always_inline))
 rte_pkt_tx_burst(struct rte_pktdev *dev, struct rte_mbuf **tx_pkts,
 		uint16_t nb_pkts)
 {
-- 
2.4.2



More information about the dev mailing list