[dpdk-dev] [PATCH v2] baseband/turbo_sw: offload cost measurement test

KamilX Chalupnik kamilx.chalupnik at intel.com
Tue Apr 17 16:27:24 CEST 2018


New test created to measure offload cost.
Changes were introduced in API, turbo software driver
and test application.

Signed-off-by: KamilX Chalupnik <kamilx.chalupnik at intel.com>

v2:
- logging macros reverted

---
 app/test-bbdev/test_bbdev_perf.c                 | 333 ++++++++++++++++++-----
 drivers/baseband/turbo_sw/bbdev_turbo_software.c |  83 ++++--
 lib/librte_bbdev/rte_bbdev.h                     |   4 +
 3 files changed, 329 insertions(+), 91 deletions(-)

diff --git a/app/test-bbdev/test_bbdev_perf.c b/app/test-bbdev/test_bbdev_perf.c
index 00f3b08..be2e20c 100644
--- a/app/test-bbdev/test_bbdev_perf.c
+++ b/app/test-bbdev/test_bbdev_perf.c
@@ -83,6 +83,28 @@ struct thread_params {
 	struct test_op_params *op_params;
 };
 
+/* Stores time statistics */
+struct test_time_stats {
+	/* Stores software enqueue total working time */
+	uint64_t enq_sw_tot_time;
+	/* Stores minimum value of software enqueue working time */
+	uint64_t enq_sw_min_time;
+	/* Stores maximum value of software enqueue working time */
+	uint64_t enq_sw_max_time;
+	/* Stores turbo enqueue total working time */
+	uint64_t enq_tur_tot_time;
+	/* Stores minimum value of turbo enqueue working time */
+	uint64_t enq_tur_min_time;
+	/* Stores maximum value of turbo enqueue working time */
+	uint64_t enq_tur_max_time;
+	/* Stores dequeue total working time */
+	uint64_t deq_tot_time;
+	/* Stores minimum value of dequeue working time */
+	uint64_t deq_min_time;
+	/* Stores maximum value of dequeue working time */
+	uint64_t deq_max_time;
+};
+
 typedef int (test_case_function)(struct active_device *ad,
 		struct test_op_params *op_params);
 
@@ -1104,7 +1126,6 @@ dequeue_event_callback(uint16_t dev_id,
 	double in_len;
 
 	struct thread_params *tp = cb_arg;
-
 	RTE_SET_USED(ret_param);
 	queue_id = tp->queue_id;
 
@@ -1649,20 +1670,21 @@ throughput_test(struct active_device *ad,
 }
 
 static int
-operation_latency_test_dec(struct rte_mempool *mempool,
+latency_test_dec(struct rte_mempool *mempool,
 		struct test_buffers *bufs, struct rte_bbdev_dec_op *ref_op,
 		int vector_mask, uint16_t dev_id, uint16_t queue_id,
 		const uint16_t num_to_process, uint16_t burst_sz,
-		uint64_t *total_time)
+		uint64_t *total_time, uint64_t *min_time, uint64_t *max_time)
 {
 	int ret = TEST_SUCCESS;
 	uint16_t i, j, dequeued;
 	struct rte_bbdev_dec_op *ops_enq[MAX_BURST], *ops_deq[MAX_BURST];
-	uint64_t start_time = 0;
+	uint64_t start_time = 0, last_time = 0;
 
 	for (i = 0, dequeued = 0; dequeued < num_to_process; ++i) {
 		uint16_t enq = 0, deq = 0;
 		bool first_time = true;
+		last_time = 0;
 
 		if (unlikely(num_to_process - dequeued < burst_sz))
 			burst_sz = num_to_process - dequeued;
@@ -1692,11 +1714,15 @@ operation_latency_test_dec(struct rte_mempool *mempool,
 			deq += rte_bbdev_dequeue_dec_ops(dev_id, queue_id,
 					&ops_deq[deq], burst_sz - deq);
 			if (likely(first_time && (deq > 0))) {
-				*total_time += rte_rdtsc_precise() - start_time;
+				last_time = rte_rdtsc_precise() - start_time;
 				first_time = false;
 			}
 		} while (unlikely(burst_sz != deq));
 
+		*max_time = RTE_MAX(*max_time, last_time);
+		*min_time = RTE_MIN(*min_time, last_time);
+		*total_time += last_time;
+
 		if (test_vector.op_type != RTE_BBDEV_OP_NONE) {
 			ret = validate_dec_op(ops_deq, burst_sz, ref_op,
 					vector_mask);
@@ -1711,20 +1737,21 @@ operation_latency_test_dec(struct rte_mempool *mempool,
 }
 
 static int
-operation_latency_test_enc(struct rte_mempool *mempool,
+latency_test_enc(struct rte_mempool *mempool,
 		struct test_buffers *bufs, struct rte_bbdev_enc_op *ref_op,
 		uint16_t dev_id, uint16_t queue_id,
 		const uint16_t num_to_process, uint16_t burst_sz,
-		uint64_t *total_time)
+		uint64_t *total_time, uint64_t *min_time, uint64_t *max_time)
 {
 	int ret = TEST_SUCCESS;
 	uint16_t i, j, dequeued;
 	struct rte_bbdev_enc_op *ops_enq[MAX_BURST], *ops_deq[MAX_BURST];
-	uint64_t start_time = 0;
+	uint64_t start_time = 0, last_time = 0;
 
 	for (i = 0, dequeued = 0; dequeued < num_to_process; ++i) {
 		uint16_t enq = 0, deq = 0;
 		bool first_time = true;
+		last_time = 0;
 
 		if (unlikely(num_to_process - dequeued < burst_sz))
 			burst_sz = num_to_process - dequeued;
@@ -1753,11 +1780,15 @@ operation_latency_test_enc(struct rte_mempool *mempool,
 			deq += rte_bbdev_dequeue_enc_ops(dev_id, queue_id,
 					&ops_deq[deq], burst_sz - deq);
 			if (likely(first_time && (deq > 0))) {
-				*total_time += rte_rdtsc_precise() - start_time;
+				last_time += rte_rdtsc_precise() - start_time;
 				first_time = false;
 			}
 		} while (unlikely(burst_sz != deq));
 
+		*max_time = RTE_MAX(*max_time, last_time);
+		*min_time = RTE_MIN(*min_time, last_time);
+		*total_time += last_time;
+
 		if (test_vector.op_type != RTE_BBDEV_OP_NONE) {
 			ret = validate_enc_op(ops_deq, burst_sz, ref_op);
 			TEST_ASSERT_SUCCESS(ret, "Validation failed!");
@@ -1771,7 +1802,7 @@ operation_latency_test_enc(struct rte_mempool *mempool,
 }
 
 static int
-operation_latency_test(struct active_device *ad,
+latency_test(struct active_device *ad,
 		struct test_op_params *op_params)
 {
 	int iter;
@@ -1781,9 +1812,12 @@ operation_latency_test(struct active_device *ad,
 	const uint16_t queue_id = ad->queue_ids[0];
 	struct test_buffers *bufs = NULL;
 	struct rte_bbdev_info info;
-	uint64_t total_time = 0;
+	uint64_t total_time, min_time, max_time;
 	const char *op_type_str;
 
+	total_time = max_time = 0;
+	min_time = UINT64_MAX;
+
 	TEST_ASSERT_SUCCESS((burst_sz > MAX_BURST),
 			"BURST_SIZE should be <= %u", MAX_BURST);
 
@@ -1798,36 +1832,65 @@ operation_latency_test(struct active_device *ad,
 			info.dev_name, burst_sz, num_to_process, op_type_str);
 
 	if (op_type == RTE_BBDEV_OP_TURBO_DEC)
-		iter = operation_latency_test_dec(op_params->mp, bufs,
+		iter = latency_test_dec(op_params->mp, bufs,
 				op_params->ref_dec_op, op_params->vector_mask,
 				ad->dev_id, queue_id, num_to_process,
-				burst_sz, &total_time);
+				burst_sz, &total_time, &min_time, &max_time);
 	else
-		iter = operation_latency_test_enc(op_params->mp, bufs,
+		iter = latency_test_enc(op_params->mp, bufs,
 				op_params->ref_enc_op, ad->dev_id, queue_id,
-				num_to_process, burst_sz, &total_time);
+				num_to_process, burst_sz, &total_time,
+				&min_time, &max_time);
 
 	if (iter <= 0)
 		return TEST_FAILED;
 
-	printf("\toperation avg. latency: %lg cycles, %lg us\n",
+	printf("\toperation latency:\n"
+			"\t\tavg latency: %lg cycles, %lg us\n"
+			"\t\tmin latency: %lg cycles, %lg us\n"
+			"\t\tmax latency: %lg cycles, %lg us\n",
 			(double)total_time / (double)iter,
 			(double)(total_time * 1000000) / (double)iter /
+			(double)rte_get_tsc_hz(), (double)min_time,
+			(double)(min_time * 1000000) / (double)rte_get_tsc_hz(),
+			(double)max_time, (double)(max_time * 1000000) /
 			(double)rte_get_tsc_hz());
 
 	return TEST_SUCCESS;
 }
 
 static int
+get_bbdev_queue_stats(uint16_t dev_id, uint16_t queue_id,
+		struct rte_bbdev_stats *stats)
+{
+	struct rte_bbdev *dev = &rte_bbdev_devices[dev_id];
+	struct rte_bbdev_stats *q_stats;
+
+	if (queue_id >= dev->data->num_queues)
+		return -1;
+
+	q_stats = &dev->data->queues[queue_id].queue_stats;
+
+	stats->enqueued_count = q_stats->enqueued_count;
+	stats->dequeued_count = q_stats->dequeued_count;
+	stats->enqueue_err_count = q_stats->enqueue_err_count;
+	stats->dequeue_err_count = q_stats->dequeue_err_count;
+	stats->turbo_perf_time = q_stats->turbo_perf_time;
+
+	return 0;
+}
+
+static int
 offload_latency_test_dec(struct rte_mempool *mempool, struct test_buffers *bufs,
 		struct rte_bbdev_dec_op *ref_op, uint16_t dev_id,
 		uint16_t queue_id, const uint16_t num_to_process,
-		uint16_t burst_sz, uint64_t *enq_total_time,
-		uint64_t *deq_total_time)
+		uint16_t burst_sz, struct test_time_stats *time_st)
 {
-	int i, dequeued;
+	int i, dequeued, ret;
 	struct rte_bbdev_dec_op *ops_enq[MAX_BURST], *ops_deq[MAX_BURST];
 	uint64_t enq_start_time, deq_start_time;
+	uint64_t enq_sw_last_time, deq_last_time;
+	struct rte_bbdev_stats stats;
 
 	for (i = 0, dequeued = 0; dequeued < num_to_process; ++i) {
 		uint16_t enq = 0, deq = 0;
@@ -1843,24 +1906,54 @@ offload_latency_test_dec(struct rte_mempool *mempool, struct test_buffers *bufs,
 					bufs->soft_outputs,
 					ref_op);
 
-		/* Start time measurment for enqueue function offload latency */
-		enq_start_time = rte_rdtsc();
+		/* Start time meas for enqueue function offload latency */
+		enq_start_time = rte_rdtsc_precise();
 		do {
 			enq += rte_bbdev_enqueue_dec_ops(dev_id, queue_id,
 					&ops_enq[enq], burst_sz - enq);
 		} while (unlikely(burst_sz != enq));
-		*enq_total_time += rte_rdtsc() - enq_start_time;
+
+		ret = get_bbdev_queue_stats(dev_id, queue_id, &stats);
+		TEST_ASSERT_SUCCESS(ret,
+				"Failed to get stats for queue (%u) of device (%u)",
+				queue_id, dev_id);
+
+		enq_sw_last_time = rte_rdtsc_precise() - enq_start_time -
+				stats.turbo_perf_time;
+		time_st->enq_sw_max_time = RTE_MAX(time_st->enq_sw_max_time,
+				enq_sw_last_time);
+		time_st->enq_sw_min_time = RTE_MIN(time_st->enq_sw_min_time,
+				enq_sw_last_time);
+		time_st->enq_sw_tot_time += enq_sw_last_time;
+
+		time_st->enq_tur_max_time = RTE_MAX(time_st->enq_tur_max_time,
+				stats.turbo_perf_time);
+		time_st->enq_tur_min_time = RTE_MIN(time_st->enq_tur_min_time,
+				stats.turbo_perf_time);
+		time_st->enq_tur_tot_time += stats.turbo_perf_time;
 
 		/* ensure enqueue has been completed */
 		rte_delay_ms(10);
 
-		/* Start time measurment for dequeue function offload latency */
-		deq_start_time = rte_rdtsc();
+		/* Start time meas for dequeue function offload latency */
+		deq_start_time = rte_rdtsc_precise();
+		/* Dequeue one operation */
 		do {
 			deq += rte_bbdev_dequeue_dec_ops(dev_id, queue_id,
+					&ops_deq[deq], 1);
+		} while (unlikely(deq != 1));
+
+		deq_last_time = rte_rdtsc_precise() - deq_start_time;
+		time_st->deq_max_time = RTE_MAX(time_st->deq_max_time,
+				deq_last_time);
+		time_st->deq_min_time = RTE_MIN(time_st->deq_min_time,
+				deq_last_time);
+		time_st->deq_tot_time += deq_last_time;
+
+		/* Dequeue remaining operations if needed*/
+		while (burst_sz != deq)
+			deq += rte_bbdev_dequeue_dec_ops(dev_id, queue_id,
 					&ops_deq[deq], burst_sz - deq);
-		} while (unlikely(burst_sz != deq));
-		*deq_total_time += rte_rdtsc() - deq_start_time;
 
 		rte_bbdev_dec_op_free_bulk(ops_enq, deq);
 		dequeued += deq;
@@ -1873,12 +1966,13 @@ static int
 offload_latency_test_enc(struct rte_mempool *mempool, struct test_buffers *bufs,
 		struct rte_bbdev_enc_op *ref_op, uint16_t dev_id,
 		uint16_t queue_id, const uint16_t num_to_process,
-		uint16_t burst_sz, uint64_t *enq_total_time,
-		uint64_t *deq_total_time)
+		uint16_t burst_sz, struct test_time_stats *time_st)
 {
-	int i, dequeued;
+	int i, dequeued, ret;
 	struct rte_bbdev_enc_op *ops_enq[MAX_BURST], *ops_deq[MAX_BURST];
 	uint64_t enq_start_time, deq_start_time;
+	uint64_t enq_sw_last_time, deq_last_time;
+	struct rte_bbdev_stats stats;
 
 	for (i = 0, dequeued = 0; dequeued < num_to_process; ++i) {
 		uint16_t enq = 0, deq = 0;
@@ -1893,24 +1987,53 @@ offload_latency_test_enc(struct rte_mempool *mempool, struct test_buffers *bufs,
 					bufs->hard_outputs,
 					ref_op);
 
-		/* Start time measurment for enqueue function offload latency */
-		enq_start_time = rte_rdtsc();
+		/* Start time meas for enqueue function offload latency */
+		enq_start_time = rte_rdtsc_precise();
 		do {
 			enq += rte_bbdev_enqueue_enc_ops(dev_id, queue_id,
 					&ops_enq[enq], burst_sz - enq);
 		} while (unlikely(burst_sz != enq));
-		*enq_total_time += rte_rdtsc() - enq_start_time;
+
+		ret = get_bbdev_queue_stats(dev_id, queue_id, &stats);
+		TEST_ASSERT_SUCCESS(ret,
+				"Failed to get stats for queue (%u) of device (%u)",
+				queue_id, dev_id);
+
+		enq_sw_last_time = rte_rdtsc_precise() - enq_start_time -
+				stats.turbo_perf_time;
+		time_st->enq_sw_max_time = RTE_MAX(time_st->enq_sw_max_time,
+				enq_sw_last_time);
+		time_st->enq_sw_min_time = RTE_MIN(time_st->enq_sw_min_time,
+				enq_sw_last_time);
+		time_st->enq_sw_tot_time += enq_sw_last_time;
+
+		time_st->enq_tur_max_time = RTE_MAX(time_st->enq_tur_max_time,
+				stats.turbo_perf_time);
+		time_st->enq_tur_min_time = RTE_MIN(time_st->enq_tur_min_time,
+				stats.turbo_perf_time);
+		time_st->enq_tur_tot_time += stats.turbo_perf_time;
 
 		/* ensure enqueue has been completed */
 		rte_delay_ms(10);
 
-		/* Start time measurment for dequeue function offload latency */
-		deq_start_time = rte_rdtsc();
+		/* Start time meas for dequeue function offload latency */
+		deq_start_time = rte_rdtsc_precise();
+		/* Dequeue one operation */
 		do {
 			deq += rte_bbdev_dequeue_enc_ops(dev_id, queue_id,
+					&ops_deq[deq], 1);
+		} while (unlikely(deq != 1));
+
+		deq_last_time = rte_rdtsc_precise() - deq_start_time;
+		time_st->deq_max_time = RTE_MAX(time_st->deq_max_time,
+				deq_last_time);
+		time_st->deq_min_time = RTE_MIN(time_st->deq_min_time,
+				deq_last_time);
+		time_st->deq_tot_time += deq_last_time;
+
+		while (burst_sz != deq)
+			deq += rte_bbdev_dequeue_enc_ops(dev_id, queue_id,
 					&ops_deq[deq], burst_sz - deq);
-		} while (unlikely(burst_sz != deq));
-		*deq_total_time += rte_rdtsc() - deq_start_time;
 
 		rte_bbdev_enc_op_free_bulk(ops_enq, deq);
 		dequeued += deq;
@@ -1920,11 +2043,10 @@ offload_latency_test_enc(struct rte_mempool *mempool, struct test_buffers *bufs,
 }
 
 static int
-offload_latency_test(struct active_device *ad,
+offload_cost_test(struct active_device *ad,
 		struct test_op_params *op_params)
 {
 	int iter;
-	uint64_t enq_total_time = 0, deq_total_time = 0;
 	uint16_t burst_sz = op_params->burst_sz;
 	const uint16_t num_to_process = op_params->num_to_process;
 	const enum rte_bbdev_op_type op_type = test_vector.op_type;
@@ -1932,6 +2054,12 @@ offload_latency_test(struct active_device *ad,
 	struct test_buffers *bufs = NULL;
 	struct rte_bbdev_info info;
 	const char *op_type_str;
+	struct test_time_stats time_st;
+
+	memset(&time_st, 0, sizeof(struct test_time_stats));
+	time_st.enq_sw_min_time = UINT64_MAX;
+	time_st.enq_tur_min_time = UINT64_MAX;
+	time_st.deq_min_time = UINT64_MAX;
 
 	TEST_ASSERT_SUCCESS((burst_sz > MAX_BURST),
 			"BURST_SIZE should be <= %u", MAX_BURST);
@@ -1949,26 +2077,51 @@ offload_latency_test(struct active_device *ad,
 	if (op_type == RTE_BBDEV_OP_TURBO_DEC)
 		iter = offload_latency_test_dec(op_params->mp, bufs,
 				op_params->ref_dec_op, ad->dev_id, queue_id,
-				num_to_process, burst_sz, &enq_total_time,
-				&deq_total_time);
+				num_to_process, burst_sz, &time_st);
 	else
 		iter = offload_latency_test_enc(op_params->mp, bufs,
 				op_params->ref_enc_op, ad->dev_id, queue_id,
-				num_to_process, burst_sz, &enq_total_time,
-				&deq_total_time);
+				num_to_process, burst_sz, &time_st);
 
 	if (iter <= 0)
 		return TEST_FAILED;
 
-	printf("\tenq offload avg. latency: %lg cycles, %lg us\n",
-			(double)enq_total_time / (double)iter,
-			(double)(enq_total_time * 1000000) / (double)iter /
-			(double)rte_get_tsc_hz());
-
-	printf("\tdeq offload avg. latency: %lg cycles, %lg us\n",
-			(double)deq_total_time / (double)iter,
-			(double)(deq_total_time * 1000000) / (double)iter /
-			(double)rte_get_tsc_hz());
+	printf("\tenq offload cost latency:\n"
+			"\t\tsoftware avg %lg cycles, %lg us\n"
+			"\t\tsoftware min %lg cycles, %lg us\n"
+			"\t\tsoftware max %lg cycles, %lg us\n"
+			"\t\tturbo avg %lg cycles, %lg us\n"
+			"\t\tturbo min %lg cycles, %lg us\n"
+			"\t\tturbo max %lg cycles, %lg us\n",
+			(double)time_st.enq_sw_tot_time / (double)iter,
+			(double)(time_st.enq_sw_tot_time * 1000000) /
+			(double)iter / (double)rte_get_tsc_hz(),
+			(double)time_st.enq_sw_min_time,
+			(double)(time_st.enq_sw_min_time * 1000000) /
+			rte_get_tsc_hz(), (double)time_st.enq_sw_max_time,
+			(double)(time_st.enq_sw_max_time * 1000000) /
+			rte_get_tsc_hz(), (double)time_st.enq_tur_tot_time /
+			(double)iter,
+			(double)(time_st.enq_tur_tot_time * 1000000) /
+			(double)iter / (double)rte_get_tsc_hz(),
+			(double)time_st.enq_tur_min_time,
+			(double)(time_st.enq_tur_min_time * 1000000) /
+			rte_get_tsc_hz(), (double)time_st.enq_tur_max_time,
+			(double)(time_st.enq_tur_max_time * 1000000) /
+			rte_get_tsc_hz());
+
+	printf("\tdeq offload cost latency - one op:\n"
+			"\t\tavg %lg cycles, %lg us\n"
+			"\t\tmin %lg cycles, %lg us\n"
+			"\t\tmax %lg cycles, %lg us\n",
+			(double)time_st.deq_tot_time / (double)iter,
+			(double)(time_st.deq_tot_time * 1000000) /
+			(double)iter / (double)rte_get_tsc_hz(),
+			(double)time_st.deq_min_time,
+			(double)(time_st.deq_min_time * 1000000) /
+			rte_get_tsc_hz(), (double)time_st.deq_max_time,
+			(double)(time_st.deq_max_time * 1000000) /
+			rte_get_tsc_hz());
 
 	return TEST_SUCCESS;
 }
@@ -1976,21 +2129,28 @@ offload_latency_test(struct active_device *ad,
 static int
 offload_latency_empty_q_test_dec(uint16_t dev_id, uint16_t queue_id,
 		const uint16_t num_to_process, uint16_t burst_sz,
-		uint64_t *deq_total_time)
+		uint64_t *deq_tot_time, uint64_t *deq_min_time,
+		uint64_t *deq_max_time)
 {
 	int i, deq_total;
 	struct rte_bbdev_dec_op *ops[MAX_BURST];
-	uint64_t deq_start_time;
+	uint64_t deq_start_time, deq_last_time;
 
 	/* Test deq offload latency from an empty queue */
-	deq_start_time = rte_rdtsc_precise();
+
 	for (i = 0, deq_total = 0; deq_total < num_to_process;
 			++i, deq_total += burst_sz) {
+		deq_start_time = rte_rdtsc_precise();
+
 		if (unlikely(num_to_process - deq_total < burst_sz))
 			burst_sz = num_to_process - deq_total;
 		rte_bbdev_dequeue_dec_ops(dev_id, queue_id, ops, burst_sz);
+
+		deq_last_time = rte_rdtsc_precise() - deq_start_time;
+		*deq_max_time = RTE_MAX(*deq_max_time, deq_last_time);
+		*deq_min_time = RTE_MIN(*deq_min_time, deq_last_time);
+		*deq_tot_time += deq_last_time;
 	}
-	*deq_total_time = rte_rdtsc_precise() - deq_start_time;
 
 	return i;
 }
@@ -1998,21 +2158,27 @@ offload_latency_empty_q_test_dec(uint16_t dev_id, uint16_t queue_id,
 static int
 offload_latency_empty_q_test_enc(uint16_t dev_id, uint16_t queue_id,
 		const uint16_t num_to_process, uint16_t burst_sz,
-		uint64_t *deq_total_time)
+		uint64_t *deq_tot_time, uint64_t *deq_min_time,
+		uint64_t *deq_max_time)
 {
 	int i, deq_total;
 	struct rte_bbdev_enc_op *ops[MAX_BURST];
-	uint64_t deq_start_time;
+	uint64_t deq_start_time, deq_last_time;
 
 	/* Test deq offload latency from an empty queue */
-	deq_start_time = rte_rdtsc_precise();
 	for (i = 0, deq_total = 0; deq_total < num_to_process;
 			++i, deq_total += burst_sz) {
+		deq_start_time = rte_rdtsc_precise();
+
 		if (unlikely(num_to_process - deq_total < burst_sz))
 			burst_sz = num_to_process - deq_total;
 		rte_bbdev_dequeue_enc_ops(dev_id, queue_id, ops, burst_sz);
+
+		deq_last_time = rte_rdtsc_precise() - deq_start_time;
+		*deq_max_time = RTE_MAX(*deq_max_time, deq_last_time);
+		*deq_min_time = RTE_MIN(*deq_min_time, deq_last_time);
+		*deq_tot_time += deq_last_time;
 	}
-	*deq_total_time = rte_rdtsc_precise() - deq_start_time;
 
 	return i;
 }
@@ -2022,7 +2188,7 @@ offload_latency_empty_q_test(struct active_device *ad,
 		struct test_op_params *op_params)
 {
 	int iter;
-	uint64_t deq_total_time = 0;
+	uint64_t deq_tot_time, deq_min_time, deq_max_time;
 	uint16_t burst_sz = op_params->burst_sz;
 	const uint16_t num_to_process = op_params->num_to_process;
 	const enum rte_bbdev_op_type op_type = test_vector.op_type;
@@ -2030,6 +2196,9 @@ offload_latency_empty_q_test(struct active_device *ad,
 	struct rte_bbdev_info info;
 	const char *op_type_str;
 
+	deq_tot_time = deq_max_time = 0;
+	deq_min_time = UINT64_MAX;
+
 	TEST_ASSERT_SUCCESS((burst_sz > MAX_BURST),
 			"BURST_SIZE should be <= %u", MAX_BURST);
 
@@ -2044,18 +2213,26 @@ offload_latency_empty_q_test(struct active_device *ad,
 
 	if (op_type == RTE_BBDEV_OP_TURBO_DEC)
 		iter = offload_latency_empty_q_test_dec(ad->dev_id, queue_id,
-				num_to_process, burst_sz, &deq_total_time);
+				num_to_process, burst_sz, &deq_tot_time,
+				&deq_min_time, &deq_max_time);
 	else
 		iter = offload_latency_empty_q_test_enc(ad->dev_id, queue_id,
-				num_to_process, burst_sz, &deq_total_time);
+				num_to_process, burst_sz, &deq_tot_time,
+				&deq_min_time, &deq_max_time);
 
 	if (iter <= 0)
 		return TEST_FAILED;
 
-	printf("\tempty deq offload avg. latency: %lg cycles, %lg us\n",
-			(double)deq_total_time / (double)iter,
-			(double)(deq_total_time * 1000000) / (double)iter /
-			(double)rte_get_tsc_hz());
+	printf("\tempty deq offload\n"
+			"\t\tavg. latency: %lg cycles, %lg us\n"
+			"\t\tmin. latency: %lg cycles, %lg us\n"
+			"\t\tmax. latency: %lg cycles, %lg us\n",
+			(double)deq_tot_time / (double)iter,
+			(double)(deq_tot_time * 1000000) / (double)iter /
+			(double)rte_get_tsc_hz(), (double)deq_min_time,
+			(double)(deq_min_time * 1000000) / rte_get_tsc_hz(),
+			(double)deq_max_time, (double)(deq_max_time * 1000000) /
+			rte_get_tsc_hz());
 
 	return TEST_SUCCESS;
 }
@@ -2067,9 +2244,9 @@ throughput_tc(void)
 }
 
 static int
-offload_latency_tc(void)
+offload_cost_tc(void)
 {
-	return run_test_case(offload_latency_test);
+	return run_test_case(offload_cost_test);
 }
 
 static int
@@ -2079,9 +2256,9 @@ offload_latency_empty_q_tc(void)
 }
 
 static int
-operation_latency_tc(void)
+latency_tc(void)
 {
-	return run_test_case(operation_latency_test);
+	return run_test_case(latency_test);
 }
 
 static int
@@ -2105,7 +2282,7 @@ static struct unit_test_suite bbdev_validation_testsuite = {
 	.setup = testsuite_setup,
 	.teardown = testsuite_teardown,
 	.unit_test_cases = {
-		TEST_CASE_ST(ut_setup, ut_teardown, operation_latency_tc),
+		TEST_CASE_ST(ut_setup, ut_teardown, latency_tc),
 		TEST_CASES_END() /**< NULL terminate unit test array */
 	}
 };
@@ -2115,9 +2292,18 @@ static struct unit_test_suite bbdev_latency_testsuite = {
 	.setup = testsuite_setup,
 	.teardown = testsuite_teardown,
 	.unit_test_cases = {
-		TEST_CASE_ST(ut_setup, ut_teardown, offload_latency_tc),
+		TEST_CASE_ST(ut_setup, ut_teardown, latency_tc),
+		TEST_CASES_END() /**< NULL terminate unit test array */
+	}
+};
+
+static struct unit_test_suite bbdev_offload_cost_testsuite = {
+	.suite_name = "BBdev Offload Cost Tests",
+	.setup = testsuite_setup,
+	.teardown = testsuite_teardown,
+	.unit_test_cases = {
+		TEST_CASE_ST(ut_setup, ut_teardown, offload_cost_tc),
 		TEST_CASE_ST(ut_setup, ut_teardown, offload_latency_empty_q_tc),
-		TEST_CASE_ST(ut_setup, ut_teardown, operation_latency_tc),
 		TEST_CASES_END() /**< NULL terminate unit test array */
 	}
 };
@@ -2135,4 +2321,5 @@ static struct unit_test_suite bbdev_interrupt_testsuite = {
 REGISTER_TEST_COMMAND(throughput, bbdev_throughput_testsuite);
 REGISTER_TEST_COMMAND(validation, bbdev_validation_testsuite);
 REGISTER_TEST_COMMAND(latency, bbdev_latency_testsuite);
+REGISTER_TEST_COMMAND(offload, bbdev_offload_cost_testsuite);
 REGISTER_TEST_COMMAND(interrupt, bbdev_interrupt_testsuite);
diff --git a/drivers/baseband/turbo_sw/bbdev_turbo_software.c b/drivers/baseband/turbo_sw/bbdev_turbo_software.c
index 302abf5..2a65d46 100644
--- a/drivers/baseband/turbo_sw/bbdev_turbo_software.c
+++ b/drivers/baseband/turbo_sw/bbdev_turbo_software.c
@@ -9,6 +9,7 @@
 #include <rte_malloc.h>
 #include <rte_ring.h>
 #include <rte_kvargs.h>
+#include <rte_cycles.h>
 
 #include <rte_bbdev.h>
 #include <rte_bbdev_pmd.h>
@@ -454,7 +455,8 @@ static inline void
 process_enc_cb(struct turbo_sw_queue *q, struct rte_bbdev_enc_op *op,
 		uint8_t cb_idx, uint8_t c, uint16_t k, uint16_t ncb,
 		uint32_t e, struct rte_mbuf *m_in, struct rte_mbuf *m_out,
-		uint16_t in_offset, uint16_t out_offset, uint16_t total_left)
+		uint16_t in_offset, uint16_t out_offset, uint16_t total_left,
+		struct rte_bbdev_stats *q_stats)
 {
 	int ret;
 	int16_t k_idx;
@@ -462,10 +464,16 @@ process_enc_cb(struct turbo_sw_queue *q, struct rte_bbdev_enc_op *op,
 	uint8_t *in, *out0, *out1, *out2, *tmp_out, *rm_out;
 	struct rte_bbdev_op_turbo_enc *enc = &op->turbo_enc;
 	struct bblib_crc_request crc_req;
+	struct bblib_crc_response crc_resp;
 	struct bblib_turbo_encoder_request turbo_req;
 	struct bblib_turbo_encoder_response turbo_resp;
 	struct bblib_rate_match_dl_request rm_req;
 	struct bblib_rate_match_dl_response rm_resp;
+#ifdef RTE_TEST_BBDEV
+	uint64_t start_time;
+#else
+	RTE_SET_USED(q_stats);
+#endif
 
 	k_idx = compute_idx(k);
 	in = rte_pktmbuf_mtod_offset(m_in, uint8_t *, in_offset);
@@ -482,13 +490,20 @@ process_enc_cb(struct turbo_sw_queue *q, struct rte_bbdev_enc_op *op,
 		 * it by 3 CRC bytes
 		 */
 		rte_memcpy(q->enc_in, in, (k - 24) >> 3);
-		crc_req.data = q->enc_in;
+		crc_req.data = in;
 		crc_req.len = (k - 24) >> 3;
-		if (bblib_lte_crc24a_gen(&crc_req) == -1) {
-			op->status |= 1 << RTE_BBDEV_CRC_ERROR;
-			rte_bbdev_log(ERR, "CRC24a generation failed");
-			return;
-		}
+		crc_resp.data = q->enc_in;
+
+#ifdef RTE_TEST_BBDEV
+		start_time = rte_rdtsc_precise();
+#endif
+
+		bblib_lte_crc24a_gen(&crc_req, &crc_resp);
+
+#ifdef RTE_TEST_BBDEV
+		q_stats->turbo_perf_time += rte_rdtsc_precise() - start_time;
+#endif
+
 		in = q->enc_in;
 	} else if (enc->op_flags & RTE_BBDEV_TURBO_CRC_24B_ATTACH) {
 		/* CRC24B */
@@ -501,13 +516,20 @@ process_enc_cb(struct turbo_sw_queue *q, struct rte_bbdev_enc_op *op,
 		 * it by 3 CRC bytes
 		 */
 		rte_memcpy(q->enc_in, in, (k - 24) >> 3);
-		crc_req.data = q->enc_in;
+		crc_req.data = in;
 		crc_req.len = (k - 24) >> 3;
-		if (bblib_lte_crc24b_gen(&crc_req) == -1) {
-			op->status |= 1 << RTE_BBDEV_CRC_ERROR;
-			rte_bbdev_log(ERR, "CRC24b generation failed");
-			return;
-		}
+		crc_resp.data = q->enc_in;
+
+#ifdef RTE_TEST_BBDEV
+		start_time = rte_rdtsc_precise();
+#endif
+
+		bblib_lte_crc24b_gen(&crc_req, &crc_resp);
+
+#ifdef RTE_TEST_BBDEV
+		q_stats->turbo_perf_time += rte_rdtsc_precise() - start_time;
+#endif
+
 		in = q->enc_in;
 	} else {
 		ret = is_enc_input_valid(k, k_idx, total_left);
@@ -533,12 +555,21 @@ process_enc_cb(struct turbo_sw_queue *q, struct rte_bbdev_enc_op *op,
 	turbo_resp.output_win_0 = out0;
 	turbo_resp.output_win_1 = out1;
 	turbo_resp.output_win_2 = out2;
+
+#ifdef RTE_TEST_BBDEV
+	start_time = rte_rdtsc_precise();
+#endif
+
 	if (bblib_turbo_encoder(&turbo_req, &turbo_resp) != 0) {
 		op->status |= 1 << RTE_BBDEV_DRV_ERROR;
 		rte_bbdev_log(ERR, "Turbo Encoder failed");
 		return;
 	}
 
+#ifdef RTE_TEST_BBDEV
+	q_stats->turbo_perf_time += rte_rdtsc_precise() - start_time;
+#endif
+
 	/* Rate-matching */
 	if (enc->op_flags & RTE_BBDEV_TURBO_RATE_MATCH) {
 		/* get output data starting address */
@@ -588,11 +619,20 @@ process_enc_cb(struct turbo_sw_queue *q, struct rte_bbdev_enc_op *op,
 		else
 			rm_req.bypass_rvidx = 0;
 
+#ifdef RTE_TEST_BBDEV
+		start_time = rte_rdtsc_precise();
+#endif
+
 		if (bblib_rate_match_dl(&rm_req, &rm_resp) != 0) {
 			op->status |= 1 << RTE_BBDEV_DRV_ERROR;
 			rte_bbdev_log(ERR, "Rate matching failed");
 			return;
 		}
+
+#ifdef RTE_TEST_BBDEV
+		q_stats->turbo_perf_time += rte_rdtsc_precise() - start_time;
+#endif
+
 		enc->output.length += rm_resp.OutputLen;
 	} else {
 		/* Rate matching is bypassed */
@@ -637,7 +677,8 @@ process_enc_cb(struct turbo_sw_queue *q, struct rte_bbdev_enc_op *op,
 }
 
 static inline void
-enqueue_enc_one_op(struct turbo_sw_queue *q, struct rte_bbdev_enc_op *op)
+enqueue_enc_one_op(struct turbo_sw_queue *q, struct rte_bbdev_enc_op *op,
+		struct rte_bbdev_stats *queue_stats)
 {
 	uint8_t c, r, crc24_bits = 0;
 	uint16_t k, ncb;
@@ -692,7 +733,8 @@ enqueue_enc_one_op(struct turbo_sw_queue *q, struct rte_bbdev_enc_op *op)
 		}
 
 		process_enc_cb(q, op, r, c, k, ncb, e, m_in,
-				m_out, in_offset, out_offset, total_left);
+				m_out, in_offset, out_offset, total_left,
+				queue_stats);
 		/* Update total_left */
 		total_left -= (k - crc24_bits) >> 3;
 		/* Update offsets for next CBs (if exist) */
@@ -714,12 +756,15 @@ enqueue_enc_one_op(struct turbo_sw_queue *q, struct rte_bbdev_enc_op *op)
 
 static inline uint16_t
 enqueue_enc_all_ops(struct turbo_sw_queue *q, struct rte_bbdev_enc_op **ops,
-		uint16_t nb_ops)
+		uint16_t nb_ops, struct rte_bbdev_stats *queue_stats)
 {
 	uint16_t i;
+#ifdef RTE_TEST_BBDEV
+	queue_stats->turbo_perf_time = 0;
+#endif
 
 	for (i = 0; i < nb_ops; ++i)
-		enqueue_enc_one_op(q, ops[i]);
+		enqueue_enc_one_op(q, ops[i], queue_stats);
 
 	return rte_ring_enqueue_burst(q->processed_pkts, (void **)ops, nb_ops,
 			NULL);
@@ -898,6 +943,8 @@ process_dec_cb(struct turbo_sw_queue *q, struct rte_bbdev_dec_op *op,
 	turbo_req.k = k;
 	turbo_req.k_idx = k_idx;
 	turbo_req.max_iter_num = dec->iter_max;
+	turbo_req.early_term_disable = !check_bit(dec->op_flags,
+			RTE_BBDEV_TURBO_EARLY_TERMINATION);
 	turbo_resp.ag_buf = q->ag;
 	turbo_resp.cb_buf = q->code_block;
 	turbo_resp.output = out;
@@ -1004,7 +1051,7 @@ enqueue_enc_ops(struct rte_bbdev_queue_data *q_data,
 	struct turbo_sw_queue *q = queue;
 	uint16_t nb_enqueued = 0;
 
-	nb_enqueued = enqueue_enc_all_ops(q, ops, nb_ops);
+	nb_enqueued = enqueue_enc_all_ops(q, ops, nb_ops, &q_data->queue_stats);
 
 	q_data->queue_stats.enqueue_err_count += nb_ops - nb_enqueued;
 	q_data->queue_stats.enqueued_count += nb_enqueued;
diff --git a/lib/librte_bbdev/rte_bbdev.h b/lib/librte_bbdev/rte_bbdev.h
index 5e7e495..395acf6 100644
--- a/lib/librte_bbdev/rte_bbdev.h
+++ b/lib/librte_bbdev/rte_bbdev.h
@@ -239,6 +239,10 @@ struct rte_bbdev_stats {
 	uint64_t enqueue_err_count;
 	/** Total error count on operations dequeued */
 	uint64_t dequeue_err_count;
+#ifdef RTE_TEST_BBDEV
+	/** It stores turbo decoder/encoder working time. */
+	uint64_t turbo_perf_time;
+#endif
 };
 
 /**
-- 
2.5.5

--------------------------------------------------------------
Intel Research and Development Ireland Limited
Registered in Ireland
Registered Office: Collinstown Industrial Park, Leixlip, County Kildare
Registered Number: 308263


This e-mail and any attachments may contain confidential material for the sole
use of the intended recipient(s). Any review or distribution by others is
strictly prohibited. If you are not the intended recipient, please contact the
sender and delete all copies.



More information about the dev mailing list