[dpdk-dev] [RFC PATCH 6/7] app/test: test dmadev instance failure handling

Bruce Richardson bruce.richardson at intel.com
Thu Aug 26 20:33:00 CEST 2021


Add a series of tests to inject bad copy operations into a dmadev to
test the error handling and reporting capabilities. Various combinations
of errors in various positions in a burst are tested, as are errors in
bursts with fence flag set, and multiple errors in a single burst.

Signed-off-by: Bruce Richardson <bruce.richardson at intel.com>
---
 app/test/test_dmadev.c | 395 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 395 insertions(+)

diff --git a/app/test/test_dmadev.c b/app/test/test_dmadev.c
index f3ebac2812..9b34632cbc 100644
--- a/app/test/test_dmadev.c
+++ b/app/test/test_dmadev.c
@@ -268,6 +268,387 @@ test_enqueue_copies(int dev_id, uint16_t vchan)
 			|| do_multi_copies(dev_id, vchan, 0, 0, 1);
 }
 
+/* Failure handling test cases - global macros and variables for those tests*/
+#define COMP_BURST_SZ	16
+#define OPT_FENCE(idx) ((fence && idx == 8) ? RTE_DMA_OP_FLAG_FENCE : 0)
+
+static int
+test_failure_in_full_burst(int dev_id, uint16_t vchan, bool fence,
+		struct rte_mbuf **srcs, struct rte_mbuf **dsts, unsigned int fail_idx)
+{
+	/* Test single full batch statuses with failures */
+	enum rte_dma_status_code status[COMP_BURST_SZ];
+	uint16_t invalid_addr_id = 0;
+	uint16_t idx;
+	uint16_t count, status_count;
+	unsigned int j;
+	bool error = 0;
+
+	for (j = 0; j < COMP_BURST_SZ; j++) {
+		int id = rte_dmadev_copy(dev_id, vchan,
+				(j == fail_idx ? 0 : (srcs[j]->buf_iova + srcs[j]->data_off)),
+				dsts[j]->buf_iova + dsts[j]->data_off,
+				COPY_LEN, OPT_FENCE(j));
+		if (id < 0) {
+			PRINT_ERR("Error with rte_dmadev_copy for buffer %u\n", j);
+			return -1;
+		}
+		if (j == fail_idx)
+			invalid_addr_id = id;
+	}
+	rte_dmadev_submit(dev_id, vchan);
+	usleep(10);
+
+	count = rte_dmadev_completed(dev_id, vchan, COMP_BURST_SZ, &idx, &error);
+	if (count != fail_idx) {
+		PRINT_ERR("Error with rte_dmadev_completed for failure test. Got returned %u not %u.\n",
+				count, fail_idx);
+		rte_dmadev_dump(dev_id, stdout);
+		return -1;
+	}
+	if (error == false) {
+		PRINT_ERR("Error, missing expected failed copy, %u. has_error is not set\n",
+				fail_idx);
+		return -1;
+	}
+	if (idx != invalid_addr_id - 1) {
+		PRINT_ERR("Error, missing expected failed copy, %u. Got last idx %u, not %u\n",
+				fail_idx, idx, invalid_addr_id - 1);
+		return -1;
+	}
+
+	/* all checks ok, now verify calling completed() again always returns 0 */
+	for (j = 0; j < 10; j++) {
+		if (rte_dmadev_completed(dev_id, vchan, COMP_BURST_SZ, &idx, &error) != 0
+				|| error == false || idx != (invalid_addr_id - 1)) {
+			PRINT_ERR("Error with follow-up completed calls for fail idx %u\n",
+					fail_idx);
+			return -1;
+		}
+	}
+
+	status_count = rte_dmadev_completed_status(dev_id, vchan, COMP_BURST_SZ,
+			&idx, status);
+	/* some HW may stop on error and be restarted after getting error status for single value
+	 * To handle this case, if we get just one error back, wait for more completions and get
+	 * status for rest of the burst
+	 */
+	if (status_count == 1) {
+		usleep(10);
+		status_count += rte_dmadev_completed_status(dev_id, vchan, COMP_BURST_SZ - 1,
+					&idx, &status[1]);
+	}
+	/* check that at this point we have all status values */
+	if (status_count != COMP_BURST_SZ - count) {
+		PRINT_ERR("Error with completed_status calls for fail idx %u. Got %u not %u\n",
+				fail_idx, status_count, COMP_BURST_SZ - count);
+		return -1;
+	}
+	/* now verify just one failure followed by multiple successful or skipped entries */
+	if (status[0] == RTE_DMA_STATUS_SUCCESSFUL) {
+		PRINT_ERR("Error with status returned for fail idx %u. First status was not failure\n",
+				fail_idx);
+		return -1;
+	}
+	for (j = 1; j < status_count; j++) {
+		/* after a failure in a burst, depending on ordering/fencing,
+		 * operations may be successful or skipped because of previous error.
+		 */
+		if (status[j] != RTE_DMA_STATUS_SUCCESSFUL
+				&& status[j] != RTE_DMA_STATUS_NOT_ATTEMPTED) {
+			PRINT_ERR("Error with status calls for fail idx %u. Status for job %u (of %u) is not successful\n",
+					fail_idx, count + j, COMP_BURST_SZ);
+			return -1;
+		}
+	}
+	return 0;
+}
+
+static int
+test_individual_status_query_with_failure(int dev_id, uint16_t vchan, bool fence,
+		struct rte_mbuf **srcs, struct rte_mbuf **dsts, unsigned int fail_idx)
+{
+	/* Test gathering batch statuses one at a time */
+	enum rte_dma_status_code status[COMP_BURST_SZ];
+	uint16_t invalid_addr_id = 0;
+	uint16_t idx;
+	uint16_t count = 0, status_count = 0;
+	unsigned int j;
+	bool error = false;
+
+	for (j = 0; j < COMP_BURST_SZ; j++) {
+		int id = rte_dmadev_copy(dev_id, vchan,
+				(j == fail_idx ? 0 : (srcs[j]->buf_iova + srcs[j]->data_off)),
+				dsts[j]->buf_iova + dsts[j]->data_off,
+				COPY_LEN, OPT_FENCE(j));
+		if (id < 0) {
+			PRINT_ERR("Error with rte_dmadev_copy for buffer %u\n", j);
+			return -1;
+		}
+		if (j == fail_idx)
+			invalid_addr_id = id;
+	}
+	rte_dmadev_submit(dev_id, vchan);
+	usleep(10);
+
+	/* use regular "completed" until we hit error */
+	while (!error) {
+		uint16_t n = rte_dmadev_completed(dev_id, vchan, 1, &idx, &error);
+		count += n;
+		if (n > 1 || count >= COMP_BURST_SZ) {
+			PRINT_ERR("Error - too many completions got\n");
+			return -1;
+		}
+		if (n == 0 && !error) {
+			PRINT_ERR("Error, unexpectedly got zero completions after %u completed\n",
+					count);
+			return -1;
+		}
+	}
+	if (idx != invalid_addr_id - 1) {
+		PRINT_ERR("Error, last successful index not as expected, got %u, expected %u\n",
+				idx, invalid_addr_id - 1);
+		return -1;
+	}
+
+	/* use completed_status until we hit end of burst */
+	while (count + status_count < COMP_BURST_SZ) {
+		uint16_t n = rte_dmadev_completed_status(dev_id, vchan, 1, &idx,
+				&status[status_count]);
+		usleep(10); /* allow delay to ensure jobs are completed */
+		status_count += n;
+		if (n != 1) {
+			PRINT_ERR("Error: unexpected number of completions received, %u, not 1\n",
+					n);
+			return -1;
+		}
+	}
+
+	/* check for single failure */
+	if (status[0] == RTE_DMA_STATUS_SUCCESSFUL) {
+		PRINT_ERR("Error, unexpected successful DMA transaction\n");
+		return -1;
+	}
+	for (j = 1; j < status_count; j++) {
+		if (status[j] != RTE_DMA_STATUS_SUCCESSFUL
+				&& status[j] != RTE_DMA_STATUS_NOT_ATTEMPTED) {
+			PRINT_ERR("Error, unexpected DMA error reported\n");
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
+static int
+test_single_item_status_query_with_failure(int dev_id, uint16_t vchan,
+		struct rte_mbuf **srcs, struct rte_mbuf **dsts, unsigned int fail_idx)
+{
+	/* When error occurs just collect a single error using "completed_status()"
+	 * before going to back to completed() calls
+	 */
+	enum rte_dma_status_code status;
+	uint16_t invalid_addr_id = 0;
+	uint16_t idx;
+	uint16_t count, status_count, count2;
+	unsigned int j;
+	bool error = 0;
+
+	for (j = 0; j < COMP_BURST_SZ; j++) {
+		int id = rte_dmadev_copy(dev_id, vchan,
+				(j == fail_idx ? 0 : (srcs[j]->buf_iova + srcs[j]->data_off)),
+				dsts[j]->buf_iova + dsts[j]->data_off,
+				COPY_LEN, 0);
+		if (id < 0) {
+			PRINT_ERR("Error with rte_dmadev_copy for buffer %u\n", j);
+			return -1;
+		}
+		if (j == fail_idx)
+			invalid_addr_id = id;
+	}
+	rte_dmadev_submit(dev_id, vchan);
+	usleep(10);
+
+	/* get up to the error point */
+	count = rte_dmadev_completed(dev_id, vchan, COMP_BURST_SZ, &idx, &error);
+	if (count != fail_idx) {
+		PRINT_ERR("Error with rte_dmadev_completed for failure test. Got returned %u not %u.\n",
+				count, fail_idx);
+		rte_dmadev_dump(dev_id, stdout);
+		return -1;
+	}
+	if (error == false) {
+		PRINT_ERR("Error, missing expected failed copy, %u. has_error is not set\n",
+				fail_idx);
+		return -1;
+	}
+	if (idx != invalid_addr_id - 1) {
+		PRINT_ERR("Error, missing expected failed copy, %u. Got last idx %u, not %u\n",
+				fail_idx, idx, invalid_addr_id - 1);
+		return -1;
+	}
+
+	/* get the error code */
+	status_count = rte_dmadev_completed_status(dev_id, vchan, 1, &idx, &status);
+	if (status_count != 1) {
+		PRINT_ERR("Error with completed_status calls for fail idx %u. Got %u not %u\n",
+				fail_idx, status_count, COMP_BURST_SZ - count);
+		return -1;
+	}
+	if (status == RTE_DMA_STATUS_SUCCESSFUL) {
+		PRINT_ERR("Error with status returned for fail idx %u. First status was not failure\n",
+				fail_idx);
+		return -1;
+	}
+	usleep(10); /* delay in case more time needed after error handled to complete other jobs */
+
+	/* get the rest of the completions without status */
+	count2 = rte_dmadev_completed(dev_id, vchan, COMP_BURST_SZ, &idx, &error);
+	if (error == true) {
+		PRINT_ERR("Error, got further errors post completed_status() call, for failure case %u.\n",
+				fail_idx);
+		return -1;
+	}
+	if (count + status_count + count2 != COMP_BURST_SZ) {
+		PRINT_ERR("Error, incorrect number of completions received, got %u not %u\n",
+				count + status_count + count2, COMP_BURST_SZ);
+		return -1;
+	}
+
+	return 0;
+}
+
+static int
+test_multi_failure(int dev_id, uint16_t vchan, struct rte_mbuf **srcs, struct rte_mbuf **dsts,
+		const unsigned int *fail, size_t num_fail)
+{
+	/* test having multiple errors in one go */
+	enum rte_dma_status_code status[COMP_BURST_SZ];
+	unsigned int i, j;
+	uint16_t count, err_count = 0;
+	bool error = 0;
+
+	/* enqueue and gather completions in one go */
+	for (j = 0; j < COMP_BURST_SZ; j++) {
+		uintptr_t src = srcs[j]->buf_iova + srcs[j]->data_off;
+		/* set up for failure if the current index is anywhere is the fails array */
+		for (i = 0; i < num_fail; i++)
+			if (j == fail[i])
+				src = 0;
+
+		int id = rte_dmadev_copy(dev_id, vchan,
+				src, dsts[j]->buf_iova + dsts[j]->data_off,
+				COPY_LEN, 0);
+		if (id < 0) {
+			PRINT_ERR("Error with rte_dmadev_copy for buffer %u\n", j);
+			return -1;
+		}
+	}
+	rte_dmadev_submit(dev_id, vchan);
+	usleep(10);
+
+	count = rte_dmadev_completed_status(dev_id, vchan, COMP_BURST_SZ, NULL, status);
+	while (count < COMP_BURST_SZ) {
+		usleep(10);
+
+		uint16_t ret = rte_dmadev_completed_status(dev_id, vchan, COMP_BURST_SZ - count,
+				NULL, &status[count]);
+		if (ret == 0) {
+			PRINT_ERR("Error getting all completions for jobs. Got %u of %u\n",
+					count, COMP_BURST_SZ);
+			return -1;
+		}
+		count += ret;
+	}
+	for (i = 0; i < count; i++) {
+		if (status[i] != RTE_DMA_STATUS_SUCCESSFUL)
+			err_count++;
+	}
+	if (err_count != num_fail) {
+		PRINT_ERR("Error: Invalid number of failed completions returned, %u; expected %zu\n",
+			err_count, num_fail);
+		return -1;
+	}
+
+	/* enqueue and gather completions in bursts, but getting errors one at a time */
+	for (j = 0; j < COMP_BURST_SZ; j++) {
+		uintptr_t src = srcs[j]->buf_iova + srcs[j]->data_off;
+		/* set up for failure if the current index is anywhere is the fails array */
+		for (i = 0; i < num_fail; i++)
+			if (j == fail[i])
+				src = 0;
+
+		int id = rte_dmadev_copy(dev_id, vchan,
+				src, dsts[j]->buf_iova + dsts[j]->data_off,
+				COPY_LEN, 0);
+		if (id < 0) {
+			PRINT_ERR("Error with rte_dmadev_copy for buffer %u\n", j);
+			return -1;
+		}
+	}
+	rte_dmadev_submit(dev_id, vchan);
+	usleep(10);
+
+	count = 0;
+	err_count = 0;
+	while (count + err_count < COMP_BURST_SZ) {
+		count += rte_dmadev_completed(dev_id, vchan, COMP_BURST_SZ, NULL, &error);
+		if (error) {
+			uint16_t ret = rte_dmadev_completed_status(dev_id, vchan, 1,
+					NULL, status);
+			if (ret != 1) {
+				PRINT_ERR("Error getting error-status for completions\n");
+				return -1;
+			}
+			err_count += ret;
+			usleep(10);
+		}
+	}
+	if (err_count != num_fail) {
+		PRINT_ERR("Error: Incorrect number of failed completions received, got %u not %lu\n",
+				err_count, num_fail);
+		return -1;
+	}
+
+	return 0;
+}
+
+static int
+test_completion_status(int dev_id, uint16_t vchan, bool fence)
+{
+	const unsigned int fail[] = {0, 7, 14, 15};
+	struct rte_mbuf *srcs[COMP_BURST_SZ], *dsts[COMP_BURST_SZ];
+	unsigned int i;
+
+	for (i = 0; i < COMP_BURST_SZ; i++) {
+		srcs[i] = rte_pktmbuf_alloc(pool);
+		dsts[i] = rte_pktmbuf_alloc(pool);
+	}
+
+	for (i = 0; i < RTE_DIM(fail); i++) {
+		if (test_failure_in_full_burst(dev_id, vchan, fence, srcs, dsts, fail[i]) < 0)
+			return -1;
+
+		if (test_individual_status_query_with_failure(dev_id, vchan, fence,
+				srcs, dsts, fail[i]) < 0)
+			return -1;
+
+		/* test is run the same fenced, or unfenced, but no harm in running it twice */
+		if (test_single_item_status_query_with_failure(dev_id, vchan,
+				srcs, dsts, fail[i]) < 0)
+			return -1;
+	}
+
+	if (test_multi_failure(dev_id, vchan, srcs, dsts, fail, RTE_DIM(fail)) < 0)
+		return -1;
+
+	for (i = 0; i < COMP_BURST_SZ; i++) {
+		rte_pktmbuf_free(srcs[i]);
+		rte_pktmbuf_free(dsts[i]);
+	}
+	return 0;
+}
+
 static int
 test_dmadev_instance(uint16_t dev_id)
 {
@@ -340,6 +721,20 @@ test_dmadev_instance(uint16_t dev_id)
 	}
 	printf("\n");
 
+	/* to test error handling we can provide null pointers for source or dest in copies. This
+	 * requires VA mode in DPDK, since NULL(0) is a valid physical address.
+	 */
+	if (rte_eal_iova_mode() == RTE_IOVA_VA) {
+		printf("DMA Dev: %u, Running Completion Handling Tests\n", dev_id);
+		if (test_completion_status(dev_id, vchan, false) != 0) /* without fences */
+			goto err;
+		if (test_completion_status(dev_id, vchan, true) != 0) /* with fences */
+			goto err;
+		rte_dmadev_stats_get(dev_id, 0, &stats);
+		printf("Ops submitted: %"PRIu64"\t", stats.submitted);
+		printf("Ops completed: %"PRIu64"\n", stats.completed);
+	}
+
 
 	rte_mempool_free(pool);
 	rte_dmadev_stop(dev_id);
-- 
2.30.2



More information about the dev mailing list