[PATCH v2 3/3] dma/ae4dma: add data path operations

Raghavendra Ningoji raghavendra.ningoji at amd.com
Mon May 25 20:42:44 CEST 2026
Previous message (by thread): [PATCH v2 2/3] dma/ae4dma: add control path operations
Next message (by thread): [PATCH] dma/ae4dma: add AMD AE4DMA DMA PMD
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]
Implement the dmadev fast path for the AMD AE4DMA PMD.

This commit adds:
 - copy enqueue (rte_dma_copy): write an AE4DMA descriptor for a
   memory-to-memory transfer; on RTE_DMA_OP_FLAG_SUBMIT the doorbell
   is rung immediately.
 - submit (rte_dma_submit): advance the per-queue write_idx
   register to expose pending descriptors to the hardware.
 - completion (rte_dma_completed / rte_dma_completed_status):
   completion is detected via the hardware's per-queue read_idx
   register, which the engine advances as it processes descriptors.
   The descriptor status / err_code bytes are read only to classify
   each drained slot as success or failure, and HW error codes are
   translated to the dmadev RTE_DMA_STATUS_* enumeration.
 - burst capacity (rte_dma_burst_capacity): report the number of
   free descriptor slots, taking into account the one slot reserved
   to distinguish full from empty on the power-of-two ring.

The fast path entry points are wired through fp_obj in
ae4dma_dmadev_create(). The fill capability is not advertised;
fp_obj->fill is left zero-initialised.

Signed-off-by: Raghavendra Ningoji <raghavendra.ningoji at amd.com>
---
 doc/guides/dmadevs/ae4dma.rst      |  22 +++
 drivers/dma/ae4dma/ae4dma_dmadev.c | 288 +++++++++++++++++++++++++++++
 2 files changed, 310 insertions(+)

diff --git a/doc/guides/dmadevs/ae4dma.rst b/doc/guides/dmadevs/ae4dma.rst
index a85c1d92ca..37a2096ccf 100644
--- a/doc/guides/dmadevs/ae4dma.rst
+++ b/doc/guides/dmadevs/ae4dma.rst
@@ -51,3 +51,25 @@ On probe the PMD performs the following steps for each PCI function:
   IOVA-contiguous memory, programs the queue base address and ring
   depth into the per-queue registers, and enables the queue.
 * Interrupts are masked; completion is polled by the application.
+
+Usage
+-----
+
+Once a dmadev has been started, copies are submitted with
+``rte_dma_copy()`` and completions are reaped with ``rte_dma_completed()``
+or ``rte_dma_completed_status()``. See the
+:ref:`Enqueue / Dequeue API <dmadev_enqueue_dequeue>` section of the
+dmadev library documentation for details.
+
+Limitations
+-----------
+
+* Only memory-to-memory copies are supported. Fill, scatter-gather and
+  any other operation types are not advertised in
+  ``rte_dma_info::dev_capa``.
+* The maximum number of descriptors per virtual channel is fixed by
+  hardware at 32. The PMD rounds the requested ring size up to a
+  power of two and clamps it to 32.
+* Only a single virtual channel per dmadev is supported; use the 16
+  per-PCI-function dmadevs to obtain channel-level parallelism.
+* Interrupt-driven completion is not supported.
diff --git a/drivers/dma/ae4dma/ae4dma_dmadev.c b/drivers/dma/ae4dma/ae4dma_dmadev.c
index dfda723c13..0f223fc40c 100644
--- a/drivers/dma/ae4dma/ae4dma_dmadev.c
+++ b/drivers/dma/ae4dma/ae4dma_dmadev.c
@@ -167,6 +167,73 @@ ae4dma_dev_close(struct rte_dma_dev *dev)
 	cmd_q->qbase_phys_addr = 0;
 	return 0;
 }
+
+/* trigger h/w to process enqued desc:doorbell - by next_write */
+static inline void
+__submit(struct ae4dma_dmadev *ae4dma)
+{
+	struct ae4dma_cmd_queue *cmd_q = &ae4dma->cmd_q;
+	uint16_t write_idx = cmd_q->next_write;
+	uint16_t nb = cmd_q->qcfg.nb_desc;
+
+	AE4DMA_WRITE_REG(&cmd_q->hwq_regs->write_idx, write_idx);
+	if (nb != 0)
+		cmd_q->stats.submitted += (uint16_t)((cmd_q->next_write - cmd_q->last_write +
+				nb) % nb);
+	cmd_q->last_write = cmd_q->next_write;
+}
+
+static int
+ae4dma_submit(void *dev_private, uint16_t vchan __rte_unused)
+{
+	struct ae4dma_dmadev *ae4dma = dev_private;
+
+	__submit(ae4dma);
+	return 0;
+}
+
+/* Write descriptor for enqueue (copy only). */
+static inline int
+__write_desc_copy(void *dev_private, rte_iova_t src, rte_iova_t dst,
+		uint32_t len, uint64_t flags)
+{
+	struct ae4dma_dmadev *ae4dma = dev_private;
+	struct ae4dma_cmd_queue *cmd_q = &ae4dma->cmd_q;
+	struct ae4dma_desc *dma_desc;
+	uint16_t ret;
+	uint16_t nb = cmd_q->qcfg.nb_desc;
+	uint16_t write = cmd_q->next_write;
+
+	if (nb == 0)
+		return -EINVAL;
+
+	/* Reserve one slot to distinguish full from empty (power-of-two ring). */
+	if ((uint32_t)cmd_q->ring_buff_count >= (uint32_t)(nb - 1))
+		return -ENOSPC;
+
+	dma_desc = &cmd_q->qbase_desc[write];
+	memset(dma_desc, 0, sizeof(*dma_desc));
+	dma_desc->length = len;
+	dma_desc->src_hi = upper_32_bits(src);
+	dma_desc->src_lo = lower_32_bits(src);
+	dma_desc->dst_hi = upper_32_bits(dst);
+	dma_desc->dst_lo = lower_32_bits(dst);
+	cmd_q->ring_buff_count++;
+	cmd_q->next_write = (uint16_t)((write + 1) % nb);
+	ret = write;
+	if (flags & RTE_DMA_OP_FLAG_SUBMIT)
+		__submit(ae4dma);
+	return ret;
+}
+
+/* Enqueue a copy operation onto the ae4dma device. */
+static int
+ae4dma_enqueue_copy(void *dev_private, uint16_t vchan __rte_unused,
+		rte_iova_t src, rte_iova_t dst, uint32_t length, uint64_t flags)
+{
+	return __write_desc_copy(dev_private, src, dst, length, flags);
+}
+
 /* Dump DMA device info. */
 static int
 ae4dma_dev_dump(const struct rte_dma_dev *dev, FILE *f)
@@ -197,6 +264,220 @@ ae4dma_dev_dump(const struct rte_dma_dev *dev, FILE *f)
 		cmd_q->stats.errors);
 	return 0;
 }
+
+/* Translates AE4DMA ChanERRs to DMA error codes. */
+static inline enum rte_dma_status_code
+__translate_status_ae4dma_to_dma(enum ae4dma_dma_err status)
+{
+	AE4DMA_PMD_DEBUG("ae4dma desc status = %d", status);
+
+	switch (status) {
+	case AE4DMA_DMA_ERR_NO_ERR:
+		return RTE_DMA_STATUS_SUCCESSFUL;
+	case AE4DMA_DMA_ERR_INV_LEN:
+		return RTE_DMA_STATUS_INVALID_LENGTH;
+	case AE4DMA_DMA_ERR_INV_SRC:
+		return RTE_DMA_STATUS_INVALID_SRC_ADDR;
+	case AE4DMA_DMA_ERR_INV_DST:
+		return RTE_DMA_STATUS_INVALID_DST_ADDR;
+	case AE4DMA_DMA_ERR_INV_ALIGN:
+		/* Name matches DPDK public enum spelling. */
+		return RTE_DMA_STATUS_DATA_POISION;
+	case AE4DMA_DMA_ERR_INV_HEADER:
+	case AE4DMA_DMA_ERR_INV_STATUS:
+		return RTE_DMA_STATUS_ERROR_UNKNOWN;
+	default:
+		return RTE_DMA_STATUS_ERROR_UNKNOWN;
+	}
+}
+
+/*
+ * Scan HW queue for completed descriptors (non-blocking).
+ *
+ * The AE4DMA engine signals completion by advancing the per-queue
+ * `read_idx` register; it does not (reliably) write a status value
+ * back into the descriptor. We therefore use the HW `read_idx`
+ * register as the source of truth and only inspect the descriptor's
+ * `dw1.err_code` byte to classify each completion as success or
+ * failure.
+ *
+ * @param cmd_q
+ *   The AE4DMA command queue.
+ * @param max_ops
+ *   Maximum descriptors to process this call.
+ * @param[out] failed_count
+ *   Number of completed descriptors that did not report success.
+ * @return
+ *   Number of descriptors completed (success + failure), <= max_ops.
+ */
+static inline uint16_t
+ae4dma_scan_hwq(struct ae4dma_cmd_queue *cmd_q, uint16_t max_ops,
+		uint16_t *failed_count)
+{
+	volatile struct ae4dma_desc *hw_desc;
+	uint16_t events_count = 0, fails = 0;
+	uint16_t tail;
+	uint16_t nb = cmd_q->qcfg.nb_desc;
+	uint16_t mask;
+	uint16_t hw_read_idx;
+	uint16_t in_flight;
+	uint16_t scan_cap;
+
+	if (nb == 0 || cmd_q->ring_buff_count == 0) {
+		*failed_count = 0;
+		return 0;
+	}
+	mask = nb - 1;
+
+	hw_read_idx = (uint16_t)(AE4DMA_READ_REG(&cmd_q->hwq_regs->read_idx) & mask);
+	tail = cmd_q->next_read;
+
+	/*
+	 * Descriptors completed since our last visit live in the
+	 * half-open ring range [tail, hw_read_idx). If HW hasn't
+	 * moved we have nothing to do.
+	 */
+	in_flight = (uint16_t)((hw_read_idx - tail) & mask);
+	if (in_flight == 0) {
+		*failed_count = 0;
+		return 0;
+	}
+
+	scan_cap = max_ops;
+	if (scan_cap > AE4DMA_DESCRIPTORS_PER_CMDQ)
+		scan_cap = AE4DMA_DESCRIPTORS_PER_CMDQ;
+	if (scan_cap > in_flight)
+		scan_cap = in_flight;
+	if (scan_cap > cmd_q->ring_buff_count)
+		scan_cap = (uint16_t)cmd_q->ring_buff_count;
+
+	while (events_count < scan_cap) {
+		uint8_t hw_status;
+		uint8_t hw_err;
+
+		hw_desc = &cmd_q->qbase_desc[tail];
+		hw_status = hw_desc->dw1.status;
+		hw_err = hw_desc->dw1.err_code;
+
+		/*
+		 * read_idx advancing is the definitive completion
+		 * signal. The per-descriptor status byte is informational
+		 * and may not yet be written when we observe it:
+		 *
+		 *   AE4DMA_DMA_DESC_ERROR (4)
+		 *     Hard failure - err_code names the precise cause.
+		 *   AE4DMA_DMA_DESC_COMPLETED (3) or 0
+		 *     Success.
+		 *   AE4DMA_DMA_DESC_VALIDATED (1) / _PROCESSED (2)
+		 *     Benign race: HW had not finished updating the
+		 *     status byte at the instant we read it. Since
+		 *     read_idx has moved past this slot, treat it as
+		 *     success unless err_code says otherwise.
+		 *
+		 * A non-zero err_code is treated as a failure regardless
+		 * of the observed status value.
+		 */
+		if (hw_status == AE4DMA_DMA_DESC_ERROR ||
+				hw_err != AE4DMA_DMA_ERR_NO_ERR) {
+			fails++;
+			AE4DMA_PMD_WARN("Desc failed: status=%u err=%u",
+					hw_status, hw_err);
+		}
+		cmd_q->status[events_count] = (enum ae4dma_dma_err)hw_err;
+		cmd_q->ring_buff_count--;
+		events_count++;
+		tail = (tail + 1) & mask;
+	}
+
+	cmd_q->stats.completed += events_count;
+	cmd_q->stats.errors += fails;
+	cmd_q->next_read = tail;
+	*failed_count = fails;
+	return events_count;
+}
+
+/* Returns successful operations count and sets error flag if any errors. */
+static uint16_t
+ae4dma_completed(void *dev_private, uint16_t vchan __rte_unused,
+		const uint16_t max_ops, uint16_t *last_idx, bool *has_error)
+{
+	struct ae4dma_dmadev *ae4dma = dev_private;
+	struct ae4dma_cmd_queue *cmd_q = &ae4dma->cmd_q;
+	uint16_t cpl_count, sl_count;
+	uint16_t err_count = 0;
+	uint16_t nb = cmd_q->qcfg.nb_desc;
+
+	*has_error = false;
+
+	cpl_count = ae4dma_scan_hwq(cmd_q, max_ops, &err_count);
+
+	if (cpl_count > max_ops)
+		cpl_count = max_ops;
+
+	if (cpl_count > 0 && last_idx != NULL)
+		*last_idx = (uint16_t)((cmd_q->next_read - 1 + nb) % nb);
+
+	sl_count = cpl_count - err_count;
+	if (err_count)
+		*has_error = true;
+
+	return sl_count;
+}
+
+static uint16_t
+ae4dma_completed_status(void *dev_private, uint16_t vchan __rte_unused,
+		uint16_t max_ops, uint16_t *last_idx,
+		enum rte_dma_status_code *status)
+{
+	struct ae4dma_dmadev *ae4dma = dev_private;
+	struct ae4dma_cmd_queue *cmd_q = &ae4dma->cmd_q;
+	uint16_t cpl_count;
+	uint16_t i;
+	uint16_t err_count = 0;
+	uint16_t nb = cmd_q->qcfg.nb_desc;
+
+	cpl_count = ae4dma_scan_hwq(cmd_q, max_ops, &err_count);
+
+	if (cpl_count > max_ops)
+		cpl_count = max_ops;
+
+	if (cpl_count > 0 && last_idx != NULL)
+		*last_idx = (uint16_t)((cmd_q->next_read - 1 + nb) % nb);
+
+	if (likely(err_count == 0)) {
+		for (i = 0; i < cpl_count; i++)
+			status[i] = RTE_DMA_STATUS_SUCCESSFUL;
+	} else {
+		for (i = 0; i < cpl_count; i++)
+			status[i] = __translate_status_ae4dma_to_dma(cmd_q->status[i]);
+	}
+
+	return cpl_count;
+}
+
+/* Get the remaining capacity of the ring. */
+static uint16_t
+ae4dma_burst_capacity(const void *dev_private, uint16_t vchan __rte_unused)
+{
+	const struct ae4dma_dmadev *ae4dma = dev_private;
+	const struct ae4dma_cmd_queue *cmd_q = &ae4dma->cmd_q;
+	uint16_t nb = cmd_q->qcfg.nb_desc;
+	uint16_t mask;
+	uint16_t read_idx = cmd_q->next_read;
+	uint16_t write_idx = cmd_q->next_write;
+	uint16_t used;
+
+	if (nb < 2 || !rte_is_power_of_2(nb))
+		return 0;
+
+	mask = nb - 1;
+	used = (uint16_t)((write_idx - read_idx) & mask);
+	/* One slot reserved (same rule as enqueue). */
+	if (used >= nb - 1)
+		return 0;
+	return (uint16_t)(nb - 1 - used);
+}
+
 /* Retrieve the generic stats of a DMA device. */
 static int
 ae4dma_stats_get(const struct rte_dma_dev *dev, uint16_t vchan __rte_unused,
@@ -357,6 +638,13 @@ ae4dma_dmadev_create(const char *name, struct rte_pci_device *dev, uint8_t qn)
 	dmadev->fp_obj->dev_private = dmadev->data->dev_private;
 	dmadev->dev_ops = &ae4dma_dmadev_ops;
 
+	dmadev->fp_obj->burst_capacity = ae4dma_burst_capacity;
+	dmadev->fp_obj->completed = ae4dma_completed;
+	dmadev->fp_obj->completed_status = ae4dma_completed_status;
+	dmadev->fp_obj->copy = ae4dma_enqueue_copy;
+	dmadev->fp_obj->submit = ae4dma_submit;
+	/* fill capability not advertised: leave fp_obj->fill as zero-initialised. */
+
 	ae4dma = dmadev->data->dev_private;
 	ae4dma->dmadev = dmadev;
 	ae4dma->pci = dev;
-- 
2.34.1
Previous message (by thread): [PATCH v2 2/3] dma/ae4dma: add control path operations
Next message (by thread): [PATCH] dma/ae4dma: add AMD AE4DMA DMA PMD
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]
More information about the dev mailing list