[PATCH v6 33/34] ml/cnxk: enable fast-path ops for TVM models
Srikanth Yalavarthi
syalavarthi at marvell.com
Wed Oct 18 15:54:18 CEST 2023
From: Anup Prabhu <aprabhu at marvell.com>
Enable fast-path ops support for TVM models. Models would
use TVMDP library function calls to execute inference
operations for Hybrid and LLVM model sub-types.
For TVM MRVL model subtypes that have a single MRVL layer,
the inference requests are directly enqueued to hardware
by the driver.
Signed-off-by: Anup Prabhu <aprabhu at marvell.com>
Signed-off-by: Srikanth Yalavarthi <syalavarthi at marvell.com>
---
doc/guides/rel_notes/release_23_11.rst | 3 +
drivers/ml/cnxk/cn10k_ml_ops.c | 4 -
drivers/ml/cnxk/cnxk_ml_io.h | 6 ++
drivers/ml/cnxk/cnxk_ml_ops.c | 4 +
drivers/ml/cnxk/cnxk_ml_ops.h | 5 +
drivers/ml/cnxk/mvtvm_ml_model.c | 20 ++++
drivers/ml/cnxk/mvtvm_ml_model.h | 6 ++
drivers/ml/cnxk/mvtvm_ml_ops.c | 124 +++++++++++++++++++++++++
drivers/ml/cnxk/mvtvm_ml_ops.h | 43 +++++++++
9 files changed, 211 insertions(+), 4 deletions(-)
diff --git a/doc/guides/rel_notes/release_23_11.rst b/doc/guides/rel_notes/release_23_11.rst
index 0a6fc76a9d..5fcf2a1897 100644
--- a/doc/guides/rel_notes/release_23_11.rst
+++ b/doc/guides/rel_notes/release_23_11.rst
@@ -243,6 +243,9 @@ New Features
Added dispatcher library which purpose is to help decouple different
parts (modules) of an eventdev-based application.
+* **Updated Marvell cnxk mldev driver.**
+
+ * Added support for models compiled using TVM framework.
Removed Items
-------------
diff --git a/drivers/ml/cnxk/cn10k_ml_ops.c b/drivers/ml/cnxk/cn10k_ml_ops.c
index 01b0a44caa..b9d30278c6 100644
--- a/drivers/ml/cnxk/cn10k_ml_ops.c
+++ b/drivers/ml/cnxk/cn10k_ml_ops.c
@@ -371,10 +371,6 @@ cn10k_ml_dev_configure(struct cnxk_ml_dev *cnxk_mldev, const struct rte_ml_dev_c
else
cn10k_mldev->ml_jcmdq_enqueue = roc_ml_jcmdq_enqueue_lf;
- cnxk_mldev->mldev->enqueue_burst = cnxk_ml_enqueue_burst;
- cnxk_mldev->mldev->dequeue_burst = cnxk_ml_dequeue_burst;
- cnxk_mldev->mldev->op_error_get = cn10k_ml_op_error_get;
-
return 0;
}
diff --git a/drivers/ml/cnxk/cnxk_ml_io.h b/drivers/ml/cnxk/cnxk_ml_io.h
index 5de166c252..6d5d25a7c9 100644
--- a/drivers/ml/cnxk/cnxk_ml_io.h
+++ b/drivers/ml/cnxk/cnxk_ml_io.h
@@ -47,6 +47,12 @@ struct cnxk_ml_io {
/* Scale */
float scale;
+
+ /* Dequantized offset */
+ uint32_t offset_d;
+
+ /* Quantized offset */
+ uint32_t offset_q;
};
/* Model / Layer IO structure */
diff --git a/drivers/ml/cnxk/cnxk_ml_ops.c b/drivers/ml/cnxk/cnxk_ml_ops.c
index fd2c46ac1f..608e9fc4ca 100644
--- a/drivers/ml/cnxk/cnxk_ml_ops.c
+++ b/drivers/ml/cnxk/cnxk_ml_ops.c
@@ -632,6 +632,10 @@ cnxk_ml_dev_configure(struct rte_ml_dev *dev, const struct rte_ml_dev_config *co
cnxk_mldev->max_nb_layers =
cnxk_mldev->cn10k_mldev.fw.req->cn10k_req.jd.fw_load.cap.s.max_models;
+ cnxk_mldev->mldev->enqueue_burst = cnxk_ml_enqueue_burst;
+ cnxk_mldev->mldev->dequeue_burst = cnxk_ml_dequeue_burst;
+ cnxk_mldev->mldev->op_error_get = cn10k_ml_op_error_get;
+
/* Allocate and initialize index_map */
if (cnxk_mldev->index_map == NULL) {
cnxk_mldev->index_map =
diff --git a/drivers/ml/cnxk/cnxk_ml_ops.h b/drivers/ml/cnxk/cnxk_ml_ops.h
index ab32676b3e..7b49793a57 100644
--- a/drivers/ml/cnxk/cnxk_ml_ops.h
+++ b/drivers/ml/cnxk/cnxk_ml_ops.h
@@ -24,6 +24,11 @@ struct cnxk_ml_req {
union {
/* CN10K */
struct cn10k_ml_req cn10k_req;
+
+#ifdef RTE_MLDEV_CNXK_ENABLE_MVTVM
+ /* MVTVM */
+ struct mvtvm_ml_req mvtvm_req;
+#endif
};
/* Address of status field */
diff --git a/drivers/ml/cnxk/mvtvm_ml_model.c b/drivers/ml/cnxk/mvtvm_ml_model.c
index 4c12f584d5..1dfd0d176a 100644
--- a/drivers/ml/cnxk/mvtvm_ml_model.c
+++ b/drivers/ml/cnxk/mvtvm_ml_model.c
@@ -198,6 +198,16 @@ mvtvm_ml_model_io_info_set(struct cnxk_ml_model *model)
model->mvtvm.info.total_input_sz_d += model->mvtvm.info.input[i].sz_d;
model->mvtvm.info.total_input_sz_q += model->mvtvm.info.input[i].sz_q;
+ model->mvtvm.info.input[i].offset_d = model->mvtvm.info.total_input_sz_d;
+ model->mvtvm.info.input[i].offset_q = model->mvtvm.info.total_input_sz_q;
+
+ model->mvtvm.input_tensor[i].device = metadata->input[i].device;
+ model->mvtvm.input_tensor[i].ndim = metadata->input[i].ndim;
+ model->mvtvm.input_tensor[i].dtype = metadata->input[i].datatype;
+ model->mvtvm.input_tensor[i].shape = metadata->input[i].shape;
+ model->mvtvm.input_tensor[i].strides = NULL;
+ model->mvtvm.input_tensor[i].byte_offset = model->mvtvm.info.input[i].offset_q;
+
plt_ml_dbg("model_id = %u, input[%u] - sz_d = %u sz_q = %u", model->model_id, i,
model->mvtvm.info.input[i].sz_d, model->mvtvm.info.input[i].sz_q);
}
@@ -231,6 +241,16 @@ mvtvm_ml_model_io_info_set(struct cnxk_ml_model *model)
model->mvtvm.info.total_output_sz_d += model->mvtvm.info.output[i].sz_d;
model->mvtvm.info.total_output_sz_q += model->mvtvm.info.output[i].sz_q;
+ model->mvtvm.info.output[i].offset_d = model->mvtvm.info.total_output_sz_d;
+ model->mvtvm.info.output[i].offset_q = model->mvtvm.info.total_output_sz_q;
+
+ model->mvtvm.output_tensor[i].device = metadata->output[i].device;
+ model->mvtvm.output_tensor[i].ndim = metadata->output[i].ndim;
+ model->mvtvm.output_tensor[i].dtype = metadata->output[i].datatype;
+ model->mvtvm.output_tensor[i].shape = metadata->output[i].shape;
+ model->mvtvm.output_tensor[i].strides = NULL;
+ model->mvtvm.output_tensor[i].byte_offset = model->mvtvm.info.output[i].offset_q;
+
plt_ml_dbg("model_id = %u, output[%u] - sz_d = %u sz_q = %u", model->model_id, i,
model->mvtvm.info.output[i].sz_d, model->mvtvm.info.output[i].sz_q);
}
diff --git a/drivers/ml/cnxk/mvtvm_ml_model.h b/drivers/ml/cnxk/mvtvm_ml_model.h
index 66c3af18e1..7ffce38094 100644
--- a/drivers/ml/cnxk/mvtvm_ml_model.h
+++ b/drivers/ml/cnxk/mvtvm_ml_model.h
@@ -69,6 +69,12 @@ struct mvtvm_ml_model_data {
/* Stats for burst ops */
struct mvtvm_ml_model_xstats *burst_xstats;
+
+ /* Input Tensor */
+ DLTensor input_tensor[ML_CNXK_MODEL_MAX_INPUT_OUTPUT];
+
+ /* Output Tensor */
+ DLTensor output_tensor[ML_CNXK_MODEL_MAX_INPUT_OUTPUT];
};
enum cnxk_ml_model_type mvtvm_ml_model_type_get(struct rte_ml_model_params *params);
diff --git a/drivers/ml/cnxk/mvtvm_ml_ops.c b/drivers/ml/cnxk/mvtvm_ml_ops.c
index 776675843a..1e74b82a0a 100644
--- a/drivers/ml/cnxk/mvtvm_ml_ops.c
+++ b/drivers/ml/cnxk/mvtvm_ml_ops.c
@@ -19,6 +19,12 @@
/* ML model macros */
#define MVTVM_ML_MODEL_MEMZONE_NAME "ml_mvtvm_model_mz"
+__rte_hot static void
+mvtvm_ml_set_poll_addr(struct cnxk_ml_req *req)
+{
+ req->status = &req->mvtvm_req.status;
+}
+
void
mvtvm_ml_model_xstat_name_set(struct cnxk_ml_dev *cnxk_mldev, struct cnxk_ml_model *model,
uint16_t stat_id, uint16_t entry, char *suffix)
@@ -242,6 +248,7 @@ mvtvm_ml_model_load(struct cnxk_ml_dev *cnxk_mldev, struct rte_ml_model_params *
callback->tvmrt_free = cn10k_ml_free;
callback->tvmrt_quantize = mvtvm_ml_io_quantize;
callback->tvmrt_dequantize = mvtvm_ml_io_dequantize;
+ callback->tvmrt_inference = cn10k_ml_inference_sync;
} else {
callback = NULL;
}
@@ -285,6 +292,19 @@ mvtvm_ml_model_load(struct cnxk_ml_dev *cnxk_mldev, struct rte_ml_model_params *
model->mvtvm.burst_xstats[qp_id].dequeued_count = 0;
}
+ /* Set model specific fast path functions */
+ if (model->subtype == ML_CNXK_MODEL_SUBTYPE_TVM_MRVL) {
+ model->enqueue_single = cn10k_ml_enqueue_single;
+ model->result_update = cn10k_ml_result_update;
+ model->set_error_code = cn10k_ml_set_error_code;
+ model->set_poll_addr = cn10k_ml_set_poll_addr;
+ } else {
+ model->enqueue_single = mvtvm_ml_enqueue_single;
+ model->result_update = mvtvm_ml_result_update;
+ model->set_error_code = mvtvm_ml_set_error_code;
+ model->set_poll_addr = mvtvm_ml_set_poll_addr;
+ }
+
return 0;
error:
@@ -495,3 +515,107 @@ mvtvm_ml_io_dequantize(void *device, uint16_t model_id, const char *layer_name,
return 0;
}
+
+static int
+mvtvm_ml_model_run(struct cnxk_ml_model *model, struct rte_ml_op *op, struct cnxk_ml_req *req)
+{
+ uint8_t i;
+
+ rte_memcpy(req->mvtvm_req.input_tensor, model->mvtvm.input_tensor,
+ model->mvtvm.metadata.model.num_input * sizeof(DLTensor));
+ for (i = 0; i < model->mvtvm.metadata.model.num_input; i++) {
+ req->mvtvm_req.input_tensor[i].data = op->input[i]->addr;
+ req->mvtvm_req.input_tensor[i].byte_offset = 0;
+ }
+
+ rte_memcpy(req->mvtvm_req.output_tensor, model->mvtvm.output_tensor,
+ model->mvtvm.metadata.model.num_output * sizeof(DLTensor));
+ for (i = 0; i < model->mvtvm.metadata.model.num_output; i++) {
+ req->mvtvm_req.output_tensor[i].data = op->output[i]->addr;
+ req->mvtvm_req.output_tensor[i].byte_offset = 0;
+ }
+
+ tvmdp_model_run(model->model_id, model->mvtvm.metadata.model.num_input,
+ req->mvtvm_req.input_tensor, model->mvtvm.metadata.model.num_output,
+ req->mvtvm_req.output_tensor, &req->mvtvm_req.result,
+ &req->mvtvm_req.status);
+
+ plt_write64(ML_CNXK_POLL_JOB_FINISH, req->status);
+
+ return 0;
+}
+
+__rte_hot void
+mvtvm_ml_set_error_code(struct cnxk_ml_req *req, uint64_t etype, uint64_t stype)
+{
+ RTE_SET_USED(stype);
+
+ req->mvtvm_req.result.error_code = etype;
+}
+
+__rte_hot bool
+mvtvm_ml_enqueue_single(struct cnxk_ml_dev *cnxk_mldev, struct rte_ml_op *op, uint16_t layer_id,
+ struct cnxk_ml_qp *qp, uint64_t head)
+{
+ struct cnxk_ml_model *model;
+ struct cnxk_ml_queue *queue;
+ struct cnxk_ml_req *req;
+
+ RTE_SET_USED(layer_id);
+
+ queue = &qp->queue;
+ req = &queue->reqs[head];
+ model = cnxk_mldev->mldev->data->models[op->model_id];
+
+ model->set_poll_addr(req);
+ memset(&req->mvtvm_req.result, 0, sizeof(struct mvtvm_ml_result));
+ req->mvtvm_req.result.error_code = 0x0;
+ req->mvtvm_req.result.user_ptr = op->user_ptr;
+
+ cnxk_ml_set_poll_ptr(req);
+ mvtvm_ml_model_run(model, op, req);
+ req->timeout = plt_tsc_cycles() + queue->wait_cycles;
+ req->op = op;
+
+ return true;
+}
+
+__rte_hot void
+mvtvm_ml_result_update(struct cnxk_ml_dev *cnxk_mldev, int qp_id, void *request)
+{
+ struct mvtvm_ml_model_xstats *xstats;
+ struct mvtvm_ml_result *result;
+ struct cnxk_ml_model *model;
+ struct cnxk_ml_req *req;
+ uint64_t tvm_rt_latency;
+ struct cnxk_ml_qp *qp;
+ struct rte_ml_op *op;
+
+ req = (struct cnxk_ml_req *)request;
+ result = &req->mvtvm_req.result;
+ op = req->op;
+ qp = cnxk_mldev->mldev->data->queue_pairs[qp_id];
+ op->impl_opaque = result->error_code;
+
+ if (likely(result->error_code == 0)) {
+ qp->stats.dequeued_count++;
+ op->status = RTE_ML_OP_STATUS_SUCCESS;
+
+ model = cnxk_mldev->mldev->data->models[op->model_id];
+ xstats = &model->mvtvm.burst_xstats[qp_id];
+
+ if (unlikely(xstats->dequeued_count == xstats->tvm_rt_reset_count)) {
+ xstats->tvm_rt_latency_min = UINT64_MAX;
+ xstats->tvm_rt_latency_max = 0;
+ }
+ tvm_rt_latency = result->stats.end_ns - result->stats.start_ns;
+ xstats->tvm_rt_latency = tvm_rt_latency;
+ xstats->tvm_rt_latency_tot += tvm_rt_latency;
+ xstats->tvm_rt_latency_min = RTE_MIN(xstats->tvm_rt_latency_min, tvm_rt_latency);
+ xstats->tvm_rt_latency_max = RTE_MAX(xstats->tvm_rt_latency_max, tvm_rt_latency);
+ xstats->dequeued_count++;
+ } else {
+ qp->stats.dequeue_err_count++;
+ op->status = RTE_ML_OP_STATUS_ERROR;
+ }
+}
diff --git a/drivers/ml/cnxk/mvtvm_ml_ops.h b/drivers/ml/cnxk/mvtvm_ml_ops.h
index 4cabe30a82..cb4b219743 100644
--- a/drivers/ml/cnxk/mvtvm_ml_ops.h
+++ b/drivers/ml/cnxk/mvtvm_ml_ops.h
@@ -16,6 +16,44 @@
struct cnxk_ml_dev;
struct cnxk_ml_model;
struct cnxk_ml_layer;
+struct cnxk_ml_qp;
+struct cnxk_ml_req;
+
+/* Inference stats */
+struct mvtvm_ml_stats {
+ /* Start ns */
+ uint64_t start_ns;
+
+ /* Start ns */
+ uint64_t end_ns;
+};
+
+/* Result structure */
+struct mvtvm_ml_result {
+ /* Job error code */
+ uint64_t error_code;
+
+ /* Inference stats */
+ struct mvtvm_ml_stats stats;
+
+ /* User context pointer */
+ void *user_ptr;
+};
+
+/* MVTVM specific request */
+struct mvtvm_ml_req {
+ /* Input tensors */
+ DLTensor input_tensor[ML_CNXK_MODEL_MAX_INPUT_OUTPUT];
+
+ /* Output tensors */
+ DLTensor output_tensor[ML_CNXK_MODEL_MAX_INPUT_OUTPUT];
+
+ /* Status field for poll mode requests */
+ volatile uint64_t status;
+
+ /* Result */
+ struct mvtvm_ml_result result;
+};
int mvtvm_ml_dev_configure(struct cnxk_ml_dev *cnxk_mldev, const struct rte_ml_dev_config *conf);
int mvtvm_ml_dev_close(struct cnxk_ml_dev *cnxk_mldev);
@@ -29,6 +67,11 @@ int mvtvm_ml_io_quantize(void *device, uint16_t model_id, const char *layer_name
int mvtvm_ml_io_dequantize(void *device, uint16_t model_id, const char *layer_name, void *qbuffer,
const DLTensor **deq_tensor);
+__rte_hot bool mvtvm_ml_enqueue_single(struct cnxk_ml_dev *cnxk_mldev, struct rte_ml_op *op,
+ uint16_t layer_id, struct cnxk_ml_qp *qp, uint64_t head);
+__rte_hot void mvtvm_ml_result_update(struct cnxk_ml_dev *cnxk_mldev, int qp_id, void *request);
+__rte_hot void mvtvm_ml_set_error_code(struct cnxk_ml_req *req, uint64_t etype, uint64_t stype);
+
void mvtvm_ml_model_xstat_name_set(struct cnxk_ml_dev *cnxk_mldev, struct cnxk_ml_model *model,
uint16_t stat_id, uint16_t entry, char *suffix);
uint64_t mvtvm_ml_model_xstat_get(struct cnxk_ml_dev *cnxk_mldev, struct cnxk_ml_model *model,
--
2.42.0
More information about the dev
mailing list