[dpdk-dev] [PATCH v3 06/10] baseband/turbo_sw: extension of turbosw PMD for 5G

Nicolas Chautru nicolas.chautru at intel.com
Fri Jun 21 18:59:08 CEST 2019


Implementation still based on Intel SDK libraries
optimized for AVX512 instructions set and 5GNR.
This can be also build for AVX2 for 4G capability or
without SDK dependency for maintainance.

Signed-off-by: Nicolas Chautru <nicolas.chautru at intel.com>
---
 config/common_base                               |   1 +
 drivers/baseband/turbo_sw/Makefile               |  15 +-
 drivers/baseband/turbo_sw/bbdev_turbo_software.c | 686 ++++++++++++++++++++++-
 drivers/baseband/turbo_sw/meson.build            |  10 +
 mk/rte.app.mk                                    |   8 +-
 5 files changed, 707 insertions(+), 13 deletions(-)

diff --git a/config/common_base b/config/common_base
index cc13025..67e1cc6 100644
--- a/config/common_base
+++ b/config/common_base
@@ -530,6 +530,7 @@ CONFIG_RTE_LIBRTE_BBDEV_DEBUG=n
 CONFIG_RTE_BBDEV_MAX_DEVS=128
 CONFIG_RTE_BBDEV_OFFLOAD_COST=y
 CONFIG_RTE_BBDEV_SDK_AVX2=n
+CONFIG_RTE_BBDEV_SDK_AVX512=n
 
 #
 # Compile PMD for NULL bbdev device
diff --git a/drivers/baseband/turbo_sw/Makefile b/drivers/baseband/turbo_sw/Makefile
index 414d0d9..4aa05d2 100644
--- a/drivers/baseband/turbo_sw/Makefile
+++ b/drivers/baseband/turbo_sw/Makefile
@@ -3,7 +3,6 @@
 
 include $(RTE_SDK)/mk/rte.vars.mk
 
-
 # library name
 LIB = librte_pmd_bbdev_turbo_sw.a
 
@@ -34,6 +33,20 @@ LDLIBS += -L$(FLEXRAN_SDK)/lib_common -lcommon
 LDLIBS += -lstdc++ -lirc -limf -lipps -lsvml
 endif
 
+ifeq ($(CONFIG_RTE_BBDEV_SDK_AVX512),y)
+ifeq ($(CONFIG_RTE_BBDEV_SDK_AVX2),n)
+$(error "CONFIG_RTE_BBDEV_SDK_AVX512 requires CONFIG_RTE_BBDEV_SDK_AVX2 set")
+endif
+CFLAGS += -I$(FLEXRAN_SDK)/lib_ldpc_encoder_5gnr
+CFLAGS += -I$(FLEXRAN_SDK)/lib_ldpc_decoder_5gnr
+CFLAGS += -I$(FLEXRAN_SDK)/lib_LDPC_ratematch_5gnr
+CFLAGS += -I$(FLEXRAN_SDK)/lib_rate_dematching_5gnr
+LDLIBS += -L$(FLEXRAN_SDK)/lib_ldpc_encoder_5gnr -lldpc_encoder_5gnr
+LDLIBS += -L$(FLEXRAN_SDK)/lib_ldpc_decoder_5gnr -lldpc_decoder_5gnr
+LDLIBS += -L$(FLEXRAN_SDK)/lib_LDPC_ratematch_5gnr -lLDPC_ratematch_5gnr
+LDLIBS += -L$(FLEXRAN_SDK)/lib_rate_dematching_5gnr -lrate_dematching_5gnr
+endif
+
 # library version
 LIBABIVER := 1
 
diff --git a/drivers/baseband/turbo_sw/bbdev_turbo_software.c b/drivers/baseband/turbo_sw/bbdev_turbo_software.c
index 374d177..2f06369 100644
--- a/drivers/baseband/turbo_sw/bbdev_turbo_software.c
+++ b/drivers/baseband/turbo_sw/bbdev_turbo_software.c
@@ -14,11 +14,24 @@
 #include <rte_bbdev.h>
 #include <rte_bbdev_pmd.h>
 
+#include <rte_common.h>
+#include <rte_hexdump.h>
+#include <rte_log.h>
+
 #ifdef RTE_BBDEV_SDK_AVX2
+#include <ipp.h>
+#include <ipps.h>
 #include <phy_turbo.h>
 #include <phy_crc.h>
 #include <phy_rate_match.h>
 #endif
+#ifdef RTE_BBDEV_SDK_AVX512
+#include <bit_reverse.h>
+#include <phy_ldpc_encoder_5gnr.h>
+#include <phy_ldpc_decoder_5gnr.h>
+#include <phy_LDPC_ratematch_5gnr.h>
+#include <phy_rate_dematching_5gnr.h>
+#endif
 
 #define DRIVER_NAME baseband_turbo_sw
 
@@ -84,6 +97,7 @@ struct turbo_sw_queue {
 	enum rte_bbdev_op_type type;
 } __rte_cache_aligned;
 
+
 #ifdef RTE_BBDEV_SDK_AVX2
 static inline char *
 mbuf_append(struct rte_mbuf *m_head, struct rte_mbuf *m, uint16_t len)
@@ -158,7 +172,8 @@ struct turbo_sw_queue {
 					RTE_BBDEV_TURBO_DEC_TB_CRC_24B_KEEP |
 					RTE_BBDEV_TURBO_EARLY_TERMINATION,
 				.max_llr_modulus = 16,
-				.num_buffers_src = RTE_BBDEV_TURBO_MAX_CODE_BLOCKS,
+				.num_buffers_src =
+						RTE_BBDEV_TURBO_MAX_CODE_BLOCKS,
 				.num_buffers_hard_out =
 						RTE_BBDEV_TURBO_MAX_CODE_BLOCKS,
 				.num_buffers_soft_out = 0,
@@ -172,25 +187,60 @@ struct turbo_sw_queue {
 						RTE_BBDEV_TURBO_CRC_24A_ATTACH |
 						RTE_BBDEV_TURBO_RATE_MATCH |
 						RTE_BBDEV_TURBO_RV_INDEX_BYPASS,
-				.num_buffers_src = RTE_BBDEV_TURBO_MAX_CODE_BLOCKS,
-				.num_buffers_dst = RTE_BBDEV_TURBO_MAX_CODE_BLOCKS,
+				.num_buffers_src =
+						RTE_BBDEV_TURBO_MAX_CODE_BLOCKS,
+				.num_buffers_dst =
+						RTE_BBDEV_TURBO_MAX_CODE_BLOCKS,
 			}
 		},
 #endif
+#ifdef RTE_BBDEV_SDK_AVX512
+		{
+			.type   = RTE_BBDEV_OP_LDPC_ENC,
+			.cap.ldpc_enc = {
+				.capability_flags =
+						RTE_BBDEV_LDPC_RATE_MATCH |
+						RTE_BBDEV_LDPC_CRC_24A_ATTACH |
+						RTE_BBDEV_LDPC_CRC_24B_ATTACH,
+				.num_buffers_src =
+						RTE_BBDEV_LDPC_MAX_CODE_BLOCKS,
+				.num_buffers_dst =
+						RTE_BBDEV_LDPC_MAX_CODE_BLOCKS,
+			}
+		},
+		{
+		.type   = RTE_BBDEV_OP_LDPC_DEC,
+		.cap.ldpc_dec = {
+			.capability_flags =
+					RTE_BBDEV_LDPC_CRC_TYPE_24B_CHECK |
+					RTE_BBDEV_LDPC_CRC_TYPE_24A_CHECK |
+					RTE_BBDEV_LDPC_CRC_TYPE_24B_DROP |
+					RTE_BBDEV_LDPC_HQ_COMBINE_IN_ENABLE |
+					RTE_BBDEV_LDPC_HQ_COMBINE_OUT_ENABLE |
+					RTE_BBDEV_LDPC_ITERATION_STOP_ENABLE,
+			.llr_size = 8,
+			.llr_decimals = 2,
+			.harq_memory_size = 0,
+			.num_buffers_src =
+					RTE_BBDEV_LDPC_MAX_CODE_BLOCKS,
+			.num_buffers_hard_out =
+					RTE_BBDEV_LDPC_MAX_CODE_BLOCKS,
+			.num_buffers_soft_out = 0,
+		}
+		},
+#endif
 		RTE_BBDEV_END_OF_CAPABILITIES_LIST()
 	};
 
 	static struct rte_bbdev_queue_conf default_queue_conf = {
 		.queue_size = RTE_BBDEV_QUEUE_SIZE_LIMIT,
 	};
-
 #ifdef RTE_BBDEV_SDK_AVX2
 	static const enum rte_cpu_flag_t cpu_flag = RTE_CPUFLAG_SSE4_2;
 	dev_info->cpu_flag_reqs = &cpu_flag;
 #else
 	dev_info->cpu_flag_reqs = NULL;
 #endif
-
 	default_queue_conf.socket = dev->data->socket_id;
 
 	dev_info->driver_name = RTE_STR(DRIVER_NAME);
@@ -277,7 +327,7 @@ struct turbo_sw_queue {
 		return -ENAMETOOLONG;
 	}
 	q->enc_in = rte_zmalloc_socket(name,
-			(RTE_BBDEV_TURBO_MAX_CB_SIZE >> 3) * sizeof(*q->enc_in),
+			(RTE_BBDEV_LDPC_MAX_CB_SIZE >> 3) * sizeof(*q->enc_in),
 			RTE_CACHE_LINE_SIZE, queue_conf->socket);
 	if (q->enc_in == NULL) {
 		rte_bbdev_log(ERR,
@@ -285,7 +335,7 @@ struct turbo_sw_queue {
 		goto free_q;
 	}
 
-	/* Allocate memory for Aplha Gamma temp buffer. */
+	/* Allocate memory for Alpha Gamma temp buffer. */
 	ret = snprintf(name, RTE_RING_NAMESIZE, RTE_STR(DRIVER_NAME)"_ag%u:%u",
 			dev->data->dev_id, q_id);
 	if ((ret < 0) || (ret >= (int)RTE_RING_NAMESIZE)) {
@@ -420,6 +470,7 @@ struct turbo_sw_queue {
 };
 
 #ifdef RTE_BBDEV_SDK_AVX2
+#ifdef RTE_LIBRTE_BBDEV_DEBUG
 /* Checks if the encoder input buffer is correct.
  * Returns 0 if it's valid, -1 otherwise.
  */
@@ -475,16 +526,21 @@ struct turbo_sw_queue {
 	return 0;
 }
 #endif
+#endif
 
 static inline void
 process_enc_cb(struct turbo_sw_queue *q, struct rte_bbdev_enc_op *op,
 		uint8_t r, uint8_t c, uint16_t k, uint16_t ncb,
 		uint32_t e, struct rte_mbuf *m_in, struct rte_mbuf *m_out_head,
-		struct rte_mbuf *m_out, uint16_t in_offset, uint16_t out_offset,
+		struct rte_mbuf *m_out,	uint16_t in_offset, uint16_t out_offset,
 		uint16_t in_length, struct rte_bbdev_stats *q_stats)
 {
 #ifdef RTE_BBDEV_SDK_AVX2
+#ifdef RTE_LIBRTE_BBDEV_DEBUG
 	int ret;
+#else
+	RTE_SET_USED(in_length);
+#endif
 	int16_t k_idx;
 	uint16_t m;
 	uint8_t *in, *out0, *out1, *out2, *tmp_out, *rm_out;
@@ -508,11 +564,14 @@ struct turbo_sw_queue {
 	/* CRC24A (for TB) */
 	if ((enc->op_flags & RTE_BBDEV_TURBO_CRC_24A_ATTACH) &&
 		(enc->code_block_mode == 1)) {
+#ifdef RTE_LIBRTE_BBDEV_DEBUG
 		ret = is_enc_input_valid(k - 24, k_idx, in_length);
 		if (ret != 0) {
 			op->status |= 1 << RTE_BBDEV_DATA_ERROR;
 			return;
 		}
+#endif
+
 		crc_req.data = in;
 		crc_req.len = k - 24;
 		/* Check if there is a room for CRC bits if not use
@@ -541,11 +600,14 @@ struct turbo_sw_queue {
 #endif
 	} else if (enc->op_flags & RTE_BBDEV_TURBO_CRC_24B_ATTACH) {
 		/* CRC24B */
+#ifdef RTE_LIBRTE_BBDEV_DEBUG
 		ret = is_enc_input_valid(k - 24, k_idx, in_length);
 		if (ret != 0) {
 			op->status |= 1 << RTE_BBDEV_DATA_ERROR;
 			return;
 		}
+#endif
+
 		crc_req.data = in;
 		crc_req.len = k - 24;
 		/* Check if there is a room for CRC bits if this is the last
@@ -572,13 +634,16 @@ struct turbo_sw_queue {
 #ifdef RTE_BBDEV_OFFLOAD_COST
 		q_stats->acc_offload_cycles += rte_rdtsc_precise() - start_time;
 #endif
-	} else {
+	}
+#ifdef RTE_LIBRTE_BBDEV_DEBUG
+	else {
 		ret = is_enc_input_valid(k, k_idx, in_length);
 		if (ret != 0) {
 			op->status |= 1 << RTE_BBDEV_DATA_ERROR;
 			return;
 		}
 	}
+#endif
 
 	/* Turbo encoder */
 
@@ -754,6 +819,143 @@ struct turbo_sw_queue {
 #endif
 }
 
+
+static inline void
+process_ldpc_enc_cb(struct turbo_sw_queue *q, struct rte_bbdev_enc_op *op,
+		uint32_t e, struct rte_mbuf *m_in, struct rte_mbuf *m_out_head,
+		struct rte_mbuf *m_out,	uint16_t in_offset, uint16_t out_offset,
+		uint16_t seg_total_left, struct rte_bbdev_stats *q_stats)
+{
+#ifdef RTE_BBDEV_SDK_AVX512
+	RTE_SET_USED(seg_total_left);
+	uint8_t *in, *rm_out;
+	struct rte_bbdev_op_ldpc_enc *enc = &op->ldpc_enc;
+	struct bblib_ldpc_encoder_5gnr_request ldpc_req;
+	struct bblib_ldpc_encoder_5gnr_response ldpc_resp;
+	struct bblib_LDPC_ratematch_5gnr_request rm_req;
+	struct bblib_LDPC_ratematch_5gnr_response rm_resp;
+	struct bblib_crc_request crc_req;
+	struct bblib_crc_response crc_resp;
+	uint16_t msgLen, puntBits, parity_offset, out_len;
+	uint16_t K = (enc->basegraph == 1 ? 22 : 10) * enc->z_c;
+	uint16_t in_length_in_bits = K - enc->n_filler;
+	uint16_t in_length_in_bytes = (in_length_in_bits + 7) >> 3;
+
+#ifdef RTE_BBDEV_OFFLOAD_COST
+	uint64_t start_time = rte_rdtsc_precise();
+#else
+	RTE_SET_USED(q_stats);
+#endif
+
+	in = rte_pktmbuf_mtod_offset(m_in, uint8_t *, in_offset);
+
+	/* Masking the Filler bits explicitly */
+	memset(q->enc_in  + (in_length_in_bytes - 3), 0,
+			((K + 7) >> 3) - (in_length_in_bytes - 3));
+	/* CRC Generation */
+	if (enc->op_flags & RTE_BBDEV_LDPC_CRC_24A_ATTACH) {
+		rte_memcpy(q->enc_in, in, in_length_in_bytes - 3);
+		crc_req.data = in;
+		crc_req.len = in_length_in_bits - 24;
+		crc_resp.data = q->enc_in;
+		bblib_lte_crc24a_gen(&crc_req, &crc_resp);
+	} else if (enc->op_flags & RTE_BBDEV_LDPC_CRC_24B_ATTACH) {
+		rte_memcpy(q->enc_in, in, in_length_in_bytes - 3);
+		crc_req.data = in;
+		crc_req.len = in_length_in_bits - 24;
+		crc_resp.data = q->enc_in;
+		bblib_lte_crc24b_gen(&crc_req, &crc_resp);
+	} else
+		rte_memcpy(q->enc_in, in, in_length_in_bytes);
+
+	/* LDPC Encoding */
+	ldpc_req.Zc = enc->z_c;
+	ldpc_req.baseGraph = enc->basegraph;
+	/* Number of rows set to maximum */
+	ldpc_req.nRows = ldpc_req.baseGraph == 1 ? 46 : 42;
+	ldpc_req.numberCodeblocks = 1;
+	ldpc_req.input[0] = (int8_t *) q->enc_in;
+	ldpc_resp.output[0] = (int8_t *) q->enc_out;
+
+	bblib_bit_reverse(ldpc_req.input[0], in_length_in_bytes << 3);
+
+	if (bblib_ldpc_encoder_5gnr(&ldpc_req, &ldpc_resp) != 0) {
+		op->status |= 1 << RTE_BBDEV_DRV_ERROR;
+		rte_bbdev_log(ERR, "LDPC Encoder failed");
+		return;
+	}
+
+	/*
+	 * Systematic + Parity : Recreating stream with filler bits, ideally
+	 * the bit select could handle this in the RM SDK
+	 */
+	msgLen = (ldpc_req.baseGraph == 1 ? 22 : 10) * ldpc_req.Zc;
+	puntBits = 2 * ldpc_req.Zc;
+	parity_offset = msgLen - puntBits;
+	ippsCopyBE_1u(((uint8_t *) ldpc_req.input[0]) + (puntBits / 8),
+			puntBits%8, q->adapter_output, 0, parity_offset);
+	ippsCopyBE_1u(q->enc_out, 0, q->adapter_output + (parity_offset / 8),
+			parity_offset % 8, ldpc_req.nRows * ldpc_req.Zc);
+
+	out_len = (e + 7) >> 3;
+	/* get output data starting address */
+	rm_out = (uint8_t *)mbuf_append(m_out_head, m_out, out_len);
+	if (rm_out == NULL) {
+		op->status |= 1 << RTE_BBDEV_DATA_ERROR;
+		rte_bbdev_log(ERR,
+				"Too little space in output mbuf");
+		return;
+	}
+	/*
+	 * rte_bbdev_op_data.offset can be different than the offset
+	 * of the appended bytes
+	 */
+	rm_out = rte_pktmbuf_mtod_offset(m_out, uint8_t *, out_offset);
+
+	/* Rate-Matching */
+	rm_req.E = e;
+	rm_req.Ncb = enc->n_cb;
+	rm_req.Qm = enc->q_m;
+	rm_req.Zc = enc->z_c;
+	rm_req.baseGraph = enc->basegraph;
+	rm_req.input = q->adapter_output;
+	rm_req.nLen = enc->n_filler;
+	rm_req.nullIndex = parity_offset - enc->n_filler;
+	rm_req.rvidx = enc->rv_index;
+	rm_resp.output = q->deint_output;
+
+	if (bblib_LDPC_ratematch_5gnr(&rm_req, &rm_resp) != 0) {
+		op->status |= 1 << RTE_BBDEV_DRV_ERROR;
+		rte_bbdev_log(ERR, "Rate matching failed");
+		return;
+	}
+
+	/* RM SDK may provide non zero bits on last byte */
+	if ((e % 8) != 0)
+		q->deint_output[out_len-1] &= (1 << (e % 8)) - 1;
+
+	bblib_bit_reverse((int8_t *) q->deint_output, out_len << 3);
+
+	rte_memcpy(rm_out, q->deint_output, out_len);
+	enc->output.length += out_len;
+
+#ifdef RTE_BBDEV_OFFLOAD_COST
+	q_stats->acc_offload_cycles += rte_rdtsc_precise() - start_time;
+#endif
+#else
+	RTE_SET_USED(q);
+	RTE_SET_USED(op);
+	RTE_SET_USED(e);
+	RTE_SET_USED(m_in);
+	RTE_SET_USED(m_out_head);
+	RTE_SET_USED(m_out);
+	RTE_SET_USED(in_offset);
+	RTE_SET_USED(out_offset);
+	RTE_SET_USED(seg_total_left);
+	RTE_SET_USED(q_stats);
+#endif
+}
+
 static inline void
 enqueue_enc_one_op(struct turbo_sw_queue *q, struct rte_bbdev_enc_op *op,
 		struct rte_bbdev_stats *queue_stats)
@@ -847,6 +1049,93 @@ struct turbo_sw_queue {
 	}
 }
 
+
+static inline void
+enqueue_ldpc_enc_one_op(struct turbo_sw_queue *q, struct rte_bbdev_enc_op *op,
+		struct rte_bbdev_stats *queue_stats)
+{
+	uint8_t c, r, crc24_bits = 0;
+	uint32_t e;
+	struct rte_bbdev_op_ldpc_enc *enc = &op->ldpc_enc;
+	uint16_t in_offset = enc->input.offset;
+	uint16_t out_offset = enc->output.offset;
+	struct rte_mbuf *m_in = enc->input.data;
+	struct rte_mbuf *m_out = enc->output.data;
+	struct rte_mbuf *m_out_head = enc->output.data;
+	uint32_t in_length, mbuf_total_left = enc->input.length;
+
+	uint16_t seg_total_left;
+
+	/* Clear op status */
+	op->status = 0;
+
+	if (mbuf_total_left > RTE_BBDEV_TURBO_MAX_TB_SIZE >> 3) {
+		rte_bbdev_log(ERR, "TB size (%u) is too big, max: %d",
+				mbuf_total_left, RTE_BBDEV_TURBO_MAX_TB_SIZE);
+		op->status = 1 << RTE_BBDEV_DATA_ERROR;
+		return;
+	}
+
+	if (m_in == NULL || m_out == NULL) {
+		rte_bbdev_log(ERR, "Invalid mbuf pointer");
+		op->status = 1 << RTE_BBDEV_DATA_ERROR;
+		return;
+	}
+
+	if ((enc->op_flags & RTE_BBDEV_TURBO_CRC_24B_ATTACH) ||
+		(enc->op_flags & RTE_BBDEV_TURBO_CRC_24A_ATTACH))
+		crc24_bits = 24;
+
+	if (enc->code_block_mode == 0) { /* For Transport Block mode */
+		c = enc->tb_params.c;
+		r = enc->tb_params.r;
+	} else { /* For Code Block mode */
+		c = 1;
+		r = 0;
+	}
+
+	while (mbuf_total_left > 0 && r < c) {
+
+		seg_total_left = rte_pktmbuf_data_len(m_in) - in_offset;
+
+		if (enc->code_block_mode == 0) {
+			e = (r < enc->tb_params.cab) ?
+				enc->tb_params.ea : enc->tb_params.eb;
+		} else {
+			e = enc->cb_params.e;
+		}
+
+		process_ldpc_enc_cb(q, op, e, m_in, m_out_head,
+				m_out, in_offset, out_offset, seg_total_left,
+				queue_stats);
+		/* Update total_left */
+		in_length = (enc->basegraph == 1 ? 22 : 10) * enc->z_c;
+		in_length = ((in_length - crc24_bits - enc->n_filler) >> 3);
+		mbuf_total_left -= in_length;
+		/* Update offsets for next CBs (if exist) */
+		in_offset += in_length;
+		out_offset += (e + 7) >> 3;
+
+		/* Update offsets */
+		if (seg_total_left == in_length) {
+			/* Go to the next mbuf */
+			m_in = m_in->next;
+			m_out = m_out->next;
+			in_offset = 0;
+			out_offset = 0;
+		}
+		r++;
+	}
+
+	/* check if all input data was processed */
+	if (mbuf_total_left != 0) {
+		op->status |= 1 << RTE_BBDEV_DATA_ERROR;
+		rte_bbdev_log(ERR,
+				"Mismatch between mbuf length and included CBs sizes %d",
+				mbuf_total_left);
+	}
+}
+
 static inline uint16_t
 enqueue_enc_all_ops(struct turbo_sw_queue *q, struct rte_bbdev_enc_op **ops,
 		uint16_t nb_ops, struct rte_bbdev_stats *queue_stats)
@@ -863,6 +1152,23 @@ struct turbo_sw_queue {
 			NULL);
 }
 
+static inline uint16_t
+enqueue_ldpc_enc_all_ops(struct turbo_sw_queue *q,
+		struct rte_bbdev_enc_op **ops,
+		uint16_t nb_ops, struct rte_bbdev_stats *queue_stats)
+{
+	uint16_t i;
+#ifdef RTE_BBDEV_OFFLOAD_COST
+	queue_stats->acc_offload_cycles = 0;
+#endif
+
+	for (i = 0; i < nb_ops; ++i)
+		enqueue_ldpc_enc_one_op(q, ops[i], queue_stats);
+
+	return rte_ring_enqueue_burst(q->processed_pkts, (void **)ops, nb_ops,
+			NULL);
+}
+
 #ifdef RTE_BBDEV_SDK_AVX2
 static inline void
 move_padding_bytes(const uint8_t *in, uint8_t *out, uint16_t k,
@@ -887,7 +1193,11 @@ struct turbo_sw_queue {
 		struct rte_bbdev_stats *q_stats)
 {
 #ifdef RTE_BBDEV_SDK_AVX2
+#ifdef RTE_LIBRTE_BBDEV_DEBUG
 	int ret;
+#else
+	RTE_SET_USED(in_length);
+#endif
 	int32_t k_idx;
 	int32_t iter_cnt;
 	uint8_t *in, *out, *adapter_input;
@@ -905,11 +1215,13 @@ struct turbo_sw_queue {
 
 	k_idx = compute_idx(k);
 
+#ifdef RTE_LIBRTE_BBDEV_DEBUG
 	ret = is_dec_input_valid(k_idx, kw, in_length);
 	if (ret != 0) {
 		op->status |= 1 << RTE_BBDEV_DATA_ERROR;
 		return;
 	}
+#endif
 
 	in = rte_pktmbuf_mtod_offset(m_in, uint8_t *, in_offset);
 	ncb = kw;
@@ -925,11 +1237,12 @@ struct turbo_sw_queue {
 		deint_resp.pinteleavebuffer = q->deint_output;
 
 #ifdef RTE_BBDEV_OFFLOAD_COST
-		start_time = rte_rdtsc_precise();
+	start_time = rte_rdtsc_precise();
 #endif
+		/* Sub-block De-Interleaving */
 		bblib_deinterleave_ul(&deint_req, &deint_resp);
 #ifdef RTE_BBDEV_OFFLOAD_COST
-		q_stats->acc_offload_cycles += rte_rdtsc_precise() - start_time;
+	q_stats->acc_offload_cycles += rte_rdtsc_precise() - start_time;
 #endif
 	} else
 		move_padding_bytes(in, q->deint_output, k, ncb);
@@ -1022,6 +1335,202 @@ struct turbo_sw_queue {
 }
 
 static inline void
+process_ldpc_dec_cb(struct turbo_sw_queue *q, struct rte_bbdev_dec_op *op,
+		uint8_t c, uint16_t out_length, uint16_t e,
+		struct rte_mbuf *m_in,
+		struct rte_mbuf *m_out_head, struct rte_mbuf *m_out,
+		struct rte_mbuf *m_harq_in,
+		struct rte_mbuf *m_harq_out_head, struct rte_mbuf *m_harq_out,
+		uint16_t in_offset, uint16_t out_offset,
+		uint16_t harq_in_offset, uint16_t harq_out_offset,
+		bool check_crc_24b,
+		uint16_t crc24_overlap, uint16_t in_length,
+		struct rte_bbdev_stats *q_stats)
+{
+#ifdef RTE_BBDEV_SDK_AVX512
+	RTE_SET_USED(in_length);
+	RTE_SET_USED(c);
+	uint8_t *in, *out, *harq_in, *harq_out, *adapter_input;
+	struct bblib_rate_dematching_5gnr_request derm_req;
+	struct bblib_rate_dematching_5gnr_response derm_resp;
+	struct bblib_ldpc_decoder_5gnr_request dec_req;
+	struct bblib_ldpc_decoder_5gnr_response dec_resp;
+	struct bblib_crc_request crc_req;
+	struct bblib_crc_response crc_resp;
+	struct rte_bbdev_op_ldpc_dec *dec = &op->ldpc_dec;
+	uint16_t K, parity_offset, sys_cols, outLenWithCrc;
+	int16_t deRmOutSize, numRows;
+
+	/* Compute some LDPC BG lengths */
+	outLenWithCrc = out_length + (crc24_overlap >> 3);
+	sys_cols = (dec->basegraph == 1) ? 22 : 10;
+	K = sys_cols * dec->z_c;
+	parity_offset = K - 2 * dec->z_c;
+
+#ifdef RTE_BBDEV_OFFLOAD_COST
+	uint64_t start_time = rte_rdtsc_precise();
+#else
+	RTE_SET_USED(q_stats);
+#endif
+
+	in = rte_pktmbuf_mtod_offset(m_in, uint8_t *, in_offset);
+
+	if (check_bit(dec->op_flags, RTE_BBDEV_LDPC_HQ_COMBINE_IN_ENABLE)) {
+		/**
+		 *  Single contiguous block from the first LLR of the
+		 *  circular buffer.
+		 */
+		harq_in = NULL;
+		if (m_harq_in != NULL)
+			harq_in = rte_pktmbuf_mtod_offset(m_harq_in,
+				uint8_t *, harq_in_offset);
+		if (harq_in == NULL) {
+			op->status |= 1 << RTE_BBDEV_DATA_ERROR;
+			rte_bbdev_log(ERR, "No space in harq input mbuf");
+			return;
+		}
+		uint16_t harq_in_length = RTE_MIN(
+				dec->harq_combined_input.length,
+				(uint32_t) dec->n_cb);
+		memset(q->ag + harq_in_length, 0,
+				dec->n_cb - harq_in_length);
+		rte_memcpy(q->ag, harq_in, harq_in_length);
+	}
+
+	derm_req.p_in = (int8_t *) in;
+	derm_req.p_harq = q->ag; /* This doesn't include the filler bits */
+	derm_req.base_graph = dec->basegraph;
+	derm_req.zc = dec->z_c;
+	derm_req.ncb = dec->n_cb;
+	derm_req.e = e;
+	derm_req.k0 = 0; /* Actual output from SDK */
+	derm_req.isretx = check_bit(dec->op_flags,
+			RTE_BBDEV_LDPC_HQ_COMBINE_IN_ENABLE);
+	derm_req.rvid = dec->rv_index;
+	derm_req.modulation_order = dec->q_m;
+	derm_req.start_null_index = parity_offset - dec->n_filler;
+	derm_req.num_of_null = dec->n_filler;
+
+	bblib_rate_dematching_5gnr(&derm_req, &derm_resp);
+
+	/* Compute RM out size and number of rows */
+	deRmOutSize = RTE_MIN(
+			derm_req.k0 + derm_req.e -
+			((derm_req.k0 < derm_req.start_null_index) ?
+					0 : dec->n_filler),
+			dec->n_cb - dec->n_filler);
+	if (m_harq_in != NULL)
+		deRmOutSize = RTE_MAX(deRmOutSize,
+				RTE_MIN(dec->n_cb - dec->n_filler,
+						m_harq_in->data_len));
+	numRows = ((deRmOutSize + dec->n_filler + dec->z_c - 1) / dec->z_c)
+			- sys_cols + 2;
+	numRows = RTE_MAX(4, numRows);
+
+	/* get output data starting address */
+	out = (uint8_t *)mbuf_append(m_out_head, m_out, out_length);
+	if (out == NULL) {
+		op->status |= 1 << RTE_BBDEV_DATA_ERROR;
+		rte_bbdev_log(ERR,
+				"Too little space in LDPC decoder output mbuf");
+		return;
+	}
+
+	/* rte_bbdev_op_data.offset can be different than the offset
+	 * of the appended bytes
+	 */
+	out = rte_pktmbuf_mtod_offset(m_out, uint8_t *, out_offset);
+	adapter_input = q->enc_out;
+
+	dec_req.Zc = dec->z_c;
+	dec_req.baseGraph = dec->basegraph;
+	dec_req.nRows = numRows;
+	dec_req.numChannelLlrs = deRmOutSize;
+	dec_req.varNodes = derm_req.p_harq;
+	dec_req.numFillerBits = dec->n_filler;
+	dec_req.maxIterations = dec->iter_max;
+	dec_req.enableEarlyTermination = check_bit(dec->op_flags,
+			RTE_BBDEV_LDPC_ITERATION_STOP_ENABLE);
+	dec_resp.varNodes = (int16_t *) q->adapter_output;
+	dec_resp.compactedMessageBytes = q->enc_out;
+
+	bblib_ldpc_decoder_5gnr(&dec_req, &dec_resp);
+
+	dec->iter_count = RTE_MAX(dec_resp.iterationAtTermination,
+			dec->iter_count);
+	if (!dec_resp.parityPassedAtTermination)
+		op->status |= 1 << RTE_BBDEV_SYNDROME_ERROR;
+
+	bblib_bit_reverse((int8_t *) q->enc_out, outLenWithCrc << 3);
+
+	if (check_bit(dec->op_flags, RTE_BBDEV_LDPC_CRC_TYPE_24A_CHECK) ||
+			check_bit(dec->op_flags,
+					RTE_BBDEV_LDPC_CRC_TYPE_24B_CHECK)) {
+		crc_req.data = adapter_input;
+		crc_req.len  = K - dec->n_filler - 24;
+		crc_resp.check_passed = false;
+		crc_resp.data = adapter_input;
+		if (check_crc_24b)
+			bblib_lte_crc24b_check(&crc_req, &crc_resp);
+		else
+			bblib_lte_crc24a_check(&crc_req, &crc_resp);
+		if (!crc_resp.check_passed)
+			op->status |= 1 << RTE_BBDEV_CRC_ERROR;
+	}
+
+#ifdef RTE_BBDEV_OFFLOAD_COST
+	q_stats->acc_offload_cycles += rte_rdtsc_precise() - start_time;
+#endif
+	if (check_bit(dec->op_flags, RTE_BBDEV_LDPC_HQ_COMBINE_OUT_ENABLE)) {
+		harq_out = NULL;
+		if (m_harq_out != NULL) {
+			/* Initialize HARQ data length since we overwrite */
+			m_harq_out->data_len = 0;
+			/* Check there is enough space
+			 * in the HARQ outbound buffer
+			 */
+			harq_out = (uint8_t *)mbuf_append(m_harq_out_head,
+					m_harq_out, deRmOutSize);
+		}
+		if (harq_out == NULL) {
+			op->status |= 1 << RTE_BBDEV_DATA_ERROR;
+			rte_bbdev_log(ERR, "No space in HARQ output mbuf");
+			return;
+		}
+		/* get output data starting address and overwrite the data */
+		harq_out = rte_pktmbuf_mtod_offset(m_harq_out, uint8_t *,
+				harq_out_offset);
+		rte_memcpy(harq_out, derm_req.p_harq, deRmOutSize);
+		dec->harq_combined_output.length += deRmOutSize;
+	}
+
+	rte_memcpy(out, adapter_input, out_length);
+	dec->hard_output.length += out_length;
+#else
+	RTE_SET_USED(q);
+	RTE_SET_USED(op);
+	RTE_SET_USED(c);
+	RTE_SET_USED(out_length);
+	RTE_SET_USED(e);
+	RTE_SET_USED(m_in);
+	RTE_SET_USED(m_out_head);
+	RTE_SET_USED(m_out);
+	RTE_SET_USED(m_harq_in);
+	RTE_SET_USED(m_harq_out_head);
+	RTE_SET_USED(m_harq_out);
+	RTE_SET_USED(harq_in_offset);
+	RTE_SET_USED(harq_out_offset);
+	RTE_SET_USED(in_offset);
+	RTE_SET_USED(out_offset);
+	RTE_SET_USED(check_crc_24b);
+	RTE_SET_USED(crc24_overlap);
+	RTE_SET_USED(in_length);
+	RTE_SET_USED(q_stats);
+#endif
+}
+
+
+static inline void
 enqueue_dec_one_op(struct turbo_sw_queue *q, struct rte_bbdev_dec_op *op,
 		struct rte_bbdev_stats *queue_stats)
 {
@@ -1080,6 +1589,7 @@ struct turbo_sw_queue {
 				in_offset, out_offset, check_bit(dec->op_flags,
 				RTE_BBDEV_TURBO_CRC_TYPE_24B), crc24_overlap,
 				seg_total_left, queue_stats);
+
 		/* To keep CRC24 attached to end of Code block, use
 		 * RTE_BBDEV_TURBO_DEC_TB_CRC_24B_KEEP flag as it
 		 * removed by default once verified.
@@ -1101,6 +1611,103 @@ struct turbo_sw_queue {
 		}
 		r++;
 	}
+
+	if (mbuf_total_left != 0) {
+		op->status |= 1 << RTE_BBDEV_DATA_ERROR;
+		rte_bbdev_log(ERR,
+				"Mismatch between mbuf length and included Circular buffer sizes");
+	}
+}
+
+static inline void
+enqueue_ldpc_dec_one_op(struct turbo_sw_queue *q, struct rte_bbdev_dec_op *op,
+		struct rte_bbdev_stats *queue_stats)
+{
+	uint8_t c, r = 0;
+	uint16_t e, out_length;
+	uint16_t crc24_overlap = 0;
+	struct rte_bbdev_op_ldpc_dec *dec = &op->ldpc_dec;
+	struct rte_mbuf *m_in = dec->input.data;
+	struct rte_mbuf *m_harq_in = dec->harq_combined_input.data;
+	struct rte_mbuf *m_harq_out = dec->harq_combined_output.data;
+	struct rte_mbuf *m_harq_out_head = dec->harq_combined_output.data;
+	struct rte_mbuf *m_out = dec->hard_output.data;
+	struct rte_mbuf *m_out_head = dec->hard_output.data;
+	uint16_t in_offset = dec->input.offset;
+	uint16_t harq_in_offset = dec->harq_combined_input.offset;
+	uint16_t harq_out_offset = dec->harq_combined_output.offset;
+	uint16_t out_offset = dec->hard_output.offset;
+	uint32_t mbuf_total_left = dec->input.length;
+	uint16_t seg_total_left;
+
+	/* Clear op status */
+	op->status = 0;
+
+	if (m_in == NULL || m_out == NULL) {
+		rte_bbdev_log(ERR, "Invalid mbuf pointer");
+		op->status = 1 << RTE_BBDEV_DATA_ERROR;
+		return;
+	}
+
+	if (dec->code_block_mode == 0) { /* For Transport Block mode */
+		c = dec->tb_params.c;
+		e = dec->tb_params.ea;
+	} else { /* For Code Block mode */
+		c = 1;
+		e = dec->cb_params.e;
+	}
+
+	if (check_bit(dec->op_flags, RTE_BBDEV_LDPC_CRC_TYPE_24B_DROP))
+		crc24_overlap = 24;
+
+	out_length = (dec->basegraph == 1 ? 22 : 10) * dec->z_c; /* K */
+	out_length = ((out_length - crc24_overlap - dec->n_filler) >> 3);
+
+	while (mbuf_total_left > 0) {
+		if (dec->code_block_mode == 0)
+			e = (r < dec->tb_params.cab) ?
+				dec->tb_params.ea : dec->tb_params.eb;
+
+		seg_total_left = rte_pktmbuf_data_len(m_in) - in_offset;
+
+		process_ldpc_dec_cb(q, op, c, out_length, e,
+				m_in, m_out_head, m_out,
+				m_harq_in, m_harq_out_head, m_harq_out,
+				in_offset, out_offset, harq_in_offset,
+				harq_out_offset,
+				check_bit(dec->op_flags,
+				RTE_BBDEV_LDPC_CRC_TYPE_24B_CHECK),
+				crc24_overlap,
+				seg_total_left, queue_stats);
+
+		/* To keep CRC24 attached to end of Code block, use
+		 * RTE_BBDEV_LDPC_DEC_TB_CRC_24B_KEEP flag as it
+		 * removed by default once verified.
+		 */
+
+		mbuf_total_left -= e;
+
+		/* Update offsets */
+		if (seg_total_left == e) {
+			/* Go to the next mbuf */
+			m_in = m_in->next;
+			m_out = m_out->next;
+			if (m_harq_in != NULL)
+				m_harq_in = m_harq_in->next;
+			if (m_harq_out != NULL)
+				m_harq_out = m_harq_out->next;
+			in_offset = 0;
+			out_offset = 0;
+			harq_in_offset = 0;
+			harq_out_offset = 0;
+		} else {
+			/* Update offsets for next CBs (if exist) */
+			in_offset += e;
+			out_offset += out_length;
+		}
+		r++;
+	}
+
 	if (mbuf_total_left != 0) {
 		op->status |= 1 << RTE_BBDEV_DATA_ERROR;
 		rte_bbdev_log(ERR,
@@ -1124,6 +1731,23 @@ struct turbo_sw_queue {
 			NULL);
 }
 
+static inline uint16_t
+enqueue_ldpc_dec_all_ops(struct turbo_sw_queue *q,
+		struct rte_bbdev_dec_op **ops,
+		uint16_t nb_ops, struct rte_bbdev_stats *queue_stats)
+{
+	uint16_t i;
+#ifdef RTE_BBDEV_OFFLOAD_COST
+	queue_stats->acc_offload_cycles = 0;
+#endif
+
+	for (i = 0; i < nb_ops; ++i)
+		enqueue_ldpc_dec_one_op(q, ops[i], queue_stats);
+
+	return rte_ring_enqueue_burst(q->processed_pkts, (void **)ops, nb_ops,
+			NULL);
+}
+
 /* Enqueue burst */
 static uint16_t
 enqueue_enc_ops(struct rte_bbdev_queue_data *q_data,
@@ -1143,6 +1767,24 @@ struct turbo_sw_queue {
 
 /* Enqueue burst */
 static uint16_t
+enqueue_ldpc_enc_ops(struct rte_bbdev_queue_data *q_data,
+		struct rte_bbdev_enc_op **ops, uint16_t nb_ops)
+{
+	void *queue = q_data->queue_private;
+	struct turbo_sw_queue *q = queue;
+	uint16_t nb_enqueued = 0;
+
+	nb_enqueued = enqueue_ldpc_enc_all_ops(
+			q, ops, nb_ops, &q_data->queue_stats);
+
+	q_data->queue_stats.enqueue_err_count += nb_ops - nb_enqueued;
+	q_data->queue_stats.enqueued_count += nb_enqueued;
+
+	return nb_enqueued;
+}
+
+/* Enqueue burst */
+static uint16_t
 enqueue_dec_ops(struct rte_bbdev_queue_data *q_data,
 		 struct rte_bbdev_dec_op **ops, uint16_t nb_ops)
 {
@@ -1158,6 +1800,24 @@ struct turbo_sw_queue {
 	return nb_enqueued;
 }
 
+/* Enqueue burst */
+static uint16_t
+enqueue_ldpc_dec_ops(struct rte_bbdev_queue_data *q_data,
+		 struct rte_bbdev_dec_op **ops, uint16_t nb_ops)
+{
+	void *queue = q_data->queue_private;
+	struct turbo_sw_queue *q = queue;
+	uint16_t nb_enqueued = 0;
+
+	nb_enqueued = enqueue_ldpc_dec_all_ops(q, ops, nb_ops,
+			&q_data->queue_stats);
+
+	q_data->queue_stats.enqueue_err_count += nb_ops - nb_enqueued;
+	q_data->queue_stats.enqueued_count += nb_enqueued;
+
+	return nb_enqueued;
+}
+
 /* Dequeue decode burst */
 static uint16_t
 dequeue_dec_ops(struct rte_bbdev_queue_data *q_data,
@@ -1270,6 +1930,10 @@ struct turbo_sw_queue {
 	bbdev->dequeue_dec_ops = dequeue_dec_ops;
 	bbdev->enqueue_enc_ops = enqueue_enc_ops;
 	bbdev->enqueue_dec_ops = enqueue_dec_ops;
+	bbdev->dequeue_ldpc_enc_ops = dequeue_enc_ops;
+	bbdev->dequeue_ldpc_dec_ops = dequeue_dec_ops;
+	bbdev->enqueue_ldpc_enc_ops = enqueue_ldpc_enc_ops;
+	bbdev->enqueue_ldpc_dec_ops = enqueue_ldpc_dec_ops;
 	((struct bbdev_private *) bbdev->data->dev_private)->max_nb_queues =
 			init_params->queues_num;
 
diff --git a/drivers/baseband/turbo_sw/meson.build b/drivers/baseband/turbo_sw/meson.build
index 438b5a7..33345aa 100644
--- a/drivers/baseband/turbo_sw/meson.build
+++ b/drivers/baseband/turbo_sw/meson.build
@@ -23,6 +23,16 @@ if dpdk_conf.has('RTE_BBDEV_SDK_AVX2')
 		includes += include_directories(path + '/lib_common')
 	endif
 endif
+if dpdk_conf.has('RTE_BBDEV_SDK_AVX512')
+	ext_deps += cc.find_library('libldpc_encoder_5gnr', dirs: [path + '/lib_ldpc_encoder_5gnr'], required: true)
+	ext_deps += cc.find_library('libldpc_decoder_5gnr', dirs: [path + '/lib_ldpc_decoder_5gnr'], required: true)
+	ext_deps += cc.find_library('libLDPC_ratematch_5gnr', dirs: [path + '/lib_LDPC_ratematch_5gnr'], required: true)
+	ext_deps += cc.find_library('librate_dematching_5gnr', dirs: [path + '/lib_rate_dematching_5gnr'], required: true)
+	includes += include_directories(path + '/lib_ldpc_encoder_5gnr')
+	includes += include_directories(path + '/lib_ldpc_decoder_5gnr')
+	includes += include_directories(path + '/lib_LDPC_ratematch_5gnr')
+	includes += include_directories(path + '/lib_rate_dematching_5gnr')
+endif
 
 deps += ['bbdev', 'bus_vdev', 'ring']
 name = 'bbdev_turbo_sw'
diff --git a/mk/rte.app.mk b/mk/rte.app.mk
index cdd6073..a586388 100644
--- a/mk/rte.app.mk
+++ b/mk/rte.app.mk
@@ -226,7 +226,13 @@ _LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_BBDEV_TURBO_SW) += -L$(FLEXRAN_SDK)/lib_crc -lcr
 _LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_BBDEV_TURBO_SW) += -L$(FLEXRAN_SDK)/lib_turbo -lturbo
 _LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_BBDEV_TURBO_SW) += -L$(FLEXRAN_SDK)/lib_rate_matching -lrate_matching
 _LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_BBDEV_TURBO_SW) += -L$(FLEXRAN_SDK)/lib_common -lcommon
-_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_BBDEV_TURBO_SW) += -lirc -limf -lstdc++ -lipps
+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_BBDEV_TURBO_SW) += -lirc -limf -lstdc++ -lipps -lsvml
+ifeq ($(CONFIG_RTE_BBDEV_SDK_AVX512),y)
+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_BBDEV_TURBO_SW) += -L$(FLEXRAN_SDK)/lib_LDPC_ratematch_5gnr -lLDPC_ratematch_5gnr
+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_BBDEV_TURBO_SW) += -L$(FLEXRAN_SDK)/lib_ldpc_encoder_5gnr -lldpc_encoder_5gnr
+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_BBDEV_TURBO_SW) += -L$(FLEXRAN_SDK)/lib_ldpc_decoder_5gnr -lldpc_decoder_5gnr
+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_BBDEV_TURBO_SW) += -L$(FLEXRAN_SDK)/lib_rate_dematching_5gnr -lrate_dematching_5gnr
+endif # CONFIG_RTE_BBDEV_SDK_AVX512
 endif # CONFIG_RTE_BBDEV_SDK_AVX2
 endif # CONFIG_RTE_LIBRTE_BBDEV
 
-- 
1.8.3.1



More information about the dev mailing list