[dpdk-dev] [PATCH v3 2/4] mbuf: create packet pool with external memory buffers

Slava Ovsiienko viacheslavo at mellanox.com
Wed Jan 15 19:13:48 CET 2020


> -----Original Message-----
> From: Olivier Matz <olivier.matz at 6wind.com>
> Sent: Tuesday, January 14, 2020 18:05
> To: Slava Ovsiienko <viacheslavo at mellanox.com>
> Cc: dev at dpdk.org; Matan Azrad <matan at mellanox.com>; Raslan Darawsheh
> <rasland at mellanox.com>; Ori Kam <orika at mellanox.com>; Shahaf Shuler
> <shahafs at mellanox.com>; stephen at networkplumber.org
> Subject: Re: [PATCH v3 2/4] mbuf: create packet pool with external memory
> buffers
> 
> On Tue, Jan 14, 2020 at 09:15:03AM +0000, Viacheslav Ovsiienko wrote:
> > The dedicated routine rte_pktmbuf_pool_create_extbuf() is provided to
> > create mbuf pool with data buffers located in the pinned external
> > memory. The application provides the external memory description and
> > routine initialises each mbuf with appropriate virtual and physical
> > buffer address.
> > It is entirely application responsibility to register external memory
> > with rte_extmem_register() API, map this memory, etc.
> >
> > The new introduced flag RTE_PKTMBUF_POOL_F_PINNED_EXT_BUF is set in
> > private pool structure, specifying the new special pool type. The
> > allocated mbufs from pool of this kind will have the EXT_ATTACHED_MBUF
> > flag set and NULL shared info pointer, because external buffers are
> > not supposed to be freed and sharing management is not needed. Also,
> > these mbufs can not be attached to other mbufs (not intended to be
> > indirect).
> >
> > Signed-off-by: Viacheslav Ovsiienko <viacheslavo at mellanox.com>
> > ---
> >  lib/librte_mbuf/rte_mbuf.c           | 144
> ++++++++++++++++++++++++++++++++++-
> >  lib/librte_mbuf/rte_mbuf.h           |  86 ++++++++++++++++++++-
> >  lib/librte_mbuf/rte_mbuf_version.map |   1 +
> >  3 files changed, 228 insertions(+), 3 deletions(-)
> >
> > diff --git a/lib/librte_mbuf/rte_mbuf.c b/lib/librte_mbuf/rte_mbuf.c
> > index 8fa7f49..d151469 100644
> > --- a/lib/librte_mbuf/rte_mbuf.c
> > +++ b/lib/librte_mbuf/rte_mbuf.c
> > @@ -59,9 +59,9 @@
> >  	}
> >
> >  	RTE_ASSERT(mp->elt_size >= sizeof(struct rte_mbuf) +
> > -		user_mbp_priv->mbuf_data_room_size +
> > +		((user_mbp_priv->flags &
> RTE_PKTMBUF_POOL_F_PINNED_EXT_BUF) ?
> > +		0 : user_mbp_priv->mbuf_data_room_size) +
> >  		user_mbp_priv->mbuf_priv_size);
> 
> Is this check really needed?
It seems so, it is in separated routine, which might be called externally.

> 
> > -	RTE_ASSERT(user_mbp_priv->flags == 0);
> 
> We can keep
>  RTE_ASSERT(user_mbp_priv->flags &
> ~RTE_PKTMBUF_POOL_F_PINNED_EXT_BUF == 0);
OK, thanks.

> 
> >
> >  	mbp_priv = rte_mempool_get_priv(mp);
> >  	memcpy(mbp_priv, user_mbp_priv, sizeof(*mbp_priv)); @@ -107,6
> > +107,63 @@
> >  	m->next = NULL;
> >  }
> >
> > +/*
> > + * pktmbuf constructor for the pool with pinned external buffer,
> > + * given as a callback function to rte_mempool_obj_iter() in
> > + * rte_pktmbuf_pool_create_extbuf(). Set the fields of a packet
> > + * mbuf to their default values.
> > + */
> > +void
> > +rte_pktmbuf_init_extmem(struct rte_mempool *mp,
> > +			void *opaque_arg,
> > +			void *_m,
> > +			__attribute__((unused)) unsigned int i) {
> > +	struct rte_mbuf *m = _m;
> > +	struct rte_pktmbuf_extmem_init_ctx *ctx = opaque_arg;
> > +	struct rte_pktmbuf_extmem *ext_mem;
> > +	uint32_t mbuf_size, buf_len, priv_size;
> > +
> > +	priv_size = rte_pktmbuf_priv_size(mp);
> > +	mbuf_size = sizeof(struct rte_mbuf) + priv_size;
> > +	buf_len = rte_pktmbuf_data_room_size(mp);
> > +
> > +	RTE_ASSERT(RTE_ALIGN(priv_size, RTE_MBUF_PRIV_ALIGN) ==
> priv_size);
> > +	RTE_ASSERT(mp->elt_size >= mbuf_size);
> > +	RTE_ASSERT(buf_len <= UINT16_MAX);
> > +
> > +	memset(m, 0, mbuf_size);
> > +	m->priv_size = priv_size;
> > +	m->buf_len = (uint16_t)buf_len;
> > +
> > +	/* set the data buffer pointers to external memory */
> > +	ext_mem = ctx->ext_mem + ctx->ext;
> > +
> > +	RTE_ASSERT(ctx->ext < ctx->ext_num);
> > +	RTE_ASSERT(ctx->off < ext_mem->buf_len);
> > +
> > +	m->buf_addr = RTE_PTR_ADD(ext_mem->buf_ptr, ctx->off);
> > +	m->buf_iova = ext_mem->buf_iova == RTE_BAD_IOVA ?
> > +		      RTE_BAD_IOVA : (ext_mem->buf_iova + ctx->off);
> > +
> > +	ctx->off += ext_mem->elt_size;
> > +	if (ctx->off >= ext_mem->buf_len) {
> > +		ctx->off = 0;
> > +		++ctx->ext;
> > +	}
> > +	/* keep some headroom between start of buffer and data */
> > +	m->data_off = RTE_MIN(RTE_PKTMBUF_HEADROOM, (uint16_t)m-
> >buf_len);
> > +
> > +	/* init some constant fields */
> > +	m->pool = mp;
> > +	m->nb_segs = 1;
> > +	m->port = MBUF_INVALID_PORT;
> > +	m->ol_flags = EXT_ATTACHED_MBUF;
> > +	rte_mbuf_refcnt_set(m, 1);
> > +	m->next = NULL;
> > +}
> > +
> > +
> >  /* Helper to create a mbuf pool with given mempool ops name*/  struct
> > rte_mempool *  rte_pktmbuf_pool_create_by_ops(const char *name,
> > unsigned int n, @@ -169,6 +226,89 @@ struct rte_mempool *
> >  			data_room_size, socket_id, NULL);
> >  }
> >
> > +/* Helper to create a mbuf pool with pinned external data buffers. */
> > +struct rte_mempool * rte_pktmbuf_pool_create_extbuf(const char *name,
> > +unsigned int n,
> > +	unsigned int cache_size, uint16_t priv_size,
> > +	uint16_t data_room_size, int socket_id,
> > +	struct rte_pktmbuf_extmem *ext_mem, unsigned int ext_num) {
> > +	struct rte_mempool *mp;
> > +	struct rte_pktmbuf_pool_private mbp_priv;
> > +	struct rte_pktmbuf_extmem_init_ctx init_ctx;
> > +	const char *mp_ops_name;
> > +	unsigned int elt_size;
> > +	unsigned int i, n_elts = 0;
> > +	int ret;
> > +
> > +	if (RTE_ALIGN(priv_size, RTE_MBUF_PRIV_ALIGN) != priv_size) {
> > +		RTE_LOG(ERR, MBUF, "mbuf priv_size=%u is not aligned\n",
> > +			priv_size);
> > +		rte_errno = EINVAL;
> > +		return NULL;
> > +	}
> > +	/* Check the external memory descriptors. */
> > +	for (i = 0; i < ext_num; i++) {
> > +		struct rte_pktmbuf_extmem *extm = ext_mem + i;
> > +
> > +		if (!extm->elt_size || !extm->buf_len || !extm->buf_ptr) {
> > +			RTE_LOG(ERR, MBUF, "invalid extmem descriptor\n");
> > +			rte_errno = EINVAL;
> > +			return NULL;
> > +		}
> > +		if (data_room_size > extm->elt_size) {
> > +			RTE_LOG(ERR, MBUF, "ext elt_size=%u is too small\n",
> > +				priv_size);
> > +			rte_errno = EINVAL;
> > +			return NULL;
> > +		}
> > +		n_elts += extm->buf_len / extm->elt_size;
> > +	}
> > +	/* Check whether enough external memory provided. */
> > +	if (n_elts < n) {
> > +		RTE_LOG(ERR, MBUF, "not enough extmem\n");
> > +		rte_errno = ENOMEM;
> > +		return NULL;
> > +	}
> > +	elt_size = sizeof(struct rte_mbuf) + (unsigned int)priv_size;
> > +	memset(&mbp_priv, 0, sizeof(mbp_priv));
> > +	mbp_priv.mbuf_data_room_size = data_room_size;
> > +	mbp_priv.mbuf_priv_size = priv_size;
> > +	mbp_priv.flags = RTE_PKTMBUF_POOL_F_PINNED_EXT_BUF;
> > +
> > +	mp = rte_mempool_create_empty(name, n, elt_size, cache_size,
> > +		 sizeof(struct rte_pktmbuf_pool_private), socket_id, 0);
> > +	if (mp == NULL)
> > +		return NULL;
> > +
> > +	mp_ops_name = rte_mbuf_best_mempool_ops();
> > +	ret = rte_mempool_set_ops_byname(mp, mp_ops_name, NULL);
> > +	if (ret != 0) {
> > +		RTE_LOG(ERR, MBUF, "error setting mempool handler\n");
> > +		rte_mempool_free(mp);
> > +		rte_errno = -ret;
> > +		return NULL;
> > +	}
> > +	rte_pktmbuf_pool_init(mp, &mbp_priv);
> > +
> > +	ret = rte_mempool_populate_default(mp);
> > +	if (ret < 0) {
> > +		rte_mempool_free(mp);
> > +		rte_errno = -ret;
> > +		return NULL;
> > +	}
> > +
> > +	init_ctx = (struct rte_pktmbuf_extmem_init_ctx){
> > +		.ext_mem = ext_mem,
> > +		.ext_num = ext_num,
> > +		.ext = 0,
> > +		.off = 0,
> > +	};
> > +	rte_mempool_obj_iter(mp, rte_pktmbuf_init_extmem, &init_ctx);
> > +
> > +	return mp;
> > +}
> 
> Instead of duplicating some code, would it be possible to do:
> 
> int
> rte_pktmbuf_pool_attach_extbuf(struct rte_mempool *mp,
> 	struct rte_pktmbuf_extmem *ext_mem, unsigned int ext_num) {
> 	struct rte_pktmbuf_extmem_init_ctx init_ctx = { 0 };
> 	struct rte_pktmbuf_pool_private *priv;
> 
> 	/* XXX assert mempool is fully populated? */
> 
> 	priv = rte_mempool_get_priv(mp);
> 	mbp_priv.flags |= RTE_PKTMBUF_POOL_F_PINNED_EXT_BUF;
> 
> 	rte_mempool_obj_iter(mp, rte_pktmbuf_init_extmem, &init_ctx);
> 
> 	return init_ctx.ret;
> }
> 
> The application would have to call:
> 
> 	rte_pktmbuf_pool_create(...);
> 	rte_pktmbuf_pool_attach_extbuf(...);
> 
It seems there are some disadvantages:
- no data_room_size check (we should remove asserts from rte_pktmbuf_pool_init)
- rte_mempool_obj_iter would be called twice, it might involve rte_mempool_virt2iova()
  and it would take some time

The code duplication is not so large as it could be seen from the diff - the part of
the  rte_pktmbuf_pool_create_extbuf() is related to checking extmem, and the main part
of job is done in the rte_pktmbuf_init_extmem().

> 
> > +
> >  /* do some sanity checks on a mbuf: panic if it fails */  void
> > rte_mbuf_sanity_check(const struct rte_mbuf *m, int is_header) diff
> > --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h index
> > 8f486af..7bde297 100644
> > --- a/lib/librte_mbuf/rte_mbuf.h
> > +++ b/lib/librte_mbuf/rte_mbuf.h
> > @@ -642,6 +642,34 @@ static inline struct rte_mbuf
> > *rte_mbuf_raw_alloc(struct rte_mempool *mp)  void
> rte_pktmbuf_init(struct rte_mempool *mp, void *opaque_arg,
> >  		      void *m, unsigned i);
> >
> > +/** The context to initialize the mbufs with pinned external buffers.
> > +*/ struct rte_pktmbuf_extmem_init_ctx {
> > +	struct rte_pktmbuf_extmem *ext_mem; /* pointer to descriptor
> array. */
> > +	unsigned int ext_num; /* number of descriptors in array. */
> > +	unsigned int ext; /* loop descriptor index. */
> > +	size_t off; /* loop buffer offset. */ };
> > +
> > +/**
> > + * The packet mbuf constructor for pools with pinned external memory.
> > + *
> > + * This function initializes some fields in the mbuf structure that
> > +are
> > + * not modified by the user once created (origin pool, buffer start
> > + * address, and so on). This function is given as a callback function
> > +to
> > + * rte_mempool_obj_iter() called from rte_mempool_create_extmem().
> > + *
> > + * @param mp
> > + *   The mempool from which mbufs originate.
> > + * @param opaque_arg
> > + *   A pointer to the rte_pktmbuf_extmem_init_ctx - initialization
> > + *   context structure
> > + * @param m
> > + *   The mbuf to initialize.
> > + * @param i
> > + *   The index of the mbuf in the pool table.
> > + */
> > +void rte_pktmbuf_init_extmem(struct rte_mempool *mp, void
> *opaque_arg,
> > +			     void *m, unsigned int i);
> >
> >  /**
> >   * A  packet mbuf pool constructor.
> > @@ -743,6 +771,62 @@ struct rte_mempool *
> >  	unsigned int cache_size, uint16_t priv_size, uint16_t data_room_size,
> >  	int socket_id, const char *ops_name);
> >
> > +/** A structure that describes the pinned external buffer segment. */
> > +struct rte_pktmbuf_extmem {
> > +	void *buf_ptr;		/**< The virtual address of data buffer. */
> > +	rte_iova_t buf_iova;	/**< The IO address of the data buffer. */
> > +	size_t buf_len;		/**< External buffer length in bytes. */
> > +	uint16_t elt_size;	/**< mbuf element size in bytes. */
> > +};
> > +
> > +/**
> > + * Create a mbuf pool with external pinned data buffers.
> > + *
> > + * This function creates and initializes a packet mbuf pool that
> > +contains
> > + * only mbufs with external buffer. It is a wrapper to rte_mempool
> functions.
> > + *
> > + * @param name
> > + *   The name of the mbuf pool.
> > + * @param n
> > + *   The number of elements in the mbuf pool. The optimum size (in terms
> > + *   of memory usage) for a mempool is when n is a power of two minus
> one:
> > + *   n = (2^q - 1).
> > + * @param cache_size
> > + *   Size of the per-core object cache. See rte_mempool_create() for
> > + *   details.
> > + * @param priv_size
> > + *   Size of application private are between the rte_mbuf structure
> > + *   and the data buffer. This value must be aligned to
> RTE_MBUF_PRIV_ALIGN.
> > + * @param data_room_size
> > + *   Size of data buffer in each mbuf, including RTE_PKTMBUF_HEADROOM.
> > + * @param socket_id
> > + *   The socket identifier where the memory should be allocated. The
> > + *   value can be *SOCKET_ID_ANY* if there is no NUMA constraint for the
> > + *   reserved zone.
> > + * @param ext_mem
> > + *   Pointer to the array of structures describing the external memory
> > + *   for data buffers. It is caller responsibility to register this memory
> > + *   with rte_extmem_register() (if needed), map this memory to
> appropriate
> > + *   physical device, etc.
> > + * @param ext_num
> > + *   Number of elements in the ext_mem array.
> > + * @return
> > + *   The pointer to the new allocated mempool, on success. NULL on error
> > + *   with rte_errno set appropriately. Possible rte_errno values include:
> > + *    - E_RTE_NO_CONFIG - function could not get pointer to rte_config
> structure
> > + *    - E_RTE_SECONDARY - function was called from a secondary process
> instance
> > + *    - EINVAL - cache size provided is too large, or priv_size is not aligned.
> > + *    - ENOSPC - the maximum number of memzones has already been
> allocated
> > + *    - EEXIST - a memzone with the same name already exists
> > + *    - ENOMEM - no appropriate memory area found in which to create
> memzone
> > + */
> > +__rte_experimental
> > +struct rte_mempool *
> > +rte_pktmbuf_pool_create_extbuf(const char *name, unsigned int n,
> > +	unsigned int cache_size, uint16_t priv_size,
> > +	uint16_t data_room_size, int socket_id,
> > +	struct rte_pktmbuf_extmem *ext_mem, unsigned int ext_num);
> > +
> >  /**
> >   * Get the data room size of mbufs stored in a pktmbuf_pool
> >   *
> > @@ -818,7 +902,7 @@ static inline void rte_pktmbuf_reset(struct rte_mbuf
> *m)
> >  	m->nb_segs = 1;
> >  	m->port = MBUF_INVALID_PORT;
> >
> > -	m->ol_flags = 0;
> > +	m->ol_flags &= EXT_ATTACHED_MBUF;
> >  	m->packet_type = 0;
> >  	rte_pktmbuf_reset_headroom(m);
> >
> 
> I wonder if it should go in previous patch?

Mmm... Definitely - yes 😊
Thanks, will move this line.
> 
> > diff --git a/lib/librte_mbuf/rte_mbuf_version.map
> > b/lib/librte_mbuf/rte_mbuf_version.map
> > index 3bbb476..ab161bc 100644
> > --- a/lib/librte_mbuf/rte_mbuf_version.map
> > +++ b/lib/librte_mbuf/rte_mbuf_version.map
> > @@ -44,5 +44,6 @@ EXPERIMENTAL {
> >  	rte_mbuf_dyn_dump;
> >  	rte_pktmbuf_copy;
> >  	rte_pktmbuf_free_bulk;
> > +	rte_pktmbuf_pool_create_extbuf;
> >
> >  };
> > --
> > 1.8.3.1
> >
With best regards, Slava



More information about the dev mailing list