[dpdk-dev] [PATCH v4] drivers/net:new PMD using tun/tap host interface

Wiles, Keith keith.wiles at intel.com
Tue Oct 11 23:07:35 CEST 2016


Regards,
Keith

> On Oct 11, 2016, at 6:49 AM, Yigit, Ferruh <ferruh.yigit at intel.com> wrote:
> 
> On 10/4/2016 3:45 PM, Keith Wiles wrote:
>> The rte_eth_tap.c PMD creates a device using TUN/TAP interfaces
>> on the local host. The PMD allows for DPDK and the host to
>> communicate using a raw device interface on the host and in
>> the DPDK application. The device created is a Tap device with
>> a L2 packet header.
>> 

Will try to ship out a v5 soon.
>> v4 - merge with latest driver changes
>> v3 - fix includes by removing ifdef for other type besides Linux.
>>     Fix the copyright notice in the Makefile
>> v2 - merge all of the patches into one patch.
>>     Fix a typo on naming the tap device.
>>     Update the maintainers list
>> 
>> Signed-off-by: Keith Wiles <keith.wiles at intel.com>
>> ---
>> MAINTAINERS                             |   5 +
>> config/common_linuxapp                  |   2 +
>> doc/guides/nics/tap.rst                 |  84 ++++
>> drivers/net/Makefile                    |   1 +
>> drivers/net/tap/Makefile                |  57 +++
>> drivers/net/tap/rte_eth_tap.c           | 866 ++++++++++++++++++++++++++++++++
>> drivers/net/tap/rte_pmd_tap_version.map |   4 +
>> mk/rte.app.mk                           |   1 +
>> 8 files changed, 1020 insertions(+)
>> create mode 100644 doc/guides/nics/tap.rst
>> create mode 100644 drivers/net/tap/Makefile
>> create mode 100644 drivers/net/tap/rte_eth_tap.c
>> create mode 100644 drivers/net/tap/rte_pmd_tap_version.map
>> 
> <>
>> diff --git a/config/common_linuxapp b/config/common_linuxapp
>> index 2483dfa..59a2053 100644
>> --- a/config/common_linuxapp
>> +++ b/config/common_linuxapp
>> @@ -44,3 +44,5 @@ CONFIG_RTE_LIBRTE_PMD_VHOST=y
>> CONFIG_RTE_LIBRTE_PMD_AF_PACKET=y
>> CONFIG_RTE_LIBRTE_POWER=y
>> CONFIG_RTE_VIRTIO_USER=y
>> +CONFIG_RTE_LIBRTE_PMD_TAP=y
> 
> According existing config items, a default value of a config option
> should go to config/common_base, and environment specific config file
> overwrites it if required.
> So this option needs to be added into config/common_base too as disabled
> by default.

Add the define to common_base as no, plus a comment for Linux only.

> 
>> +CONFIG_RTE_PMD_TAP_MAX_QUEUES=32

Moved this to the .c file as a define.
> 
> Is the number of max queues really needs to be a config option, I assume
> in normal use case user won't need to update this and will use single
> queue, if that is true what about pushing this into source code to not
> make config file more complex?
> 
>> diff --git a/doc/guides/nics/tap.rst b/doc/guides/nics/tap.rst
> 
> <...>
> 
>> +.. code-block:: console
>> +
>> +   The interfaced name can be changed by adding the iface=foo0
>> +   e.g. --vedv=eth_tap,iface=foo0 --vdev=eth_tap,iface=foo1, ...
> 
> s/vedv/vdev
> eth_tap needs to be net_tap as part of unifying device names work

Fixed.
> 
> <...>
> 
>> diff --git a/drivers/net/Makefile b/drivers/net/Makefile
>> index bc93230..b4afa98 100644
>> --- a/drivers/net/Makefile
>> +++ b/drivers/net/Makefile
>> @@ -55,6 +55,7 @@ DIRS-$(CONFIG_RTE_LIBRTE_THUNDERX_NICVF_PMD) += thunderx
>> DIRS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio
>> DIRS-$(CONFIG_RTE_LIBRTE_VMXNET3_PMD) += vmxnet3
>> DIRS-$(CONFIG_RTE_LIBRTE_PMD_XENVIRT) += xenvirt
>> +DIRS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += tap
> 
> Rest of the PMDs sorted alphabetically, please do same.

Done.
> 
>> 
>> ifeq ($(CONFIG_RTE_LIBRTE_VHOST),y)
>> DIRS-$(CONFIG_RTE_LIBRTE_PMD_VHOST) += vhost
> 
> <...>
> 
>> diff --git a/drivers/net/tap/rte_eth_tap.c b/drivers/net/tap/rte_eth_tap.c
> 
> <...>
> 
>> +
>> +static const char *drivername = "Tap PMD";
>> +static int tap_unit = 0;
> 
> No need to initialize to zero.

Fixed
> 
> <...>
> 
>> +
>> +struct pmd_internals {
>> +	char name[RTE_ETH_NAME_MAX_LEN];	/* Internal Tap device name */
>> +	uint16_t nb_queues;			/* Number of queues supported */
>> +	uint16_t pad0;
> 
> Why this padding? Is it reserved?

Removed pad0. I just like to know about gaps in the structures is the reason.
> 
>> +	struct ether_addr eth_addr;	/* Mac address of the device port */
>> +
>> +	int if_index;			/* IF_INDEX for the port */
>> +	int fds[RTE_PMD_TAP_MAX_QUEUES]; /* List of all file descriptors */
>> +
>> +	struct rx_queue rxq[RTE_PMD_TAP_MAX_QUEUES];	/* List of RX queues */
>> +	struct tx_queue txq[RTE_PMD_TAP_MAX_QUEUES];	/* List of TX queues */
>> +};
>> +
>> +/*
>> + * Tun/Tap allocation routine
>> + *
>> + * name is the number of the interface to use, unless NULL to take the host
>> + * supplied name.
>> + */
>> +static int
>> +tun_alloc(char * name)
> 
> char *name

Fixed.
> 
> <...>
> 
>> +
>> +	/* Always set the fiile descriptor to non-blocking */
> 
> s/fiile/file
> 
>> +	if (fcntl(fd, F_SETFL, O_NONBLOCK) < 0) {
>> +		RTE_LOG(ERR, PMD, "Unable to set to nonblocking\n");
>> +		perror("F_SETFL, NONBLOCK");
>> +		goto error;
>> +	}
>> +
>> +	/* If the name is different that new name as default */
>> +	if (name && strcmp(name, ifr.ifr_name))
>> +		strcpy(name, ifr.ifr_name);
> What about more secure copy?

Changed to be more secure.
> 
>> +
>> +	return fd;
>> +
>> +error:
>> +	if (fd > 0)
>> +		close(fd);
>> +	return -1;
>> +}
>> +
> 
> <...>
> 
>> +
>> +static void
>> +tap_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
>> +{
>> +	struct pmd_internals *internals = dev->data->dev_private;
>> +
>> +	dev_info->driver_name = drivername;
>> +	dev_info->if_index = internals->if_index;
>> +	dev_info->max_mac_addrs = 1;
>> +	dev_info->max_rx_pktlen = (uint32_t)ETHER_MAX_VLAN_FRAME_LEN;
>> +	dev_info->max_rx_queues = (uint16_t)internals->nb_queues;
>> +	dev_info->max_tx_queues = (uint16_t)internals->nb_queues;
> casting to uint16_t is not requires, it is already uint16_t.

Removed
> 
>> +	dev_info->min_rx_bufsize = 0;
>> +	dev_info->pci_dev = NULL;
>> +}
>> +
>> +static void
>> +tap_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *igb_stats)
> igb_stats?
> 
>> +{
>> +	unsigned i, imax;
>> +	unsigned long rx_total = 0, tx_total = 0, tx_err_total = 0;
>> +	unsigned long rx_bytes_total = 0, tx_bytes_total = 0;
>> +	const struct pmd_internals *internal = dev->data->dev_private;
>> +
>> +	imax = (internal->nb_queues < RTE_ETHDEV_QUEUE_STAT_CNTRS) ?
>> +		internal->nb_queues : RTE_ETHDEV_QUEUE_STAT_CNTRS;
>> +
>> +	for (i = 0; i < imax; i++) {
>> +		igb_stats->q_ipackets[i] = internal->rxq[i].stats.ipackets;
>> +		igb_stats->q_ibytes[i] = internal->rxq[i].stats.ibytes;
>> +		rx_total += igb_stats->q_ipackets[i];
>> +		rx_bytes_total += igb_stats->q_ibytes[i];
>> +	}
>> +
>> +	imax = (internal->nb_queues < RTE_ETHDEV_QUEUE_STAT_CNTRS) ?
>> +		internal->nb_queues : RTE_ETHDEV_QUEUE_STAT_CNTRS;
> Do we need to duplicate imax calculation?

Removed
> 
> 
>> +
>> +	for (i = 0; i < imax; i++) {
>> +		igb_stats->q_opackets[i] = internal->txq[i].stats.opackets;
>> +		igb_stats->q_errors[i] = internal->txq[i].stats.errs;
>> +		igb_stats->q_obytes[i] = internal->txq[i].stats.obytes;
>> +		tx_total += igb_stats->q_opackets[i];
>> +		tx_err_total += igb_stats->q_errors[i];
>> +		tx_bytes_total += igb_stats->q_obytes[i];
>> +	}
>> +
>> +	igb_stats->ipackets = rx_total;
>> +	igb_stats->ibytes = rx_bytes_total;
>> +	igb_stats->opackets = tx_total;
>> +	igb_stats->oerrors = tx_err_total;
>> +	igb_stats->obytes = tx_bytes_total;
>> +}
>> +
> 
> <...>
> 
>> +
>> +static int
>> +rte_eth_dev_create(const char *name,
>> +		   struct rte_eth_dev **eth_dev,
>> +		   const struct eth_dev_ops *dev_ops,
>> +		   void **internals, size_t internal_size,
>> +		   uint16_t flag)
>> +{
>> +	char buff[RTE_ETH_NAME_MAX_LEN];
>> +	int numa_node = rte_socket_id();
>> +	struct rte_eth_dev *dev = NULL;
>> +	struct rte_eth_dev_data *data = NULL;
>> +	void *priv = NULL;
>> +
>> +	if ((name == NULL) || (eth_dev == NULL) || (dev_ops == NULL) ||
>> +	    (internals == NULL) || (internal_size == 0)) {
>> +		RTE_PMD_DEBUG_TRACE("Paramters are invalid\n");
>> +		return -1;
>> +	}
>> +
>> +	dev = rte_eth_dev_allocate(name);
>> +	if (dev == NULL) {
>> +		RTE_PMD_DEBUG_TRACE("%s: rte_eth_dev_allocate failed for %s\n",
>> +				    name, buff);
>> +		goto error;
>> +	}
>> +
>> +	if (flag & RTE_USE_PRIVATE_DATA) {
> 
> You may need to save this flag value somewhere in internals, to decide
> how to free data later.

Let me look into this one more and see if it is required at all.
> 
>> +		/*
>> +		 * now do all data allocation - for eth_dev structure, dummy
>> +		 * pci driver and internal (private) data
>> +		 */
>> +		snprintf(buff, sizeof(buff), "D-%s-%d", name, numa_node);
>> +		data = rte_zmalloc_socket(buff, sizeof(struct rte_eth_dev_data),
>> +					  0, numa_node);
>> +		if (data == NULL) {
>> +			RTE_PMD_DEBUG_TRACE("%s: Unable to allocate memory\n",
>> +					    name);
>> +			goto error;
>> +		}
>> +		/* move the current state of the structure to the new one */
>> +		rte_memcpy(data, dev->data, sizeof(struct rte_eth_dev_data));
> Why do we need to copy, trying to preserve which data?
> 
>> +		dev->data = data;	/* Override the current data pointer */
>> +	} else
>> +		data = dev->data;
>> +
>> +	snprintf(buff, sizeof(buff), "I-%s-%d", name, numa_node);
>> +	priv = rte_zmalloc_socket(buff, internal_size, 0, numa_node);
>> +	if (priv == NULL) {
>> +		RTE_PMD_DEBUG_TRACE("Unable to allocate internal memory %lu\n",
>> +				    internal_size);
>> +		goto error;
>> +	}
>> +
>> +	/* Setup some default values */
>> +	dev->dev_ops = dev_ops;
>> +	data->dev_private = priv;
> 
>> +	data->port_id = dev->data->port_id;
>> +	memmove(data->name, dev->data->name, strlen(dev->data->name));
> These two assignments are useless, needs to be done before "dev->data =
> data" assignment.

Reworked this code area to remove it.
> 
>> +
>> +	dev->driver = NULL;
>> +	data->dev_flags = RTE_ETH_DEV_DETACHABLE;
>> +	data->kdrv = RTE_KDRV_NONE;
>> +	data->numa_node = numa_node;
>> +
>> +	*eth_dev = dev;
>> +	*internals = priv;
>> +
>> +	return 0;
>> +error:
>> +	rte_free(priv);
>> +
>> +	if (flag & RTE_USE_PRIVATE_DATA)
>> +		rte_free(data);
>> +
>> +	rte_eth_dev_release_port(dev);
>> +
>> +	return -1;
>> +}
>> +
>> +static int
>> +pmd_init_internals(const char *name, struct tap_info *tap,
>> +		   struct pmd_internals **internals,
>> +		   struct rte_eth_dev **eth_dev)
>> +{
>> +	struct rte_eth_dev *dev = NULL;
>> +	struct pmd_internals *internal = NULL;
>> +	struct rte_eth_dev_data *data = NULL;
>> +	int ret, i, fd = -1;
>> +
>> +	RTE_LOG(INFO, PMD,
>> +		"%s: Create TUN/TAP Ethernet device with %d queues on numa %u\n",
>> +		name, RTE_PMD_TAP_MAX_QUEUES, rte_socket_id());
>> +
>> +	pmd_link.link_speed = tap->speed;
>> +
>> +	ret = rte_eth_dev_create(tap->name, &dev, &ops,
>> +				 (void **)&internal, sizeof(struct pmd_internals),
> Why rte_eth_dev_create() get "void **internals" which requires casting,
> but not "struct pmd_internals **internals” ?

Fixed.
> 
>> +				 RTE_USE_PRIVATE_DATA);
>> +	if (ret < 0)
>> +		return -1;
>> +
>> +	strncpy(internal->name, tap->name, sizeof(internal->name));
>> +
>> +	internal->nb_queues = RTE_PMD_TAP_MAX_QUEUES;
>> +
>> +	/* Create the first Tap device */
>> +	if ((fd = tun_alloc(dev->data->name)) < 0) {
>> +		RTE_LOG(ERR, PMD, "tun_alloc(%s) failed\n", dev->data->name);
>> +		rte_free(internal);
> rte_free(dev->data); ?
> But needs to check RTE_USE_PRIVATE_DATA ..

See above
> 
>> +		rte_eth_dev_release_port(dev);
>> +		return -1;
>> +	}
>> +
>> +	/* Presetup the fds to -1 as being not working */
>> +	for(i = 0; i < RTE_PMD_TAP_MAX_QUEUES; i++) {
>> +		internal->fds[i] = -1;
>> +		internal->rxq[i].fd = -1;
>> +		internal->txq[i].fd = -1;
>> +	}
>> +
>> +	/* Take the TUN/TAP fd and place in the first location */
>> +	internal->rxq[0].fd = fd;
>> +	internal->txq[0].fd = fd;
>> +	internal->fds[0] = fd;
>> +
>> +	if (pmd_mac_address(fd, dev, &internal->eth_addr) < 0) {
>> +		rte_free(internal);
> rte_free(dev->data); ?

Yes Added.
> 
>> +		rte_eth_dev_release_port(dev);
>> +		return -1;
>> +	}
>> +
>> +	data = dev->data;
>> +
>> +	data->dev_link = pmd_link;
>> +	data->mac_addrs = &internal->eth_addr;
>> +
>> +	data->nb_rx_queues = (uint16_t)internal->nb_queues;
>> +	data->nb_tx_queues = (uint16_t)internal->nb_queues;
> no cast required.

Removed
> 
>> +	data->drv_name = drivername;
>> +
>> +	*eth_dev = dev;
>> +	*internals = internal;
>> +
>> +	return 0;
>> +}
>> +
> 
> <...>
> 
>> +
>> +static int
>> +set_interface_speed(const char *key __rte_unused,
>> +		    const char *value,
>> +		    void *extra_args __rte_unused)
> need to drop  __rte_unused for extra_args
> 
>> +{
>> +	struct tap_info *tap = (struct tap_info *)extra_args;
>> +
>> +	pmd_link.link_speed = (value) ? atoi(value) : ETH_SPEED_NUM_10G;
>> +	tap->speed = pmd_link.link_speed;
>> +
>> +	return 0;
>> +}
>> +
>> +/*
>> + * Open a TAP interface device.
>> + */
>> +static int
>> +rte_pmd_tap_devinit(const char *name, const char *params)
>> +{
>> +	int ret = 0;
>> +	struct rte_kvargs *kvlist;
>> +	struct tap_info tap_info;
>> +
>> +	/* Setup default values */
>> +	memset(&tap_info, 0, sizeof(tap_info));
>> +
>> +	tap_info.speed = ETH_SPEED_NUM_10G;
>> +	snprintf(tap_info.name, sizeof(tap_info.name), "dtap%d", tap_unit++);
> What about extracting iface name "dtap" into a macro to make it more
> visible.

Created macro for the default name.
> 
>> +
>> +	if ((params == NULL) || (params[0] == '\0')) {
>> +		RTE_LOG(INFO, PMD, "Initializing pmd_tap for %s\n", name);
>> +
>> +		ret = eth_dev_tap_create(name, &tap_info);
> This "name" is not used at all (except from RTE_LOG), instead tap->name
> is used for interface name, so why carying this variable around?

Fixed and changed the API.
> 
>> +		goto leave;
>> +	}
>> +
>> +	RTE_LOG(INFO, PMD, "Initialize %s with params (%s)\n", name, params);
>> +
>> +	kvlist = rte_kvargs_parse(params, valid_arguments);
>> +	if (!kvlist) {
>> +		ret = eth_dev_tap_create(name, &tap_info);
>> +		goto leave;
>> +	}
>> +
>> +	if (rte_kvargs_count(kvlist, ETH_TAP_SPEED_ARG) == 1) {
>> +		ret = rte_kvargs_process(kvlist, ETH_TAP_SPEED_ARG,
>> +					 &set_interface_speed, &tap_info);
>> +		if (ret < 0)
>> +			goto leave;
>> +	} else
>> +		set_interface_speed(NULL, NULL, &tap_info);
> This call is redundant, tap_info already has default speed value set.

Removed
> 
>> +
>> +	if (rte_kvargs_count(kvlist, ETH_TAP_IFACE_ARG) == 1) {
>> +		ret = rte_kvargs_process(kvlist, ETH_TAP_IFACE_ARG,
>> +					 &set_interface_name, &tap_info);
>> +		if (ret < 0)
>> +			goto leave;
>> +	} else
>> +		set_interface_name(NULL, NULL, (void *)&tap_info);
> tap_info->name already set to default value (dtap%d), this call is not
> required.

Removed
> 
>> +
>> +	rte_kvargs_free(kvlist);
>> +
>> +leave:
>> +	if (ret == -1)
>> +		RTE_LOG(INFO, PMD, "Failed to create pmd_tap for %s\n", name);
>> +
>> +	return ret;
>> +}
>> +
>> +/*
>> + * detach a TAP device.
>> + */
>> +static int
>> +rte_pmd_tap_devuninit(const char *name)
>> +{
>> +	struct rte_eth_dev *eth_dev = NULL;
>> +	struct pmd_internals *internals;
>> +	int i;
>> +
>> +	RTE_LOG(INFO, PMD, "Closing TUN/TAP Ethernet device on numa %u\n",
>> +		rte_socket_id());
>> +
>> +	if (name == NULL)
> This check is redundant, eal layer won't call this function with "name
> == NULL”

Removed
> 
>> +		return 0;
>> +
>> +	/* find the ethdev entry */
>> +	eth_dev = rte_eth_dev_allocated(name);
>> +	if (eth_dev == NULL)
>> +		return 0;
>> +
>> +	internals = eth_dev->data->dev_private;
>> +	for (i = 0; i < internals->nb_queues; i++)
>> +		if (internals->fds[i] != -1)
>> +			close(internals->fds[i]);
>> +
>> +	rte_free(eth_dev->data->dev_private);
>> +	rte_free(eth_dev->data);
> data can be shared?
> Don't we need a RTE_USE_PRIVATE_DATA flag check?
> 
>> +
>> +	rte_eth_dev_release_port(eth_dev);
>> +
>> +	return 0;
>> +}
>> +
>> +static struct rte_vdev_driver pmd_tap_drv = {
>> +	.init = rte_pmd_tap_devinit,
>> +	.uninit = rte_pmd_tap_devuninit,
>> +};
>> +
>> +DRIVER_REGISTER_VDEV(eth_tap, pmd_tap_drv);
> name convention is now: “net_tap"

Fixed
> 
>> +DRIVER_REGISTER_PARAM_STRING(eth_tap,
>> +			     "iface=<string>,speed=N");
>> diff --git a/drivers/net/tap/rte_pmd_tap_version.map b/drivers/net/tap/rte_pmd_tap_version.map
>> new file mode 100644
>> index 0000000..61463bf
>> --- /dev/null
>> +++ b/drivers/net/tap/rte_pmd_tap_version.map
>> @@ -0,0 +1,4 @@
>> +DPDK_16.11 {
>> +
>> +	local: *;
>> +};
>> diff --git a/mk/rte.app.mk b/mk/rte.app.mk
>> index 1a0095b..bd1d10f 100644
>> --- a/mk/rte.app.mk
>> +++ b/mk/rte.app.mk
>> @@ -129,6 +129,7 @@ ifeq ($(CONFIG_RTE_LIBRTE_VHOST),y)
>> _LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_VHOST)      += -lrte_pmd_vhost
>> endif # $(CONFIG_RTE_LIBRTE_VHOST)
>> _LDLIBS-$(CONFIG_RTE_LIBRTE_VMXNET3_PMD)    += -lrte_pmd_vmxnet3_uio
>> +_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_TAP)        += -lrte_pmd_tap
> please put in alphebetical order

Done
> 
>> 
>> ifeq ($(CONFIG_RTE_LIBRTE_CRYPTODEV),y)
>> _LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_AESNI_MB)   += -lrte_pmd_aesni_mb




More information about the dev mailing list