[dpdk-dev] [ovs-dev] [PATCH RFC] dpif-netdev: Add support Intel DPDK based ports.

Pravin Shelar pshelar at nicira.com
Wed Jan 29 22:49:22 CET 2014


On Wed, Jan 29, 2014 at 2:01 AM, Thomas Graf <tgraf at redhat.com> wrote:
> On 01/28/2014 02:48 AM, pshelar at nicira.com wrote:
>>
>> From: Pravin B Shelar <pshelar at nicira.com>
>>
>> Following patch adds DPDK netdev-class to userspace datapath.
>> Approach taken in this patch differs from Intel® DPDK vSwitch
>> where DPDK datapath switching is done in saparate process.  This
>> patch adds support for DPDK type port and uses OVS userspace
>> datapath for switching.  Therefore all DPDK processing and flow
>> miss handling is done in single process.  This also avoids code
>> duplication by reusing OVS userspace datapath switching and
>> therefore it supports all flow matching and actions that
>> user-space datapath supports.  Refer to INSTALL.DPDK doc for
>> further info.
>>
>> With this patch I got similar performance for netperf TCP_STREAM
>> tests compared to kernel datapath.
>>
>> This is based a patch from Gerald Rogers.
>>
>> Signed-off-by: Pravin B Shelar <pshelar at nicira.com>
>> CC: "Gerald Rogers" <gerald.rogers at intel.com>
>
>
> Pravin,
>
> Some initial comments below. I will provide more after deeper
> digging.
>

> Do you have any ideas on how to implement the TX batching yet?
>
We can batch packets for some interval, then to do tx-burst. But I did
not see any performance improvements as we have other bottleneck in
userspace datapath. Ben's RCU patch should help there.

>
>> +
>> +static int
>> +netdev_dpdk_rx_drain(struct netdev_rx *rx_)
>> +{
>> +    struct netdev_rx_dpdk *rx = netdev_rx_dpdk_cast(rx_);
>> +    int pending;
>> +    int i;
>> +
>> +    pending = rx->ofpbuf_cnt;
>> +    if (pending) {
>
>
> This conditional seems unneeded.
>
Right.

>
>> +        for (i = 0; i < pending; i++) {
>> +             build_ofpbuf(rx, &rx->ofpbuf[i], NULL);
>> +        }
>> +        rx->ofpbuf_cnt = 0;
>> +        return 0;
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +/* Tx function. Transmit packets indefinitely */
>> +static int
>> +dpdk_do_tx_copy(struct netdev *netdev, char *buf, int size)
>> +{
>> +    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>> +    struct rte_mbuf *pkt;
>> +    uint32_t nb_tx = 0;
>> +
>> +    pkt = rte_pktmbuf_alloc(dev->dpdk_mp->mp);
>> +    if (!pkt) {
>> +        return 0;
>
>
> Silent drop? ;-) Shouldn't these drops be accounted for somehow?
>
ahh, I will keep it in netdev-dpdk.

>
>> +    }
>> +
>> +    /* We have to do a copy for now */
>> +    memcpy(pkt->pkt.data, buf, size);
>> +
>> +    rte_pktmbuf_data_len(pkt) = size;
>> +    rte_pktmbuf_pkt_len(pkt) = size;
>> +
>> +    rte_spinlock_lock(&dev->tx_lock);
>
>
> What is the purpose of tx_lock here? Multiple threads writing to
> the same Q? The lock is not acquired for the zerocopy path below.
>
There are PMD threads which have their own queue. So tx in these
threads is lockless. But vswitchd can send packet from other thread
all other thread send packets from single queue which is locked.

>
>> +    nb_tx = rte_eth_tx_burst(dev->port_id, NR_QUEUE, &pkt, 1);
>> +    rte_spinlock_unlock(&dev->tx_lock);
>> +
>> +    if (nb_tx != 1) {
>> +        /* free buffers if we couldn't transmit packets */
>> +        rte_mempool_put_bulk(dev->dpdk_mp->mp, (void **)&pkt, 1);
>> +    }
>> +    return nb_tx;
>> +}
>> +
>> +static int
>> +netdev_dpdk_send(struct netdev *netdev,
>> +                 struct ofpbuf *ofpbuf, bool may_steal)
>> +{
>> +    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>> +
>> +    if (ofpbuf->size > dev->max_packet_len) {
>> +        VLOG_ERR("2big size %d max_packet_len %d",
>> +                  (int)ofpbuf->size , dev->max_packet_len);
>
>
> Should probably use VLOG_RATE_LIMIT_INIT
>
ok.

>
>> +        return E2BIG;
>> +    }
>> +
>> +    rte_prefetch0(&ofpbuf->private_p);
>> +    if (!may_steal ||
>> +        !ofpbuf->private_p || ofpbuf->source != OFPBUF_DPDK) {
>> +        dpdk_do_tx_copy(netdev, (char *) ofpbuf->data, ofpbuf->size);
>> +    } else {
>> +        struct rte_mbuf *pkt;
>> +        uint32_t nb_tx;
>> +        int qid;
>> +
>> +        pkt = ofpbuf->private_p;
>> +        ofpbuf->private_p = NULL;
>> +        rte_pktmbuf_data_len(pkt) = ofpbuf->size;
>> +        rte_pktmbuf_pkt_len(pkt) = ofpbuf->size;
>> +
>> +        /* TODO: TX batching. */
>> +        qid = rte_lcore_id() % NR_QUEUE;
>> +        nb_tx = rte_eth_tx_burst(dev->port_id, qid, &pkt, 1);
>> +        if (nb_tx != 1) {
>> +            struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>> +
>> +            rte_mempool_put_bulk(dev->dpdk_mp->mp, (void **)&pkt, 1);
>> +            VLOG_ERR("TX error, zero packets sent");
>
>
> Same here
>
ok

>
>> +       }
>> +    }
>> +    return 0;
>> +}
>
>
>> +static int
>> +netdev_dpdk_set_mtu(const struct netdev *netdev, int mtu)
>> +{
>> +    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>> +    int old_mtu, err;
>> +    struct dpdk_mp *old_mp;
>> +    struct dpdk_mp *mp;
>> +
>> +    ovs_mutex_lock(&dpdk_mutex);
>> +    ovs_mutex_lock(&dev->mutex);
>> +    if (dev->mtu == mtu) {
>> +        err = 0;
>> +        goto out;
>> +    }
>> +
>> +    mp = dpdk_mp_get(dev->socket_id, dev->mtu);
>> +    if (!mp) {
>> +        err = ENOMEM;
>> +        goto out;
>> +    }
>> +
>> +    rte_eth_dev_stop(dev->port_id);
>> +
>> +    old_mtu = dev->mtu;
>> +    old_mp = dev->dpdk_mp;
>> +    dev->dpdk_mp = mp;
>> +    dev->mtu = mtu;
>> +    dev->max_packet_len = MTU_TO_MAX_LEN(dev->mtu);
>> +
>> +    err = dpdk_eth_dev_init(dev);
>> +    if (err) {
>> +
>> +        dpdk_mp_put(mp);
>> +        dev->mtu = old_mtu;
>> +        dev->dpdk_mp = old_mp;
>> +        dev->max_packet_len = MTU_TO_MAX_LEN(dev->mtu);
>> +        dpdk_eth_dev_init(dev);
>
>
> Would be nice if we don't need these constructs and DPDK would
> provide an all or nothing init method.
>
right, today we need to do bunch of calls to setup a device.

>
>> +        goto out;
>> +    }
>> +
>> +    dpdk_mp_put(old_mp);
>> +out:
>> +    ovs_mutex_unlock(&dev->mutex);
>> +    ovs_mutex_unlock(&dpdk_mutex);
>> +    return err;
>> +}
>> +
>
>
>> +static int
>>
>> +netdev_dpdk_update_flags__(struct netdev_dpdk *dev,
>> +                           enum netdev_flags off, enum netdev_flags on,
>> +                           enum netdev_flags *old_flagsp)
>> +    OVS_REQUIRES(dev->mutex)
>> +{
>> +    int err;
>> +
>> +    if ((off | on) & ~(NETDEV_UP | NETDEV_PROMISC)) {
>> +        return EINVAL;
>> +    }
>> +
>> +    *old_flagsp = dev->flags;
>> +    dev->flags |= on;
>> +    dev->flags &= ~off;
>> +
>> +    if (dev->flags == *old_flagsp) {
>> +        return 0;
>> +    }
>> +
>> +    rte_eth_dev_stop(dev->port_id);
>> +
>> +    if (dev->flags & NETDEV_UP) {
>> +        err = rte_eth_dev_start(dev->port_id);
>> +        if (err)
>> +            return err;
>> +    }
>
>
> I'm not a DPDK expert but is it required to restart the device
> to change promisc settings or could we conditionally start and
> stop based on the previous flags state?
>
promiscuous-enable does not require device reset, but I was lazy to
write another case for promiscuous flag change :)
I will update code.

>> +
>> +    if (dev->flags & NETDEV_PROMISC) {
>> +        rte_eth_promiscuous_enable(dev->port_id);
>> +        rte_eth_allmulticast_enable(dev->port_id);
>> +    }
>> +
>> +    return 0;
>> +}
>>
>> +
>> +static void
>> +netdev_dpdk_set_admin_state(struct unixctl_conn *conn, int argc,
>> +                            const char *argv[], void *aux OVS_UNUSED)
>> +{
>> +    bool up;
>> +
>> +    if (!strcasecmp(argv[argc - 1], "up")) {
>> +        up = true;
>> +    } else if ( !strcasecmp(argv[argc - 1], "down")) {
>> +        up = false;
>> +    } else {
>> +        unixctl_command_reply_error(conn, "Invalid Admin State");
>> +        return;
>> +    }
>> +
>> +    if (argc > 2) {
>> +        struct netdev *netdev = netdev_from_name(argv[1]);
>
>
> For future refinement: Usability would be increased if either a
> strict one interface argument is enforced or multiple interface
> names could be passed in, e.g. set-admin-state dpdk0 dpdk1 up
> or set-admin-state dpdk0 up dpdk1 up
>
> As of now, dpdk1 is silently ignored which is not nice.
>
ok.

>
>> +        if (netdev && is_dpdk_class(netdev->netdev_class)) {
>> +            struct netdev_dpdk *dpdk_dev = netdev_dpdk_cast(netdev);
>> +
>> +            ovs_mutex_lock(&dpdk_dev->mutex);
>> +            netdev_dpdk_set_admin_state__(dpdk_dev, up);
>> +            ovs_mutex_unlock(&dpdk_dev->mutex);
>> +
>> +            netdev_close(netdev);
>> +        } else {
>> +            unixctl_command_reply_error(conn, "Unknown Dummy Interface");
>
>
> I think this should read "Not a DPDK Interface" or something similar.
>
ok.

>
>> +            netdev_close(netdev);
>> +            return;
>> +        }
>> +    } else {
>> +        struct netdev_dpdk *netdev;
>> +
>> +        ovs_mutex_lock(&dpdk_mutex);
>> +        LIST_FOR_EACH (netdev, list_node, &dpdk_list) {
>> +            ovs_mutex_lock(&netdev->mutex);
>> +            netdev_dpdk_set_admin_state__(netdev, up);
>> +            ovs_mutex_unlock(&netdev->mutex);
>> +        }
>> +        ovs_mutex_unlock(&dpdk_mutex);
>> +    }
>> +    unixctl_command_reply(conn, "OK");
>> +}
>> +
>> +
>> -    retval = rx->netdev->netdev_class->rx_recv(rx, buffer);
>> -    if (!retval) {
>> -        COVERAGE_INC(netdev_received);
>
>
> Are you removing the netdev_receive counter on purpose here?

That is mistake. I will add it back.

Thanks a lot for review.

Pravin.


More information about the dev mailing list