[dpdk-dev] [PATCH v2 08/16] Add support for mapping devices through	VFIO.
    Xu, HuilongX 
    huilongx.xu at intel.com
       
    Tue May 27 05:19:15 CEST 2014
    
    
  
VFIO is kernel 3.6+ only, and so is only compiled when DPDK config
option CONFIG_RTE_EAL_VFIO is enabled, and kernel 3.6 or higher is
detected, thus preventing compile failures on older kernels if VFIO is
enabled in config (and it is, by default).
Since VFIO cannot be used to map the same device twice, secondary
processes receive the device/group fd's by means of communicating over a
local socket. Only group and container fd's should be sent, as device
fd's can be obtained via ioctl() calls' on the group fd.
For multiprocess, VFIO distinguishes between existing but unused groups
(e.g. grups that aren't bound to VFIO driver) and non-existing groups in
order to know if the secondary process requests a valid group, or if
secondary process requests something that doesn't exist.
Signed-off-by: Anatoly Burakov <anatoly.burakov at intel.com>
Test-by: HuilongX Xu <huilongx.xu at intel.com<mailto:huilongx.xu at intel.com>>
Compile pass
     >>Compile OS: FC20 x86_64
     >>Kernel version: 3.13.6-200
     >>GCC version: 4.8.2
     >>Server: Crownpass
---
lib/librte_eal/linuxapp/eal/Makefile               |    5 +-
lib/librte_eal/linuxapp/eal/eal.c                  |    1 +
lib/librte_eal/linuxapp/eal/eal_pci_vfio.c         |  719 ++++++++++++++++++++
lib/librte_eal/linuxapp/eal/eal_pci_vfio_socket.c  |  367 ++++++++++
.../linuxapp/eal/include/eal_internal_cfg.h        |    3 +
lib/librte_eal/linuxapp/eal/include/eal_pci_init.h |   55 ++
lib/librte_eal/linuxapp/eal/include/eal_vfio.h     |    6 +
7 files changed, 1155 insertions(+), 1 deletions(-)
create mode 100644 lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
create mode 100644 lib/librte_eal/linuxapp/eal/eal_pci_vfio_socket.c
diff --git a/lib/librte_eal/linuxapp/eal/Makefile b/lib/librte_eal/linuxapp/eal/Makefile
index 527fa2a..3a39cca 100644
--- a/lib/librte_eal/linuxapp/eal/Makefile
+++ b/lib/librte_eal/linuxapp/eal/Makefile
@@ -58,6 +58,8 @@ SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal_thread.c
SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal_log.c
SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal_pci.c
SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal_pci_uio.c
+SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal_pci_vfio.c
+SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal_pci_vfio_socket.c
SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal_debug.c
SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal_lcore.c
SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal_timer.c
@@ -88,12 +90,13 @@ CFLAGS_eal_common_log.o := -D_GNU_SOURCE
CFLAGS_eal_hugepage_info.o := -D_GNU_SOURCE
CFLAGS_eal_pci.o := -D_GNU_SOURCE
CFLAGS_eal_common_whitelist.o := -D_GNU_SOURCE
+CFLAGS_eal_pci_vfio.o := -D_GNU_SOURCE
 # workaround for a gcc bug with noreturn attribute
# http://gcc.gnu.org/bugzilla/show_bug.cgi?id=12603
ifeq ($(CONFIG_RTE_TOOLCHAIN_GCC),y)
CFLAGS_eal_thread.o += -Wno-return-type
-CFLAGS_eal_hpet.o += -Wno-return-type
+CFLAGS_eal_pci_vfio_socket.o += -Wno-return-type
endif
 INC := rte_per_lcore.h rte_lcore.h rte_interrupts.h rte_kni_common.h rte_dom0_common.h
diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index de182e1..01bfd6c 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -650,6 +650,7 @@ eal_parse_args(int argc, char **argv)
               internal_config.force_sockets = 0;
               internal_config.syslog_facility = LOG_DAEMON;
               internal_config.xen_dom0_support = 0;
+             internal_config.vfio_intr_mode = RTE_INTR_MODE_MSIX;
#ifdef RTE_LIBEAL_USE_HPET
               internal_config.no_hpet = 0;
#else
diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
new file mode 100644
index 0000000..0a6f95c
--- /dev/null
+++ b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
@@ -0,0 +1,719 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <string.h>
+#include <fcntl.h>
+#include <linux/pci_regs.h>
+#include <sys/eventfd.h>
+#include <sys/socket.h>
+#include <sys/ioctl.h>
+
+#include <rte_log.h>
+#include <rte_pci.h>
+#include <rte_tailq.h>
+#include <rte_eal_memconfig.h>
+#include <rte_malloc.h>
+
+#include "eal_filesystem.h"
+#include "eal_pci_init.h"
+#include "eal_vfio.h"
+
+/**
+ * @file
+ * PCI probing under linux (VFIO version)
+ *
+ * This code tries to determine if the PCI device is bound to VFIO driver,
+ * and initialize it (map BARs, set up interrupts) if that's the case.
+ *
+ * This file is only compiled if CONFIG_RTE_EAL_VFIO is set to "y".
+ */
+
+#ifdef VFIO_PRESENT
+/* get PCI BAR number where MSI-X interrupts are */
+static int
+pci_vfio_get_msix_bar(int fd, int * msix_bar)
+{
+             int ret;
+             uint32_t reg;
+             uint8_t cap_id, cap_offset;
+
+             /* read PCI capability pointer from config space */
+             ret = pread64(fd, ®, sizeof(reg),
+                                             VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
+                                             PCI_CAPABILITY_LIST);
+             if (ret != sizeof(reg)) {
+                             RTE_LOG(ERR, EAL, "Cannot read capability pointer from PCI "
+                                                             "config space!\n");
+                             return -1;
+             }
+
+             /* we need first byte */
+             cap_offset = reg & 0xFF;
+
+             while (cap_offset){
+
+                             /* read PCI capability ID */
+                             ret = pread64(fd, ®, sizeof(reg),
+                                                             VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
+                                                             cap_offset);
+                             if (ret != sizeof(reg)) {
+                                             RTE_LOG(ERR, EAL, "Cannot read capability ID from PCI "
+                                                                             "config space!\n");
+                                             return -1;
+                             }
+
+                             /* we need first byte */
+                             cap_id = reg & 0xFF;
+
+                             /* if we haven't reached MSI-X, check next capability */
+                             if (cap_id != PCI_CAP_ID_MSIX) {
+                                             ret = pread64(fd, ®, sizeof(reg),
+                                                                             VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
+                                                                             cap_offset);
+                                             if (ret != sizeof(reg)) {
+                                                             RTE_LOG(ERR, EAL, "Cannot read capability pointer from PCI "
+                                                                                             "config space!\n");
+                                                             return -1;
+                                             }
+
+                                             /* we need second byte */
+                                             cap_offset = (reg & 0xFF00) >> 8;
+
+                                             continue;
+                             }
+                             /* else, read table offset */
+                             else {
+                                             /* table offset resides in the next 4 bytes */
+                                             ret = pread64(fd, ®, sizeof(reg),
+                                                                             VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
+                                                                             cap_offset + 4);
+                                             if (ret != sizeof(reg)) {
+                                                             RTE_LOG(ERR, EAL, "Cannot read table offset from PCI config "
+                                                                                             "space!\n");
+                                                             return -1;
+                                             }
+
+                                             *msix_bar = reg & RTE_PCI_MSIX_TABLE_BIR;
+
+                                             return 0;
+                             }
+             }
+             return 0;
+}
+
+/* set PCI bus mastering */
+static int
+pci_vfio_set_bus_master(int dev_fd)
+{
+             uint16_t reg;
+             int ret;
+
+             ret = pread64(dev_fd, ®, sizeof(reg),
+                                             VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
+                                             PCI_COMMAND);
+             if (ret != sizeof(reg)) {
+                             RTE_LOG(ERR, EAL, "Cannot read command from PCI config space!\n");
+                             return -1;
+             }
+
+             /* set the master bit */
+             reg |= PCI_COMMAND_MASTER;
+
+             ret = pwrite64(dev_fd, ®, sizeof(reg),
+                                             VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
+                                             PCI_COMMAND);
+
+             if (ret != sizeof(reg)) {
+                             RTE_LOG(ERR, EAL, "Cannot write command to PCI config space!\n");
+                             return -1;
+             }
+
+             return 0;
+}
+
+/* set up DMA mappings */
+static int
+pci_vfio_setup_dma_maps(int vfio_container_fd)
+{
+             const struct rte_memseg * ms = rte_eal_get_physmem_layout();
+             int i, ret;
+
+             ret = ioctl(vfio_container_fd, VFIO_SET_IOMMU,
+                                             VFIO_TYPE1_IOMMU);
+             if (ret) {
+                             RTE_LOG(ERR, EAL, "  cannot set IOMMU type!\n");
+                             return -1;
+             }
+
+             /* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */
+             for (i = 0; i < RTE_MAX_MEMSEG; i++) {
+                             struct vfio_iommu_type1_dma_map dma_map;
+
+                             if (ms[i].addr == NULL)
+                                             break;
+
+                             memset(&dma_map, 0, sizeof(dma_map));
+                             dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
+                             dma_map.vaddr = ms[i].addr_64;
+                             dma_map.size = ms[i].len;
+                             dma_map.iova = ms[i].phys_addr;
+                             dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
+
+                             ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
+
+                             if (ret) {
+                                             RTE_LOG(ERR, EAL, "  cannot set up DMA remapping!\n");
+                                             return -1;
+                             }
+             }
+
+             return 0;
+}
+
+/* set up interrupt support (but not enable interrupts) */
+static int
+pci_vfio_setup_interrupts(struct rte_pci_device *dev, int vfio_dev_fd,
+                             int num_irqs)
+{
+             int i, ret, intr_idx;
+             enum rte_intr_handle_type handle_type;
+
+             /* get interrupt type from internal config (MSI-X by default, can be
+             * overriden from the command line
+             */
+             switch (internal_config.vfio_intr_mode) {
+             case RTE_INTR_MODE_MSIX:
+                             intr_idx = VFIO_PCI_MSIX_IRQ_INDEX;
+                             handle_type = RTE_INTR_HANDLE_VFIO_MSIX;
+                             break;
+             case RTE_INTR_MODE_LEGACY:
+                             intr_idx = VFIO_PCI_INTX_IRQ_INDEX;
+                             handle_type = RTE_INTR_HANDLE_VFIO_LEGACY;
+                             break;
+             default:
+                             RTE_LOG(ERR, EAL, "  unknown default interrupt type!\n");
+                             return -1;
+             }
+
+             for (i = 0; i < num_irqs; i++) {
+                             struct vfio_irq_info irq = { .argsz = sizeof(irq) };
+                             int fd = -1;
+
+                             /* skip interrupt modes we don't want */
+                             if (i != intr_idx)
+                                             continue;
+
+                             irq.index = i;
+
+                             ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_IRQ_INFO, &irq);
+                             if (ret < 0) {
+                                             RTE_LOG(ERR, EAL, "  cannot get IRQ info!\n");
+                                             return -1;
+                             }
+
+                             /* fail if this vector cannot be used with eventfd */
+                             if ((irq.flags & VFIO_IRQ_INFO_EVENTFD) == 0) {
+                                             RTE_LOG(ERR, EAL, "  interrupt vector does not support eventfd!\n");
+                                             return -1;
+                             }
+
+                             /* set up an eventfd for interrupts */
+                             fd = eventfd(0, 0);
+                             if (fd < 0) {
+                                             RTE_LOG(ERR, EAL, "  cannot set up eventfd!\n");
+                                             return -1;
+                             }
+
+                             dev->intr_handle.type = handle_type;
+                             dev->intr_handle.fd = fd;
+                             dev->intr_handle.vfio_dev_fd = vfio_dev_fd;
+
+                             return 0;
+             }
+
+             /* if we're here, we haven't found a suitable interrupt vector */
+             return -1;
+}
+
+/* open container fd or get an existing one */
+static int
+pci_vfio_get_container_fd(void)
+{
+             int ret, vfio_container_fd;
+
+             /* if we're in a primary process, try to open the container */
+             if (internal_config.process_type == RTE_PROC_PRIMARY) {
+                             vfio_container_fd = open(VFIO_CONTAINER_PATH, O_RDWR);
+                             if (vfio_container_fd < 0) {
+                                             RTE_LOG(ERR, EAL, "  cannot open VFIO container!\n");
+                                             return -1;
+                             }
+
+                             /* check VFIO API version */
+                             ret = ioctl(vfio_container_fd, VFIO_GET_API_VERSION);
+                             if (ret != VFIO_API_VERSION) {
+                                             RTE_LOG(ERR, EAL, "  unknown VFIO API version!\n");
+                                             close(vfio_container_fd);
+                                             return -1;
+                             }
+
+                             /* check if we support IOMMU type 1 */
+                             ret = ioctl(vfio_container_fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU);
+                             if (!ret) {
+                                             RTE_LOG(ERR, EAL, "  unknown IOMMU driver!\n");
+                                             close(vfio_container_fd);
+                                             return -1;
+                             }
+
+                             return vfio_container_fd;
+             }
+             /* if we're in a secondary process, request container fd from the primary
+             * process via our socket
+             */
+             else {
+                             int socket_fd;
+                             if ((socket_fd = vfio_socket_connect_to_primary()) < 0) {
+                                             RTE_LOG(ERR, EAL, "  cannot connect to primary process!\n");
+                                             return -1;
+                             }
+                             if (vfio_socket_send_request(socket_fd, SOCKET_REQ_CONTAINER) < 0) {
+                                             RTE_LOG(ERR, EAL, "  cannot request container fd!\n");
+                                             close(socket_fd);
+                                             return -1;
+                             }
+                             vfio_container_fd = vfio_socket_receive_fd(socket_fd);
+                             if (vfio_container_fd < 0) {
+                                             RTE_LOG(ERR, EAL, "  cannot get container fd!\n");
+                                             close(socket_fd);
+                                             return -1;
+                             }
+                             close(socket_fd);
+                             return vfio_container_fd;
+             }
+
+             return -1;
+}
+
+/* open group fd or get an existing one */
+static int
+pci_vfio_get_group_fd(int iommu_group_no)
+{
+             int i;
+             int vfio_group_fd;
+             char filename[PATH_MAX];
+
+             /* check if we already have the group descriptor open */
+             for (i = 0; i < vfio_cfg.vfio_group_idx; i++)
+                             if (vfio_cfg.vfio_groups[i].group_no == iommu_group_no)
+                                             return vfio_cfg.vfio_groups[i].fd;
+
+             /* if primary, try to open the group */
+             if (internal_config.process_type == RTE_PROC_PRIMARY) {
+                             rte_snprintf(filename, sizeof(filename),
+                                                             VFIO_GROUP_FMT, iommu_group_no);
+                             vfio_group_fd = open(filename, O_RDWR);
+                             if (vfio_group_fd < 0) {
+                                             /* if file not found, it's not an error */
+                                             if (errno != ENOENT) {
+                                                             RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", filename,
+                                                                                             strerror(errno));
+                                                             return -1;
+                                             }
+                                             return 0;
+                             }
+
+                             /* if the fd is valid, create a new group for it */
+                             if (vfio_cfg.vfio_group_idx == VFIO_MAX_GROUPS) {
+                                             RTE_LOG(ERR, EAL, "Maximum number of VFIO groups reached!\n");
+                                             return -1;
+                             }
+                             vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].group_no = iommu_group_no;
+                             vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].fd = vfio_group_fd;
+                             return vfio_group_fd;
+             }
+             /* if we're in a secondary process, request group fd from the primary
+             * process via our socket
+             */
+             else {
+                             int socket_fd, ret;
+                             if ((socket_fd = vfio_socket_connect_to_primary()) < 0) {
+                                             RTE_LOG(ERR, EAL, "  cannot connect to primary process!\n");
+                                             return -1;
+                             }
+                             if (vfio_socket_send_request(socket_fd, SOCKET_REQ_GROUP) < 0) {
+                                             RTE_LOG(ERR, EAL, "  cannot request container fd!\n");
+                                             close(socket_fd);
+                                             return -1;
+                             }
+                             if (vfio_socket_send_request(socket_fd, iommu_group_no) < 0) {
+                                             RTE_LOG(ERR, EAL, "  cannot send group number!\n");
+                                             close(socket_fd);
+                                             return -1;
+                             }
+                             ret = vfio_socket_receive_request(socket_fd);
+                             switch(ret) {
+                             case SOCKET_NO_FD:
+                                             close(socket_fd);
+                                             return 0;
+                             case SOCKET_OK:
+                                             vfio_group_fd = vfio_socket_receive_fd(socket_fd);
+                                             /* if we got the fd, return it */
+                                             if (vfio_group_fd > 0) {
+                                                             close(socket_fd);
+                                                             return vfio_group_fd;
+                                             }
+                                             /* fall-through on error */
+                             default:
+                                             RTE_LOG(ERR, EAL, "  cannot get container fd!\n");
+                                             close(socket_fd);
+                                             return -1;
+                             }
+             }
+             return -1;
+}
+
+/* parse IOMMU group number for a PCI device
+ * returns -1 for errors, 0 for non-existent group */
+static int
+pci_vfio_get_group_no(const char * pci_addr)
+{
+             char linkname[PATH_MAX];
+             char filename[PATH_MAX];
+             char * tok[16], *group_tok, *end;
+             int ret, iommu_group_no;
+
+             memset(linkname, 0, sizeof(linkname));
+             memset(filename, 0, sizeof(filename));
+
+             /* try to find out IOMMU group for this device */
+             rte_snprintf(linkname, sizeof(linkname),
+                                             SYSFS_PCI_DEVICES "/%s/iommu_group", pci_addr);
+
+             ret = readlink(linkname, filename, sizeof(filename));
+
+             /* if the link doesn't exist, no VFIO for us */
+             if (ret < 0)
+                             return 0;
+
+             ret = rte_strsplit(filename, sizeof(filename),
+                                             tok, RTE_DIM(tok), '/');
+
+             if (ret <= 0) {
+                             RTE_LOG(ERR, EAL, "  %s cannot get IOMMU group\n", pci_addr);
+                             return -1;
+             }
+
+             /* IOMMU group is always the last token */
+             errno = 0;
+             group_tok = tok[ret - 1];
+             end = group_tok;
+             iommu_group_no = strtol(group_tok, &end, 10);
+             if ((end != group_tok && *end != '\0') || errno != 0) {
+                             RTE_LOG(ERR, EAL, "  %s error parsing IOMMU number!\n", pci_addr);
+                             return -1;
+             }
+
+             return iommu_group_no;
+}
+
+static void
+clear_current_group(void)
+{
+             vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].group_no = 0;
+             vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].fd = -1;
+}
+
+
+/*
+ * map the PCI resources of a PCI device in virtual memory (VFIO version).
+ * primary and secondary processes follow almost exactly the same path
+ */
+int
+pci_vfio_map_resource(struct rte_pci_device *dev)
+{
+             struct vfio_group_status group_status =
+                                                                             { .argsz = sizeof(group_status) };
+             struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
+             int vfio_group_fd, vfio_dev_fd;
+             int iommu_group_no;
+             char pci_addr[PATH_MAX] = {0};
+             struct rte_pci_addr *loc = &dev->addr;
+             int i, ret, msix_bar;
+             struct mapped_pci_resource *vfio_res = NULL;
+             struct pci_map *maps;
+
+             dev->intr_handle.fd = -1;
+             dev->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
+
+             /* store PCI address string */
+             rte_snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT,
+                                             loc->domain, loc->bus, loc->devid, loc->function);
+
+             /* get container fd (needs to be done only once per initialization) */
+             if (vfio_cfg.vfio_container_fd == -1) {
+                             int vfio_container_fd = pci_vfio_get_container_fd();
+                             if (vfio_container_fd < 0) {
+                                             RTE_LOG(ERR, EAL, "  %s cannot open VFIO container!\n", pci_addr);
+                                             return -1;
+                             }
+
+                             vfio_cfg.vfio_container_fd = vfio_container_fd;
+             }
+
+             /* get group number */
+             iommu_group_no = pci_vfio_get_group_no(pci_addr);
+
+             /* if 0, group doesn't exist */
+             if (iommu_group_no == 0) {
+                             RTE_LOG(WARNING, EAL, "  %s not managed by VFIO driver, skipping\n",
+                                                             pci_addr);
+                             return 1;
+             }
+             /* if negative, something failed */
+             else if (iommu_group_no < 0)
+                             return -1;
+
+             /* get the actual group fd */
+             vfio_group_fd = pci_vfio_get_group_fd(iommu_group_no);
+             if (vfio_group_fd < 0) {
+                             return -1;
+             }
+
+             /* store group fd */
+             vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].group_no = iommu_group_no;
+             vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].fd = vfio_group_fd;
+
+             /* if group_fd == 0, that means the device isn't managed by VFIO */
+             if (vfio_group_fd == 0) {
+                             RTE_LOG(WARNING, EAL, "  %s not managed by VFIO driver, skipping\n",
+                                                             pci_addr);
+                             /* we store 0 as group fd to distinguish between existing but
+                             * unbound VFIO groups, and groups that don't exist at all.
+                             */
+                             vfio_cfg.vfio_group_idx++;
+                             return 1;
+             }
+
+             /*
+             * at this point, we know at least one port on this device is bound to VFIO,
+             * so we can proceed to try and set this particular port up
+             */
+
+             /* check if the group is viable */
+             ret = ioctl(vfio_group_fd, VFIO_GROUP_GET_STATUS, &group_status);
+             if (ret) {
+                             RTE_LOG(ERR, EAL, "  %s cannot get group status!\n", pci_addr);
+                             close(vfio_group_fd);
+                             clear_current_group();
+                             return -1;
+             }
+             else if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
+                             RTE_LOG(ERR, EAL, "  %s VFIO group is not viable!\n", pci_addr);
+                             close(vfio_group_fd);
+                             clear_current_group();
+                             return -1;
+             }
+
+             /*
+             * at this point, we know that this group is viable (meaning, all devices
+             * are either bound to VFIO or not bound to anything)
+             */
+
+             /* check if group does not have a container yet */
+             if (!(group_status.flags & VFIO_GROUP_FLAGS_CONTAINER_SET)) {
+
+                             /* add group to a container */
+                             ret = ioctl(vfio_group_fd, VFIO_GROUP_SET_CONTAINER,
+                                                             &vfio_cfg.vfio_container_fd);
+                             if (ret) {
+                                             RTE_LOG(ERR, EAL, "  %s cannot add VFIO group to container!\n",
+                                                                             pci_addr);
+                                             close(vfio_group_fd);
+                                             clear_current_group();
+                                             return -1;
+                             }
+                             /*
+                             * at this point we know that this group has been successfully
+                             * initialized, so we increment vfio_group_idx to indicate that we can
+                             * add new groups.
+                             */
+                             vfio_cfg.vfio_group_idx++;
+             }
+
+             /*
+             * set up DMA mappings for container (needs to be done only once, only when
+             * at least one group is assigned to a container and only in primary process)
+             */
+             if (internal_config.process_type == RTE_PROC_PRIMARY &&
+                                             vfio_cfg.vfio_container_has_dma == 0) {
+                             ret = pci_vfio_setup_dma_maps(vfio_cfg.vfio_container_fd);
+                             if (ret) {
+                                             RTE_LOG(ERR, EAL, "  %s DMA remapping failed!\n", pci_addr);
+                                             return -1;
+                             }
+                             vfio_cfg.vfio_container_has_dma = 1;
+             }
+
+             /* get a file descriptor for the device */
+             vfio_dev_fd = ioctl(vfio_group_fd, VFIO_GROUP_GET_DEVICE_FD, pci_addr);
+             if (vfio_dev_fd < 0) {
+                             /* if we cannot get a device fd, this simply means that this
+                             * particular port is not bound to VFIO
+                             */
+                             RTE_LOG(WARNING, EAL, "  %s not managed by VFIO driver, skipping\n",
+                                                             pci_addr);
+                             return 1;
+             }
+
+             /* test and setup the device */
+             ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_INFO, &device_info);
+             if (ret) {
+                             RTE_LOG(ERR, EAL, "  %s cannot get device info!\n", pci_addr);
+                             close(vfio_dev_fd);
+                             return -1;
+             }
+
+             /* get MSI-X BAR, if any (we have to know where it is because we can't
+             * mmap it when using VFIO) */
+             msix_bar = -1;
+             ret = pci_vfio_get_msix_bar(vfio_dev_fd, &msix_bar);
+             if (ret < 0) {
+                             RTE_LOG(ERR, EAL, "  %s cannot get MSI-X BAR number!\n", pci_addr);
+                             close(vfio_dev_fd);
+                             return -1;
+             }
+
+             /* if we're in a primary process, allocate vfio_res and get region info */
+             if (internal_config.process_type == RTE_PROC_PRIMARY) {
+                             if ((vfio_res = rte_zmalloc("VFIO_RES", sizeof (*vfio_res), 0)) == NULL) {
+                                             RTE_LOG(ERR, EAL,
+                                                             "%s(): cannot store uio mmap details\n", __func__);
+                                             close(vfio_dev_fd);
+                                             return -1;
+                             }
+                             memcpy(&vfio_res->pci_addr, &dev->addr, sizeof(vfio_res->pci_addr));
+
+                             /* get number of registers (up to BAR5) */
+                             vfio_res->nb_maps = RTE_MIN((int) device_info.num_regions,
+                                                             VFIO_PCI_BAR5_REGION_INDEX + 1);
+             }
+             /* if we're in a secondary process, just find our tailq entry and use that */
+             else {
+                             TAILQ_FOREACH(vfio_res, pci_res_list, next) {
+                                             if (memcmp(&vfio_res->pci_addr, &dev->addr, sizeof(dev->addr)))
+                                                             continue;
+                                             break;
+                             }
+                             /* if we haven't found our tailq entry, something's wrong */
+                             if (vfio_res == NULL) {
+                                             RTE_LOG(ERR, EAL, "  %s cannot find TAILQ entry for PCI device!\n",
+                                                                             pci_addr);
+                                             close(vfio_dev_fd);
+                                             return -1;
+                             }
+             }
+
+             /* map BARs */
+             maps = vfio_res->maps;
+
+             for (i = 0; i < (int) vfio_res->nb_maps; i++) {
+                             struct vfio_region_info reg = { .argsz = sizeof(reg) };
+                             void * bar_addr;
+
+                             reg.index = i;
+
+                             ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, ®);
+
+                             if (ret) {
+                                             RTE_LOG(ERR, EAL, "  %s cannot get device region info!\n",
+                                                                             pci_addr);
+                                             close(vfio_dev_fd);
+                                             if (internal_config.process_type == RTE_PROC_PRIMARY)
+                                                             rte_free(vfio_res);
+                                             return -1;
+                             }
+
+                             /* skip non-mmapable BARs */
+                             if ((reg.flags & VFIO_REGION_INFO_FLAG_MMAP) == 0)
+                                             continue;
+
+                             /* skip MSI-X BAR */
+                             if (i == msix_bar)
+                                             continue;
+
+                             bar_addr = pci_map_resource(maps[i].addr, vfio_dev_fd, reg.offset,
+                                                             reg.size);
+
+                             if (bar_addr == NULL) {
+                                             RTE_LOG(ERR, EAL, "  %s mapping BAR%i failed: %s\n", pci_addr, i,
+                                                                             strerror(errno));
+                                             close(vfio_dev_fd);
+                                             if (internal_config.process_type == RTE_PROC_PRIMARY)
+                                                             rte_free(vfio_res);
+                                             return -1;
+                             }
+
+                             maps[i].addr = bar_addr;
+                             maps[i].offset = reg.offset;
+                             maps[i].size = reg.size;
+                             dev->mem_resource[i].addr = bar_addr;
+             }
+
+             /* if secondary process, do not set up interrupts */
+             if (internal_config.process_type == RTE_PROC_PRIMARY) {
+                             if (pci_vfio_setup_interrupts(dev, vfio_dev_fd,
+                                                             (int) device_info.num_irqs) != 0) {
+                                             RTE_LOG(ERR, EAL, "  %s error setting up interrupts!\n", pci_addr);
+                                             close(vfio_dev_fd);
+                                             rte_free(vfio_res);
+                                             return -1;
+                             }
+
+                             /* set bus mastering for the device */
+                             if (pci_vfio_set_bus_master(vfio_dev_fd)) {
+                                             RTE_LOG(ERR, EAL, "  %s cannot set up bus mastering!\n", pci_addr);
+                                             close(vfio_dev_fd);
+                                             rte_free(vfio_res);
+                                             return -1;
+                             }
+
+                             /* Reset the device */
+                             ioctl(vfio_dev_fd, VFIO_DEVICE_RESET);
+             }
+
+             if (internal_config.process_type == RTE_PROC_PRIMARY)
+                             TAILQ_INSERT_TAIL(pci_res_list, vfio_res, next);
+
+             return (0);
+}
+#endif
diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_vfio_socket.c b/lib/librte_eal/linuxapp/eal/eal_pci_vfio_socket.c
new file mode 100644
index 0000000..1605fce
--- /dev/null
+++ b/lib/librte_eal/linuxapp/eal/eal_pci_vfio_socket.c
@@ -0,0 +1,367 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <string.h>
+#include <fcntl.h>
+#include <sys/socket.h>
+
+/* sys/un.h with __USE_MISC uses strlen, which is unsafe and should not be used. */
+#ifdef __USE_MISC
+#define REMOVED_USE_MISC
+#undef __USE_MISC
+#endif
+#include <sys/un.h>
+/* make sure we redefine __USE_MISC only if it was previously undefined */
+#ifdef REMOVED_USE_MISC
+#define __USE_MISC
+#undef REMOVED_USE_MISC
+#endif
+
+#include <rte_log.h>
+#include <rte_pci.h>
+#include <rte_tailq.h>
+#include <rte_eal_memconfig.h>
+#include <rte_malloc.h>
+
+#include "eal_filesystem.h"
+#include "eal_pci_init.h"
+
+/**
+ * @file
+ * VFIO socket for communication between primary and secondary processes.
+ *
+ * This file is only compiled if CONFIG_RTE_EAL_VFIO is set to "y".
+ */
+
+#ifdef VFIO_PRESENT
+#define SOCKET_PATH_FMT "%s/.%s_mp_socket"
+#define CMSGLEN (CMSG_LEN(sizeof(int)))
+#define FD_TO_CMSGHDR(fd,chdr) \
+                             do {\
+                                             (chdr).cmsg_len = CMSGLEN;\
+                                             (chdr).cmsg_level = SOL_SOCKET;\
+                                             (chdr).cmsg_type = SCM_RIGHTS;\
+                                             memcpy((chdr).__cmsg_data, &(fd), sizeof(fd));\
+                             } while(0)
+#define CMSGHDR_TO_FD(chdr,fd) \
+                             do {\
+                                             memcpy(&(fd), (chdr).__cmsg_data, sizeof(fd));\
+                             } while (0)
+
+
+/* get socket path (/var/run if root, $HOME otherwise) */
+static void
+get_socket_path(char * buffer, int bufsz)
+{
+             const char *dir = "/var/run";
+             const char *home_dir = getenv("HOME");
+
+             if (getuid() != 0 && home_dir != NULL)
+                             dir = home_dir;
+
+             /* use current prefix as file path */
+             rte_snprintf(buffer, bufsz, SOCKET_PATH_FMT, dir,
+                                             internal_config.hugefile_prefix);
+}
+
+
+
+/*
+ * data flow for socket comm protocol:
+ * 1. client sends SOCKET_REQ_CONTAINER or SOCKET_REQ_GROUP
+ * 1a. in case of SOCKET_REQ_GROUP, client also then sends group number
+ * 2. server receives message
+ * 2a. in case of invalid group, SOCKET_ERR is sent back to client
+ * 2b. in case of unbound group, SOCKET_NO_FD is sent back to client
+ * 2c. in case of valid group, SOCKET_OK is sent and immediately followed by fd
+ *
+ * in case of any error, socket is closed.
+ */
+
+/* send a request, return -1 on error */
+int
+vfio_socket_send_request(int socket, int req)
+{
+             struct msghdr hdr;
+             struct iovec iov;
+             int buf;
+             int ret;
+
+             memset(&hdr, 0, sizeof(hdr));
+
+             buf = req;
+
+             hdr.msg_iov = &iov;
+             hdr.msg_iovlen = 1;
+             iov.iov_base = (char*) &buf;
+             iov.iov_len = sizeof(buf);
+
+             ret = sendmsg(socket, &hdr, 0);
+             if (ret < 0)
+                             return -1;
+             return 0;
+}
+
+/* receive a request and return it */
+int
+vfio_socket_receive_request(int socket)
+{
+             int buf;
+             struct msghdr hdr;
+             struct iovec iov;
+             int ret, req;
+
+             memset(&hdr, 0, sizeof(hdr));
+
+             buf = SOCKET_ERR;
+
+             hdr.msg_iov = &iov;
+             hdr.msg_iovlen = 1;
+             iov.iov_base = (char*) &buf;
+             iov.iov_len = sizeof(buf);
+
+             ret = recvmsg(socket, &hdr, 0);
+             if (ret < 0)
+                             return -1;
+
+             req = buf;
+
+             return req;
+}
+
+/* send OK in message, fd in control message */
+int
+vfio_socket_send_fd(int socket, int fd)
+{
+             int buf;
+             struct msghdr hdr;
+             struct cmsghdr * chdr;
+             char chdr_buf[CMSGLEN];
+             struct iovec iov;
+             int ret;
+
+             chdr = (struct cmsghdr *) chdr_buf;
+             memset(chdr, 0, sizeof(chdr_buf));
+             memset(&hdr, 0, sizeof(hdr));
+
+             hdr.msg_iov = &iov;
+             hdr.msg_iovlen = 1;
+             iov.iov_base = (char*) &buf;
+             iov.iov_len = sizeof(buf);
+             hdr.msg_control = chdr;
+             hdr.msg_controllen = CMSGLEN;
+
+             buf = SOCKET_OK;
+             FD_TO_CMSGHDR(fd, *chdr);
+
+             ret = sendmsg(socket, &hdr, 0);
+             if (ret < 0)
+                             return -1;
+             return 0;
+}
+
+/* receive OK in message, fd in control message */
+int
+vfio_socket_receive_fd(int socket)
+{
+             int buf;
+             struct msghdr hdr;
+             struct cmsghdr * chdr;
+             char chdr_buf[CMSGLEN];
+             struct iovec iov;
+             int ret, req, fd;
+
+             buf = SOCKET_ERR;
+
+             chdr = (struct cmsghdr *) chdr_buf;
+             memset(chdr, 0, sizeof(chdr_buf));
+             memset(&hdr, 0, sizeof(hdr));
+
+             hdr.msg_iov = &iov;
+             hdr.msg_iovlen = 1;
+             iov.iov_base = (char*) &buf;
+             iov.iov_len = sizeof(buf);
+             hdr.msg_control = chdr;
+             hdr.msg_controllen = CMSGLEN;
+
+             ret = recvmsg(socket, &hdr, 0);
+             if (ret < 0)
+                             return -1;
+
+             req = buf;
+
+             if (req != SOCKET_OK)
+                             return -1;
+
+             CMSGHDR_TO_FD(*chdr, fd);
+
+             return fd;
+}
+
+/* connect socket_fd in secondary process to the primary process's socket */
+int
+vfio_socket_connect_to_primary(void)
+{
+             struct sockaddr_un addr;
+             socklen_t sockaddr_len;
+             int socket_fd;
+
+             /* set up a socket */
+             socket_fd = socket(AF_UNIX, SOCK_SEQPACKET, 0);
+             if (socket_fd < 0) {
+                             RTE_LOG(ERR, EAL, "Failed to create socket!\n");
+                             return -1;
+             }
+
+             get_socket_path(addr.sun_path, sizeof(addr.sun_path));
+             addr.sun_family = AF_UNIX;
+
+             sockaddr_len = sizeof(struct sockaddr_un);
+
+             if (connect(socket_fd, (struct sockaddr*) &addr, sockaddr_len) == 0)
+                             return socket_fd;
+
+             /* if connect failed */
+             close(socket_fd);
+             return -1;
+}
+
+
+
+/*
+ * socket listening thread for primary process
+ */
+__attribute__((noreturn)) void *
+pci_vfio_socket_thread(void *arg)
+{
+             int ret, i, vfio_group_no;
+             int socket_fd = *(int*) arg;
+
+             /* wait for requests on the socket */
+             for (;;) {
+                             int conn_sock;
+                             struct sockaddr_un addr;
+                             socklen_t sockaddr_len = sizeof(addr);
+
+                             /* this is a blocking call */
+                             conn_sock = accept(socket_fd, (struct sockaddr*) &addr, &sockaddr_len);
+
+                             /* just restart on error */
+                             if (conn_sock == -1)
+                                             continue;
+
+                             /* set socket to linger after close */
+                             struct linger l;
+                             l.l_onoff = 1;
+                             l.l_linger = 60;
+                             setsockopt(conn_sock, SOL_SOCKET, SO_LINGER, &l, sizeof(l));
+
+                             ret = vfio_socket_receive_request(conn_sock);
+
+                             switch (ret) {
+                             case SOCKET_REQ_CONTAINER:
+                                             vfio_socket_send_fd(conn_sock, vfio_cfg.vfio_container_fd);
+                                             break;
+                             case SOCKET_REQ_GROUP:
+                                             /* wait for group number */
+                                             vfio_group_no = vfio_socket_receive_request(conn_sock);
+                                             if (vfio_group_no < 0) {
+                                                             close(conn_sock);
+                                                             continue;
+                                             }
+                                             for (i = 0; i < vfio_cfg.vfio_group_idx; i++) {
+                                                             if (vfio_cfg.vfio_groups[i].group_no == vfio_group_no)
+                                                                             break;
+                                             }
+                                             /* if we reached end of the list, the group doesn't exist */
+                                             if (i == vfio_cfg.vfio_group_idx)
+                                                             vfio_socket_send_request(conn_sock, SOCKET_ERR);
+                                             /* if VFIO group exists but isn't bound to VFIO driver */
+                                             else if (vfio_cfg.vfio_groups[i].fd == 0)
+                                                             vfio_socket_send_request(conn_sock, SOCKET_NO_FD);
+                                             /* if group exists and is bound to VFIO driver */
+                                             else {
+                                                             vfio_socket_send_request(conn_sock, SOCKET_OK);
+                                                             vfio_socket_send_fd(conn_sock, vfio_cfg.vfio_groups[i].fd);
+                                             }
+                                             break;
+                             default:
+                                             vfio_socket_send_request(conn_sock, SOCKET_ERR);
+                                             break;
+                             }
+                             close(conn_sock);
+             }
+}
+
+/*
+ * set up a local socket and tell it to listen for incoming connections
+ */
+int
+pci_vfio_socket_setup(void)
+{
+             int ret, socket_fd;
+             struct sockaddr_un addr;
+             socklen_t sockaddr_len;
+
+             /* set up a socket */
+             socket_fd = socket(AF_UNIX, SOCK_SEQPACKET, 0);
+             if (socket_fd < 0) {
+                             RTE_LOG(ERR, EAL, "Failed to create socket!\n");
+                             return -1;
+             }
+
+             get_socket_path(addr.sun_path, sizeof(addr.sun_path));
+             addr.sun_family = AF_UNIX;
+
+             sockaddr_len = sizeof(struct sockaddr_un);
+
+             unlink(addr.sun_path);
+
+             ret = bind(socket_fd, (struct sockaddr*) &addr, sockaddr_len);
+             if (ret) {
+                             RTE_LOG(ERR, EAL, "Failed to bind socket: %s!\n", strerror(errno));
+                             close(socket_fd);
+                             return -1;
+             }
+
+             ret = listen(socket_fd, 50);
+             if (ret) {
+                             RTE_LOG(ERR, EAL, "Failed to listen: %s!\n", strerror(errno));
+                             close(socket_fd);
+                             return -1;
+             }
+
+             return socket_fd;
+}
+
+#endif
diff --git a/lib/librte_eal/linuxapp/eal/include/eal_internal_cfg.h b/lib/librte_eal/linuxapp/eal/include/eal_internal_cfg.h
index 92e3065..5468b0a 100644
--- a/lib/librte_eal/linuxapp/eal/include/eal_internal_cfg.h
+++ b/lib/librte_eal/linuxapp/eal/include/eal_internal_cfg.h
@@ -40,6 +40,7 @@
#define _EAL_LINUXAPP_INTERNAL_CFG
 #include <rte_eal.h>
+#include <rte_pci_dev_feature_defs.h>
 #define MAX_HUGEPAGE_SIZES 3  /**< support up to 3 page sizes */
@@ -76,6 +77,8 @@ struct internal_config {
               volatile uint64_t socket_mem[RTE_MAX_NUMA_NODES]; /**< amount of memory per socket */
               uintptr_t base_virtaddr;          /**< base address to try and reserve memory from */
               volatile int syslog_facility;               /**< facility passed to openlog() */
+             /** default interrupt mode for VFIO */
+             volatile enum rte_intr_mode vfio_intr_mode;
               const char *hugefile_prefix;      /**< the base filename of hugetlbfs files */
               const char *hugepage_dir;         /**< specific hugetlbfs directory to use */
diff --git a/lib/librte_eal/linuxapp/eal/include/eal_pci_init.h b/lib/librte_eal/linuxapp/eal/include/eal_pci_init.h
index 699e80d..b163ab5 100644
--- a/lib/librte_eal/linuxapp/eal/include/eal_pci_init.h
+++ b/lib/librte_eal/linuxapp/eal/include/eal_pci_init.h
@@ -34,6 +34,8 @@
#ifndef EAL_PCI_INIT_H_
#define EAL_PCI_INIT_H_
+#include "eal_vfio.h"
+
struct pci_map {
               void *addr;
               uint64_t offset;
@@ -62,4 +64,57 @@ void * pci_map_resource(void *requested_addr, int fd, off_t offset, size_t size)
/* map IGB_UIO resource prototype */
int pci_uio_map_resource(struct rte_pci_device *dev);
+#ifdef VFIO_PRESENT
+
+#define VFIO_MAX_GROUPS 64
+#define VFIO_DIR "/dev/vfio"
+#define VFIO_CONTAINER_PATH "/dev/vfio/vfio"
+#define VFIO_GROUP_FMT "/dev/vfio/%u"
+#define VFIO_GET_REGION_ADDR(x) ((uint64_t) x << 40ULL)
+
+/* map VFIO resource prototype */
+int pci_vfio_map_resource(struct rte_pci_device *dev);
+
+/*
+ * Function prototypes for VFIO socket functions
+ */
+int vfio_socket_send_request(int socket, int req);
+int vfio_socket_receive_request(int socket);
+int vfio_socket_send_fd(int socket, int fd);
+int vfio_socket_receive_fd(int socket);
+int vfio_socket_connect_to_primary(void);
+int pci_vfio_socket_setup(void);
+void * pci_vfio_socket_thread(void *arg);
+
+/* socket comm protocol definitions */
+#define SOCKET_REQ_CONTAINER 0x100
+#define SOCKET_REQ_GROUP 0x200
+#define SOCKET_OK 0x0
+#define SOCKET_NO_FD 0x1
+#define SOCKET_ERR 0xFF
+
+/*
+ * we don't need to store device fd's anywhere since they can be obtained from
+ * the group fd via an ioctl() call.
+ */
+struct vfio_group {
+             int group_no;
+             int fd;
+};
+
+struct vfio_config {
+             int vfio_enabled;
+             int vfio_container_fd;
+             int vfio_container_has_dma;
+             int vfio_group_idx;
+             struct vfio_group vfio_groups[VFIO_MAX_GROUPS];
+};
+
+/* per-process VFIO config */
+struct vfio_config vfio_cfg;
+
+pthread_t socket_thread;
+
+#endif
+
#endif /* EAL_PCI_INIT_H_ */
diff --git a/lib/librte_eal/linuxapp/eal/include/eal_vfio.h b/lib/librte_eal/linuxapp/eal/include/eal_vfio.h
index ca4982b..32953c0 100644
--- a/lib/librte_eal/linuxapp/eal/include/eal_vfio.h
+++ b/lib/librte_eal/linuxapp/eal/include/eal_vfio.h
@@ -42,6 +42,12 @@
#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,6,0)
#include <linux/vfio.h>
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
+#define RTE_PCI_MSIX_TABLE_BIR 0x7
+#else
+#define RTE_PCI_MSIX_TABLE_BIR PCI_MSIX_TABLE_BIR
+#endif
+
#define VFIO_PRESENT
#endif /* kernel version */
#endif /* RTE_EAL_VFIO */
--
1.7.0.7
    
    
More information about the dev
mailing list