summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorCaz Yokoyama <Caz.Yokoyama@intel.com>2013-09-05 16:42:39 -0700
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>2013-09-26 13:52:24 -0700
commit8d49751580db804a02caf6a5b7cebe2ff26c0d7e (patch)
tree69c8a70bbbed2d4b56efe7be87a44b62286d4c14
parent2141c7c5ee677014023cb50c793f91e85f44d2ea (diff)
Sample Implementation of Intel MIC User Space Daemon.
This patch introduces a sample user space daemon which implements the virtio device backends on the host. The daemon creates/removes/configures virtio device backends by communicating with the Intel MIC Host Driver. The virtio devices currently supported are virtio net, virtio console and virtio block. Virtio net supports TSO/GSO. The daemon also monitors card shutdown status and takes appropriate actions like killing the virtio backends and resetting the card upon card shutdown and crashes. Co-author: Ashutosh Dixit <ashutosh.dixit@intel.com> Co-author: Sudeep Dutt <sudeep.dutt@intel.com> Signed-off-by: Ashutosh Dixit <ashutosh.dixit@intel.com> Signed-off-by: Caz Yokoyama <Caz.Yokoyama@intel.com> Signed-off-by: Dasaratharaman Chandramouli <dasaratharaman.chandramouli@intel.com> Signed-off-by: Nikhil Rao <nikhil.rao@intel.com> Signed-off-by: Harshavardhan R Kharche <harshavardhan.r.kharche@intel.com> Signed-off-by: Sudeep Dutt <sudeep.dutt@intel.com> Acked-by: Yaozu (Eddie) Dong <eddie.dong@intel.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-rw-r--r--Documentation/mic/mic_overview.txt49
-rw-r--r--Documentation/mic/mpssd/.gitignore1
-rw-r--r--Documentation/mic/mpssd/Makefile19
-rwxr-xr-xDocumentation/mic/mpssd/micctrl173
-rwxr-xr-xDocumentation/mic/mpssd/mpss202
-rw-r--r--Documentation/mic/mpssd/mpssd.c1701
-rw-r--r--Documentation/mic/mpssd/mpssd.h100
-rw-r--r--Documentation/mic/mpssd/sysfs.c102
8 files changed, 2347 insertions, 0 deletions
diff --git a/Documentation/mic/mic_overview.txt b/Documentation/mic/mic_overview.txt
new file mode 100644
index 000000000000..c4424ed1b746
--- /dev/null
+++ b/Documentation/mic/mic_overview.txt
@@ -0,0 +1,49 @@
+An Intel MIC X100 device is a PCIe form factor add-in coprocessor
+card based on the Intel Many Integrated Core (MIC) architecture
+that runs a Linux OS. It is a PCIe endpoint in a platform and therefore
+implements the three required standard address spaces i.e. configuration,
+memory and I/O. The host OS loads a device driver as is typical for
+PCIe devices. The card itself runs a bootstrap after reset that
+transfers control to the card OS downloaded from the host driver.
+The card OS as shipped by Intel is a Linux kernel with modifications
+for the X100 devices.
+
+Since it is a PCIe card, it does not have the ability to host hardware
+devices for networking, storage and console. We provide these devices
+on X100 coprocessors thus enabling a self-bootable equivalent environment
+for applications. A key benefit of our solution is that it leverages
+the standard virtio framework for network, disk and console devices,
+though in our case the virtio framework is used across a PCIe bus.
+
+Here is a block diagram of the various components described above. The
+virtio backends are situated on the host rather than the card given better
+single threaded performance for the host compared to MIC, the ability of
+the host to initiate DMA's to/from the card using the MIC DMA engine and
+the fact that the virtio block storage backend can only be on the host.
+
+ |
+ +----------+ | +----------+
+ | Card OS | | | Host OS |
+ +----------+ | +----------+
+ |
++-------+ +--------+ +------+ | +---------+ +--------+ +--------+
+| Virtio| |Virtio | |Virtio| | |Virtio | |Virtio | |Virtio |
+| Net | |Console | |Block | | |Net | |Console | |Block |
+| Driver| |Driver | |Driver| | |backend | |backend | |backend |
++-------+ +--------+ +------+ | +---------+ +--------+ +--------+
+ | | | | | | |
+ | | | |User | | |
+ | | | |------|------------|---------|-------
+ +-------------------+ |Kernel +--------------------------+
+ | | | Virtio over PCIe IOCTLs |
+ | | +--------------------------+
+ +--------------+ | |
+ |Intel MIC | | +---------------+
+ |Card Driver | | |Intel MIC |
+ +--------------+ | |Host Driver |
+ | | +---------------+
+ | | |
+ +-------------------------------------------------------------+
+ | |
+ | PCIe Bus |
+ +-------------------------------------------------------------+
diff --git a/Documentation/mic/mpssd/.gitignore b/Documentation/mic/mpssd/.gitignore
new file mode 100644
index 000000000000..8b7c72f07c92
--- /dev/null
+++ b/Documentation/mic/mpssd/.gitignore
@@ -0,0 +1 @@
+mpssd
diff --git a/Documentation/mic/mpssd/Makefile b/Documentation/mic/mpssd/Makefile
new file mode 100644
index 000000000000..eb860a7d152e
--- /dev/null
+++ b/Documentation/mic/mpssd/Makefile
@@ -0,0 +1,19 @@
+#
+# Makefile - Intel MIC User Space Tools.
+# Copyright(c) 2013, Intel Corporation.
+#
+ifdef DEBUG
+CFLAGS += $(USERWARNFLAGS) -I. -g -Wall -DDEBUG=$(DEBUG)
+else
+CFLAGS += $(USERWARNFLAGS) -I. -g -Wall
+endif
+
+mpssd: mpssd.o sysfs.o
+ $(CC) $(CFLAGS) -o $@ $^ -lpthread
+
+install:
+ install mpssd /usr/sbin/mpssd
+ install micctrl /usr/sbin/micctrl
+
+clean:
+ rm -f mpssd *.o
diff --git a/Documentation/mic/mpssd/micctrl b/Documentation/mic/mpssd/micctrl
new file mode 100755
index 000000000000..8f2629b41c5f
--- /dev/null
+++ b/Documentation/mic/mpssd/micctrl
@@ -0,0 +1,173 @@
+#!/bin/bash
+# Intel MIC Platform Software Stack (MPSS)
+#
+# Copyright(c) 2013 Intel Corporation.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License, version 2, as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# General Public License for more details.
+#
+# The full GNU General Public License is included in this distribution in
+# the file called "COPYING".
+#
+# Intel MIC User Space Tools.
+#
+# micctrl - Controls MIC boot/start/stop.
+#
+# chkconfig: 2345 95 05
+# description: start MPSS stack processing.
+#
+### BEGIN INIT INFO
+# Provides: micctrl
+### END INIT INFO
+
+# Source function library.
+. /etc/init.d/functions
+
+sysfs="/sys/class/mic"
+
+_status()
+{
+ f=$sysfs/$1
+ echo -e $1 state: "`cat $f/state`" shutdown_status: "`cat $f/shutdown_status`"
+}
+
+status()
+{
+ if [ "`echo $1 | head -c3`" == "mic" ]; then
+ _status $1
+ return $?
+ fi
+ for f in $sysfs/*
+ do
+ _status `basename $f`
+ RETVAL=$?
+ [ $RETVAL -ne 0 ] && return $RETVAL
+ done
+ return 0
+}
+
+_reset()
+{
+ f=$sysfs/$1
+ echo reset > $f/state
+}
+
+reset()
+{
+ if [ "`echo $1 | head -c3`" == "mic" ]; then
+ _reset $1
+ return $?
+ fi
+ for f in $sysfs/*
+ do
+ _reset `basename $f`
+ RETVAL=$?
+ [ $RETVAL -ne 0 ] && return $RETVAL
+ done
+ return 0
+}
+
+_boot()
+{
+ f=$sysfs/$1
+ echo "linux" > $f/bootmode
+ echo "mic/uos.img" > $f/firmware
+ echo "mic/$1.image" > $f/ramdisk
+ echo "boot" > $f/state
+}
+
+boot()
+{
+ if [ "`echo $1 | head -c3`" == "mic" ]; then
+ _boot $1
+ return $?
+ fi
+ for f in $sysfs/*
+ do
+ _boot `basename $f`
+ RETVAL=$?
+ [ $RETVAL -ne 0 ] && return $RETVAL
+ done
+ return 0
+}
+
+_shutdown()
+{
+ f=$sysfs/$1
+ echo shutdown > $f/state
+}
+
+shutdown()
+{
+ if [ "`echo $1 | head -c3`" == "mic" ]; then
+ _shutdown $1
+ return $?
+ fi
+ for f in $sysfs/*
+ do
+ _shutdown `basename $f`
+ RETVAL=$?
+ [ $RETVAL -ne 0 ] && return $RETVAL
+ done
+ return 0
+}
+
+_wait()
+{
+ f=$sysfs/$1
+ while [ "`cat $f/state`" != "offline" -a "`cat $f/state`" != "online" ]
+ do
+ sleep 1
+ echo -e "Waiting for $1 to go offline"
+ done
+}
+
+wait()
+{
+ if [ "`echo $1 | head -c3`" == "mic" ]; then
+ _wait $1
+ return $?
+ fi
+ # Wait for the cards to go offline
+ for f in $sysfs/*
+ do
+ _wait `basename $f`
+ RETVAL=$?
+ [ $RETVAL -ne 0 ] && return $RETVAL
+ done
+ return 0
+}
+
+if [ ! -d "$sysfs" ]; then
+ echo -e $"Module unloaded "
+ exit 3
+fi
+
+case $1 in
+ -s)
+ status $2
+ ;;
+ -r)
+ reset $2
+ ;;
+ -b)
+ boot $2
+ ;;
+ -S)
+ shutdown $2
+ ;;
+ -w)
+ wait $2
+ ;;
+ *)
+ echo $"Usage: $0 {-s (status) |-r (reset) |-b (boot) |-S (shutdown) |-w (wait)}"
+ exit 2
+esac
+
+exit $?
diff --git a/Documentation/mic/mpssd/mpss b/Documentation/mic/mpssd/mpss
new file mode 100755
index 000000000000..3136c68dad0b
--- /dev/null
+++ b/Documentation/mic/mpssd/mpss
@@ -0,0 +1,202 @@
+#!/bin/bash
+# Intel MIC Platform Software Stack (MPSS)
+#
+# Copyright(c) 2013 Intel Corporation.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License, version 2, as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# General Public License for more details.
+#
+# The full GNU General Public License is included in this distribution in
+# the file called "COPYING".
+#
+# Intel MIC User Space Tools.
+#
+# mpss Start mpssd.
+#
+# chkconfig: 2345 95 05
+# description: start MPSS stack processing.
+#
+### BEGIN INIT INFO
+# Provides: mpss
+# Required-Start:
+# Required-Stop:
+# Short-Description: MPSS stack control
+# Description: MPSS stack control
+### END INIT INFO
+
+# Source function library.
+. /etc/init.d/functions
+
+exec=/usr/sbin/mpssd
+sysfs="/sys/class/mic"
+
+start()
+{
+ [ -x $exec ] || exit 5
+
+ if [ "`ps -e | awk '{print $4}' | grep mpssd | head -1`" = "mpssd" ]; then
+ echo -e $"MPSSD already running! "
+ success
+ echo
+ return 0
+ fi
+
+ echo -e $"Starting MPSS Stack"
+ echo -e $"Loading MIC_HOST Module"
+
+ # Ensure the driver is loaded
+ if [ ! -d "$sysfs" ]; then
+ modprobe mic_host
+ RETVAL=$?
+ if [ $RETVAL -ne 0 ]; then
+ failure
+ echo
+ return $RETVAL
+ fi
+ fi
+
+ # Start the daemon
+ echo -n $"Starting MPSSD "
+ $exec
+ RETVAL=$?
+ if [ $RETVAL -ne 0 ]; then
+ failure
+ echo
+ return $RETVAL
+ fi
+ success
+ echo
+
+ sleep 5
+
+ # Boot the cards
+ micctrl -b
+
+ # Wait till ping works
+ for f in $sysfs/*
+ do
+ count=100
+ ipaddr=`cat $f/cmdline`
+ ipaddr=${ipaddr#*address,}
+ ipaddr=`echo $ipaddr | cut -d, -f1 | cut -d\; -f1`
+ while [ $count -ge 0 ]
+ do
+ echo -e "Pinging "`basename $f`" "
+ ping -c 1 $ipaddr &> /dev/null
+ RETVAL=$?
+ if [ $RETVAL -eq 0 ]; then
+ success
+ break
+ fi
+ sleep 1
+ count=`expr $count - 1`
+ done
+ [ $RETVAL -ne 0 ] && failure || success
+ echo
+ done
+ return $RETVAL
+}
+
+stop()
+{
+ echo -e $"Shutting down MPSS Stack: "
+
+ # Bail out if module is unloaded
+ if [ ! -d "$sysfs" ]; then
+ echo -n $"Module unloaded "
+ success
+ echo
+ return 0
+ fi
+
+ # Shut down the cards.
+ micctrl -S
+
+ # Wait for the cards to go offline
+ for f in $sysfs/*
+ do
+ while [ "`cat $f/state`" != "offline" ]
+ do
+ sleep 1
+ echo -e "Waiting for "`basename $f`" to go offline"
+ done
+ done
+
+ # Display the status of the cards
+ micctrl -s
+
+ # Kill MPSSD now
+ echo -n $"Killing MPSSD"
+ killall -9 mpssd 2>/dev/null
+ RETVAL=$?
+ [ $RETVAL -ne 0 ] && failure || success
+ echo
+ return $RETVAL
+}
+
+restart()
+{
+ stop
+ sleep 5
+ start
+}
+
+status()
+{
+ micctrl -s
+ if [ "`ps -e | awk '{print $4}' | grep mpssd | head -n 1`" = "mpssd" ]; then
+ echo "mpssd is running"
+ else
+ echo "mpssd is stopped"
+ fi
+ return 0
+}
+
+unload()
+{
+ if [ ! -d "$sysfs" ]; then
+ echo -n $"No MIC_HOST Module: "
+ success
+ echo
+ return
+ fi
+
+ stop
+
+ sleep 5
+ echo -n $"Removing MIC_HOST Module: "
+ modprobe -r mic_host
+ RETVAL=$?
+ [ $RETVAL -ne 0 ] && failure || success
+ echo
+ return $RETVAL
+}
+
+case $1 in
+ start)
+ start
+ ;;
+ stop)
+ stop
+ ;;
+ restart)
+ restart
+ ;;
+ status)
+ status
+ ;;
+ unload)
+ unload
+ ;;
+ *)
+ echo $"Usage: $0 {start|stop|restart|status|unload}"
+ exit 2
+esac
+
+exit $?
diff --git a/Documentation/mic/mpssd/mpssd.c b/Documentation/mic/mpssd/mpssd.c
new file mode 100644
index 000000000000..8064804cdac3
--- /dev/null
+++ b/Documentation/mic/mpssd/mpssd.c
@@ -0,0 +1,1701 @@
+/*
+ * Intel MIC Platform Software Stack (MPSS)
+ *
+ * Copyright(c) 2013 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ * Intel MIC User Space Tools.
+ */
+
+#define _GNU_SOURCE
+
+#include <stdlib.h>
+#include <fcntl.h>
+#include <getopt.h>
+#include <assert.h>
+#include <unistd.h>
+#include <stdbool.h>
+#include <signal.h>
+#include <poll.h>
+#include <features.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <sys/socket.h>
+#include <linux/virtio_ring.h>
+#include <linux/virtio_net.h>
+#include <linux/virtio_console.h>
+#include <linux/virtio_blk.h>
+#include <linux/version.h>
+#include "mpssd.h"
+#include <linux/mic_ioctl.h>
+#include <linux/mic_common.h>
+
+static void init_mic(struct mic_info *mic);
+
+static FILE *logfp;
+static struct mic_info mic_list;
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+
+#define min_t(type, x, y) ({ \
+ type __min1 = (x); \
+ type __min2 = (y); \
+ __min1 < __min2 ? __min1 : __min2; })
+
+/* align addr on a size boundary - adjust address up/down if needed */
+#define _ALIGN_DOWN(addr, size) ((addr)&(~((size)-1)))
+#define _ALIGN_UP(addr, size) _ALIGN_DOWN(addr + size - 1, size)
+
+/* align addr on a size boundary - adjust address up if needed */
+#define _ALIGN(addr, size) _ALIGN_UP(addr, size)
+
+/* to align the pointer to the (next) page boundary */
+#define PAGE_ALIGN(addr) _ALIGN(addr, PAGE_SIZE)
+
+#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x))
+
+#define GSO_ENABLED 1
+#define MAX_GSO_SIZE (64 * 1024)
+#define ETH_H_LEN 14
+#define MAX_NET_PKT_SIZE (_ALIGN_UP(MAX_GSO_SIZE + ETH_H_LEN, 64))
+#define MIC_DEVICE_PAGE_END 0x1000
+
+#ifndef VIRTIO_NET_HDR_F_DATA_VALID
+#define VIRTIO_NET_HDR_F_DATA_VALID 2 /* Csum is valid */
+#endif
+
+static struct {
+ struct mic_device_desc dd;
+ struct mic_vqconfig vqconfig[2];
+ __u32 host_features, guest_acknowledgements;
+ struct virtio_console_config cons_config;
+} virtcons_dev_page = {
+ .dd = {
+ .type = VIRTIO_ID_CONSOLE,
+ .num_vq = ARRAY_SIZE(virtcons_dev_page.vqconfig),
+ .feature_len = sizeof(virtcons_dev_page.host_features),
+ .config_len = sizeof(virtcons_dev_page.cons_config),
+ },
+ .vqconfig[0] = {
+ .num = htole16(MIC_VRING_ENTRIES),
+ },
+ .vqconfig[1] = {
+ .num = htole16(MIC_VRING_ENTRIES),
+ },
+};
+
+static struct {
+ struct mic_device_desc dd;
+ struct mic_vqconfig vqconfig[2];
+ __u32 host_features, guest_acknowledgements;
+ struct virtio_net_config net_config;
+} virtnet_dev_page = {
+ .dd = {
+ .type = VIRTIO_ID_NET,
+ .num_vq = ARRAY_SIZE(virtnet_dev_page.vqconfig),
+ .feature_len = sizeof(virtnet_dev_page.host_features),
+ .config_len = sizeof(virtnet_dev_page.net_config),
+ },
+ .vqconfig[0] = {
+ .num = htole16(MIC_VRING_ENTRIES),
+ },
+ .vqconfig[1] = {
+ .num = htole16(MIC_VRING_ENTRIES),
+ },
+#if GSO_ENABLED
+ .host_features = htole32(
+ 1 << VIRTIO_NET_F_CSUM |
+ 1 << VIRTIO_NET_F_GSO |
+ 1 << VIRTIO_NET_F_GUEST_TSO4 |
+ 1 << VIRTIO_NET_F_GUEST_TSO6 |
+ 1 << VIRTIO_NET_F_GUEST_ECN |
+ 1 << VIRTIO_NET_F_GUEST_UFO),
+#else
+ .host_features = 0,
+#endif
+};
+
+static const char *mic_config_dir = "/etc/sysconfig/mic";
+static const char *virtblk_backend = "VIRTBLK_BACKEND";
+static struct {
+ struct mic_device_desc dd;
+ struct mic_vqconfig vqconfig[1];
+ __u32 host_features, guest_acknowledgements;
+ struct virtio_blk_config blk_config;
+} virtblk_dev_page = {
+ .dd = {
+ .type = VIRTIO_ID_BLOCK,
+ .num_vq = ARRAY_SIZE(virtblk_dev_page.vqconfig),
+ .feature_len = sizeof(virtblk_dev_page.host_features),
+ .config_len = sizeof(virtblk_dev_page.blk_config),
+ },
+ .vqconfig[0] = {
+ .num = htole16(MIC_VRING_ENTRIES),
+ },
+ .host_features =
+ htole32(1<<VIRTIO_BLK_F_SEG_MAX),
+ .blk_config = {
+ .seg_max = htole32(MIC_VRING_ENTRIES - 2),
+ .capacity = htole64(0),
+ }
+};
+
+static char *myname;
+
+static int
+tap_configure(struct mic_info *mic, char *dev)
+{
+ pid_t pid;
+ char *ifargv[7];
+ char ipaddr[IFNAMSIZ];
+ int ret = 0;
+
+ pid = fork();
+ if (pid == 0) {
+ ifargv[0] = "ip";
+ ifargv[1] = "link";
+ ifargv[2] = "set";
+ ifargv[3] = dev;
+ ifargv[4] = "up";
+ ifargv[5] = NULL;
+ mpsslog("Configuring %s\n", dev);
+ ret = execvp("ip", ifargv);
+ if (ret < 0) {
+ mpsslog("%s execvp failed errno %s\n",
+ mic->name, strerror(errno));
+ return ret;
+ }
+ }
+ if (pid < 0) {
+ mpsslog("%s fork failed errno %s\n",
+ mic->name, strerror(errno));
+ return ret;
+ }
+
+ ret = waitpid(pid, NULL, 0);
+ if (ret < 0) {
+ mpsslog("%s waitpid failed errno %s\n",
+ mic->name, strerror(errno));
+ return ret;
+ }
+
+ snprintf(ipaddr, IFNAMSIZ, "172.31.%d.254/24", mic->id);
+
+ pid = fork();
+ if (pid == 0) {
+ ifargv[0] = "ip";
+ ifargv[1] = "addr";
+ ifargv[2] = "add";
+ ifargv[3] = ipaddr;
+ ifargv[4] = "dev";
+ ifargv[5] = dev;
+ ifargv[6] = NULL;
+ mpsslog("Configuring %s ipaddr %s\n", dev, ipaddr);
+ ret = execvp("ip", ifargv);
+ if (ret < 0) {
+ mpsslog("%s execvp failed errno %s\n",
+ mic->name, strerror(errno));
+ return ret;
+ }
+ }
+ if (pid < 0) {
+ mpsslog("%s fork failed errno %s\n",
+ mic->name, strerror(errno));
+ return ret;
+ }
+
+ ret = waitpid(pid, NULL, 0);
+ if (ret < 0) {
+ mpsslog("%s waitpid failed errno %s\n",
+ mic->name, strerror(errno));
+ return ret;
+ }
+ mpsslog("MIC name %s %s %d DONE!\n",
+ mic->name, __func__, __LINE__);
+ return 0;
+}
+
+static int tun_alloc(struct mic_info *mic, char *dev)
+{
+ struct ifreq ifr;
+ int fd, err;
+#if GSO_ENABLED
+ unsigned offload;
+#endif
+ fd = open("/dev/net/tun", O_RDWR);
+ if (fd < 0) {
+ mpsslog("Could not open /dev/net/tun %s\n", strerror(errno));
+ goto done;
+ }
+
+ memset(&ifr, 0, sizeof(ifr));
+
+ ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR;
+ if (*dev)
+ strncpy(ifr.ifr_name, dev, IFNAMSIZ);
+
+ err = ioctl(fd, TUNSETIFF, (void *) &ifr);
+ if (err < 0) {
+ mpsslog("%s %s %d TUNSETIFF failed %s\n",
+ mic->name, __func__, __LINE__, strerror(errno));
+ close(fd);
+ return err;
+ }
+#if GSO_ENABLED
+ offload = TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 |
+ TUN_F_TSO_ECN | TUN_F_UFO;
+
+ err = ioctl(fd, TUNSETOFFLOAD, offload);
+ if (err < 0) {
+ mpsslog("%s %s %d TUNSETOFFLOAD failed %s\n",
+ mic->name, __func__, __LINE__, strerror(errno));
+ close(fd);
+ return err;
+ }
+#endif
+ strcpy(dev, ifr.ifr_name);
+ mpsslog("Created TAP %s\n", dev);
+done:
+ return fd;
+}
+
+#define NET_FD_VIRTIO_NET 0
+#define NET_FD_TUN 1
+#define MAX_NET_FD 2
+
+static void set_dp(struct mic_info *mic, int type, void *dp)
+{
+ switch (type) {
+ case VIRTIO_ID_CONSOLE:
+ mic->mic_console.console_dp = dp;
+ return;
+ case VIRTIO_ID_NET:
+ mic->mic_net.net_dp = dp;
+ return;
+ case VIRTIO_ID_BLOCK:
+ mic->mic_virtblk.block_dp = dp;
+ return;
+ }
+ mpsslog("%s %s %d not found\n", mic->name, __func__, type);
+ assert(0);
+}
+
+static void *get_dp(struct mic_info *mic, int type)
+{
+ switch (type) {
+ case VIRTIO_ID_CONSOLE:
+ return mic->mic_console.console_dp;
+ case VIRTIO_ID_NET:
+ return mic->mic_net.net_dp;
+ case VIRTIO_ID_BLOCK:
+ return mic->mic_virtblk.block_dp;
+ }
+ mpsslog("%s %s %d not found\n", mic->name, __func__, type);
+ assert(0);
+ return NULL;
+}
+
+static struct mic_device_desc *get_device_desc(struct mic_info *mic, int type)
+{
+ struct mic_device_desc *d;
+ int i;
+ void *dp = get_dp(mic, type);
+
+ for (i = mic_aligned_size(struct mic_bootparam); i < PAGE_SIZE;
+ i += mic_total_desc_size(d)) {
+ d = dp + i;
+
+ /* End of list */
+ if (d->type == 0)
+ break;
+
+ if (d->type == -1)
+ continue;
+
+ mpsslog("%s %s d-> type %d d %p\n",
+ mic->name, __func__, d->type, d);
+
+ if (d->type == (__u8)type)
+ return d;
+ }
+ mpsslog("%s %s %d not found\n", mic->name, __func__, type);
+ assert(0);
+ return NULL;
+}
+
+/* See comments in vhost.c for explanation of next_desc() */
+static unsigned next_desc(struct vring_desc *desc)
+{
+ unsigned int next;
+
+ if (!(le16toh(desc->flags) & VRING_DESC_F_NEXT))
+ return -1U;
+ next = le16toh(desc->next);
+ return next;
+}
+
+/* Sum up all the IOVEC length */
+static ssize_t
+sum_iovec_len(struct mic_copy_desc *copy)
+{
+ ssize_t sum = 0;
+ int i;
+
+ for (i = 0; i < copy->iovcnt; i++)
+ sum += copy->iov[i].iov_len;
+ return sum;
+}
+
+static inline void verify_out_len(struct mic_info *mic,
+ struct mic_copy_desc *copy)
+{
+ if (copy->out_len != sum_iovec_len(copy)) {
+ mpsslog("%s %s %d BUG copy->out_len 0x%x len 0x%x\n",
+ mic->name, __func__, __LINE__,
+ copy->out_len, sum_iovec_len(copy));
+ assert(copy->out_len == sum_iovec_len(copy));
+ }
+}
+
+/* Display an iovec */
+static void
+disp_iovec(struct mic_info *mic, struct mic_copy_desc *copy,
+ const char *s, int line)
+{
+ int i;
+
+ for (i = 0; i < copy->iovcnt; i++)
+ mpsslog("%s %s %d copy->iov[%d] addr %p len 0x%lx\n",
+ mic->name, s, line, i,
+ copy->iov[i].iov_base, copy->iov[i].iov_len);
+}
+
+static inline __u16 read_avail_idx(struct mic_vring *vr)
+{
+ return ACCESS_ONCE(vr->info->avail_idx);
+}
+
+static inline void txrx_prepare(int type, bool tx, struct mic_vring *vr,
+ struct mic_copy_desc *copy, ssize_t len)
+{
+ copy->vr_idx = tx ? 0 : 1;
+ copy->update_used = true;
+ if (type == VIRTIO_ID_NET)
+ copy->iov[1].iov_len = len - sizeof(struct virtio_net_hdr);
+ else
+ copy->iov[0].iov_len = len;
+}
+
+/* Central API which triggers the copies */
+static int
+mic_virtio_copy(struct mic_info *mic, int fd,
+ struct mic_vring *vr, struct mic_copy_desc *copy)
+{
+ int ret;
+
+ ret = ioctl(fd, MIC_VIRTIO_COPY_DESC, copy);
+ if (ret) {
+ mpsslog("%s %s %d errno %s ret %d\n",
+ mic->name, __func__, __LINE__,
+ strerror(errno), ret);
+ }
+ return ret;
+}
+
+/*
+ * This initialization routine requires at least one
+ * vring i.e. vr0. vr1 is optional.
+ */
+static void *
+init_vr(struct mic_info *mic, int fd, int type,
+ struct mic_vring *vr0, struct mic_vring *vr1, int num_vq)
+{
+ int vr_size;
+ char *va;
+
+ vr_size = PAGE_ALIGN(vring_size(MIC_VRING_ENTRIES,
+ MIC_VIRTIO_RING_ALIGN) + sizeof(struct _mic_vring_info));
+ va = mmap(NULL, MIC_DEVICE_PAGE_END + vr_size * num_vq,
+ PROT_READ, MAP_SHARED, fd, 0);
+ if (MAP_FAILED == va) {
+ mpsslog("%s %s %d mmap failed errno %s\n",
+ mic->name, __func__, __LINE__,
+ strerror(errno));
+ goto done;
+ }
+ set_dp(mic, type, va);
+ vr0->va = (struct mic_vring *)&va[MIC_DEVICE_PAGE_END];
+ vr0->info = vr0->va +
+ vring_size(MIC_VRING_ENTRIES, MIC_VIRTIO_RING_ALIGN);
+ vring_init(&vr0->vr,
+ MIC_VRING_ENTRIES, vr0->va, MIC_VIRTIO_RING_ALIGN);
+ mpsslog("%s %s vr0 %p vr0->info %p vr_size 0x%x vring 0x%x ",
+ __func__, mic->name, vr0->va, vr0->info, vr_size,
+ vring_size(MIC_VRING_ENTRIES, MIC_VIRTIO_RING_ALIGN));
+ mpsslog("magic 0x%x expected 0x%x\n",
+ vr0->info->magic, MIC_MAGIC + type);
+ assert(vr0->info->magic == MIC_MAGIC + type);
+ if (vr1) {
+ vr1->va = (struct mic_vring *)
+ &va[MIC_DEVICE_PAGE_END + vr_size];
+ vr1->info = vr1->va + vring_size(MIC_VRING_ENTRIES,
+ MIC_VIRTIO_RING_ALIGN);
+ vring_init(&vr1->vr,
+ MIC_VRING_ENTRIES, vr1->va, MIC_VIRTIO_RING_ALIGN);
+ mpsslog("%s %s vr1 %p vr1->info %p vr_size 0x%x vring 0x%x ",
+ __func__, mic->name, vr1->va, vr1->info, vr_size,
+ vring_size(MIC_VRING_ENTRIES, MIC_VIRTIO_RING_ALIGN));
+ mpsslog("magic 0x%x expected 0x%x\n",
+ vr1->info->magic, MIC_MAGIC + type + 1);
+ assert(vr1->info->magic == MIC_MAGIC + type + 1);
+ }
+done:
+ return va;
+}
+
+static void
+wait_for_card_driver(struct mic_info *mic, int fd, int type)
+{
+ struct pollfd pollfd;
+ int err;
+ struct mic_device_desc *desc = get_device_desc(mic, type);
+
+ pollfd.fd = fd;
+ mpsslog("%s %s Waiting .... desc-> type %d status 0x%x\n",
+ mic->name, __func__, type, desc->status);
+ while (1) {
+ pollfd.events = POLLIN;
+ pollfd.revents = 0;
+ err = poll(&pollfd, 1, -1);
+ if (err < 0) {
+ mpsslog("%s %s poll failed %s\n",
+ mic->name, __func__, strerror(errno));
+ continue;
+ }
+
+ if (pollfd.revents) {
+ mpsslog("%s %s Waiting... desc-> type %d status 0x%x\n",
+ mic->name, __func__, type, desc->status);
+ if (desc->status & VIRTIO_CONFIG_S_DRIVER_OK) {
+ mpsslog("%s %s poll.revents %d\n",
+ mic->name, __func__, pollfd.revents);
+ mpsslog("%s %s desc-> type %d status 0x%x\n",
+ mic->name, __func__, type,
+ desc->status);
+ break;
+ }
+ }
+ }
+}
+
+/* Spin till we have some descriptors */
+static void
+spin_for_descriptors(struct mic_info *mic, struct mic_vring *vr)
+{
+ __u16 avail_idx = read_avail_idx(vr);
+
+ while (avail_idx == le16toh(ACCESS_ONCE(vr->vr.avail->idx))) {
+#ifdef DEBUG
+ mpsslog("%s %s waiting for desc avail %d info_avail %d\n",
+ mic->name, __func__,
+ le16toh(vr->vr.avail->idx), vr->info->avail_idx);
+#endif
+ sched_yield();
+ }
+}
+
+static void *
+virtio_net(void *arg)
+{
+ static __u8 vnet_hdr[2][sizeof(struct virtio_net_hdr)];
+ static __u8 vnet_buf[2][MAX_NET_PKT_SIZE] __aligned(64);
+ struct iovec vnet_iov[2][2] = {
+ { { .iov_base = vnet_hdr[0], .iov_len = sizeof(vnet_hdr[0]) },
+ { .iov_base = vnet_buf[0], .iov_len = sizeof(vnet_buf[0]) } },
+ { { .iov_base = vnet_hdr[1], .iov_len = sizeof(vnet_hdr[1]) },
+ { .iov_base = vnet_buf[1], .iov_len = sizeof(vnet_buf[1]) } },
+ };
+ struct iovec *iov0 = vnet_iov[0], *iov1 = vnet_iov[1];
+ struct mic_info *mic = (struct mic_info *)arg;
+ char if_name[IFNAMSIZ];
+ struct pollfd net_poll[MAX_NET_FD];
+ struct mic_vring tx_vr, rx_vr;
+ struct mic_copy_desc copy;
+ struct mic_device_desc *desc;
+ int err;
+
+ snprintf(if_name, IFNAMSIZ, "mic%d", mic->id);
+ mic->mic_net.tap_fd = tun_alloc(mic, if_name);
+ if (mic->mic_net.tap_fd < 0)
+ goto done;
+
+ if (tap_configure(mic, if_name))
+ goto done;
+ mpsslog("MIC name %s id %d\n", mic->name, mic->id);
+
+ net_poll[NET_FD_VIRTIO_NET].fd = mic->mic_net.virtio_net_fd;
+ net_poll[NET_FD_VIRTIO_NET].events = POLLIN;
+ net_poll[NET_FD_TUN].fd = mic->mic_net.tap_fd;
+ net_poll[NET_FD_TUN].events = POLLIN;
+
+ if (MAP_FAILED == init_vr(mic, mic->mic_net.virtio_net_fd,
+ VIRTIO_ID_NET, &tx_vr, &rx_vr,
+ virtnet_dev_page.dd.num_vq)) {
+ mpsslog("%s init_vr failed %s\n",
+ mic->name, strerror(errno));
+ goto done;
+ }
+
+ copy.iovcnt = 2;
+ desc = get_device_desc(mic, VIRTIO_ID_NET);
+
+ while (1) {
+ ssize_t len;
+
+ net_poll[NET_FD_VIRTIO_NET].revents = 0;
+ net_poll[NET_FD_TUN].revents = 0;
+
+ /* Start polling for data from tap and virtio net */
+ err = poll(net_poll, 2, -1);
+ if (err < 0) {
+ mpsslog("%s poll failed %s\n",
+ __func__, strerror(errno));
+ continue;
+ }
+ if (!(desc->status & VIRTIO_CONFIG_S_DRIVER_OK))
+ wait_for_card_driver(mic, mic->mic_net.virtio_net_fd,
+ VIRTIO_ID_NET);
+ /*
+ * Check if there is data to be read from TUN and write to
+ * virtio net fd if there is.
+ */
+ if (net_poll[NET_FD_TUN].revents & POLLIN) {
+ copy.iov = iov0;
+ len = readv(net_poll[NET_FD_TUN].fd,
+ copy.iov, copy.iovcnt);
+ if (len > 0) {
+ struct virtio_net_hdr *hdr
+ = (struct virtio_net_hdr *) vnet_hdr[0];
+
+ /* Disable checksums on the card since we are on
+ a reliable PCIe link */
+ hdr->flags |= VIRTIO_NET_HDR_F_DATA_VALID;
+#ifdef DEBUG
+ mpsslog("%s %s %d hdr->flags 0x%x ", mic->name,
+ __func__, __LINE__, hdr->flags);
+ mpsslog("copy.out_len %d hdr->gso_type 0x%x\n",
+ copy.out_len, hdr->gso_type);
+#endif
+#ifdef DEBUG
+ disp_iovec(mic, copy, __func__, __LINE__);
+ mpsslog("%s %s %d read from tap 0x%lx\n",
+ mic->name, __func__, __LINE__,
+ len);
+#endif
+ spin_for_descriptors(mic, &tx_vr);
+ txrx_prepare(VIRTIO_ID_NET, 1, &tx_vr, &copy,
+ len);
+
+ err = mic_virtio_copy(mic,
+ mic->mic_net.virtio_net_fd, &tx_vr,
+ &copy);
+ if (err < 0) {
+ mpsslog("%s %s %d mic_virtio_copy %s\n",
+ mic->name, __func__, __LINE__,
+ strerror(errno));
+ }
+ if (!err)
+ verify_out_len(mic, &copy);
+#ifdef DEBUG
+ disp_iovec(mic, copy, __func__, __LINE__);
+ mpsslog("%s %s %d wrote to net 0x%lx\n",
+ mic->name, __func__, __LINE__,
+ sum_iovec_len(&copy));
+#endif
+ /* Reinitialize IOV for next run */
+ iov0[1].iov_len = MAX_NET_PKT_SIZE;
+ } else if (len < 0) {
+ disp_iovec(mic, &copy, __func__, __LINE__);
+ mpsslog("%s %s %d read failed %s ", mic->name,
+ __func__, __LINE__, strerror(errno));
+ mpsslog("cnt %d sum %d\n",
+ copy.iovcnt, sum_iovec_len(&copy));
+ }
+ }
+
+ /*
+ * Check if there is data to be read from virtio net and
+ * write to TUN if there is.
+ */
+ if (net_poll[NET_FD_VIRTIO_NET].revents & POLLIN) {
+ while (rx_vr.info->avail_idx !=
+ le16toh(rx_vr.vr.avail->idx)) {
+ copy.iov = iov1;
+ txrx_prepare(VIRTIO_ID_NET, 0, &rx_vr, &copy,
+ MAX_NET_PKT_SIZE
+ + sizeof(struct virtio_net_hdr));
+
+ err = mic_virtio_copy(mic,
+ mic->mic_net.virtio_net_fd, &rx_vr,
+ &copy);
+ if (!err) {
+#ifdef DEBUG
+ struct virtio_net_hdr *hdr
+ = (struct virtio_net_hdr *)
+ vnet_hdr[1];
+
+ mpsslog("%s %s %d hdr->flags 0x%x, ",
+ mic->name, __func__, __LINE__,
+ hdr->flags);
+ mpsslog("out_len %d gso_type 0x%x\n",
+ copy.out_len,
+ hdr->gso_type);
+#endif
+ /* Set the correct output iov_len */
+ iov1[1].iov_len = copy.out_len -
+ sizeof(struct virtio_net_hdr);
+ verify_out_len(mic, &copy);
+#ifdef DEBUG
+ disp_iovec(mic, copy, __func__,
+ __LINE__);
+ mpsslog("%s %s %d ",
+ mic->name, __func__, __LINE__);
+ mpsslog("read from net 0x%lx\n",