This is xnu-11215.1.10. See this file in:
/*
 * Copyright (c) 2016-2024 Apple Inc. All rights reserved.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
 *
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. The rights granted to you under the License
 * may not be used to create, or enable the creation or redistribution of,
 * unlawful or unlicensed copies of an Apple operating system, or to
 * circumvent, violate, or enable the circumvention or violation of, any
 * terms of an Apple operating system software license agreement.
 *
 * Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this file.
 *
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
 */

/* This file contains useful utility routines, but contrary to skywalk_test_common
 * Do not operate on a single set of static objects
 */

/*
 * Copyright (c) 1988, 1992, 1993
 *	The Regents of the University of California.  All rights reserved.
 *
 *	@(#)in_cksum.c	8.1 (Berkeley) 6/10/93
 */


#include <err.h>
#include <assert.h>
#include <inttypes.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <unistd.h>
#include <poll.h>
#include <sys/event.h>
#include <uuid/uuid.h>
#include <arpa/inet.h>
#include <stddef.h>
#include <sysexits.h>
#include <sys/types.h>
#include <sys/sysctl.h>
#include <net/if_utun.h>
#include <net/if_ipsec.h>
#include <netinet/ip6.h>
#include <sys/kern_control.h>
#include <sys/ioctl.h>
#include <sys/socket.h>
#include <sys/kern_control.h>
#include <sys/sys_domain.h>
#include <ifaddrs.h>
#include <sys/fcntl.h>
#include <sys/kern_control.h>
#include <sys/sys_domain.h>
#include <net/if_utun.h>
#include <os/log.h>

#include <net/pfkeyv2.h>
#include <netinet6/ipsec.h>
#include <darwintest.h>

#include "skywalk_test_driver.h"
#include "skywalk_test_common.h" // XXX remove this
#include "skywalk_test_utils.h"

#define SIN(s)          ((struct sockaddr_in *)(void *)s)
#define SIN6(s)          ((struct sockaddr_in6 *)(void *)s)

void
sktc_build_nexus(nexus_controller_t ncd, struct sktc_nexus_attr *sktc_attr,
    uuid_t *providerp, uuid_t *instancep)
{
	nexus_attr_t attr;
	int error;
	uint64_t scratch;

	attr = os_nexus_attr_create();
	assert(attr);

	if (sktc_attr->anonymous != -1) {
		error = os_nexus_attr_set(attr, NEXUS_ATTR_ANONYMOUS,
		    sktc_attr->anonymous);
		SKTC_ASSERT_ERR(!error);
	}
	if (sktc_attr->userchannel != -1) {
		error = os_nexus_attr_set(attr, NEXUS_ATTR_USER_CHANNEL,
		    sktc_attr->userchannel);
		SKTC_ASSERT_ERR(!error);
	}
	if (sktc_attr->ntxrings != -1) {
		error = os_nexus_attr_set(attr, NEXUS_ATTR_TX_RINGS,
		    sktc_attr->ntxrings);
		SKTC_ASSERT_ERR(!error);
	}
	if (sktc_attr->nrxrings != -1) {
		error = os_nexus_attr_set(attr, NEXUS_ATTR_RX_RINGS,
		    sktc_attr->nrxrings);
		SKTC_ASSERT_ERR(!error);
	}
	if (sktc_attr->ntxslots != -1) {
		error = os_nexus_attr_set(attr, NEXUS_ATTR_TX_SLOTS,
		    sktc_attr->ntxslots);
		SKTC_ASSERT_ERR(!error);
	}
	if (sktc_attr->nrxslots != -1) {
		error = os_nexus_attr_set(attr, NEXUS_ATTR_RX_SLOTS,
		    sktc_attr->nrxslots);
		SKTC_ASSERT_ERR(!error);
	}
	if (sktc_attr->slotsize != -1) {
		error = os_nexus_attr_set(attr, NEXUS_ATTR_SLOT_BUF_SIZE,
		    sktc_attr->slotsize);
		SKTC_ASSERT_ERR(!error);
	}
	if (sktc_attr->metasize != -1) {
		error = os_nexus_attr_set(attr, NEXUS_ATTR_SLOT_META_SIZE,
		    sktc_attr->metasize);
		SKTC_ASSERT_ERR(error == ENOTSUP);
	}
	if (sktc_attr->maxfrags != -1) {
		error = os_nexus_attr_set(attr, NEXUS_ATTR_MAX_FRAGS,
		    sktc_attr->maxfrags);
		SKTC_ASSERT_ERR(!error);
	}
	if (sktc_attr->rejectonclose != -1) {
		error = os_nexus_attr_set(attr, NEXUS_ATTR_REJECT_ON_CLOSE,
		    sktc_attr->rejectonclose);
		SKTC_ASSERT_ERR(!error);
	}

	uuid_clear(*providerp);
	error = os_nexus_controller_register_provider(ncd,
	    sktc_attr->name, sktc_attr->type, attr, providerp);
	SKTC_ASSERT_ERR(!error);
	assert(!uuid_is_null(*providerp));

	/* Clear the parameters to make sure they are being read */
	error = os_nexus_attr_set(attr, NEXUS_ATTR_ANONYMOUS, -1);
	SKTC_ASSERT_ERR(!error);
	error = os_nexus_attr_set(attr, NEXUS_ATTR_TX_RINGS, -1);
	SKTC_ASSERT_ERR(!error);
	error = os_nexus_attr_set(attr, NEXUS_ATTR_RX_RINGS, -1);
	SKTC_ASSERT_ERR(!error);
	error = os_nexus_attr_set(attr, NEXUS_ATTR_TX_SLOTS, -1);
	SKTC_ASSERT_ERR(!error);
	error = os_nexus_attr_set(attr, NEXUS_ATTR_RX_SLOTS, -1);
	SKTC_ASSERT_ERR(!error);
	error = os_nexus_attr_set(attr, NEXUS_ATTR_SLOT_BUF_SIZE, -1);
	SKTC_ASSERT_ERR(!error);
	error = os_nexus_attr_set(attr, NEXUS_ATTR_SLOT_META_SIZE, -1);
	SKTC_ASSERT_ERR(error == ENOTSUP);
	error = os_nexus_attr_set(attr, NEXUS_ATTR_EXTENSIONS, -1);
	SKTC_ASSERT_ERR(!error);
	error = os_nexus_attr_set(attr, NEXUS_ATTR_MAX_FRAGS, -1);
	SKTC_ASSERT_ERR(!error);
	error = os_nexus_attr_set(attr, NEXUS_ATTR_REJECT_ON_CLOSE, -1);
	SKTC_ASSERT_ERR(!error);

	error = os_nexus_controller_read_provider_attr(ncd,
	    *providerp, attr);
	SKTC_ASSERT_ERR(!error);

	scratch = -1;
	error = os_nexus_attr_get(attr, NEXUS_ATTR_ANONYMOUS, &scratch);
	SKTC_ASSERT_ERR(!error);
	assert(scratch != -1);
	assert(sktc_attr->anonymous == -1 || sktc_attr->anonymous == scratch);

	scratch = -1;
	error = os_nexus_attr_get(attr, NEXUS_ATTR_USER_CHANNEL, &scratch);
	SKTC_ASSERT_ERR(!error);
	assert(scratch != -1);
	assert(sktc_attr->userchannel == -1 ||
	    sktc_attr->userchannel == scratch);

	scratch = -1;
	error = os_nexus_attr_get(attr, NEXUS_ATTR_TX_RINGS, &scratch);
	SKTC_ASSERT_ERR(!error);
	assert(scratch != -1);
	assert(sktc_attr->ntxrings == -1 || sktc_attr->ntxrings == scratch);

	scratch = -1;
	error = os_nexus_attr_get(attr, NEXUS_ATTR_RX_RINGS, &scratch);
	SKTC_ASSERT_ERR(!error);
	assert(scratch != -1);
	assert(sktc_attr->nrxrings == -1 || sktc_attr->nrxrings == scratch);

	scratch = -1;
	error = os_nexus_attr_get(attr, NEXUS_ATTR_TX_SLOTS, &scratch);
	SKTC_ASSERT_ERR(!error);
	assert(scratch != -1);
	assert(sktc_attr->ntxslots == -1 || sktc_attr->ntxslots == scratch);

	scratch = -1;
	error = os_nexus_attr_get(attr, NEXUS_ATTR_RX_SLOTS, &scratch);
	SKTC_ASSERT_ERR(!error);
	assert(scratch != -1);
	assert(sktc_attr->nrxslots == -1 || sktc_attr->nrxslots == scratch);

	scratch = -1;
	error = os_nexus_attr_get(attr, NEXUS_ATTR_SLOT_BUF_SIZE, &scratch);
	SKTC_ASSERT_ERR(!error);
	assert(scratch != -1);
	assert(sktc_attr->slotsize == -1 || sktc_attr->slotsize == scratch);

	scratch = -1;
	error = os_nexus_attr_get(attr, NEXUS_ATTR_SLOT_META_SIZE, &scratch);
	SKTC_ASSERT_ERR(!error);
	assert(scratch != -1);
	assert(sktc_attr->metasize == -1 || sktc_attr->metasize == scratch);

	scratch = -1;
	error = os_nexus_attr_get(attr, NEXUS_ATTR_MAX_FRAGS, &scratch);
	SKTC_ASSERT_ERR(!error);
	assert(scratch != -1);
	assert(sktc_attr->maxfrags == -1 || sktc_attr->maxfrags == scratch);

	scratch = -1;
	error = os_nexus_attr_get(attr, NEXUS_ATTR_REJECT_ON_CLOSE, &scratch);
	SKTC_ASSERT_ERR(!error);
	assert(scratch != -1);
	assert(sktc_attr->rejectonclose == -1 ||
	    sktc_attr->rejectonclose == scratch);

	os_nexus_attr_destroy(attr);

	if (instancep) {
		uuid_clear(*instancep);
		error = os_nexus_controller_alloc_provider_instance(ncd,
		    *providerp, instancep);
		SKTC_ASSERT_ERR(!error);
		assert(!uuid_is_null(*instancep));
	}
}

/* up to 4 seconds of retries (250ms delay per retry) */
#define SKTU_CHANNEL_CREATE_NOMEM_RETRIES       16

channel_t
sktu_channel_create_extended(const uuid_t uuid,
    const nexus_port_t port, const ring_dir_t dir,
    const ring_id_t rid, const channel_attr_t attr,
    uint64_t exclusive, uint64_t monitor,
    uint64_t txlowatunit, uint64_t txlowatval,
    uint64_t rxlowatunit, uint64_t rxlowatval,
    uint64_t userpacketpool, uint64_t defunctok,
    uint64_t event_ring, uint64_t low_latency)
{
	channel_attr_t tmpattr;
	int error;
	uint64_t scratch;
	static struct timespec delay250ms = { .tv_sec = 0, .tv_nsec = 250000000 };
	uint32_t retries = 0;
	channel_t ret = NULL;

	if (!attr) {
		tmpattr = os_channel_attr_create();
	} else {
		tmpattr = attr;
	}

	if (exclusive != -1) {
		error = os_channel_attr_set(tmpattr, CHANNEL_ATTR_EXCLUSIVE, exclusive);
		SKTC_ASSERT_ERR(!error);
	}

	if (monitor != -1) {
		error = os_channel_attr_set(tmpattr, CHANNEL_ATTR_MONITOR, monitor);
		SKTC_ASSERT_ERR(!error);
	}

	if (txlowatunit != -1) {
		error = os_channel_attr_set(tmpattr, CHANNEL_ATTR_TX_LOWAT_UNIT, txlowatunit);
		SKTC_ASSERT_ERR(!error);
	}

	if (txlowatval != -1) {
		error = os_channel_attr_set(tmpattr, CHANNEL_ATTR_TX_LOWAT_VALUE, txlowatval);
		SKTC_ASSERT_ERR(!error);
	}

	if (rxlowatunit != -1) {
		error = os_channel_attr_set(tmpattr, CHANNEL_ATTR_RX_LOWAT_UNIT, rxlowatunit);
		SKTC_ASSERT_ERR(!error);
	}

	if (rxlowatval != -1) {
		error = os_channel_attr_set(tmpattr, CHANNEL_ATTR_RX_LOWAT_VALUE, rxlowatval);
		SKTC_ASSERT_ERR(!error);
	}

	if (userpacketpool != -1) {
		error = os_channel_attr_set(tmpattr, CHANNEL_ATTR_USER_PACKET_POOL, userpacketpool);
		SKTC_ASSERT_ERR(!error);
	}

	if (defunctok != -1) {
		error = os_channel_attr_set(tmpattr, CHANNEL_ATTR_NEXUS_DEFUNCT_OK, defunctok);
		SKTC_ASSERT_ERR(!error);
	}

	if (event_ring != -1) {
		error = os_channel_attr_set(tmpattr, CHANNEL_ATTR_EVENT_RING, event_ring);
		SKTC_ASSERT_ERR(!error);
	}

	if (low_latency != -1) {
		error = os_channel_attr_set(tmpattr, CHANNEL_ATTR_LOW_LATENCY, low_latency);
		SKTC_ASSERT_ERR(!error);
	}

retry:
	ret = os_channel_create_extended(uuid, port, dir, rid, tmpattr);
	if (ret == NULL) {
		if (errno == ENOMEM && ++retries < SKTU_CHANNEL_CREATE_NOMEM_RETRIES) {
			nanosleep(&delay250ms, NULL);
			goto retry;
		}
		goto out;
	}

	scratch = -1;
	error = os_channel_attr_get(tmpattr, CHANNEL_ATTR_EXCLUSIVE, &scratch);
	SKTC_ASSERT_ERR(!error);
	assert(scratch != 1);
	assert(exclusive == -1 || exclusive == scratch);

	scratch = -1;
	error = os_channel_attr_get(tmpattr, CHANNEL_ATTR_MONITOR, &scratch);
	SKTC_ASSERT_ERR(!error);
	assert(scratch != -1);
	assert(exclusive == -1 || monitor == scratch);

	scratch = -1;
	error = os_channel_attr_get(tmpattr, CHANNEL_ATTR_TX_LOWAT_UNIT, &scratch);
	SKTC_ASSERT_ERR(!error);
	assert(scratch != -1);
	assert(exclusive == -1 || txlowatunit == scratch);

	scratch = -1;
	error = os_channel_attr_get(tmpattr, CHANNEL_ATTR_TX_LOWAT_VALUE, &scratch);
	SKTC_ASSERT_ERR(!error);
	assert(scratch != -1);
	assert(exclusive == -1 || txlowatval == scratch);

	scratch = -1;
	error = os_channel_attr_get(tmpattr, CHANNEL_ATTR_RX_LOWAT_UNIT, &scratch);
	SKTC_ASSERT_ERR(!error);
	assert(scratch != -1);
	assert(exclusive == -1 || rxlowatunit == scratch);

	scratch = -1;
	error = os_channel_attr_get(tmpattr, CHANNEL_ATTR_RX_LOWAT_VALUE, &scratch);
	SKTC_ASSERT_ERR(!error);
	assert(scratch != -1);
	assert(exclusive == -1 || rxlowatval == scratch);

	scratch = -1;
	error = os_channel_attr_get(tmpattr, CHANNEL_ATTR_USER_PACKET_POOL, &scratch);
	SKTC_ASSERT_ERR(!error);
	assert(scratch != -1);
	assert(exclusive == -1 || userpacketpool == scratch);

	scratch = -1;
	error = os_channel_attr_get(tmpattr, CHANNEL_ATTR_NEXUS_DEFUNCT_OK, &scratch);
	SKTC_ASSERT_ERR(!error);
	assert(scratch != -1);
	assert(exclusive == -1 || defunctok == scratch);

	scratch = -1;
	error = os_channel_attr_get(tmpattr, CHANNEL_ATTR_EVENT_RING, &scratch);
	SKTC_ASSERT_ERR(!error);
	assert(scratch != -1);
	assert(exclusive == -1 || event_ring == scratch);

	scratch = -1;
	error = os_channel_attr_get(tmpattr, CHANNEL_ATTR_LOW_LATENCY, &scratch);
	SKTC_ASSERT_ERR(!error);
	assert(scratch != -1);
	assert(exclusive == -1 || low_latency == scratch);

out:
	if (!attr) {
		os_channel_attr_destroy(tmpattr);
	}

	return ret;
}

/****************************************************************/

static inline void
swap(int *permute, int i, int j)
{
	int tmp = permute[i];
	permute[i] = permute[j];
	permute[j] = tmp;
}


/* Plain changes, see Knuth (7.2.1.2) "Algorithm P"
 * has advantage of only swapping adjacent pairs
 * This could be cleaned up to be more "C" like, but
 * this literal translation works without fanfare.
 */
void
permutefuncP(int n, int *permute, void (*func)(int, int *permute))
{
	int j, s, q;
	int c[n], o[n];
	/* P1 Initialize. */
	for (j = 0; j < n; j++) {
		c[j] = 0;
		o[j] = 1;
	}
p2:
	/* P2 Visit. */
	func(n, permute);
	/* P3 Prepare for change. */
	j = n;
	s = 0;
p4:
	/* P4 Ready to change? */
	q = c[j - 1] + o[j - 1];
	if (q < 0) {
		goto p7;
	}
	if (q == j) {
		goto p6;
	}
	/* P5 Change. */
	{
		//T_LOG("Swapping %d with %d\n", j-c[j-1]+s-1, j-q+s-1);
		swap(permute, j - c[j - 1] + s - 1, j - q + s - 1);
	}
	c[j - 1] = q;
	goto p2;
p6:     /* P6 Increase s */
	if (j == 1) {
		return;
	}
	s++;
p7:     /* P7 Switch Direction */
	o[j - 1] = -o[j - 1];
	j--;
	goto p4;
}

/* Heap's algorithm */
void
permutefuncH(int n, int *permute, void (*func)(int, int *permute))
{
	time_t start = time(NULL);
	time_t now, then = start;
	int count = 0;
	int total = 1;
	int i = 0;
	int c[n];
	memset(c, 0, sizeof(c));
	for (int f = 2; f <= n; f++) {
		total *= f;
	}
	count++;
	func(n, permute);
	while (i < n) {
		if (c[i] < i) {
			if (!(i & 1)) { /* Even */
				swap(permute, i, 0);
			} else { /* Odd */
				swap(permute, i, c[i]);
			}
			count++;
			{
				now = time(NULL);
				if (now > then) {
					T_LOG("time %ld on %d of %d (%2.2f%%, est %ld secs left)\n",
					    now - start, count, total,
					    (double)count * 100 / total,
					    (long)((double)(now - start) * total / count) - (now - start));
					then = now;
				}
			}
			func(n, permute);
			c[i] += 1;
			i = 0;
		} else {
			c[i] = 0;
			i++;
		}
	}
	now = time(NULL);
	T_LOG("total time %ld for %d permutations (rate %.2f)\n",
	    now - start, total, (double)total / (now - start));
}

/* Random permutations, knuth's shuffle */

void
permutefuncR(int n, int *permute, void (*func)(int, int *permute), int total, unsigned seed)
{
	time_t start = time(NULL);
	time_t now, then = start;
	int count = 0;
	T_LOG("Starting %d random permutations with seed %u\n", total, seed);
	srandom(seed);
	while (count < total) {
		for (int i = n - 1; i > 0; i--) {
			int j = random() % i; // XXX modulo bias.
			swap(permute, i, j);
		}
		count++;
		{
			now = time(NULL);
			if (now > then) {
				T_LOG("time %ld on %d of %d (%2.2f%%, est %ld secs left)\n",
				    now - start, count, total,
				    (double)count * 100 / total,
				    (long)((double)(now - start) * total / count) - (now - start));
				then = now;
			}
		}
		func(n, permute);
	}
	now = time(NULL);
	T_LOG("total time %ld for %d permutations (rate %.2f)\n",
	    now - start, total, (double)total / (now - start));
}


/*
 * rakes each element across all other elements.
 */
void
permutefuncZ(int n, int *permute, void (*func)(int, int *permute))
{
	int save[n];
	memcpy(save, permute, sizeof(save));
	func(n, permute);
	for (int i = 0; i < n; i++) {
		//T_LOG("raking %d left\n", i);
		memcpy(permute, save, sizeof(save));
		for (int j = i; j > 0; j--) {
			swap(permute, j, j - 1);
			func(n, permute);
		}
		//T_LOG("raking %d right\n", i);
		memcpy(permute, save, sizeof(save));
		for (int j = i; j < n - 1; j++) {
			swap(permute, j, j + 1);
			/* The first right is the same as the last left, so skip it */
			if (j != i) {
				func(n, permute);
			}
		}
	}
}

/****************************************************************/

void
sktc_create_flowswitch_no_address(struct sktc_nexus_handles *handles,
    uint64_t ntxslots, uint64_t nrxslots, uint64_t buf_size, uint64_t max_frags,
    uint64_t anonymous)
{
	char buf[256];
	int error;
	struct sktc_nexus_attr attr = SKTC_NEXUS_ATTR_INIT();

	attr.ntxslots = ntxslots;
	attr.nrxslots = nrxslots;
	attr.slotsize = buf_size;
	attr.anonymous = anonymous;
	attr.maxfrags = max_frags;

	if (handles->netif_ifname[0] == '\0') {
		T_LOG("%s: no interface name specified\n",
		    __func__);
		return;
	}
	if (strlen(handles->netif_ifname) >= IFNAMSIZ) {
		T_LOG("%s: invalid interface name specified %s\n",
		    __func__, handles->netif_ifname);
		return;
	}
	handles->controller = os_nexus_controller_create();
	if (handles->controller == NULL) {
		SKT_LOG(
			"%s: os_nexus_controller_create failed, %s (%d)\n",
			__func__, strerror(errno), errno);
		return;
	}

	snprintf(buf, sizeof(buf), "ms_fsw_%s", handles->netif_ifname);
	strncpy((char *)attr.name, buf, sizeof(nexus_name_t) - 1);
	attr.type = NEXUS_TYPE_FLOW_SWITCH;
	sktc_build_nexus(handles->controller, &attr, &handles->fsw_prov_uuid,
	    &handles->fsw_nx_uuid);

	/* if the netif is already present, don't bother creating/attaching */
	if (!sktc_get_netif_nexus(handles->netif_ifname,
	    handles->netif_nx_uuid)) {
		snprintf(buf, sizeof(buf), "netif_%s", handles->netif_ifname);
		strncpy((char *)attr.name, buf, sizeof(nexus_name_t) - 1);
		attr.type = NEXUS_TYPE_NET_IF;
		attr.ntxslots = -1;
		attr.nrxslots = -1;
		sktc_build_nexus(handles->controller, &attr,
		    &handles->netif_prov_uuid, &handles->netif_nx_uuid);
		error = __os_nexus_ifattach(handles->controller,
		    handles->netif_nx_uuid,
		    handles->netif_ifname, NULL,
		    false,
		    &handles->netif_nx_attach_uuid);
		if (error != 0) {
			SKT_LOG(
				"__os_nexus_ifattach(%s) failed, %s (%d)\n",
				buf, strerror(errno), errno);
			return;
		}
	}
	error = __os_nexus_ifattach(handles->controller, handles->fsw_nx_uuid,
	    NULL, handles->netif_nx_uuid, false, &handles->fsw_nx_dev_attach_uuid);
	if (error != 0) {
		SKT_LOG("__os_nexus_ifattach() failed, %s (%d)\n",
		    strerror(errno), errno);
		return;
	}
}


void
sktc_nexus_handles_assign_address(struct sktc_nexus_handles *handles)
{
	int             error;

	error = sktc_ifnet_add_addr(handles->netif_ifname,
	    &handles->netif_addr,
	    &handles->netif_mask, NULL);
	SKTC_ASSERT_ERR(!error);
}

void
sktc_create_flowswitch(struct sktc_nexus_handles *handles, int i)
{
	uint16_t        val;

	/* assign the name */
	snprintf(handles->netif_ifname, sizeof(handles->netif_ifname),
	    FETH_FORMAT, i);

	/* pick/assign a random IPv4LL address */
	val = random() % 0xffff;
	/* avoid subnet broadcast and host address 0 */
	if (((val & 0xff) == 0) || ((val & 0xff) == 0xff)) {
		val = (val & 0xfff0) | 0x2;
	}
	handles->netif_addr = sktc_make_in_addr(IN_LINKLOCALNETNUM | val);
	handles->netif_mask = sktc_make_in_addr(IN_CLASSC_NET);
	sktc_nexus_handles_assign_address(handles);

	/* create the flowswitch */
	sktc_create_flowswitch_no_address(handles, -1, -1, -1, -1, 1);
}

void
sktc_cleanup_flowswitch(struct sktc_nexus_handles *handles)
{
	int error;

	assert(handles->controller);
	assert(!uuid_is_null(handles->fsw_prov_uuid));
	assert(!uuid_is_null(handles->fsw_nx_uuid));

	error = os_nexus_controller_free_provider_instance(handles->controller,
	    handles->fsw_nx_uuid);
	SKTC_ASSERT_ERR(!error);

	error = os_nexus_controller_deregister_provider(handles->controller,
	    handles->fsw_prov_uuid);
	SKTC_ASSERT_ERR(!error);

	os_nexus_controller_destroy(handles->controller);

	error = sktc_ifnet_del_addr(handles->netif_ifname, &handles->netif_addr);
	SKTC_ASSERT_ERR(!error);
}

/****************************************************************/

int
sktc_bind_tcp4_flow(nexus_controller_t ncd, const uuid_t fsw, in_port_t in_port, nexus_port_t nx_port, const uuid_t flow)
{
	struct nx_flow_req nfr;
	int error;

	memset(&nfr, 0, sizeof(nfr));
	nfr.nfr_ip_protocol = IPPROTO_TCP;
	nfr.nfr_nx_port = nx_port;
	nfr.nfr_saddr.sa.sa_len = sizeof(struct sockaddr_in);
	nfr.nfr_saddr.sa.sa_family = AF_INET;
	nfr.nfr_saddr.sin.sin_port = htons(in_port);
	nfr.nfr_saddr.sin.sin_addr.s_addr = htonl(INADDR_ANY);
	uuid_copy(nfr.nfr_flow_uuid, flow);

#if 0
	char buf[31];
	uuid_string_t uuidstr;
	uuid_unparse(nfr.nfr_flow_uuid, uuidstr);
	inet_ntop(AF_INET, &nfr.nfr_saddr.sin.sin_addr.s_addr, buf, sizeof(buf));
	T_LOG("before: nx_port %3d Flow %s %s addr %s port %d\n",
	    nfr.nfr_nx_port, uuidstr, (nfr.nfr_ip_protocol == IPPROTO_TCP) ? "tcp" : "udp",
	    buf, ntohs(nfr.nfr_saddr.sin.sin_port));
#endif

	error = __os_nexus_flow_add(ncd, fsw, &nfr);
#if 0
	if (error) {
		T_LOG("__os_nexus_flow_add returned %d, errno %d\n", error, errno);
	}
#endif

#if 0
	uuid_unparse(nfr.nfr_flow_uuid, uuidstr);
	inet_ntop(AF_INET, &nfr.nfr_saddr.sin.sin_addr.s_addr, buf, sizeof(buf));
	T_LOG("after:  nx_port %3d Flow %s %s addr %s port %d\n",
	    nfr.nfr_nx_port, uuidstr, (nfr.nfr_ip_protocol == IPPROTO_TCP) ? "tcp" : "udp",
	    buf, ntohs(nfr.nfr_saddr.sin.sin_port));
#endif

	// XXX fails, see the fswbind25 for standalone test for this
	assert(nfr.nfr_nx_port == nx_port);
	T_LOG("got ephemeral port %d\n", ntohs(nfr.nfr_saddr.sin.sin_port));

	/* Validate the ephemeral ports */
	if (!error && !in_port) {
		static int first, last;
		if (!first && !last) {
			size_t size;

			size = sizeof(first);
			error = sysctlbyname("net.inet.ip.portrange.first", &first, &size, NULL, 0);
			SKTC_ASSERT_ERR(!error);
			assert(size == sizeof(first));

			size = sizeof(last);
			error = sysctlbyname("net.inet.ip.portrange.last", &last, &size, NULL, 0);
			SKTC_ASSERT_ERR(!error);
			assert(size == sizeof(last));

			T_LOG("ephemeral port range first %d last %d\n", first, last);

			if (last < first) {
				int tmp = first;
				first = last;
				last = tmp;
			}
			assert(first <= last);
		}
		assert(ntohs(nfr.nfr_saddr.sin.sin_port) >= first);
		assert(ntohs(nfr.nfr_saddr.sin.sin_port) <= last);
	}

	return error;
}

int
sktc_unbind_flow(nexus_controller_t ncd, const uuid_t fsw, const uuid_t flow)
{
	struct nx_flow_req nfr;
	int error;

	memset(&nfr, 0, sizeof(nfr));
	uuid_copy(nfr.nfr_flow_uuid, flow);

	error = __os_nexus_flow_del(ncd, fsw, &nfr);
	if (error) {
		SKT_LOG("__os_nexus_flow_add returned %d, errno %d\n", error, errno);
	}
	return error;
}

/****************************************************************/

uint32_t
sktc_chew_random(channel_t channel, channel_ring_t ring, sync_mode_t mode, bool dosync, uint32_t nslots)
{
	uint64_t count = 0;
	int error;
	channel_slot_t slot;

	/* Chew a random number of slots */
	nslots = random() % (nslots + 1);

	slot = NULL;
	while (count < nslots) {
		slot_prop_t prop;

		slot = os_channel_get_next_slot(ring, slot, &prop);
		assert(slot);
		if (mode == CHANNEL_SYNC_TX) {
			packet_t pkt = os_channel_slot_get_packet(ring, slot);
			buflet_t buf = os_packet_get_next_buflet(pkt, NULL);
			assert(buf != NULL);
			uint16_t bdlim = os_buflet_get_data_limit(buf);
			assert(bdlim != 0);
			prop.sp_len = random() % bdlim;
			os_channel_set_slot_properties(ring, slot, &prop);
		}
		count++;
	}

	if (slot) {
		error = os_channel_advance_slot(ring, slot);
		SKTC_ASSERT_ERR(!error);
	}

	if (dosync) {
		error = os_channel_sync(channel, mode);
		if (skywalk_in_driver && error) {
			SKT_LOG("%s: sync fail error %d errno %d: %s\n", __func__, error, errno, strerror(errno));
		} else {
			SKTC_ASSERT_ERR(!error);
		}
	}

	return count;
}

/* This pumps slots on a ring until count slots have been tranferred */
void
sktc_pump_ring_nslots_kq(channel_t channel, channel_ring_t ring, sync_mode_t mode, bool dosync, uint64_t nslots, bool verbose)
{
	uint64_t count = 0;
	int channelfd;
	int kq;
	struct kevent kev;
	int error;
	time_t start, then;

	channelfd = os_channel_get_fd(channel);
	assert(channelfd != -1);

	kq = kqueue();
	assert(kq != -1);
	EV_SET(&kev, channelfd,
	    mode == CHANNEL_SYNC_TX ? EVFILT_WRITE : EVFILT_READ,
	    EV_ADD | EV_ENABLE, 0, 0, NULL);
	error = kevent(kq, &kev, 1, NULL, 0, NULL);
	SKTC_ASSERT_ERR(!error);

	if (verbose) {
		then = start = time(NULL);
	}

	while (count < nslots) {
		uint32_t avail;

		if (verbose) {
			time_t now = time(NULL);
			if (now > then) {
				T_LOG("time %ld pump %"PRId64" of %"PRId64" (%2.2f%%, est %ld secs left)\n",
				    now - start, count, nslots,
				    (double)count * 100 / nslots,
				    (long)((double)(now - start) * nslots / count) - (now - start));
				then = now;
			}
		}

		avail = os_channel_available_slot_count(ring);

		if (!avail) {
			int error;

			memset(&kev, 0, sizeof(kev));
			error = kevent(kq, NULL, 0, &kev, 1, NULL);
			SKTC_ASSERT_ERR(error != -1);
			SKTC_ASSERT_ERR(error == 1);

			assert(kev.ident == channelfd);
			if (mode == CHANNEL_SYNC_TX) {
				assert(kev.filter == EVFILT_WRITE);
			} else {
				assert(kev.filter == EVFILT_READ);
			}

			avail = os_channel_available_slot_count(ring);
			assert(avail);
		}

		count += sktc_chew_random(channel, ring, mode, dosync, MIN(nslots - count, avail));
	}

	if (verbose) {
		time_t now = time(NULL);
		T_LOG("total time %ld for %"PRId64" slots (rate %.2f)\n",
		    now - start, nslots, (double)nslots / (now - start));
	}

	error = close(kq);
	SKTC_ASSERT_ERR(!error);
}

void
sktc_pump_ring_nslots_select(channel_t channel, channel_ring_t ring, sync_mode_t mode, bool dosync, uint64_t nslots, bool verbose)
{
	uint64_t count = 0;
	int channelfd;
	fd_set readfds, writefds, errorfds, zerofds;
	time_t start, then;

	channelfd = os_channel_get_fd(channel);
	assert(channelfd != -1);

	FD_ZERO(&zerofds);
	FD_ZERO(&readfds);
	FD_ZERO(&writefds);
	FD_ZERO(&errorfds);
	if (mode == CHANNEL_SYNC_TX) {
		FD_SET(channelfd, &writefds);
	} else {
		FD_SET(channelfd, &readfds);
	}

	if (verbose) {
		then = start = time(NULL);
	}

	while (count < nslots) {
		uint32_t avail;

		if (verbose) {
			time_t now = time(NULL);
			if (now > then) {
				T_LOG("time %ld pump %"PRId64" of %"PRId64" (%2.2f%%, est %ld secs left)\n",
				    now - start, count, nslots,
				    (double)count * 100 / nslots,
				    (long)((double)(now - start) * nslots / count) - (now - start));
				then = now;
			}
		}

		avail = os_channel_available_slot_count(ring);

		if (!avail) {
			int error;

			FD_SET(channelfd, &errorfds);
			error = select(channelfd + 1, &readfds, &writefds, &errorfds, NULL);
			SKTC_ASSERT_ERR(error != -1);
			assert(!memcmp(&zerofds, &errorfds, sizeof(zerofds)));
			if (mode == CHANNEL_SYNC_TX) {
				assert(FD_ISSET(channelfd, &writefds));
				assert(!memcmp(&zerofds, &readfds, sizeof(zerofds)));
			} else {
				assert(FD_ISSET(channelfd, &readfds));
				assert(!memcmp(&zerofds, &writefds, sizeof(zerofds)));
			}
			SKTC_ASSERT_ERR(error == 1);

			avail = os_channel_available_slot_count(ring);
			assert(avail);
		}

		count += sktc_chew_random(channel, ring, mode, dosync, MIN(nslots - count, avail));
	}

	if (verbose) {
		time_t now = time(NULL);
		T_LOG("total time %ld for %"PRId64" slots (rate %.2f)\n",
		    now - start, nslots, (double)nslots / (now - start));
	}
}

void
sktc_pump_ring_nslots_poll(channel_t channel, channel_ring_t ring, sync_mode_t mode, bool dosync, uint64_t nslots, bool verbose)
{
	uint64_t count = 0;
	int channelfd;
	struct pollfd fds;
	time_t start, then;

	channelfd = os_channel_get_fd(channel);
	assert(channelfd != -1);

	fds.fd = channelfd;
	if (mode == CHANNEL_SYNC_TX) {
		fds.events = POLLWRNORM;
	} else {
		fds.events = POLLRDNORM;
	}

	if (verbose) {
		then = start = time(NULL);
	}

	while (count < nslots) {
		uint32_t avail;

		if (verbose) {
			time_t now = time(NULL);
			if (now > then) {
				T_LOG("time %ld pump %"PRId64" of %"PRId64" (%2.2f%%, est %ld secs left)\n",
				    now - start, count, nslots,
				    (double)count * 100 / nslots,
				    (long)((double)(now - start) * nslots / count) - (now - start));
				then = now;
			}
		}

		avail = os_channel_available_slot_count(ring);

		if (!avail) {
			int error;

			error = poll(&fds, 1, -1);
			SKTC_ASSERT_ERR(error != -1);
			SKTC_ASSERT_ERR(error == 1);
			assert(fds.fd == channelfd);
			if (mode == CHANNEL_SYNC_TX) {
				assert(fds.events == POLLWRNORM);
				assert(fds.revents == POLLWRNORM);
			} else {
				assert(fds.events == POLLRDNORM);
				assert(fds.revents == POLLRDNORM);
			}

			avail = os_channel_available_slot_count(ring);
			assert(avail);
		}

		count += sktc_chew_random(channel, ring, mode, dosync, MIN(nslots - count, avail));
	}

	if (verbose) {
		time_t now = time(NULL);
		T_LOG("total time %ld for %"PRId64" slots (rate %.2f)\n",
		    now - start, nslots, (double)nslots / (now - start));
	}
}

/****************************************************************/

void
sktc_raise_file_limit(int new)
{
	int error;
	struct rlimit rl;

	error = getrlimit(RLIMIT_NOFILE, &rl);
	SKTC_ASSERT_ERR(!error);

	if (rl.rlim_cur < new) {
		T_LOG("raising file open limit from %llu (max %llu) to %d\n",
		    rl.rlim_cur, rl.rlim_max, new);
		rl.rlim_cur = new;
		rl.rlim_max = new;
		error = setrlimit(RLIMIT_NOFILE, &rl);
		SKTC_ASSERT_ERR(!error);
	}
}


/****************************************************************/

int
sktu_create_interface(sktu_if_type_t type, sktu_if_flag_t flags)
{
	struct ctl_info kernctl_info;
	struct sockaddr_ctl kernctl_addr;
	int error;
	int tunsock;
	const char *CONTROL_NAME;
	int OPT_ENABLE_NETIF, OPT_ATTACH_FSW;
	int enable_netif, attach_fsw;
	int scratch;

	assert(type == SKTU_IFT_UTUN || type == SKTU_IFT_IPSEC);
	if (type == SKTU_IFT_UTUN) {
		CONTROL_NAME = UTUN_CONTROL_NAME;
		OPT_ENABLE_NETIF = UTUN_OPT_ENABLE_NETIF;
		OPT_ATTACH_FSW = UTUN_OPT_ATTACH_FLOWSWITCH;
	} else {
		CONTROL_NAME = IPSEC_CONTROL_NAME;
		OPT_ENABLE_NETIF = IPSEC_OPT_ENABLE_NETIF;
		OPT_ATTACH_FSW = 0;
	}

	enable_netif = ((flags & SKTU_IFF_ENABLE_NETIF) != 0) ? 1 : 0;
	attach_fsw = ((flags & SKTU_IFF_NO_ATTACH_FSW) != 0) ? 0 : 1;

	/* XXX Remove this retry nonsense when this is fixed:
	 * <rdar://problem/37340313> creating an interface without specifying specific interface name should not return EBUSY
	 */

	for (int i = 0; i < 10; i++) {
		if (i > 0) {
			T_LOG("%s: sleeping 1ms before retrying\n", __func__);
			usleep(1000);
		}

		tunsock = socket(PF_SYSTEM, SOCK_DGRAM, SYSPROTO_CONTROL);
		assert(tunsock != -1);

		memset(&kernctl_info, 0, sizeof(kernctl_info));
		strlcpy(kernctl_info.ctl_name, CONTROL_NAME, sizeof(kernctl_info.ctl_name));
		error = ioctl(tunsock, CTLIOCGINFO, &kernctl_info);
		SKTC_ASSERT_ERR(error == 0);

		memset(&kernctl_addr, 0, sizeof(kernctl_addr));
		kernctl_addr.sc_len = sizeof(kernctl_addr);
		kernctl_addr.sc_family = AF_SYSTEM;
		kernctl_addr.ss_sysaddr = AF_SYS_CONTROL;
		kernctl_addr.sc_id = kernctl_info.ctl_id;
		kernctl_addr.sc_unit = 0;

		/* If this is being called to reinstantiate a device that was just detached,
		 * then this may return busy while the asynchronous detach completes.
		 * This only occurs when this is being called in a tight loop
		 * as per the utun27646755 test below
		 */

		error = bind(tunsock, (struct sockaddr *)&kernctl_addr, sizeof(kernctl_addr));
		if (error == -1 && errno == EBUSY) {
			close(tunsock);
			tunsock = -1;
			T_LOG("%s: i = %d bind returned EBUSY\n", __func__, i);
			continue;
		}

		/* can only be set before connecting */
		error = setsockopt(tunsock, SYSPROTO_CONTROL, OPT_ENABLE_NETIF, &enable_netif, sizeof(enable_netif));
		SKTC_ASSERT_ERR(!error);
		socklen_t scratchlen = sizeof(scratch);
		error = getsockopt(tunsock, SYSPROTO_CONTROL, OPT_ENABLE_NETIF, &scratch, &scratchlen);
		SKTC_ASSERT_ERR(!error);
		assert(scratchlen == sizeof(scratch));
		assert(enable_netif == scratch);

		/* only applicable for utun */
		if (type == SKTU_IFT_UTUN) {
			error = setsockopt(tunsock, SYSPROTO_CONTROL, OPT_ATTACH_FSW, &attach_fsw, sizeof(attach_fsw));
			SKTC_ASSERT_ERR(!error);
		}

		error = connect(tunsock, (struct sockaddr *)&kernctl_addr, sizeof(kernctl_addr));
		if (error == -1 && errno == EBUSY) {
			T_LOG("%s: i = %d connect returned EBUSY\n", __func__, i);
			close(tunsock);
			tunsock = -1;
			continue;
		}

		error = fcntl(tunsock, F_SETFD, FD_CLOEXEC);
		if (error != 0) {
			warn("FD_CLOEXEC");
		}

		break;
	}

	if (error == -1) {
		warn("Failed to create utun errno %d", errno);
		close(tunsock);
		tunsock = -1;
	}

	return tunsock;
}

channel_t
sktu_create_interface_channel(sktu_if_type_t type, int tunsock)
{
	uuid_t uuid;
	channel_attr_t attr;
	channel_t channel;
	socklen_t uuidlen;
	int error;
	int OPT_ENABLE_CHANNEL;
	int OPT_GET_CHANNEL_UUID;

	if (type == SKTU_IFT_UTUN) {
		OPT_ENABLE_CHANNEL = UTUN_OPT_ENABLE_CHANNEL;
		OPT_GET_CHANNEL_UUID = UTUN_OPT_GET_CHANNEL_UUID;
	} else {
		assert(type == SKTU_IFT_IPSEC);
		OPT_ENABLE_CHANNEL = IPSEC_OPT_ENABLE_CHANNEL;
		OPT_GET_CHANNEL_UUID = IPSEC_OPT_GET_CHANNEL_UUID;
	}

	if (type == SKTU_IFT_UTUN) {
		int enable = 1;
		error = setsockopt(tunsock, SYSPROTO_CONTROL, OPT_ENABLE_CHANNEL, &enable, sizeof(enable));
		if (error != 0) {
			SKT_LOG("setsockopt returned error %d, errno %d\n", error, errno);
		}
		SKTC_ASSERT_ERR(error == 0);
	}

	int scratch;
	socklen_t scratchlen = sizeof(scratch);
	error = getsockopt(tunsock, SYSPROTO_CONTROL, OPT_ENABLE_CHANNEL, &scratch, &scratchlen);
	SKTC_ASSERT_ERR(!error);
	assert(scratchlen == sizeof(scratch));
	assert(1 == scratch);

	uuidlen = sizeof(uuid);
	error = getsockopt(tunsock, SYSPROTO_CONTROL, OPT_GET_CHANNEL_UUID, uuid, &uuidlen);
	SKTC_ASSERT_ERR(error == 0);
	assert(uuidlen == sizeof(uuid));

	attr = NULL;
	channel = sktu_channel_create_extended(uuid,
	    NEXUS_PORT_KERNEL_PIPE_CLIENT,
	    CHANNEL_DIR_TX_RX, CHANNEL_RING_ID_ANY, attr,
	    -1, -1, -1, -1, -1, -1, -1, 1, -1, -1);
	assert(channel);

	return channel;
}

void
sktu_get_interface_name(sktu_if_type_t type, int s, char name[IFNAMSIZ])
{
	int error;
	socklen_t  optlen = IFNAMSIZ;
	if (type == SKTU_IFT_UTUN) {
		error = getsockopt(s, SYSPROTO_CONTROL, UTUN_OPT_IFNAME, name, &optlen);
	} else {
		error = getsockopt(s, SYSPROTO_CONTROL, IPSEC_OPT_IFNAME, name, &optlen);
	}
	SKTC_ASSERT_ERR(!error);
}

void
sktu_dump_buffer(FILE *f, const char *desc, const void *buf, size_t len)
{
	int i;
	unsigned char buff[17];
	unsigned char *pc = (unsigned char*)buf;

	if (desc != NULL) {
		fprintf(f, "%s:\n", desc);
	}

	if (len == 0) {
		fprintf(f, "  ZERO LENGTH\n");
		return;
	}

	for (i = 0; i < len; i++) {
		if ((i % 16) == 0) {
			if (i != 0) {
				fprintf(f, "  %s\n", buff);
			}

			fprintf(f, "  %04x ", i); // offset
		}

		fprintf(f, " %02x", pc[i]);

		// prepare ascii
		if ((pc[i] < 0x20) || (pc[i] > 0x7e)) {
			buff[i % 16] = '.';
		} else {
			buff[i % 16] = pc[i];
		}
		buff[(i % 16) + 1] = '\0';
	}

	// pad last line to for ascii
	while ((i % 16) != 0) {
		fprintf(f, "   ");
		i++;
	}

	fprintf(f, "  %s\n", buff);
}

int
sysctl_buf(char *oid_name, void **buffer, size_t *len, void *newp,
    size_t newlen)
{
	int ret, err;
	int try = 0;

	*buffer = NULL;
#define RETRY_COUNT 10
try_again:
	ret = sysctlbyname(oid_name, NULL, len, newp, newlen);
	if (ret != 0) {
		if (ret == ENOMEM) {
			try++;
			if (try <= RETRY_COUNT) {
				goto try_again;
			}
		}
		err = errno;
		SKT_LOG("sysctl for len failed, %s\n", strerror(errno));
		return err;
	}
	if (*len == 0) {
		T_LOG("sysctl for len returned zero! No stats?\n");
		*buffer = NULL;
		return 0;
	}
	*buffer = malloc(*len);
	if (*buffer == NULL) {
		T_LOG("sysctl malloc for %ld bytes failed\n", *len);
		return ENOMEM;
	}

	ret = sysctlbyname(oid_name, *buffer, len, newp, newlen);
	if (ret != 0) {
		err = errno;
		if (ret == ENOMEM) {
			free(*buffer);
			*buffer = NULL;
			try++;
			if (try <= RETRY_COUNT) {
				goto try_again;
			}
		}
		SKT_LOG("sysctl for buf failed, %s\n", strerror(errno));
		free(*buffer);
		return err;
	}

	return 0;
}

uint32_t
sktu_set_inject_error_rmask(uint32_t *mask)
{
	uint32_t old_mask;
	size_t size = sizeof(old_mask);
	int error;

	error = sysctlbyname("kern.skywalk.inject_error_rmask",
	    &old_mask, &size, mask, mask ? sizeof(*mask) : 0);

	SKTC_ASSERT_ERR(!error);
	return old_mask;
}

/* returns TRUE if a matching IPv4 address is found */
boolean_t
sktu_check_interface_ipv4_address(char *ifname, uint32_t ipaddr)
{
	struct ifaddrs *ifaddr, *ifa;
	boolean_t match = FALSE;
	int error;

	error = getifaddrs(&ifaddr);
	SKTC_ASSERT_ERR(!error);

	for (ifa = ifaddr; ifa != NULL; ifa = ifa->ifa_next) {
		struct sockaddr_in *sin =
		    (struct sockaddr_in *)(void *)ifa->ifa_addr;
		if (ifa->ifa_addr == NULL) {
			continue;
		}
		if ((strncmp(ifa->ifa_name, ifname, IFNAMSIZ) == 0) &&
		    (ifa->ifa_addr->sa_family == AF_INET) &&
		    (sin->sin_addr.s_addr == ipaddr)) {
			match = TRUE;
		}
	}
	freeifaddrs(ifaddr);
	return match;
}

/****************************************************************/

int
sktu_create_pfkeysock(void)
{
	int keysock = socket(PF_KEY, SOCK_RAW, PF_KEY_V2);
	assert(keysock != -1);
	return keysock;
}

void
sktu_create_sa(int keysock, const char ifname[IFXNAMSIZ], uint32_t spi, struct in_addr *src, struct in_addr *dst)
{
	/*
	 *       <base, SA, (lifetime(HS),) address(SD), (address(P),)
	 *       key(AE), (identity(SD),) (sensitivity)>
	 */

	struct {
		struct sadb_msg msg __attribute((aligned(sizeof(uint64_t))));
		struct sadb_key key      __attribute((aligned(sizeof(uint64_t))));
		struct sadb_sa sa        __attribute((aligned(sizeof(uint64_t))));
		struct sadb_x_sa2 sa2    __attribute((aligned(sizeof(uint64_t))));
		struct sadb_x_ipsecif ipsecif __attribute((aligned(sizeof(uint64_t))));
		struct {
			struct sadb_address addr __attribute((aligned(sizeof(uint64_t))));
			struct sockaddr_in saddr __attribute((aligned(sizeof(uint64_t))));
		} src;
		struct {
			struct sadb_address addr __attribute((aligned(sizeof(uint64_t))));
			struct sockaddr_in saddr __attribute((aligned(sizeof(uint64_t))));
		} dst;
	} addcmd;

	memset(&addcmd, 0, sizeof(addcmd));

	addcmd.msg.sadb_msg_version = PF_KEY_V2;
	addcmd.msg.sadb_msg_type = SADB_ADD;
	addcmd.msg.sadb_msg_errno = 0;
	addcmd.msg.sadb_msg_satype = SADB_SATYPE_ESP;
	addcmd.msg.sadb_msg_len = PFKEY_UNIT64(sizeof(addcmd));
	addcmd.msg.sadb_msg_reserved = 0;
	addcmd.msg.sadb_msg_seq = 0;
	addcmd.msg.sadb_msg_pid = (unsigned)getpid();

	addcmd.key.sadb_key_len = PFKEY_UNIT64(sizeof(addcmd.key));
	addcmd.key.sadb_key_exttype = SADB_EXT_KEY_ENCRYPT;
	addcmd.key.sadb_key_bits = 0;
	addcmd.key.sadb_key_reserved = 0;

	addcmd.sa.sadb_sa_len = PFKEY_UNIT64(sizeof(addcmd.sa));
	addcmd.sa.sadb_sa_exttype = SADB_EXT_SA;
	addcmd.sa.sadb_sa_spi = htonl(spi);
	addcmd.sa.sadb_sa_replay = 0;
	addcmd.sa.sadb_sa_state = 0;
	addcmd.sa.sadb_sa_auth = SADB_AALG_NONE;
	addcmd.sa.sadb_sa_encrypt = SADB_EALG_NULL;
	addcmd.sa.sadb_sa_flags = 0;

	addcmd.sa2.sadb_x_sa2_len = PFKEY_UNIT64(sizeof(addcmd.sa2));
	addcmd.sa2.sadb_x_sa2_exttype = SADB_X_EXT_SA2;
	addcmd.sa2.sadb_x_sa2_mode = IPSEC_MODE_TRANSPORT;
	addcmd.sa2.sadb_x_sa2_alwaysexpire = 1;
	addcmd.sa2.sadb_x_sa2_flags = SADB_X_EXT_SA2_DELETE_ON_DETACH;
	addcmd.sa2.sadb_x_sa2_sequence = 0;
	addcmd.sa2.sadb_x_sa2_reqid = 0;

	addcmd.ipsecif.sadb_x_ipsecif_len = PFKEY_UNIT64(sizeof(addcmd.ipsecif));
	addcmd.ipsecif.sadb_x_ipsecif_exttype = SADB_X_EXT_IPSECIF;
	memset(addcmd.ipsecif.sadb_x_ipsecif_internal_if, 0, sizeof(addcmd.ipsecif.sadb_x_ipsecif_internal_if));
	memset(addcmd.ipsecif.sadb_x_ipsecif_outgoing_if, 0, sizeof(addcmd.ipsecif.sadb_x_ipsecif_outgoing_if));
	strlcpy(addcmd.ipsecif.sadb_x_ipsecif_ipsec_if, ifname, sizeof(addcmd.ipsecif.sadb_x_ipsecif_ipsec_if));
	addcmd.ipsecif.sadb_x_ipsecif_init_disabled = 0;
	addcmd.ipsecif.reserved = 0;

	addcmd.src.addr.sadb_address_len = PFKEY_UNIT64(sizeof(addcmd.src));
	addcmd.src.addr.sadb_address_exttype = SADB_EXT_ADDRESS_SRC;
	addcmd.src.addr.sadb_address_proto = IPSEC_ULPROTO_ANY;
	addcmd.src.addr.sadb_address_prefixlen = sizeof(struct in_addr) << 3; //XXX Why?
	addcmd.src.addr.sadb_address_reserved = 0;
	addcmd.src.saddr.sin_len = sizeof(addcmd.src.saddr);
	addcmd.src.saddr.sin_family = AF_INET;
	addcmd.src.saddr.sin_port = htons(0);
	addcmd.src.saddr.sin_addr = *src;

	addcmd.dst.addr.sadb_address_len = PFKEY_UNIT64(sizeof(addcmd.dst));
	addcmd.dst.addr.sadb_address_exttype = SADB_EXT_ADDRESS_DST;
	addcmd.dst.addr.sadb_address_proto = IPSEC_ULPROTO_ANY;
	addcmd.dst.addr.sadb_address_prefixlen = sizeof(struct in_addr) << 3; //XXX Why?
	addcmd.dst.addr.sadb_address_reserved = 0;
	addcmd.dst.saddr.sin_len = sizeof(addcmd.dst.saddr);
	addcmd.dst.saddr.sin_family = AF_INET;
	addcmd.dst.saddr.sin_port = htons(0);
	addcmd.dst.saddr.sin_addr = *dst;

	//log_hexdump(&addcmd, sizeof(addcmd));

	ssize_t slen;
	slen = send(keysock, &addcmd, sizeof(addcmd), 0);
	assert(slen == sizeof(addcmd));
}

typedef union {
	char        c[2];
	u_short     s;
} short_union_t;

typedef union {
	u_short     s[2];
	long        l;
} long_union_t;

static __inline__ void
reduce(int * sum)
{
	long_union_t l_util;

	l_util.l = *sum;
	*sum = l_util.s[0] + l_util.s[1];
	if (*sum > 65535) {
		*sum -= 65535;
	}
	return;
}

unsigned short
in_cksum(void * pkt, int len, int sum0)
{
	u_short * w;
	int sum = sum0;

	w = (u_short *)pkt;
	while ((len -= 32) >= 0) {
		sum += w[0]; sum += w[1];
		sum += w[2]; sum += w[3];
		sum += w[4]; sum += w[5];
		sum += w[6]; sum += w[7];
		sum += w[8]; sum += w[9];
		sum += w[10]; sum += w[11];
		sum += w[12]; sum += w[13];
		sum += w[14]; sum += w[15];
		w += 16;
	}
	len += 32;
	while ((len -= 8) >= 0) {
		sum += w[0]; sum += w[1];
		sum += w[2]; sum += w[3];
		w += 4;
	}
	len += 8;
	if (len) {
		reduce(&sum);
		while ((len -= 2) >= 0) {
			sum += *w++;
		}
	}
	if (len == -1) { /* odd-length packet */
		short_union_t s_util;

		s_util.s = 0;
		s_util.c[0] = *((char *)w);
		s_util.c[1] = 0;
		sum += s_util.s;
	}
	reduce(&sum);
	return ~sum & 0xffff;
}

#define ADDCARRY(_x)  do {                                              \
	while (((_x) >> 16) != 0)                                       \
	        (_x) = ((_x) >> 16) + ((_x) & 0xffff);                  \
} while (0)

/*
 * Checksum routine for Internet Protocol family headers (Portable Version).
 *
 * This routine is very heavily used in the network
 * code and should be modified for each CPU to be as fast as possible.
 */
#define REDUCE16 {                                                        \
	q_util.q = sum;                                                   \
	l_util.l = q_util.s[0] + q_util.s[1] + q_util.s[2] + q_util.s[3]; \
	sum = l_util.s[0] + l_util.s[1];                                  \
	ADDCARRY(sum);                                                    \
}

union l_util {
	uint16_t s[2];
	uint32_t l;
};

union q_util {
	uint16_t s[4];
	uint32_t l[2];
	uint64_t q;
};

uint16_t
in_pseudo(uint32_t a, uint32_t b, uint32_t c)
{
	uint64_t sum;
	union q_util q_util;
	union l_util l_util;

	sum = (uint64_t)a + b + c;
	REDUCE16;
	return sum;
}

uint16_t
in6_pseudo(const struct in6_addr *src, const struct in6_addr *dst, uint32_t x)
{
	uint32_t sum = 0;
	const uint16_t *w;

	/*
	 * IPv6 source address
	 */
	w = (const uint16_t *)src;
	sum += w[0]; sum += w[1];
	sum += w[2]; sum += w[3]; sum += w[4]; sum += w[5];
	sum += w[6]; sum += w[7];

	/*
	 * IPv6 destination address
	 */
	w = (const uint16_t *)dst;
	sum += w[0]; sum += w[1];
	sum += w[2]; sum += w[3]; sum += w[4]; sum += w[5];
	sum += w[6]; sum += w[7];

	/*
	 * Caller-supplied value; 'x' could be one of:
	 *
	 *	htonl(proto + length), or
	 *	htonl(proto + length + sum)
	 **/
	sum += x;

	/* fold in carry bits */
	ADDCARRY(sum);

	return sum;
}

uint16_t
sktu_ip_id()
{
	static int sktu_ip_id;
	return sktu_ip_id++;
}

void
sktu_channel_port_init(channel_port_t ch_port, uuid_t instance,
    nexus_port_t nx_port, bool enable_upp, bool enable_event_ring,
    bool low_latency)
{
	channel_t       chan;
	nexus_port_t    port = nx_port;
	ring_id_t       ringid;

	bzero(ch_port, sizeof(*ch_port));
	chan = sktu_channel_create_extended(instance, port,
	    CHANNEL_DIR_TX_RX, CHANNEL_RING_ID_ANY, NULL,
	    -1, -1, -1, -1, -1, -1, enable_upp ? 1 : -1, 1,
	    enable_event_ring ? 1 : -1, low_latency ? 1 : -1);
	if (chan == NULL) {
		SKT_LOG("Can't open channel on port %d, %s\n", port,
		    strerror(errno));
		return;
	}

	T_LOG("Opened port %d\n", port);

	ch_port->chan = chan;
	ch_port->fd = os_channel_get_fd(chan);
	ch_port->port = port;
	ch_port->user_packet_pool = enable_upp;

	/* tx ring */
	ringid = os_channel_ring_id(chan, CHANNEL_FIRST_TX_RING);
	ch_port->tx_ring = os_channel_tx_ring(ch_port->chan, ringid);
	assert(ch_port->tx_ring != NULL);
	/* rx ring */
	ringid = os_channel_ring_id(chan, CHANNEL_FIRST_RX_RING);
	ch_port->rx_ring = os_channel_rx_ring(ch_port->chan, ringid);
	assert(ch_port->rx_ring != NULL);
}

static inline uint16_t
sktu_fold_sum_final(uint32_t sum)
{
	sum = (sum >> 16) + (sum & 0xffff);     /* 17-bit */
	sum = (sum >> 16) + (sum & 0xffff);     /* 16-bit + carry */
	sum = (sum >> 16) + (sum & 0xffff);     /* final carry */
	return ~sum & 0xffff;
}

packet_t
sktu_channel_port_frame_to_pkt(channel_port_t port, struct sktu_frame *frame)
{
	int error;
	packet_t pkt;
	void *baddr, *bytes = &frame->bytes[0];
	size_t len = frame->len;
	buflet_t buf, pbuf = NULL;
	uint16_t clen, bdlim, blen, bcnt;

	assert(port->user_packet_pool);

	error = os_channel_packet_alloc(port->chan, &pkt);
	SKTC_ASSERT_ERR(error == 0);
	assert(pkt != 0);

	buf = os_packet_get_next_buflet(pkt, NULL);
	assert(buf != NULL);
	error = os_buflet_set_data_offset(buf, 0);
	SKTC_ASSERT_ERR(error == 0);
	bdlim = blen = os_buflet_get_data_limit(buf);
	assert(bdlim != 0);
	bcnt = os_packet_get_buflet_count(pkt);
	assert(blen * bcnt >= len);
	baddr = os_buflet_get_object_address(buf);
	assert(baddr != NULL);

	error = os_packet_set_link_header_length(pkt, 0);
	SKTC_ASSERT_ERR(error == 0);

	/* copy the frame bytes */
	while (len != 0) {
		if (blen == 0) {
			error = os_buflet_set_data_length(buf, bdlim);
			SKTC_ASSERT_ERR(error == 0);
			pbuf = buf;
			buf = os_packet_get_next_buflet(pkt, pbuf);
			assert(buf != NULL);
			error = os_buflet_set_data_offset(buf, 0);
			SKTC_ASSERT_ERR(error == 0);
			baddr = os_buflet_get_object_address(buf);
			assert(baddr != NULL);
			bdlim = blen = os_buflet_get_data_limit(buf);
		}
		clen = MIN(blen, len);
		memcpy(baddr, bytes, clen);
		len -= clen;
		blen -= clen;
		bytes += clen;
		baddr += clen;
		assert(len == 0 || blen == 0);
	}
	if (frame->csum_flags != 0) {
		os_packet_set_inet_checksum(pkt, frame->csum_flags,
		    frame->csum_start, frame->csum_stuff);
	}
	if (pbuf == NULL) {
		error = os_buflet_set_data_length(buf, frame->len);
	} else {
		error = os_buflet_set_data_length(buf, clen);
	}
	SKTC_ASSERT_ERR(error == 0);

	os_packet_set_flow_uuid(pkt, frame->flow_uuid);
	error = os_packet_finalize(pkt);
	SKTC_ASSERT_ERR(error == 0);
	return pkt;
}

int
sktu_channel_port_tx(channel_port_t port, packet_t pkt)
{
	int error;
	slot_prop_t prop;
	channel_slot_t slot;

	slot = os_channel_get_next_slot(port->tx_ring, NULL, &prop);
	if (slot == NULL) {
		return ENOENT;
	}
	error = os_channel_slot_attach_packet(port->tx_ring, slot, pkt);
	SKTC_ASSERT_ERR(error == 0);
	error = os_channel_advance_slot(port->tx_ring, slot);
	SKTC_ASSERT_ERR(error == 0);
	return 0;
}

/*
 * Burst Tx tries to tx as many it can in one shot.
 *
 * Returns number of actually completed Tx.
 */
uint32_t
sktu_channel_port_tx_burst_pkt(channel_port_t port, packet_t *pkts,
    uint32_t n)
{
	struct timespec timeout = {
		.tv_sec = 10,
		.tv_nsec = 0,
	};
	struct kevent evlist, kev;
	int kq;
	int error;
	uint32_t i;

	kq = kqueue();
	assert(kq != -1);

	EV_SET(&kev, port->fd, EVFILT_WRITE, EV_ADD | EV_ENABLE, 0, 0, NULL);
	error = kevent(kq, &kev, 1, NULL, 0, NULL);
	SKTC_ASSERT_ERR(error == 0);

	/* wait for Tx to become available */
	error = kevent(kq, NULL, 0, &evlist, 1, &timeout);
	if (error <= 0) {
		if (errno == EAGAIN) {
			return 0;
		}
		SKTC_ASSERT_ERR(error == 0);
	}
	if (error == 0) {
		T_LOG("kevent timeout\n");
		return 0;
	}
	if (evlist.flags & EV_ERROR) {
		int err = evlist.data;
		if (err == EAGAIN) {
			return 0;
		}
		SKTC_ASSERT_ERR(err == 0);
	}

	if (evlist.filter != EVFILT_WRITE) {
		err(EX_OSERR, "%lu event %d?\n", evlist.ident, evlist.filter);
	}

	for (i = 0; i < n; i++) {
		error = sktu_channel_port_tx(port, pkts[i]);
		if (error != 0) {
			break;
		}
	}

	if (i != 0) {
		error = os_channel_sync(port->chan, CHANNEL_SYNC_TX);
		SKTC_ASSERT_ERR(error == 0);
	}

	return i;
}

/*
 * Burst Tx tries to tx as many it can in one shot.
 *
 * Returns number of actually completed Tx.
 */
uint32_t
sktu_channel_port_tx_burst(channel_port_t port, struct sktu_frame **frames,
    uint32_t n)
{
	struct timespec timeout = {
		.tv_sec = 10,
		.tv_nsec = 0,
	};
	struct kevent evlist, kev;
	int kq;
	int error;
	uint32_t i;
	packet_t pkt;

	kq = kqueue();
	assert(kq != -1);

	EV_SET(&kev, port->fd, EVFILT_WRITE, EV_ADD | EV_ENABLE, 0, 0, NULL);
	error = kevent(kq, &kev, 1, NULL, 0, NULL);
	SKTC_ASSERT_ERR(error == 0);

	/* wait for Tx to become available */
	error = kevent(kq, NULL, 0, &evlist, 1, &timeout);
	if (error <= 0) {
		if (errno == EAGAIN) {
			return 0;
		}
		SKTC_ASSERT_ERR(error == 0);
	}
	if (error == 0) {
		T_LOG("kevent timeout\n");
		return 0;
	}
	if (evlist.flags & EV_ERROR) {
		int err = evlist.data;
		if (err == EAGAIN) {
			return 0;
		}
		SKTC_ASSERT_ERR(err == 0);
	}

	if (evlist.filter != EVFILT_WRITE) {
		err(EX_OSERR, "%lu event %d?\n", evlist.ident, evlist.filter);
	}

	for (i = 0; i < n; i++) {
		pkt = sktu_channel_port_frame_to_pkt(port, frames[i]);
		error = sktu_channel_port_tx(port, pkt);
		if (error != 0) {
			break;
		}
	}

	if (i != 0) {
		error = os_channel_sync(port->chan, CHANNEL_SYNC_TX);
		SKTC_ASSERT_ERR(error == 0);
	}

	return i;
}

/*
 * Bulk Tx makes sure all Tx operations are completed; otherwise fails the test.
 */
void
sktu_channel_port_tx_bulk(channel_port_t port, struct sktu_frame **frames,
    uint32_t n)
{
	uint32_t ret = 0;
	ret = sktu_channel_port_tx_burst(port, frames, n);
	assert(ret < n);
	if (ret != n) {
		errx(EX_OSERR, "tx bulk failed %u/%u", n, ret);
	}
}

int
sktu_parse_ipv4_frame(struct sktu_frame *frame, void *ip_payload,
    uint32_t *ip_payload_len)
{
	size_t pkt_len, payload_len;
	void *buf;
	struct ip *ip;
	uint16_t csum;

	buf = &frame->bytes[0];
	ip = (struct ip*)buf;
	pkt_len = frame->len;
	assert(pkt_len == ntohs(ip->ip_len));
	payload_len = pkt_len - sizeof(*ip);
	assert(payload_len <= SKTU_FRAME_BUF_SIZE);

	/* verify ip header checksum */
	csum = in_cksum(ip, sizeof(*ip), 0);
	if (csum != 0) {
		sktu_dump_buffer(stderr, __func__, buf, pkt_len);
		errx(EX_PROTOCOL, "IP header checksum invalid");
	}

	if (ip_payload != NULL) {     /* copy the data */
		memcpy(ip_payload, buf + sizeof(*ip), pkt_len - sizeof(*ip));
	}

	*ip_payload_len = payload_len;
	return 0;
}

int
sktu_parse_tcp4_frame(struct sktu_frame *frame, void *tcp_payload,
    uint32_t *tcp_payload_len)
{
	uint32_t pkt_len, payload_len;
	void *buf;
	struct ip *ip;
	ip_tcp_header_t *ip_tcp;
	uint16_t csum;

	buf = &frame->bytes[0];
	ip = buf;
	ip_tcp = buf;
	pkt_len = frame->len;
	if (ip->ip_p != IPPROTO_TCP) {
		sktu_dump_buffer(stderr, "non-TCP packet", buf, pkt_len);
		return EINVAL;
	}
	assert(pkt_len == ntohs(ip_tcp->ip.ip_len));
	payload_len = pkt_len - sizeof(ip_tcp_header_t);
	assert(payload_len <= SKTU_FRAME_BUF_SIZE);

	csum = in_cksum(ip, sizeof(*ip), 0);
	if (csum != 0) {
		sktu_dump_buffer(stderr, __func__, buf, pkt_len);
		errx(EX_PROTOCOL, "IP header checksum invalid");
	}

	csum = os_inet_checksum(&ip_tcp->tcp, pkt_len - sizeof(struct ip), 0);
	csum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
	    csum + htonl(payload_len + sizeof(struct tcphdr) + IPPROTO_TCP));
	csum ^= 0xffff;
	if (csum != 0) {
		sktu_dump_buffer(stderr, "invalid TCP csum", buf, pkt_len);
		return -1;
	}

	if (tcp_payload != NULL) {     /* copy the data */
		memcpy(tcp_payload, buf + sizeof(*ip_tcp), payload_len);
	}

	*tcp_payload_len = payload_len;

	return 0;
}

int
sktu_parse_udp4_frame(struct sktu_frame *frame, void *udp_payload,
    uint32_t *udp_payload_len)
{
	size_t pkt_len, payload_len;
	void *buf;
	struct ip *ip;
	ip_udp_header_t *ip_udp;
	uint16_t csum;

	buf = &frame->bytes[0];
	ip = buf;
	ip_udp = buf;
	pkt_len = frame->len;
	if (ip->ip_p != IPPROTO_UDP) {
		sktu_dump_buffer(stderr,
		    "sktu_parse_udp4_frame: non-UDP packet", buf, pkt_len);
		return EINVAL;
	}
	assert(pkt_len == ntohs(ip_udp->ip.ip_len));
	payload_len = pkt_len - sizeof(ip_udp_header_t);
	assert(payload_len <= SKTU_FRAME_BUF_SIZE);

	csum = in_cksum(ip, sizeof(*ip), 0);
	if (csum != 0) {
		sktu_dump_buffer(stderr, __func__, buf, pkt_len);
		errx(EX_PROTOCOL, "IP header checksum invalid");
	}

	if (ip_udp->udp.uh_sum == 0) {
		goto skip_udp_checksum;
	}

	csum = os_inet_checksum(&ip_udp->udp, pkt_len - sizeof(struct ip), 0);
	csum += htons(payload_len + sizeof(struct udphdr) + IPPROTO_UDP);
	csum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, csum);
	csum ^= 0xffff;
	if (csum != 0) {
		sktu_dump_buffer(stderr, __func__, buf, pkt_len);
		return -1;
	}

skip_udp_checksum:
	if (udp_payload != NULL) {
		memcpy(udp_payload, buf + sizeof(*ip_udp), payload_len);
	}

	*udp_payload_len = payload_len;

	return 0;
}

/*
 * Rx once from an available ring;
 * Return 0, if successful; non-zero, otherwise.
 */
struct sktu_frame *
sktu_channel_port_rx(channel_port_t port)
{
	int error;
	slot_prop_t prop;
	channel_slot_t slot;
	struct sktu_frame *frame;
	packet_t pkt;
	void *addr, *buf;
	size_t buf_len;
	size_t frame_length;
	buflet_t buflet;

	slot = os_channel_get_next_slot(port->rx_ring, NULL, &prop);
	if (slot == NULL) {
		return NULL;
	}
	assert(prop.sp_buf_ptr != 0);

	frame = sktu_frame_alloc();

	pkt = os_channel_slot_get_packet(port->rx_ring, slot);
	assert(pkt != 0);
	if (port->user_packet_pool) {
		error = os_channel_slot_detach_packet(port->rx_ring,
		    slot, pkt);
		SKTC_ASSERT_ERR(error == 0);
	}

	buflet = os_packet_get_next_buflet(pkt, NULL);
	assert(buflet != NULL);
	buf = os_buflet_get_object_address(buflet) +
	    os_buflet_get_data_offset(buflet);
	frame_length = os_packet_get_data_length(pkt);

	buflet = os_packet_get_next_buflet(pkt, NULL);
	assert(buflet != NULL);
	buf = os_buflet_get_object_address(buflet) +
	    os_buflet_get_data_offset(buflet);
	buf_len = os_buflet_get_data_length(buflet);
	assert(buf_len < SKTU_FRAME_BUF_SIZE);

	frame->len = os_packet_get_data_length(pkt);

	addr = &frame->bytes[0];
	memcpy(addr, buf, buf_len);
	frame_length -= buf_len;

	while (frame_length != 0) {
		buflet = os_packet_get_next_buflet(pkt, buflet);
		assert(buflet != NULL);
		buf = os_buflet_get_object_address(buflet) +
		    os_buflet_get_data_offset(buflet);
		assert(buf != 0);
		buf_len = os_buflet_get_data_length(buflet);
		assert(buf_len != 0);
		memcpy(addr, buf, buf_len);
		addr += buf_len;
		frame_length -= buf_len;
	}

	os_packet_get_flow_uuid(pkt, &frame->flow_uuid);
	error = os_channel_packet_free(port->chan, pkt);

	error = os_channel_advance_slot(port->rx_ring, slot);
	SKTC_ASSERT_ERR(error == 0);

	return frame;
}

uint32_t
sktu_channel_port_rx_burst(channel_port_t port, struct sktu_frame **frames,
    uint32_t n)
{
	struct timespec timeout = {
		.tv_sec = 10,
		.tv_nsec = 0,
	};

	int error;
	struct kevent evlist, kev;
	int kq;
	uint32_t i;

	kq = kqueue();
	assert(kq != -1);

	EV_SET(&kev, port->fd, EVFILT_READ, EV_ADD | EV_ENABLE, 0, 0, NULL);
	error = kevent(kq, &kev, 1, NULL, 0, NULL);
	SKTC_ASSERT_ERR(error == 0);

	/* wait for RX to become available */
	error = kevent(kq, NULL, 0, &evlist, 1, &timeout);
	if (error <= 0) {
		if (errno == EAGAIN) {
			return 0;
		}
		SKTC_ASSERT_ERR(error == 0);
	}
	if (error == 0) {
		T_LOG("kevent timeout\n");
		return 0;
	}
	if (evlist.flags & EV_ERROR) {
		int err = evlist.data;
		if (err == EAGAIN) {
			return 0;
		}
		SKTC_ASSERT_ERR(err == 0);
	}

	if (evlist.filter != EVFILT_READ) {
		err(EX_OSERR, "%lu event %d?\n", evlist.ident, evlist.filter);
	}

	for (i = 0; i < n; i++) {
		frames[i] = sktu_channel_port_rx(port);
		if (frames[i] == NULL) {
			break;
		}
	}

	if (i != 0) {
		error = os_channel_sync(port->chan, CHANNEL_SYNC_RX);
		SKTC_ASSERT_ERR(error == 0);
	}

	close(kq);

	return i;
}

void
sktu_channel_port_rx_bulk(channel_port_t port, struct sktu_frame **frames,
    uint32_t n)
{
	uint32_t ret = 0;
	ret = sktu_channel_port_rx_burst(port, frames, n);
	assert(ret < n);
	if (ret != n) {
		errx(EX_OSERR, "rx bulk failed, %u/%u packets", n, ret);
	}
}

/*
 * Received batch of frames from utun file descriptor.
 *
 * Returns number of frames actually received.
 */
uint32_t
sktu_utun_fd_rx_burst(int utun_fd, struct sktu_frame **frames, uint32_t n)
{
	struct timeval timeout = {
		.tv_sec = 10,
		.tv_usec = 0,
	};

	fd_set readfds, errorfds;
	int retval;

	FD_ZERO(&readfds);
	FD_ZERO(&errorfds);
	FD_SET(utun_fd, &readfds);
	FD_SET(utun_fd, &errorfds);

	retval = select(utun_fd + 1, &readfds, NULL, &errorfds, &timeout);
	if (retval == -1) {
		err(EX_OSERR, "select()");
	}

	if (!FD_ISSET(utun_fd, &readfds) && retval == 0) { // timeout
		T_LOG("recv timeout\n");
		return 0;
	}
	assert(!FD_ISSET(utun_fd, &errorfds));
	assert(retval == 1);

	if (!FD_ISSET(utun_fd, &readfds)) {
		errx(EX_OSERR, "fd selected but no read fd available");
	}

	uint32_t i = 0;
	for (i = 0; i < n; i++) {
		struct {
			uint32_t af;
			char bytes[SKTU_FRAME_BUF_SIZE];
		} utun_packet;
		ssize_t len;
		len = read(utun_fd, &utun_packet, sizeof(utun_packet));
		if (len < 1) {
			errx(EX_OSERR, "utun read 0 len");
		}
		struct sktu_frame *frame = frames[i] = sktu_frame_alloc();
		memcpy(frame->bytes, &utun_packet.bytes, len - sizeof(uint32_t));
		frame->len = len - sizeof(uint32_t);
	}

	return i;
}

void
sktu_utun_fd_tx_burst(int utun_fd, struct sktu_frame **frames, uint32_t n)
{
	struct timeval timeout = {
		.tv_sec = 10,
		.tv_usec = 0,
	};
	fd_set writefds, errorfds;
	int retval;

	FD_ZERO(&writefds);
	FD_ZERO(&errorfds);
	FD_SET(utun_fd, &writefds);
	FD_SET(utun_fd, &errorfds);

	retval = select(utun_fd + 1, NULL, &writefds, &errorfds, &timeout);
	if (retval == -1) {
		err(EX_OSERR, "select()");
	}

	if (!FD_ISSET(utun_fd, &writefds) && retval == 0) { // timeout
		err(EX_OSERR, "recv timeout\n");
	}

	assert(!FD_ISSET(utun_fd, &errorfds));
	assert(retval == 1);

	if (!FD_ISSET(utun_fd, &writefds)) {
		errx(EX_OSERR, "fd selected but no write fd available");
	}

	uint32_t i = 0;
	for (i = 0; i < n; i++) {
		struct sktu_frame *frame = frames[i];
		struct ip *ip = (void *)&frame->bytes[0];
		uint32_t af;
		switch (ip->ip_v) {
		case IPVERSION:
			af = htonl(AF_INET);
			break;
		case IPV6_VERSION:
			af = htonl(AF_INET6);
			break;
		default:
			assert("unrecoginzed IP version");
			__builtin_unreachable();
			break;
		}
		struct {
			uint32_t af;
			char bytes[SKTU_FRAME_BUF_SIZE];
		} utun_packet;
		memcpy(&utun_packet.af, &af, sizeof(af));
		memcpy(&utun_packet.bytes, &frame->bytes[0], frame->len);
		ssize_t write_len = frame->len + sizeof(uint32_t);
		T_LOG("%s writing frame len %zu\n", __func__, write_len);
		ssize_t len = write(utun_fd, &utun_packet, write_len);
		if (len != write_len) {
			err(EX_OSERR, "utun write error\n");
		}
	}
}

struct sktu_frame *
sktu_frame_alloc()
{
	return malloc(sizeof(struct sktu_frame));
}

#define sktu_frame_free(frame) \
do { \
	free(frame); \
	frame = NULL; \
} while (0)

void
sktu_frames_free(struct sktu_frame **frames, size_t n)
{
	for (size_t i = 0; i < n; i++) {
		sktu_frame_free(frames[i]);
		frames[i] = NULL;
	}
}

size_t
sktu_create_ip_frames(struct sktu_frame **frames, size_t n,
    void *src_ip, void *dst_ip, uint8_t proto, const void *sdu, size_t sdu_len,
    size_t mtu, uint16_t csum_flags, uint16_t csum_start, uint16_t csum_stuff)
{
	size_t off = 0, remaining_sdu_len = sdu_len;
	size_t i = 0;
	uint16_t ip_id = sktu_ip_id();
	bool needs_frag = false;

	while (remaining_sdu_len > 0) {
		assert(i < n);

		struct sktu_frame *frame = frames[i] = sktu_frame_alloc();
		char *baddr = &frame->bytes[0];
		struct ip *ip = (struct ip *)baddr;
		size_t dlen;
		bool more_frag = false;

		dlen = mtu - sizeof(*ip);
		if (dlen >= remaining_sdu_len) {
			dlen = remaining_sdu_len;
			needs_frag = false;
			more_frag = false;
		} else {
			dlen = dlen & ~0x7; // round down to 8-byte multiple
			needs_frag = true;
			more_frag = true;
		}

		// can't handle fragmented csum offload
		assert(!(needs_frag && csum_flags != 0));

		memset(ip, 0, sizeof(*ip));
		ip->ip_v = IPVERSION;
		ip->ip_hl = sizeof(struct ip) >> 2;
		ip->ip_ttl = MAXTTL;
		ip->ip_p = proto;
		memcpy(&ip->ip_src, src_ip, sizeof(struct in_addr));
		memcpy(&ip->ip_dst, dst_ip, sizeof(struct in_addr));
		ip->ip_len = htons(sizeof(*ip) + dlen);
		ip->ip_id = htons(ip_id);
		ip->ip_off = ((off >> 3) & IP_OFFMASK);
		if (more_frag) {
			ip->ip_off |= IP_MF;
		}
		ip->ip_off = htons(ip->ip_off);

		/* compute the IP header checksum */
		ip->ip_sum = in_cksum(ip, sizeof(*ip), 0);
		baddr += sizeof(*ip);

		memcpy(baddr, sdu + off, dlen);

		frame->csum_flags = csum_flags;
		frame->csum_start = sizeof(*ip) + csum_start;
		frame->csum_stuff = sizeof(*ip) + csum_stuff;

		frame->len = sizeof(*ip) + dlen;

		off += dlen;
		remaining_sdu_len -= dlen;
		i++;
	}

	return i;
}

size_t
sktu_create_ip6_frames(struct sktu_frame **frames, size_t n,
    void *src_ip, void *dst_ip, uint8_t proto, const void *sdu, size_t sdu_len,
    size_t mtu, uint16_t csum_flags, uint16_t csum_start, uint16_t csum_stuff)
{
	size_t off = 0, remaining_sdu_len = sdu_len;
	size_t i = 0;
	uint16_t ip_id = sktu_ip_id();
	bool needs_frag = false;

	while (remaining_sdu_len > 0) {
		assert(i < n);

		struct sktu_frame *frame = frames[i] = sktu_frame_alloc();
		char *baddr = &frame->bytes[0];
		struct ip6_hdr *ip6 = (struct ip6_hdr *)baddr;
		size_t hlen = sizeof(*ip6);
		size_t plen, dlen;
		bool more_frag = false;

		dlen = mtu - hlen;
		if (dlen >= remaining_sdu_len) {
			// fits in one packet
			dlen = plen = remaining_sdu_len;
			remaining_sdu_len = 0;
			more_frag = false;
		} else {
			// need to fragment
			dlen -= sizeof(struct ip6_frag);
			dlen = dlen & ~0x7; // round down to 8-byte multiple
			plen = sizeof(struct ip6_frag) + dlen;
			remaining_sdu_len -= dlen;
			needs_frag = true;
			more_frag = true;
		}

		// can't handle fragmented csum offload
		assert(!(needs_frag && csum_flags != 0));

		// insert ipv6 header
		memset(ip6, 0, sizeof(*ip6));
		ip6->ip6_vfc = (IPV6_VERSION & IPV6_VERSION_MASK);
		ip6->ip6_plen = htons(plen);
		ip6->ip6_nxt = needs_frag ? IPPROTO_FRAGMENT : proto;
		ip6->ip6_hlim = IPV6_DEFHLIM;
		memcpy(&ip6->ip6_src, src_ip, sizeof(struct in6_addr));
		memcpy(&ip6->ip6_dst, dst_ip, sizeof(struct in6_addr));

		baddr += sizeof(*ip6);

		// insert ipv6 frag header
		if (needs_frag) {
			struct ip6_frag *ip6f = (struct ip6_frag *)baddr;
			ip6f->ip6f_nxt = proto;
			ip6f->ip6f_reserved = 0;
			ip6f->ip6f_offlg = htons(off);
			if (more_frag) {
				ip6f->ip6f_offlg |= IP6F_MORE_FRAG;
			}
			ip6f->ip6f_ident = htonl(ip_id);

			hlen += sizeof(*ip6f);
			baddr += sizeof(*ip6f);
		}

		memcpy(baddr, sdu + off, dlen);

		frame->csum_flags = csum_flags;
		frame->csum_start = sizeof(*ip6) + csum_start;
		frame->csum_stuff = sizeof(*ip6) + csum_stuff;
		frame->len = hlen + dlen;

		off += dlen;
		i++;
	}

	return i;
}

size_t
sktu_create_tcp_frames(struct sktu_frame **frames, size_t n,
    uint8_t ipver, void *src_ip, void *dst_ip, uint16_t sport, uint16_t dport,
    const void *data, size_t data_len, size_t mtu, bool csum_offload)
{
	uint32_t n_frames;
	size_t sdu_len = data_len + sizeof(struct tcphdr);
	void *sdu = malloc(sdu_len);

	// populate header
	struct tcphdr *tcp = (struct tcphdr *)sdu;
	tcp->th_sport = htons(sport);
	tcp->th_dport = htons(dport);
	tcp->th_flags |= 0; //FIXME (connect ? TH_SYN : TH_RST);
	tcp->th_off = (sizeof(struct tcphdr)) >> 2;

	// copy payload
	memcpy(sdu + sizeof(*tcp), data, data_len);

	// compute checksum
	uint16_t sum = 0;

	if (ipver == IPVERSION) {
		sum = in_pseudo(*(uint32_t*)src_ip, *(uint32_t*)dst_ip,
		    htons(data_len + sizeof(struct tcphdr) + IPPROTO_TCP));
	} else {
		sum = in6_pseudo(src_ip, dst_ip,
		    htonl(data_len + sizeof(struct tcphdr) + IPPROTO_TCP));
	}
	tcp->th_sum = sum;

	uint16_t csum_flags = 0, csum_start = 0, csum_stuff = 0;
	if (csum_offload) {
		csum_flags = PACKET_CSUM_PARTIAL;
		csum_start = 0;
		csum_stuff = offsetof(struct tcphdr, th_sum);
	} else {
		sum = os_inet_checksum(sdu, sdu_len, 0);
		tcp->th_sum = sktu_fold_sum_final(sum);
	}

	// IP framing
	if (ipver == IPVERSION) {
		n_frames = sktu_create_ip_frames(frames, n, src_ip, dst_ip,
		    IPPROTO_TCP, sdu, sdu_len, mtu, csum_flags, csum_start,
		    csum_stuff);
	} else {
		n_frames = sktu_create_ip6_frames(frames, n, src_ip, dst_ip,
		    IPPROTO_TCP, sdu, sdu_len, mtu, csum_flags, csum_start,
		    csum_stuff);
	}

	free(sdu);

	return n_frames;
}

size_t
sktu_create_udp_frames(struct sktu_frame **frames, size_t n,
    uint8_t ipver, void *src_ip, void *dst_ip, uint16_t sport, uint16_t dport,
    const void *data, size_t data_len, size_t mtu, bool csum_offload)
{
	uint32_t n_frames;
	size_t sdu_len = data_len + sizeof(struct udphdr);
	void *sdu = malloc(sdu_len);

	// populate header
	struct udphdr *udp = (struct udphdr *)sdu;
	udp->uh_sport = htons(sport);
	udp->uh_dport = htons(dport);
	udp->uh_ulen = htons(sizeof(*udp) + data_len);

	// compute payload checksum
	uint32_t payload_sum = 0, pseudo_sum = 0;
	if (ipver == IPVERSION) {
		struct ipv4_udp_pseudo_hdr udp_pseudo = {};
		memcpy(&udp_pseudo.src_ip, src_ip, sizeof(struct in_addr));
		memcpy(&udp_pseudo.dst_ip, dst_ip, sizeof(struct in_addr));
		udp_pseudo.proto = IPPROTO_UDP;
		udp_pseudo.length = htons(sizeof(struct udphdr) + data_len);
		pseudo_sum = os_inet_checksum(&udp_pseudo, sizeof(udp_pseudo)
		    + sizeof(struct udphdr), 0);
	} else {
		struct ipv6_udp_pseudo_hdr udp_pseudo = {};
		memcpy(&udp_pseudo.src_ip, src_ip, sizeof(struct in6_addr));
		memcpy(&udp_pseudo.dst_ip, dst_ip, sizeof(struct in6_addr));
		udp_pseudo.proto = IPPROTO_UDP;
		udp_pseudo.length = htons(sizeof(struct udphdr) + data_len);
		pseudo_sum = os_inet_checksum(&udp_pseudo, sizeof(udp_pseudo)
		    + sizeof(struct udphdr), 0);
	}

	uint16_t csum_flags = 0, csum_start = 0, csum_stuff = 0;
	if (csum_offload) {
		csum_flags = PACKET_CSUM_PARTIAL | PACKET_CSUM_ZERO_INVERT;
		csum_start = 0;
		csum_stuff = offsetof(struct udphdr, uh_sum);
		udp->uh_sum = sktu_fold_sum_final(pseudo_sum);
	} else {
		payload_sum = os_inet_checksum(data, data_len, 0);
		udp->uh_sum = ~sktu_fold_sum_final(pseudo_sum + payload_sum);
	}

	// copy payload
	memcpy(sdu + sizeof(*udp), data, data_len);

	// IP framing
	if (ipver == IPVERSION) {
		n_frames = sktu_create_ip_frames(frames, n, src_ip, dst_ip,
		    IPPROTO_UDP, sdu, sdu_len, mtu, csum_flags, csum_start,
		    csum_stuff);
	} else {
		n_frames = sktu_create_ip6_frames(frames, n, src_ip, dst_ip,
		    IPPROTO_UDP, sdu, sdu_len, mtu, csum_flags, csum_start,
		    csum_stuff);
	}

	free(sdu);

	return n_frames;
}

void
sktu_attach_flow_metadata_to_frames(struct sktu_flow *flow,
    struct sktu_frame **frames, size_t n_frames)
{
	for (uint32_t i = 0; i < n_frames; i++) {
		struct sktu_frame *frame = frames[i];
		uuid_copy(frame->flow_uuid, flow->uuid);
	}
}

static size_t
_sktu_create_udp_flow_input_frames(struct sktu_flow *flow,
    struct sktu_frame **frames, size_t n, const void *data, size_t data_len)
{
	n = sktu_create_udp_frames(frames, n, flow->ipver, flow->dst_ip,
	    flow->src_ip, flow->dport, flow->sport, data, data_len, flow->mtu,
	    NO_CSUM_OFFLOAD);
	sktu_attach_flow_metadata_to_frames(flow, frames, n);
	return n;
}

static size_t
_sktu_create_udp_flow_output_frames(struct sktu_flow *flow,
    struct sktu_frame **frames, size_t n, const void *data, size_t data_len,
    bool csum_offload)
{
	n = sktu_create_udp_frames(frames, n, flow->ipver, flow->src_ip,
	    flow->dst_ip, flow->sport, flow->dport, data, data_len, flow->mtu,
	    csum_offload);
	sktu_attach_flow_metadata_to_frames(flow, frames, n);
	return n;
}

static size_t
_sktu_create_tcp_flow_input_frames(struct sktu_flow *flow,
    struct sktu_frame **frames, size_t n, const void *data, size_t data_len)
{
	n = sktu_create_tcp_frames(frames, n, flow->ipver, flow->dst_ip,
	    flow->src_ip, flow->dport, flow->sport, data, data_len, flow->mtu,
	    NO_CSUM_OFFLOAD);
	sktu_attach_flow_metadata_to_frames(flow, frames, n);
	return n;
}

static size_t
_sktu_create_tcp_flow_output_frames(struct sktu_flow *flow,
    struct sktu_frame **frames, size_t n, const void *data, size_t data_len,
    bool csum_offload)
{
	n = sktu_create_tcp_frames(frames, n, flow->ipver, flow->src_ip,
	    flow->dst_ip, flow->sport, flow->dport, data, data_len, flow->mtu,
	    csum_offload);
	sktu_attach_flow_metadata_to_frames(flow, frames, n);
	return n;
}

static size_t
_sktu_create_ip_flow_input_frames(struct sktu_flow *flow,
    struct sktu_frame **frames, size_t n, const void *data, size_t data_len)
{
	n = sktu_create_ip_frames(frames, n, flow->dst_ip, flow->src_ip,
	    flow->ip_protocol, data, data_len, flow->mtu, 0, 0, 0);
	sktu_attach_flow_metadata_to_frames(flow, frames, n);
	return n;
}

static size_t
_sktu_create_ip_flow_output_frames(struct sktu_flow *flow,
    struct sktu_frame **frames, size_t n, const void *data,
    size_t data_len, bool csum_offload)
{
	n = sktu_create_ip_frames(frames, n, flow->src_ip, flow->dst_ip,
	    flow->ip_protocol, data, data_len, flow->mtu, 0, 0, 0);
	sktu_attach_flow_metadata_to_frames(flow, frames, n);
	return n;
}

#define SKTU_STRING_BUF_MAX 2048
char *
sktu_nfr_to_string(struct nx_flow_req *nfr)
{
	static char buf[SKTU_STRING_BUF_MAX];
	uuid_string_t uuidstr;
	char sa_buf[31];
	char da_buf[31];

	uuid_unparse(nfr->nfr_flow_uuid, uuidstr);
	if (nfr->nfr_saddr.sa.sa_family == AF_INET) {
		inet_ntop(AF_INET, &nfr->nfr_saddr.sin.sin_addr.s_addr, sa_buf,
		    sizeof(sa_buf));
		inet_ntop(AF_INET, &nfr->nfr_daddr.sin.sin_addr.s_addr, da_buf,
		    sizeof(da_buf));
	} else {
		inet_ntop(AF_INET6, &nfr->nfr_saddr.sin6.sin6_addr, sa_buf,
		    sizeof(sa_buf));
		inet_ntop(AF_INET6, &nfr->nfr_daddr.sin6.sin6_addr, da_buf,
		    sizeof(da_buf));
	}
	snprintf(buf, sizeof(buf),
	    "nx_port[%d] %s src=%s,dst=%s,proto=%d,sport=%d,dport=%d, flags=0x%x",
	    nfr->nfr_nx_port, uuidstr, sa_buf, da_buf, nfr->nfr_ip_protocol,
	    ntohs(nfr->nfr_saddr.sin.sin_port),
	    ntohs(nfr->nfr_daddr.sin.sin_port), nfr->nfr_flags);

	return buf;
}

char *
sktu_flow_to_string(struct sktu_flow *flow)
{
	return sktu_nfr_to_string(&flow->nfr);
}

struct sktu_flow *
_sktu_create_nexus_flow(sktu_nexus_t nexus, nexus_port_t nx_port,
    uint8_t af, void *src, void *dst, uint8_t proto, uint16_t sport,
    uint16_t dport, uint32_t flags)
{
	struct sktu_flow *flow = malloc(sizeof(*flow));

	memset(flow, 0, sizeof(*flow));
	flow->nexus = nexus;
	flow->mtu = 1500;

	flow->nx_port = nx_port;

	struct nx_flow_req *nfr = &flow->nfr;
	union sockaddr_in_4_6 *saddr = &nfr->nfr_saddr;
	union sockaddr_in_4_6 *daddr = &nfr->nfr_daddr;
	nfr->nfr_nx_port = nx_port;
	if (af == AF_INET) {
		// initialize flow
		flow->ipver = IPVERSION;
		// fill in nfr (stuff in network order :)
		SIN(saddr)->sin_len = sizeof(struct sockaddr_in);
		SIN(daddr)->sin_len = sizeof(struct sockaddr_in);
		SIN(saddr)->sin_family = AF_INET;
		SIN(daddr)->sin_family = AF_INET;
		SIN(saddr)->sin_addr = *(struct in_addr *)src;
		SIN(daddr)->sin_addr = *(struct in_addr *)dst;
		nfr->nfr_ip_protocol = proto;
		SIN(saddr)->sin_port = htons(sport);
		SIN(daddr)->sin_port = htons(dport);
	} else {
		flow->ipver = IPV6_VERSION;
		SIN6(saddr)->sin6_len = sizeof(struct sockaddr_in6);
		SIN6(daddr)->sin6_len = sizeof(struct sockaddr_in6);
		SIN6(saddr)->sin6_family = AF_INET6;
		SIN6(daddr)->sin6_family = AF_INET6;
		SIN6(saddr)->sin6_addr = *(struct in6_addr *)src;
		SIN6(daddr)->sin6_addr = *(struct in6_addr *)dst;
		nfr->nfr_ip_protocol = proto;
		SIN6(saddr)->sin6_port = htons(sport);
		SIN6(daddr)->sin6_port = htons(dport);
	}

	uuid_generate_random(nfr->nfr_flow_uuid);
	nfr->nfr_flags = flags;

	errno = 0;
	int error = __os_nexus_flow_add(nexus->controller, nexus->fsw_nx_uuid, nfr);
	if (error) {
		T_LOG("Failed flow %s\n", sktu_nfr_to_string(nfr));
		free(flow);
		return NULL;
	}

	if (af == AF_INET) {
		flow->src_ip = &SIN(saddr)->sin_addr;
		flow->dst_ip = &SIN(daddr)->sin_addr;
		flow->sport = ntohs(SIN(saddr)->sin_port);
		flow->dport = ntohs(SIN(daddr)->sin_port);
	} else {
		flow->src_ip = &SIN6(saddr)->sin6_addr;
		flow->dst_ip = &SIN6(daddr)->sin6_addr;
		flow->sport = ntohs(SIN6(saddr)->sin6_port);
		flow->dport = ntohs(SIN6(daddr)->sin6_port);
	}

	flow->ip_protocol = proto;
	uuid_copy(flow->uuid, nfr->nfr_flow_uuid);

	switch (proto) {
	case IPPROTO_UDP:
		flow->create_input_frames = _sktu_create_udp_flow_input_frames;
		flow->create_output_frames = _sktu_create_udp_flow_output_frames;
		break;
	case IPPROTO_TCP:
		flow->create_input_frames = _sktu_create_tcp_flow_input_frames;
		flow->create_output_frames = _sktu_create_tcp_flow_output_frames;
		break;
	default:
		flow->create_input_frames = _sktu_create_ip_flow_input_frames;
		flow->create_output_frames = _sktu_create_ip_flow_output_frames;
	}

	assert(nfr->nfr_nx_port != NEXUS_PORT_ANY);

	T_LOG("Created flow %s\n", sktu_nfr_to_string(nfr));

	return flow;
}

struct sktu_flow *
sktu_create_nexus_flow(sktu_nexus_t nexus, uint8_t af, void *src, void *dst,
    uint8_t proto, uint16_t sport, uint16_t dport)
{
	return _sktu_create_nexus_flow(nexus, NEXUS_PORT_ANY, af, src, dst, proto, sport, dport, 0);
}

struct sktu_flow *
sktu_create_nexus_flow_with_nx_port(sktu_nexus_t nexus, nexus_port_t nx_port,
    uint8_t af, void *src, void *dst, uint8_t proto, uint16_t sport,
    uint16_t dport)
{
	return _sktu_create_nexus_flow(nexus, nx_port, af, src, dst, proto, sport, dport, 0);
}

struct sktu_flow *
sktu_create_nexus_low_latency_flow(sktu_nexus_t nexus, uint8_t af, void *src, void *dst,
    uint8_t proto, uint16_t sport, uint16_t dport)
{
	return _sktu_create_nexus_flow(nexus, NEXUS_PORT_ANY, af, src, dst, proto, sport, dport, NXFLOWREQF_LOW_LATENCY);
}

void
_sktu_destroy_nexus_flow(struct sktu_flow *flow)
{
	sktu_nexus_t nexus = flow->nexus;
	struct nx_flow_req *nfr = &flow->nfr;

	int error = __os_nexus_flow_del(nexus->controller, nexus->fsw_nx_uuid, nfr);
	SKTC_ASSERT_ERR(!error);
	if (error) {
		T_LOG("failed to deling flow %s", sktu_nfr_to_string(nfr));
	}

	free(flow);
}

int
sktu_get_nexus_flow_stats(uuid_t flow_uuid, struct sk_stats_flow *sf)
{
	size_t length = 0;
	void *buffer = NULL;
	int ret = sysctl_buf(SK_STATS_FLOW, &buffer, &length, NULL, 0);
	assert(ret == 0);
	assert(buffer != NULL && length != 0);

	assert((length % sizeof(*sf)) == 0);

	struct sk_stats_flow *iter;
	for (iter = buffer; (void *)iter < buffer + length; iter++) {
		if (uuid_compare(iter->sf_uuid, flow_uuid) == 0) {
			*sf = *iter;
			return 0;
		}
	}
	return ENOENT;
}

int
sktu_get_nexus_flowswitch_stats(struct sk_stats_flow_switch **sfsw, size_t *len)
{
	int ret;
	void *buffer = NULL;
	size_t length = 0;
	size_t width = sizeof(struct sk_stats_flow_switch);

	ret = sysctl_buf(SK_STATS_FLOW_SWITCH, &buffer, &length, NULL, 0);
	if (ret != 0 || buffer == NULL || length == 0) {
		return ret;
	}
	if ((length % width) != 0) {
		T_LOG("Error, mismatching sk_stats_flow_switch, quit\n");
		exit(EX_OSERR);
	}

	*sfsw = (struct sk_stats_flow_switch *)buffer;
	*len = length;

	return 0;
}

void
__fsw_stats_print(struct fsw_stats *s)
{
	int i;

	for (i = 0; i < __FSW_STATS_MAX; i++) {
		if (STATS_VAL(s, i) == 0) {
			continue;
		}
		os_log(OS_LOG_DEFAULT, "\t%-24s: %llu\n",
		    fsw_stats_str(i), STATS_VAL(s, i));
	}
}