Source of /bsd/skywalk/nexus/netif/nx_netif.c (From xnu-11215.1.10)

/*
 * Copyright (c) 2015-2023 Apple Inc. All rights reserved.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
 *
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. The rights granted to you under the License
 * may not be used to create, or enable the creation or redistribution of,
 * unlawful or unlicensed copies of an Apple operating system, or to
 * circumvent, violate, or enable the circumvention or violation of, any
 * terms of an Apple operating system software license agreement.
 *
 * Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this file.
 *
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
 */

/*
 * The netif nexus domain has two domain providers: native and compat, with
 * the latter being the default provider of this domain. The compat provider
 * has special handlers for NXCFG_CMD_ATTACH and NXCFG_CMD_DETACH, etc.
 *
 * A netif nexus instance can be in a native or compat mode; in either case,
 * it is associated with two instances of a nexus_adapter structure, and allows
 * at most two channels opened to the nexus.  Two two adapters correspond to
 * host and device ports, respectively.
 *
 * By itself, a netif nexus isn't associated with a network interface. The
 * association happens by attaching a network interface to the nexus instance.
 * A channel can only be successfully opened to a netif nexus after it has an
 * interface attached to it.
 *
 * During an attach, the interface is marked as Skywalk-capable, and its ifnet
 * structure refers to the attached netif nexus adapter via its if_na field.
 * The nexus also holds a reference to the interface on its na_ifp field. Note
 * that attaching to a netif_compat nexus does not alter the input/output data
 * path, nor does it remove any of the interface's hardware offload flags. It
 * merely associates the interface and netif nexus together.
 *
 * During a detach, the above references are dropped and the fields are cleared;
 * the interface is also marked as non-Skywalk-capable. This detach can happen
 * explicitly via a command down the nexus, or implicitly when the nexus goes
 * away (assuming there's no channel opened to it.)
 *
 * A userland channel can be opened to a netif nexus via the usual ch_open()
 * way, assuming the nexus provider is setup to allow access for the userland
 * process (either by binding the nexus port to PID, etc. or by creating the
 * nexus in the anonymous mode.)
 *
 * Alternatively, a kernel channel can also be opened to it by some kernel
 * subsystem, via ch_open_special(), e.g. by the flowswitch. Kernel channels
 * don't have any task mapping created, and the flag CHANF_KERNEL is used to
 * indicate that.
 *
 * Opening a channel to the host port of a native or compat netif causes the
 * ifnet output path to be redirected to nx_netif_host_transmit().  We also,
 * at present, disable any hardware offload features.
 *
 * Opening a channel to the device port of a compat netif causes the ifnet
 * input path to be redirected to nx_netif_compat_receive().  This is specific
 * to the compat variant, as the native variant's RX path already goes to
 * the native netif.
 *
 * During channel close, we restore the original I/O callbacks, as well as the
 * interface's offload flags.
 */

#include <skywalk/os_skywalk_private.h>
#include <skywalk/nexus/netif/nx_netif.h>
#include <skywalk/nexus/upipe/nx_user_pipe.h>
#include <skywalk/nexus/flowswitch/nx_flowswitch.h>
#include <sys/kdebug.h>
#include <sys/sdt.h>
#include <os/refcnt.h>
#include <libkern/OSDebug.h>

#define NX_NETIF_MAXRINGS       NX_MAX_NUM_RING_PAIR
#define NX_NETIF_MINSLOTS       2       /* XXX same as above */
#define NX_NETIF_MAXSLOTS       NX_MAX_NUM_SLOT_PER_RING /* max # of slots */
#define NX_NETIF_TXRINGSIZE     512     /* default TX ring size */
#define NX_NETIF_RXRINGSIZE     1024    /* default RX ring size */
#define NX_NETIF_BUFSIZE        (2 * 1024)  /* default buffer size */
#define NX_NETIF_MINBUFSIZE     (128)  /* min buffer size */
#define NX_NETIF_MAXBUFSIZE     (32 * 1024) /* max buffer size */

/*
 * TODO: adi@apple.com -- minimum buflets for now; we will need to
 * have a way to adjust this based on the underlying interface's
 * parameters, e.g. jumbo MTU, large segment offload, etc.
 */
#define NX_NETIF_UMD_SIZE       _USER_PACKET_SIZE(BUFLETS_MIN)
#define NX_NETIF_KMD_SIZE       _KERN_PACKET_SIZE(BUFLETS_MIN)

/*
 * minimum stack space required for IOSkywalkFamily and Driver execution.
 */
#if XNU_TARGET_OS_OSX
#define NX_NETIF_MIN_DRIVER_STACK_SIZE    (kernel_stack_size >> 1)
#else /* !XNU_TARGET_OS_OSX */
#define NX_NETIF_MIN_DRIVER_STACK_SIZE    (kernel_stack_size >> 2)
#endif /* XNU_TARGET_OS_OSX */

static void nx_netif_dom_init(struct nxdom *);
static void nx_netif_dom_terminate(struct nxdom *);
static void nx_netif_dom_fini(struct nxdom *);
static int nx_netif_prov_params_adjust(
	const struct kern_nexus_domain_provider *, const struct nxprov_params *,
	struct nxprov_adjusted_params *);

static int nx_netif_dom_bind_port(struct kern_nexus *, nexus_port_t *,
    struct nxbind *, void *);
static int nx_netif_dom_unbind_port(struct kern_nexus *, nexus_port_t);
static int nx_netif_dom_connect(struct kern_nexus_domain_provider *,
    struct kern_nexus *, struct kern_channel *, struct chreq *,
    struct kern_channel *, struct nxbind *, struct proc *);
static void nx_netif_dom_disconnect(struct kern_nexus_domain_provider *,
    struct kern_nexus *, struct kern_channel *);
static void nx_netif_dom_defunct(struct kern_nexus_domain_provider *,
    struct kern_nexus *, struct kern_channel *, struct proc *);
static void nx_netif_dom_defunct_finalize(struct kern_nexus_domain_provider *,
    struct kern_nexus *, struct kern_channel *, boolean_t);

static void nx_netif_doorbell(struct ifnet *);
static int nx_netif_na_txsync(struct __kern_channel_ring *, struct proc *,
    uint32_t);
static int nx_netif_na_rxsync(struct __kern_channel_ring *, struct proc *,
    uint32_t);
static void nx_netif_na_dtor(struct nexus_adapter *na);
static int nx_netif_na_notify_tx(struct __kern_channel_ring *, struct proc *,
    uint32_t);
static int nx_netif_na_notify_rx(struct __kern_channel_ring *, struct proc *,
    uint32_t);
static int nx_netif_na_activate(struct nexus_adapter *, na_activate_mode_t);

static int nx_netif_ctl(struct kern_nexus *, nxcfg_cmd_t, void *,
    struct proc *);
static int nx_netif_ctl_attach(struct kern_nexus *, struct nx_spec_req *,
    struct proc *);
static int nx_netif_ctl_detach(struct kern_nexus *, struct nx_spec_req *);
static int nx_netif_attach(struct kern_nexus *, struct ifnet *);
static void nx_netif_flags_init(struct nx_netif *);
static void nx_netif_flags_fini(struct nx_netif *);
static void nx_netif_callbacks_init(struct nx_netif *);
static void nx_netif_callbacks_fini(struct nx_netif *);
static void nx_netif_capabilities_fini(struct nx_netif *);
static errno_t nx_netif_interface_advisory_notify(void *,
    const struct ifnet_interface_advisory *);

struct nxdom nx_netif_dom_s = {
	.nxdom_prov_head =
    STAILQ_HEAD_INITIALIZER(nx_netif_dom_s.nxdom_prov_head),
	.nxdom_type =           NEXUS_TYPE_NET_IF,
	.nxdom_md_type =        NEXUS_META_TYPE_PACKET,
	.nxdom_md_subtype =     NEXUS_META_SUBTYPE_RAW,
	.nxdom_name =           "netif",
	.nxdom_ports = {
		.nb_def = 2,
		.nb_min = 2,
		.nb_max = NX_NETIF_MAXPORTS,
	},
	.nxdom_tx_rings = {
		.nb_def = 1,
		.nb_min = 1,
		.nb_max = NX_NETIF_MAXRINGS,
	},
	.nxdom_rx_rings = {
		.nb_def = 1,
		.nb_min = 1,
		.nb_max = NX_NETIF_MAXRINGS,
	},
	.nxdom_tx_slots = {
		.nb_def = NX_NETIF_TXRINGSIZE,
		.nb_min = NX_NETIF_MINSLOTS,
		.nb_max = NX_NETIF_MAXSLOTS,
	},
	.nxdom_rx_slots = {
		.nb_def = NX_NETIF_RXRINGSIZE,
		.nb_min = NX_NETIF_MINSLOTS,
		.nb_max = NX_NETIF_MAXSLOTS,
	},
	.nxdom_buf_size = {
		.nb_def = NX_NETIF_BUFSIZE,
		.nb_min = NX_NETIF_MINBUFSIZE,
		.nb_max = NX_NETIF_MAXBUFSIZE,
	},
	.nxdom_large_buf_size = {
		.nb_def = 0,
		.nb_min = 0,
		.nb_max = 0,
	},
	.nxdom_meta_size = {
		.nb_def = NX_NETIF_UMD_SIZE,
		.nb_min = NX_NETIF_UMD_SIZE,
		.nb_max = NX_METADATA_USR_MAX_SZ,
	},
	.nxdom_stats_size = {
		.nb_def = 0,
		.nb_min = 0,
		.nb_max = NX_STATS_MAX_SZ,
	},
	.nxdom_pipes = {
		.nb_def = 0,
		.nb_min = 0,
		.nb_max = NX_UPIPE_MAXPIPES,
	},
	.nxdom_flowadv_max = {
		.nb_def = 0,
		.nb_min = 0,
		.nb_max = NX_FLOWADV_MAX,
	},
	.nxdom_nexusadv_size = {
		.nb_def = 0,
		.nb_min = 0,
		.nb_max = NX_NEXUSADV_MAX_SZ,
	},
	.nxdom_capabilities = {
		.nb_def = NXPCAP_USER_CHANNEL,
		.nb_min = 0,
		.nb_max = NXPCAP_USER_CHANNEL,
	},
	.nxdom_qmap = {
		.nb_def = NEXUS_QMAP_TYPE_DEFAULT,
		.nb_min = NEXUS_QMAP_TYPE_DEFAULT,
		.nb_max = NEXUS_QMAP_TYPE_WMM,
	},
	.nxdom_max_frags = {
		.nb_def = NX_PBUF_FRAGS_DEFAULT,
		.nb_min = NX_PBUF_FRAGS_MIN,
		.nb_max = NX_PBUF_FRAGS_MAX,
	},
	.nxdom_init =           nx_netif_dom_init,
	.nxdom_terminate =      nx_netif_dom_terminate,
	.nxdom_fini =           nx_netif_dom_fini,
	.nxdom_find_port =      NULL,
	.nxdom_port_is_reserved = NULL,
	.nxdom_bind_port =      nx_netif_dom_bind_port,
	.nxdom_unbind_port =    nx_netif_dom_unbind_port,
	.nxdom_connect =        nx_netif_dom_connect,
	.nxdom_disconnect =     nx_netif_dom_disconnect,
	.nxdom_defunct =        nx_netif_dom_defunct,
	.nxdom_defunct_finalize = nx_netif_dom_defunct_finalize,
};

struct kern_nexus_domain_provider nx_netif_prov_s = {
	.nxdom_prov_name =              NEXUS_PROVIDER_NET_IF,
	/*
	 * Don't install this as the default domain provider, i.e.
	 * NXDOMPROVF_DEFAULT flag not set; we want netif_compat
	 * provider to be the one handling userland-issued requests
	 * coming down thru nxprov_create() instead.
	 */
	.nxdom_prov_flags =             0,
	.nxdom_prov_cb = {
		.dp_cb_init =           nx_netif_prov_init,
		.dp_cb_fini =           nx_netif_prov_fini,
		.dp_cb_params =         nx_netif_prov_params,
		.dp_cb_mem_new =        nx_netif_prov_mem_new,
		.dp_cb_config =         nx_netif_prov_config,
		.dp_cb_nx_ctor =        nx_netif_prov_nx_ctor,
		.dp_cb_nx_dtor =        nx_netif_prov_nx_dtor,
		.dp_cb_nx_mem_info =    nx_netif_prov_nx_mem_info,
		.dp_cb_nx_mib_get =     nx_netif_prov_nx_mib_get,
		.dp_cb_nx_stop =        nx_netif_prov_nx_stop,
	},
};

struct nexus_ifnet_ops na_netif_ops = {
	.ni_finalize = na_netif_finalize,
	.ni_reap = nx_netif_reap,
	.ni_dequeue = nx_netif_native_tx_dequeue,
	.ni_get_len = nx_netif_native_tx_get_len,
};

#define NX_NETIF_DOORBELL_MAX_DEQUEUE    64
uint32_t nx_netif_doorbell_max_dequeue = NX_NETIF_DOORBELL_MAX_DEQUEUE;

#define NQ_TRANSFER_DECAY       2               /* ilog2 of EWMA decay rate (4) */
static uint32_t nq_transfer_decay = NQ_TRANSFER_DECAY;

#define NQ_ACCUMULATE_INTERVAL  2 /* 2 seconds */
static uint32_t nq_accumulate_interval = NQ_ACCUMULATE_INTERVAL;

static uint32_t nq_stat_enable = 0;

SYSCTL_EXTENSIBLE_NODE(_kern_skywalk, OID_AUTO, netif,
    CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Skywalk network interface");
#if (DEVELOPMENT || DEBUG)
SYSCTL_STRING(_kern_skywalk_netif, OID_AUTO, sk_ll_prefix,
    CTLFLAG_RW | CTLFLAG_LOCKED, sk_ll_prefix, sizeof(sk_ll_prefix),
    "ifname prefix for enabling low latency support");
static uint32_t nx_netif_force_ifnet_start = 0;
SYSCTL_UINT(_kern_skywalk_netif, OID_AUTO, force_ifnet_start,
    CTLFLAG_RW | CTLFLAG_LOCKED, &nx_netif_force_ifnet_start, 0,
    "always use ifnet starter thread");
SYSCTL_UINT(_kern_skywalk_netif, OID_AUTO, doorbell_max_dequeue,
    CTLFLAG_RW | CTLFLAG_LOCKED, &nx_netif_doorbell_max_dequeue,
    NX_NETIF_DOORBELL_MAX_DEQUEUE,
    "max packets to dequeue in doorbell context");
SYSCTL_UINT(_kern_skywalk_netif, OID_AUTO, netif_queue_transfer_decay,
    CTLFLAG_RW | CTLFLAG_LOCKED, &nq_transfer_decay,
    NQ_TRANSFER_DECAY, "ilog2 of EWMA decay rate of netif queue transfers");
SYSCTL_UINT(_kern_skywalk_netif, OID_AUTO, netif_queue_stat_accumulate_interval,
    CTLFLAG_RW | CTLFLAG_LOCKED, &nq_accumulate_interval,
    NQ_ACCUMULATE_INTERVAL, "accumulation interval for netif queue stats");
#endif /* !DEVELOPMENT && !DEBUG */

SYSCTL_UINT(_kern_skywalk_netif, OID_AUTO, netif_queue_stat_enable,
    CTLFLAG_RW | CTLFLAG_LOCKED, &nq_stat_enable,
    0, "enable/disable stats collection for netif queue");

static SKMEM_TYPE_DEFINE(na_netif_zone, struct nexus_netif_adapter);

static SKMEM_TYPE_DEFINE(nx_netif_zone, struct nx_netif);

#define SKMEM_TAG_NETIF_MIT          "com.apple.skywalk.netif.mit"
static SKMEM_TAG_DEFINE(skmem_tag_netif_mit, SKMEM_TAG_NETIF_MIT);

#define SKMEM_TAG_NETIF_FILTER       "com.apple.skywalk.netif.filter"
SKMEM_TAG_DEFINE(skmem_tag_netif_filter, SKMEM_TAG_NETIF_FILTER);

#define SKMEM_TAG_NETIF_FLOW         "com.apple.skywalk.netif.flow"
SKMEM_TAG_DEFINE(skmem_tag_netif_flow, SKMEM_TAG_NETIF_FLOW);

#define SKMEM_TAG_NETIF_AGENT_FLOW   "com.apple.skywalk.netif.agent_flow"
SKMEM_TAG_DEFINE(skmem_tag_netif_agent_flow, SKMEM_TAG_NETIF_AGENT_FLOW);

#define SKMEM_TAG_NETIF_LLINK        "com.apple.skywalk.netif.llink"
SKMEM_TAG_DEFINE(skmem_tag_netif_llink, SKMEM_TAG_NETIF_LLINK);

#define SKMEM_TAG_NETIF_QSET         "com.apple.skywalk.netif.qset"
SKMEM_TAG_DEFINE(skmem_tag_netif_qset, SKMEM_TAG_NETIF_QSET);

#define SKMEM_TAG_NETIF_LLINK_INFO   "com.apple.skywalk.netif.llink_info"
SKMEM_TAG_DEFINE(skmem_tag_netif_llink_info, SKMEM_TAG_NETIF_LLINK_INFO);

/* use this for any temporary allocations */
#define SKMEM_TAG_NETIF_TEMP         "com.apple.skywalk.netif.temp"
static SKMEM_TAG_DEFINE(skmem_tag_netif_temp, SKMEM_TAG_NETIF_TEMP);

static void
nx_netif_dom_init(struct nxdom *nxdom)
{
	SK_LOCK_ASSERT_HELD();
	ASSERT(!(nxdom->nxdom_flags & NEXUSDOMF_INITIALIZED));

	_CASSERT(NEXUS_PORT_NET_IF_DEV == 0);
	_CASSERT(NEXUS_PORT_NET_IF_HOST == 1);
	_CASSERT(NEXUS_PORT_NET_IF_CLIENT == 2);
	_CASSERT(SK_NETIF_MIT_FORCE_OFF < SK_NETIF_MIT_FORCE_SIMPLE);
	_CASSERT(SK_NETIF_MIT_FORCE_SIMPLE < SK_NETIF_MIT_FORCE_ADVANCED);
	_CASSERT(SK_NETIF_MIT_FORCE_ADVANCED < SK_NETIF_MIT_AUTO);
	_CASSERT(SK_NETIF_MIT_AUTO == SK_NETIF_MIT_MAX);

	(void) nxdom_prov_add(nxdom, &nx_netif_prov_s);

	nx_netif_compat_init(nxdom);

	ASSERT(nxdom_prov_default[nxdom->nxdom_type] != NULL &&
	    strbufcmp(nxdom_prov_default[nxdom->nxdom_type]->nxdom_prov_name,
	    NEXUS_PROVIDER_NET_IF_COMPAT) == 0);

	netif_gso_init();
}

static void
nx_netif_dom_terminate(struct nxdom *nxdom)
{
	struct kern_nexus_domain_provider *nxdom_prov, *tnxdp;

	SK_LOCK_ASSERT_HELD();

	netif_gso_fini();
	nx_netif_compat_fini();

	STAILQ_FOREACH_SAFE(nxdom_prov, &nxdom->nxdom_prov_head,
	    nxdom_prov_link, tnxdp) {
		(void) nxdom_prov_del(nxdom_prov);
	}
}

static void
nx_netif_dom_fini(struct nxdom *nxdom)
{
#pragma unused(nxdom)
}

int
nx_netif_prov_init(struct kern_nexus_domain_provider *nxdom_prov)
{
#pragma unused(nxdom_prov)
	SK_D("initializing %s", nxdom_prov->nxdom_prov_name);
	return 0;
}

static int
nx_netif_na_notify_drop(struct __kern_channel_ring *kring, struct proc *p,
    uint32_t flags)
{
#pragma unused(kring, p, flags)
	return ENXIO;
}

int
nx_netif_prov_nx_stop(struct kern_nexus *nx)
{
	uint32_t r;
	struct nexus_adapter *na = nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV);
	struct nexus_netif_adapter *nifna = NIFNA(na);

	SK_LOCK_ASSERT_HELD();
	ASSERT(nx != NULL);

	/* place all rings in drop mode */
	na_kr_drop(na, TRUE);

	/* ensure global visibility */
	os_atomic_thread_fence(seq_cst);

	/* reset all TX notify callbacks */
	for (r = 0; r < na_get_nrings(na, NR_TX); r++) {
		while (!os_atomic_cmpxchg((void * volatile *)&na->na_tx_rings[r].ckr_na_notify,
		    ptrauth_nop_cast(void *__single, na->na_tx_rings[r].ckr_na_notify),
		    ptrauth_nop_cast(void *__single, &nx_netif_na_notify_drop), acq_rel)) {
			;
		}
		os_atomic_thread_fence(seq_cst);
		if (nifna->nifna_tx_mit != NULL) {
			nx_netif_mit_cleanup(&nifna->nifna_tx_mit[r]);
		}
	}
	if (nifna->nifna_tx_mit != NULL) {
		skn_free_type_array_counted_by(tx, struct nx_netif_mit,
		    nifna->nifna_tx_mit_count, nifna->nifna_tx_mit);
	}

	/* reset all RX notify callbacks */
	for (r = 0; r < na_get_nrings(na, NR_RX); r++) {
		while (!os_atomic_cmpxchg((void * volatile *)&na->na_rx_rings[r].ckr_na_notify,
		    ptrauth_nop_cast(void *__single, na->na_rx_rings[r].ckr_na_notify),
		    ptrauth_nop_cast(void *__single, &nx_netif_na_notify_drop), acq_rel)) {
			;
		}
		os_atomic_thread_fence(seq_cst);
		if (nifna->nifna_rx_mit != NULL) {
			nx_netif_mit_cleanup(&nifna->nifna_rx_mit[r]);
		}
	}
	if (nifna->nifna_rx_mit != NULL) {
		skn_free_type_array_counted_by(rx, struct nx_netif_mit,
		    nifna->nifna_rx_mit_count, nifna->nifna_rx_mit);
	}
	return 0;
}

static inline void
nx_netif_compat_adjust_ring_size(struct nxprov_adjusted_params *adj,
    ifnet_t ifp)
{
	const char *ifname;

	ifname = __terminated_by_to_indexable(ifp->if_name);
	if (IFNET_IS_CELLULAR(ifp) && (ifp->if_unit != 0)) {
		*(adj->adj_rx_slots) = sk_netif_compat_aux_cell_rx_ring_sz;
		*(adj->adj_tx_slots) = sk_netif_compat_aux_cell_tx_ring_sz;
	} else if (IFNET_IS_WIFI(ifp)) {
		if (ifname[0] == 'a' && ifname[1] == 'p' &&
		    ifname[2] == '\0') {
			/* Wi-Fi Access Point */
			*(adj->adj_rx_slots) = sk_netif_compat_wap_rx_ring_sz;
			*(adj->adj_tx_slots) = sk_netif_compat_wap_tx_ring_sz;
		} else if (ifp->if_eflags & IFEF_AWDL) {
			/* AWDL */
			*(adj->adj_rx_slots) = sk_netif_compat_awdl_rx_ring_sz;
			*(adj->adj_tx_slots) = sk_netif_compat_awdl_tx_ring_sz;
		} else {
			/* Wi-Fi infrastructure */
			*(adj->adj_rx_slots) = sk_netif_compat_wif_rx_ring_sz;
			*(adj->adj_tx_slots) = sk_netif_compat_wif_tx_ring_sz;
		}
	} else if (IFNET_IS_ETHERNET(ifp)) {
#if !XNU_TARGET_OS_OSX
		/*
		 * On non-macOS platforms, treat all compat Ethernet
		 * interfaces as USB Ethernet with reduced ring sizes.
		 */
		*(adj->adj_rx_slots) = sk_netif_compat_usb_eth_rx_ring_sz;
		*(adj->adj_tx_slots) = sk_netif_compat_usb_eth_tx_ring_sz;
#else /* XNU_TARGET_OS_OSX */
		if (ifp->if_subfamily == IFNET_SUBFAMILY_USB) {
			*(adj->adj_rx_slots) =
			    sk_netif_compat_usb_eth_rx_ring_sz;
			*(adj->adj_tx_slots) =
			    sk_netif_compat_usb_eth_tx_ring_sz;
		}
#endif /* XNU_TARGET_OS_OSX */
	}
}

static int
nx_netif_prov_params_adjust(const struct kern_nexus_domain_provider *nxdom_prov,
    const struct nxprov_params *nxp, struct nxprov_adjusted_params *adj)
{
	/*
	 * for netif compat adjust the following parameters for memory
	 * optimization:
	 * - change the size of buffer object to 128 bytes.
	 * - don't allocate rx ring for host port and tx ring for dev port.
	 * - for cellular interfaces other than pdp_ip0 reduce the ring size.
	 *   Assumption here is that pdp_ip0 is always used as the data
	 *   interface.
	 * - reduce the ring size for AWDL interface.
	 * - reduce the ring size for USB ethernet interface.
	 */
	if (strbufcmp(nxdom_prov->nxdom_prov_name,
	    NEXUS_PROVIDER_NET_IF_COMPAT) == 0) {
		/*
		 * Leave the parameters default if userspace access may be
		 * needed. We can't use skywalk_direct_allowed() here because
		 * the drivers have not attached yet.
		 */
		if (skywalk_netif_direct_enabled()) {
			goto done;
		}

		*(adj->adj_buf_size) = NETIF_COMPAT_BUF_SIZE;
		*(adj->adj_tx_rings) = 1;
		if (IF_INDEX_IN_RANGE(nxp->nxp_ifindex)) {
			ifnet_t ifp;
			ifnet_head_lock_shared();
			ifp = ifindex2ifnet[nxp->nxp_ifindex];
			ifnet_head_done();
			VERIFY(ifp != NULL);
			nx_netif_compat_adjust_ring_size(adj, ifp);
		}
	} else { /* netif native */
		if (nxp->nxp_flags & NXPF_NETIF_LLINK) {
			*(adj->adj_tx_slots) = NX_NETIF_MINSLOTS;
			*(adj->adj_rx_slots) = NX_NETIF_MINSLOTS;
		}
		/*
		 * Add another extra ring for host port. Note that if the
		 * nexus isn't configured to use the same pbufpool for all of
		 * its ports, we'd end up allocating extra here.
		 * Not a big deal since that case isn't the default.
		 */
		*(adj->adj_tx_rings) += 1;
		*(adj->adj_rx_rings) += 1;

		if ((*(adj->adj_buf_size) < PKT_MAX_PROTO_HEADER_SIZE)) {
			SK_ERR("buf size too small, min (%d)",
			    PKT_MAX_PROTO_HEADER_SIZE);
			return EINVAL;
		}
		_CASSERT(sizeof(struct __kern_netif_intf_advisory) ==
		    NX_INTF_ADV_SIZE);
		*(adj->adj_nexusadv_size) = sizeof(struct netif_nexus_advisory);
	}
done:
	return 0;
}

int
nx_netif_prov_params(struct kern_nexus_domain_provider *nxdom_prov,
    const uint32_t req, const struct nxprov_params *nxp0,
    struct nxprov_params *nxp, struct skmem_region_params srp[SKMEM_REGIONS],
    uint32_t pp_region_config_flags)
{
	struct nxdom *nxdom = nxdom_prov->nxdom_prov_dom;

	return nxprov_params_adjust(nxdom_prov, req, nxp0, nxp, srp,
	           nxdom, nxdom, nxdom, pp_region_config_flags,
	           nx_netif_prov_params_adjust);
}

int
nx_netif_prov_mem_new(struct kern_nexus_domain_provider *nxdom_prov,
    struct kern_nexus *nx, struct nexus_adapter *na)
{
#pragma unused(nxdom_prov)
	int err = 0;
	boolean_t pp_truncated_buf = FALSE;
	boolean_t allow_direct;
	boolean_t kernel_only;

	SK_DF(SK_VERB_NETIF,
	    "nx 0x%llx (\"%s\":\"%s\") na \"%s\" (0x%llx)", SK_KVA(nx),
	    NX_DOM(nx)->nxdom_name, nxdom_prov->nxdom_prov_name, na->na_name,
	    SK_KVA(na));

	ASSERT(na->na_arena == NULL);
	if ((na->na_type == NA_NETIF_COMPAT_DEV) ||
	    (na->na_type == NA_NETIF_COMPAT_HOST)) {
		pp_truncated_buf = TRUE;
	}
	/*
	 * We do this check to determine whether to create the extra
	 * regions needed for userspace access. This is per interface.
	 * NX_USER_CHANNEL_PROV() is systemwide so it can't be used.
	 */
	allow_direct = skywalk_netif_direct_allowed(
		__unsafe_null_terminated_from_indexable(na->na_name));

	/*
	 * Both ports (host and dev) share the same packet buffer pool;
	 * the first time a port gets opened will allocate the pp that
	 * gets stored in the nexus, which will then be used by any
	 * subsequent opens.
	 */
	kernel_only = !allow_direct || !NX_USER_CHANNEL_PROV(nx);
	na->na_arena = skmem_arena_create_for_nexus(na,
	    NX_PROV(nx)->nxprov_region_params, &nx->nx_tx_pp,
	    &nx->nx_rx_pp, pp_truncated_buf, kernel_only, &nx->nx_adv, &err);
	ASSERT(na->na_arena != NULL || err != 0);
	ASSERT(nx->nx_tx_pp == NULL || (nx->nx_tx_pp->pp_md_type ==
	    NX_DOM(nx)->nxdom_md_type && nx->nx_tx_pp->pp_md_subtype ==
	    NX_DOM(nx)->nxdom_md_subtype));

	return err;
}

SK_NO_INLINE_ATTRIBUTE
static int
nx_netif_get_llink_info(struct sockopt *sopt, struct kern_nexus *nx)
{
	struct nx_llink_info_req *nlir = NULL;
	struct nx_netif *nif;
	struct netif_llink *llink;
	uint16_t llink_cnt;
	size_t len, user_len;
	int err, i;

	nif = NX_NETIF_PRIVATE(nx);
	if (!NETIF_LLINK_ENABLED(nif)) {
		SK_ERR("llink mode not enabled");
		return ENOTSUP;
	}
	lck_rw_lock_shared(&nif->nif_llink_lock);
	llink_cnt = nif->nif_llink_cnt;
	if (llink_cnt == 0) {
		SK_ERR("zero llink cnt");
		err = ENXIO;
		goto done;
	}
	len = sizeof(*nlir) + (sizeof(struct nx_llink_info) * llink_cnt);
	/* preserve sopt_valsize because it gets overwritten by copyin */
	user_len = sopt->sopt_valsize;
	if (user_len < len) {
		SK_ERR("buffer too small");
		err = ENOBUFS;
		goto done;
	}
	nlir = sk_alloc_data(len, Z_WAITOK, skmem_tag_netif_llink_info);
	if (nlir == NULL) {
		SK_ERR("failed to allocate nlir");
		err = ENOMEM;
		goto done;
	}
	err = sooptcopyin(sopt, nlir, sizeof(*nlir), sizeof(*nlir));
	if (err != 0) {
		SK_ERR("copyin failed: %d", err);
		goto done;
	}
	if (nlir->nlir_version != NETIF_LLINK_INFO_VERSION) {
		SK_ERR("nlir version mismatch: %d != %d",
		    nlir->nlir_version, NETIF_LLINK_INFO_VERSION);
		err = ENOTSUP;
		goto done;
	}
	nlir->nlir_llink_cnt = llink_cnt;
	i = 0;
	STAILQ_FOREACH(llink, &nif->nif_llink_list, nll_link) {
		struct nx_llink_info *nli;
		struct netif_qset *qset;
		uint16_t qset_cnt;
		int j;

		nli = &nlir->nlir_llink[i];
		nli->nli_link_id = llink->nll_link_id;
		nli->nli_link_id_internal = llink->nll_link_id_internal;
		nli->nli_state = llink->nll_state;
		nli->nli_flags = llink->nll_flags;

		qset_cnt = llink->nll_qset_cnt;
		ASSERT(qset_cnt <= NETIF_LLINK_MAX_QSETS);
		nli->nli_qset_cnt = qset_cnt;

		j = 0;
		SLIST_FOREACH(qset, &llink->nll_qset_list, nqs_list) {
			struct nx_qset_info *nqi;

			nqi = &nli->nli_qset[j];
			nqi->nqi_id = qset->nqs_id;
			nqi->nqi_flags = qset->nqs_flags;
			nqi->nqi_num_rx_queues = qset->nqs_num_rx_queues;
			nqi->nqi_num_tx_queues = qset->nqs_num_tx_queues;
			j++;
		}
		ASSERT(j == qset_cnt);
		i++;
	}
	ASSERT(i == llink_cnt);
	sopt->sopt_valsize = user_len;
	err = sooptcopyout(sopt, nlir, len);
	if (err != 0) {
		SK_ERR("sooptcopyout failed: %d", err);
	}
done:
	lck_rw_unlock_shared(&nif->nif_llink_lock);
	if (nlir != NULL) {
		sk_free_data(nlir, len);
	}
	return err;
}

int
nx_netif_prov_config(struct kern_nexus_domain_provider *nxdom_prov,
    struct kern_nexus *nx, struct nx_cfg_req *ncr, int sopt_dir,
    struct proc *p, kauth_cred_t cred)
{
#pragma unused(nxdom_prov)
	struct sockopt sopt;
	int err = 0;

	SK_LOCK_ASSERT_HELD();

	/* proceed only if the client possesses netif entitlement */
	if ((err = skywalk_priv_check_cred(p, cred,
	    PRIV_SKYWALK_REGISTER_NET_IF)) != 0) {
		goto done;
	}

	if (ncr->nc_req == USER_ADDR_NULL) {
		err = EINVAL;
		goto done;
	}

	/* to make life easier for handling copies */
	bzero(&sopt, sizeof(sopt));
	sopt.sopt_dir = sopt_dir;
	sopt.sopt_val = ncr->nc_req;
	sopt.sopt_valsize = ncr->nc_req_len;
	sopt.sopt_p = p;

	switch (ncr->nc_cmd) {
	case NXCFG_CMD_ATTACH:
	case NXCFG_CMD_DETACH: {
		struct nx_spec_req nsr;

		bzero(&nsr, sizeof(nsr));
		err = sooptcopyin(&sopt, &nsr, sizeof(nsr), sizeof(nsr));
		if (err != 0) {
			goto done;
		}

		/*
		 * Null-terminate in case this has an interface name;
		 * the union is already large enough for uuid_t.
		 */
		nsr.nsr_name[sizeof(nsr.nsr_name) - 1] = '\0';
		if (p != kernproc) {
			nsr.nsr_flags &= NXSPECREQ_MASK;
		}

		err = nx_netif_ctl(nx, ncr->nc_cmd, &nsr, p);
		if (err != 0) {
			goto done;
		}

		/* XXX: adi@apple.com -- can this copyout fail? */
		(void) sooptcopyout(&sopt, &nsr, sizeof(nsr));
		break;
	}
	case NXCFG_CMD_FLOW_ADD:
	case NXCFG_CMD_FLOW_DEL: {
		_CASSERT(offsetof(struct nx_flow_req, _nfr_kernel_field_end) ==
		    offsetof(struct nx_flow_req, _nfr_common_field_end));
		struct nx_flow_req nfr;

		bzero(&nfr, sizeof(nfr));
		err = sooptcopyin(&sopt, &nfr, sizeof(nfr), sizeof(nfr));
		if (err != 0) {
			goto done;
		}

		err = nx_netif_ctl(nx, ncr->nc_cmd, &nfr, p);
		if (err != 0) {
			goto done;
		}

		/* XXX: adi@apple.com -- can this copyout fail? */
		(void) sooptcopyout(&sopt, &nfr, sizeof(nfr));
		break;
	}
	case NXCFG_CMD_GET_LLINK_INFO: {
		err = nx_netif_get_llink_info(&sopt, nx);
		break;
	}
	default:
		err = EINVAL;
		goto done;
	}
done:
	SK_DF(err ? SK_VERB_ERROR : SK_VERB_NETIF,
	    "nexus 0x%llx (%s) cmd %d err %d", SK_KVA(nx),
	    NX_DOM_PROV(nx)->nxdom_prov_name, ncr->nc_cmd, err);
	return err;
}

void
nx_netif_prov_fini(struct kern_nexus_domain_provider *nxdom_prov)
{
#pragma unused(nxdom_prov)
	SK_D("destroying %s", nxdom_prov->nxdom_prov_name);
}

int
nx_netif_prov_nx_ctor(struct kern_nexus *nx)
{
	struct nx_netif *n;
	char name[64];
	const char *__null_terminated nxadv_name = NULL;
	int error;

	SK_LOCK_ASSERT_HELD();
	ASSERT(nx->nx_arg == NULL);

	SK_D("nexus 0x%llx (%s)", SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name);

	nx->nx_arg = nx_netif_alloc(Z_WAITOK);
	n = NX_NETIF_PRIVATE(nx);
	if (NX_USER_CHANNEL_PROV(nx) &&
	    NX_PROV(nx)->nxprov_params->nxp_nexusadv_size != 0) {
		nxadv_name = tsnprintf(name, sizeof(name), "netif_%llu", nx->nx_id);
		error = nx_advisory_alloc(nx, nxadv_name,
		    &NX_PROV(nx)->nxprov_region_params[SKMEM_REGION_NEXUSADV],
		    NEXUS_ADVISORY_TYPE_NETIF);
		if (error != 0) {
			nx_netif_free(n);
			return error;
		}
	}
	n->nif_nx = nx;
	SK_D("create new netif 0x%llx for nexus 0x%llx",
	    SK_KVA(NX_NETIF_PRIVATE(nx)), SK_KVA(nx));
	return 0;
}

void
nx_netif_prov_nx_dtor(struct kern_nexus *nx)
{
	struct nx_netif *n = NX_NETIF_PRIVATE(nx);

	SK_LOCK_ASSERT_HELD();

	SK_D("nexus 0x%llx (%s) netif 0x%llx", SK_KVA(nx),
	    NX_DOM_PROV(nx)->nxdom_prov_name, SK_KVA(n));

	/*
	 * XXX
	 * detach should be done separately to be symmetrical with attach.
	 */
	nx_advisory_free(nx);
	if (nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV) != NULL) {
		/* we're called by nx_detach(), so this cannot fail */
		int err = nx_netif_ctl_detach(nx, NULL);
		VERIFY(err == 0);
	}
	if (n->nif_dev_nxb != NULL) {
		nxb_free(n->nif_dev_nxb);
		n->nif_dev_nxb = NULL;
	}
	if (n->nif_host_nxb != NULL) {
		nxb_free(n->nif_host_nxb);
		n->nif_host_nxb = NULL;
	}
	SK_DF(SK_VERB_NETIF, "marking netif 0x%llx as free", SK_KVA(n));
	nx_netif_free(n);
	nx->nx_arg = NULL;
}

int
nx_netif_prov_nx_mem_info(struct kern_nexus *nx, struct kern_pbufpool **tpp,
    struct kern_pbufpool **rpp)
{
	ASSERT(nx->nx_tx_pp != NULL);
	ASSERT(nx->nx_rx_pp != NULL);

	if (tpp != NULL) {
		*tpp = nx->nx_tx_pp;
	}
	if (rpp != NULL) {
		*rpp = nx->nx_rx_pp;
	}

	return 0;
}

static size_t
__netif_mib_get_stats(struct kern_nexus *nx, void *out, size_t len)
{
	struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
	struct ifnet *ifp = nif->nif_ifp;
	struct sk_stats_net_if *__single sns = out;
	size_t actual_space = sizeof(struct sk_stats_net_if);

	if (out != NULL && actual_space <= len) {
		uuid_copy(sns->sns_nx_uuid, nx->nx_uuid);
		if (ifp != NULL) {
			(void) strlcpy(sns->sns_if_name, if_name(ifp), IFNAMSIZ);
		}
		sns->sns_nifs = nif->nif_stats;
	}

	return actual_space;
}

static size_t
__netif_mib_get_llinks(struct kern_nexus *nx, void *__sized_by(len) out, size_t len)
{
	struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
	struct nx_llink_info *nli_list = out;
	size_t actual_space = 0;
	if (NETIF_LLINK_ENABLED(nif)) {
		lck_rw_lock_shared(&nif->nif_llink_lock);
		actual_space += nif->nif_llink_cnt * sizeof(struct nx_llink_info);

		if (out != NULL && actual_space <= len) {
			struct netif_llink *llink;
			int i = 0;
			STAILQ_FOREACH(llink, &nif->nif_llink_list, nll_link) {
				struct nx_llink_info *nli;
				struct netif_qset *qset;
				uint16_t qset_cnt;
				int j;

				nli = &nli_list[i];
				uuid_copy(nli->nli_netif_uuid, nx->nx_uuid);
				nli->nli_link_id = llink->nll_link_id;
				nli->nli_link_id_internal = llink->nll_link_id_internal;
				nli->nli_state = llink->nll_state;
				nli->nli_flags = llink->nll_flags;

				qset_cnt = llink->nll_qset_cnt;
				ASSERT(qset_cnt <= NETIF_LLINK_MAX_QSETS);
				nli->nli_qset_cnt = qset_cnt;

				j = 0;
				SLIST_FOREACH(qset, &llink->nll_qset_list, nqs_list) {
					struct nx_qset_info *nqi;

					nqi = &nli->nli_qset[j];
					nqi->nqi_id = qset->nqs_id;
					nqi->nqi_flags = qset->nqs_flags;
					nqi->nqi_num_rx_queues = qset->nqs_num_rx_queues;
					nqi->nqi_num_tx_queues = qset->nqs_num_tx_queues;
					j++;
				}
				ASSERT(j == qset_cnt);
				i++;
			}
			ASSERT(i == nif->nif_llink_cnt);
		}
		lck_rw_unlock_shared(&nif->nif_llink_lock);
	}

	return actual_space;
}

static size_t
__netif_mib_get_queue_stats(struct kern_nexus *nx, void *__sized_by(len) out, size_t len)
{
	struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
	uint8_t *itr = out;
	size_t actual_space = 0;
	if (!NETIF_LLINK_ENABLED(nif)) {
		return actual_space;
	}

	lck_rw_lock_shared(&nif->nif_llink_lock);
	struct netif_llink *llink;
	struct netif_qset *qset;
	STAILQ_FOREACH(llink, &nif->nif_llink_list, nll_link) {
		SLIST_FOREACH(qset, &llink->nll_qset_list, nqs_list) {
			actual_space += sizeof(struct netif_qstats_info) *
			    (qset->nqs_num_rx_queues + qset->nqs_num_tx_queues);
		}
	}
	if (out == NULL || actual_space > len) {
		lck_rw_unlock_shared(&nif->nif_llink_lock);
		return actual_space;
	}

	llink = NULL;
	qset = NULL;
	uint16_t i = 0, j = 0;
	STAILQ_FOREACH(llink, &nif->nif_llink_list, nll_link) {
		uint16_t qset_cnt;
		j = 0;
		qset_cnt = llink->nll_qset_cnt;
		ASSERT(qset_cnt <= NETIF_LLINK_MAX_QSETS);
		SLIST_FOREACH(qset, &llink->nll_qset_list, nqs_list) {
			int queue_cnt = qset->nqs_num_rx_queues +
			    qset->nqs_num_tx_queues;
			for (uint16_t k = 0; k < queue_cnt; k++) {
				struct netif_qstats_info *nqi =
				    (struct netif_qstats_info *)(void *)itr;
				struct netif_queue *nq = &qset->nqs_driver_queues[k];
				nqi->nqi_qset_id = qset->nqs_id;
				nqi->nqi_queue_idx = k;
				if (KPKT_VALID_SVC(nq->nq_svc)) {
					nqi->nqi_svc = (packet_svc_class_t)nq->nq_svc;
				}
				if (nq->nq_flags & NETIF_QUEUE_IS_RX) {
					nqi->nqi_queue_flag = NQI_QUEUE_FLAG_IS_RX;
				}

				struct netif_qstats *nq_out = &nqi->nqi_stats;
				struct netif_qstats *nq_src = &nq->nq_stats;
				memcpy(nq_out, nq_src, sizeof(struct netif_qstats));

				itr += sizeof(struct netif_qstats_info);
			}
			j++;
		}
		ASSERT(j == qset_cnt);
		i++;
	}
	ASSERT(i == nif->nif_llink_cnt);

	lck_rw_unlock_shared(&nif->nif_llink_lock);
	return actual_space;
}

size_t
nx_netif_prov_nx_mib_get(struct kern_nexus *nx, struct nexus_mib_filter *filter,
    void *__sized_by(len) out, size_t len, struct proc *p)
{
#pragma unused(p)
	size_t ret;

	if ((filter->nmf_bitmap & NXMIB_FILTER_NX_UUID) &&
	    (uuid_compare(filter->nmf_nx_uuid, nx->nx_uuid)) != 0) {
		return 0;
	}

	switch (filter->nmf_type) {
	case NXMIB_NETIF_STATS:
		ret = __netif_mib_get_stats(nx, out, len);
		break;
	case NXMIB_LLINK_LIST:
		ret = __netif_mib_get_llinks(nx, out, len);
		break;
	case NXMIB_NETIF_QUEUE_STATS:
		ret = __netif_mib_get_queue_stats(nx, out, len);
		break;
	default:
		ret = 0;
		break;
	}
	return ret;
}

static int
nx_netif_dom_bind_port(struct kern_nexus *nx, nexus_port_t *nx_port,
    struct nxbind *nxb, void *info)
{
	struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
	nexus_port_t first, last, port;
	int error;

	ASSERT(nx_port != NULL);
	ASSERT(nxb != NULL);

	port = *nx_port;

	/*
	 * If port is:
	 * != NEXUS_PORT_ANY: attempt to bind to the specified port
	 * == NEXUS_PORT_ANY: find an available port, bind to it, and
	 *                    return back the assigned port.
	 */
	first = NEXUS_PORT_NET_IF_CLIENT;
	ASSERT(NXDOM_MAX(NX_DOM(nx), ports) <= NEXUS_PORT_MAX);
	last = (nexus_port_size_t)NXDOM_MAX(NX_DOM(nx), ports);
	ASSERT(first <= last);

	NETIF_WLOCK(nif);

	if (__improbable(first == last)) {
		error = ENOMEM;
	} else if (port != NEXUS_PORT_ANY) {
		error = nx_port_bind_info(nx, port, nxb, info);
		SK_DF(SK_VERB_NETIF, "port %d, bind err %d", port, error);
	} else {
		error = nx_port_find(nx, first, last - 1, &port);
		ASSERT(error != 0 || (port >= first && port < last));
		if (error == 0) {
			error = nx_port_bind_info(nx, port, nxb, info);
			SK_DF(SK_VERB_NETIF, "found port %d, bind err %d",
			    port, error);
		}
	}
	NETIF_WUNLOCK(nif);

	ASSERT(*nx_port == NEXUS_PORT_ANY || *nx_port == port);
	if (error == 0) {
		*nx_port = port;
	}

	SK_DF(error ? SK_VERB_ERROR : SK_VERB_NETIF,
	    "+++ netif 0x%llx nx_port %d, total %u active %u (err %d)",
	    SK_KVA(nif), (int)*nx_port, NX_NETIF_MAXPORTS,
	    nx->nx_active_ports, error);

	return error;
}

static int
nx_netif_dom_unbind_port(struct kern_nexus *nx, nexus_port_t nx_port)
{
	struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
	int error = 0;

	ASSERT(nx_port != NEXUS_PORT_ANY);

	NETIF_WLOCK(nif);
	error = nx_port_unbind(nx, nx_port);
	NETIF_WUNLOCK(nif);

	return error;
}

static int
nx_netif_dom_connect(struct kern_nexus_domain_provider *nxdom_prov,
    struct kern_nexus *nx, struct kern_channel *ch, struct chreq *chr,
    struct kern_channel *ch0, struct nxbind *nxb, struct proc *p)
{
#pragma unused(nxdom_prov)
	int err = 0;

	SK_LOCK_ASSERT_HELD();

	ASSERT(NX_DOM_PROV(nx) == nxdom_prov);
	ASSERT(nx->nx_prov->nxprov_params->nxp_type ==
	    nxdom_prov->nxdom_prov_dom->nxdom_type &&
	    nx->nx_prov->nxprov_params->nxp_type == NEXUS_TYPE_NET_IF);
	ASSERT(!(ch->ch_flags & CHANF_HOST));

	switch (chr->cr_port) {
	case NEXUS_PORT_NET_IF_DEV:
		if (chr->cr_mode & CHMODE_HOST) {
			err = EINVAL;
			goto done;
		}
		break;

	case NEXUS_PORT_NET_IF_HOST:
		if (!(chr->cr_mode & CHMODE_HOST)) {
			if (ch->ch_flags & CHANF_KERNEL) {
				err = EINVAL;
				goto done;
			}
			chr->cr_mode |= CHMODE_HOST;
		}
		/*
		 * This channel is exclusively opened to the host
		 * rings; don't notify the external provider.
		 */
		os_atomic_or(&ch->ch_flags, CHANF_HOST | CHANF_EXT_SKIP, relaxed);
		break;

	default:
		/*
		 * This channel is shared between netif and user process;
		 * don't notify the external provider.
		 */
		os_atomic_or(&ch->ch_flags, CHANF_EXT_SKIP, relaxed);
		break;
	}

	chr->cr_ring_set = RING_SET_DEFAULT;
	chr->cr_real_endpoint = chr->cr_endpoint = CH_ENDPOINT_NET_IF;
	(void) snprintf(chr->cr_name, sizeof(chr->cr_name), "netif:%llu:%.*s",
	    nx->nx_id, (int)nx->nx_prov->nxprov_params->nxp_namelen,
	    nx->nx_prov->nxprov_params->nxp_name);

	if (ch->ch_flags & CHANF_KERNEL) {
		err = na_connect_spec(nx, ch, chr, p);
	} else {
		err = na_connect(nx, ch, chr, ch0, nxb, p);
	}

	if (err == 0) {
		/*
		 * Mark the kernel slot descriptor region as busy; this
		 * prevents it from being torn-down at channel defunct
		 * time, as the (external) nexus owner may be calling
		 * KPIs that require accessing the slots.
		 */
		skmem_arena_nexus_sd_set_noidle(
			skmem_arena_nexus(ch->ch_na->na_arena), 1);
	}

done:
	return err;
}

static void
nx_netif_dom_disconnect(struct kern_nexus_domain_provider *nxdom_prov,
    struct kern_nexus *nx, struct kern_channel *ch)
{
#pragma unused(nxdom_prov)
	SK_LOCK_ASSERT_HELD();

	SK_D("channel 0x%llx -!- nexus 0x%llx (%s:\"%s\":%u:%d)", SK_KVA(ch),
	    SK_KVA(nx), nxdom_prov->nxdom_prov_name, ch->ch_na->na_name,
	    ch->ch_info->cinfo_nx_port, (int)ch->ch_info->cinfo_ch_ring_id);

	/*
	 * Release busy assertion held earlier in nx_netif_dom_connect();
	 * this allows for the final arena teardown to succeed.
	 */
	skmem_arena_nexus_sd_set_noidle(
		skmem_arena_nexus(ch->ch_na->na_arena), -1);

	if (ch->ch_flags & CHANF_KERNEL) {
		na_disconnect_spec(nx, ch);
	} else {
		na_disconnect(nx, ch);
	}
}

static void
nx_netif_dom_defunct(struct kern_nexus_domain_provider *nxdom_prov,
    struct kern_nexus *nx, struct kern_channel *ch, struct proc *p)
{
#pragma unused(nxdom_prov, nx)
	LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
	ASSERT(!(ch->ch_flags & CHANF_KERNEL));
	ASSERT(ch->ch_na->na_type == NA_NETIF_DEV ||
	    ch->ch_na->na_type == NA_NETIF_HOST ||
	    ch->ch_na->na_type == NA_NETIF_COMPAT_DEV ||
	    ch->ch_na->na_type == NA_NETIF_COMPAT_HOST ||
	    ch->ch_na->na_type == NA_NETIF_VP);

	na_ch_rings_defunct(ch, p);
}

static void
nx_netif_dom_defunct_finalize(struct kern_nexus_domain_provider *nxdom_prov,
    struct kern_nexus *nx, struct kern_channel *ch, boolean_t locked)
{
#pragma unused(nxdom_prov)
	struct ifnet *ifp;

	if (!locked) {
		SK_LOCK_ASSERT_NOTHELD();
		SK_LOCK();
		LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_NOTOWNED);
	} else {
		SK_LOCK_ASSERT_HELD();
		LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
	}

	ASSERT(ch->ch_na->na_type == NA_NETIF_DEV ||
	    ch->ch_na->na_type == NA_NETIF_HOST ||
	    ch->ch_na->na_type == NA_NETIF_COMPAT_DEV ||
	    ch->ch_na->na_type == NA_NETIF_COMPAT_HOST ||
	    ch->ch_na->na_type == NA_NETIF_VP);

	na_defunct(nx, ch, ch->ch_na, locked);
	ifp = ch->ch_na->na_ifp;
	if (ch->ch_na->na_type == NA_NETIF_VP && ifp != NULL &&
	    ifnet_is_low_latency(ifp)) {
		/*
		 * We release the VPNA's ifp here instead of waiting for the
		 * application to close the channel to trigger the release.
		 */
		DTRACE_SKYWALK2(release__vpna__ifp, struct nexus_adapter *,
		    ch->ch_na, struct ifnet *, ifp);
		ifnet_decr_iorefcnt(ifp);
		ch->ch_na->na_ifp = NULL;
	}
	SK_D("%s(%d): ch 0x%llx -/- nx 0x%llx (%s:\"%s\":%u:%d)",
	    ch->ch_name, ch->ch_pid, SK_KVA(ch), SK_KVA(nx),
	    nxdom_prov->nxdom_prov_name, ch->ch_na->na_name,
	    ch->ch_info->cinfo_nx_port, (int)ch->ch_info->cinfo_ch_ring_id);

	if (!locked) {
		LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_NOTOWNED);
		SK_UNLOCK();
	} else {
		LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
		SK_LOCK_ASSERT_HELD();
	}
}

struct nexus_netif_adapter *
na_netif_alloc(zalloc_flags_t how)
{
	_CASSERT(offsetof(struct nexus_netif_adapter, nifna_up) == 0);

	return zalloc_flags(na_netif_zone, how | Z_ZERO);
}

void
na_netif_free(struct nexus_adapter *na)
{
	struct nexus_netif_adapter *nifna = (struct nexus_netif_adapter *)na;

	SK_LOCK_ASSERT_HELD();
	SK_DF(SK_VERB_MEM, "nifna 0x%llx FREE", SK_KVA(nifna));

	ASSERT(na->na_refcount == 0);
	ASSERT(nifna->nifna_tx_mit == NULL);
	ASSERT(nifna->nifna_rx_mit == NULL);
	bzero(nifna, sizeof(*nifna));

	zfree(na_netif_zone, nifna);
}

/* Process NXCFG_CMD_ATTACH */
SK_NO_INLINE_ATTRIBUTE
static int
nx_netif_ctl_attach(struct kern_nexus *nx, struct nx_spec_req *nsr,
    struct proc *p)
{
	struct nx_netif *n = NX_NETIF_PRIVATE(nx);
	struct ifnet *ifp = NULL;
	boolean_t compat;
	int err = 0;

	SK_LOCK_ASSERT_HELD();

	ASSERT(NX_DOM(nx)->nxdom_type == NEXUS_TYPE_NET_IF);
	compat = (strbufcmp(NX_DOM_PROV(nx)->nxdom_prov_name,
	    NEXUS_PROVIDER_NET_IF_COMPAT) == 0);

	uuid_clear(nsr->nsr_if_uuid);
	/*
	 * The netif accepts either an interface name or a pointer to
	 * an ifnet, but never a UUID.
	 */
	if (nsr->nsr_flags & NXSPECREQ_UUID) {
		err = EINVAL;
		goto done;
	}
	if (nsr->nsr_flags & NXSPECREQ_IFP) {
		if (p != kernproc || (ifp = nsr->nsr_ifp) == NULL) {
			err = EINVAL;
			goto done;
		}
	} else if ((ifp = ifunit_ref(__unsafe_null_terminated_from_indexable(
		    nsr->nsr_name))) == NULL) {
		err = ENXIO;
		goto done;
	}

	if ((compat && SKYWALK_NATIVE(ifp)) ||
	    (!compat && !SKYWALK_NATIVE(ifp))) {
		/* native driver for netif; non-native for netif_compat  */
		err = ENODEV;
	} else if (ifp->if_na != NULL || !uuid_is_null(n->nif_uuid)) {
		err = EBUSY;
	} else {
		ASSERT(uuid_is_null(n->nif_uuid));
		/*
		 * Upon success, callee will hold its own ifnet iorefcnt
		 * as well as a retain count on the nexus adapter.
		 */
		if (compat) {
			err = nx_netif_compat_attach(nx, ifp);
		} else {
			err = nx_netif_attach(nx, ifp);
		}

		if (err == 0) {
			/* return the adapter UUID */
			uuid_generate_random(n->nif_uuid);
			uuid_copy(nsr->nsr_if_uuid, n->nif_uuid);
#if (DEVELOPMENT || DEBUG)
			skoid_create(&n->nif_skoid,
			    SKOID_SNODE(_kern_skywalk_netif), if_name(ifp),
			    CTLFLAG_RW);
#endif /* !DEVELOPMENT && !DEBUG */
		}
	}
done:
	/* drop I/O refcnt from ifunit_ref() */
	if (ifp != NULL && !(nsr->nsr_flags & NXSPECREQ_IFP)) {
		ifnet_decr_iorefcnt(ifp);
	}

#if SK_LOG
	uuid_string_t uuidstr, ifuuidstr;
	const char *nustr;
	if (nsr->nsr_flags & NXSPECREQ_UUID) {
		nustr = sk_uuid_unparse(nsr->nsr_uuid, uuidstr);
	} else if (nsr->nsr_flags & NXSPECREQ_IFP) {
		(void) snprintf((char *)uuidstr, sizeof(uuidstr), "0x%llx",
		    SK_KVA(nsr->nsr_ifp));
		nustr = uuidstr;
	} else {
		nustr = nsr->nsr_name;
	}
	SK_DF(err ? SK_VERB_ERROR : SK_VERB_NETIF,
	    "nexus 0x%llx (%s) name/uuid \"%s\" if_uuid %s flags 0x%x err %d",
	    SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name, nustr,
	    sk_uuid_unparse(nsr->nsr_if_uuid, ifuuidstr), nsr->nsr_flags, err);
#endif /* SK_LOG */

	return err;
}

SK_NO_INLINE_ATTRIBUTE
static int
nx_netif_clean(struct nx_netif *nif, boolean_t quiesce_needed)
{
	struct kern_nexus *nx = nif->nif_nx;
	struct ifnet *ifp;
	boolean_t suspended = FALSE;

	ifp = nif->nif_ifp;
	if (ifp == NULL) {
		return EALREADY;
	}
	/*
	 * For regular kernel-attached interfaces, quiescing is handled by
	 * the ifnet detach thread, which calls dlil_quiesce_and_detach_nexuses().
	 * For interfaces created by skywalk test cases, flowswitch/netif nexuses
	 * are constructed on the fly and can also be torn down on the fly.
	 * dlil_quiesce_and_detach_nexuses() won't help here because any nexus
	 * can be detached while the interface is still attached.
	 */
	if (quiesce_needed && ifnet_datamov_suspend_if_needed(ifp)) {
		SK_UNLOCK();
		suspended = TRUE;
		ifnet_datamov_drain(ifp);
		SK_LOCK();
	}
	nx_netif_callbacks_fini(nif);
	nx_netif_agent_fini(nif);
	nx_netif_capabilities_fini(nif);
	nx_netif_flow_fini(nif);
	nx_netif_filter_fini(nif);
	nx_netif_llink_fini(nif);
	nx_netif_flags_fini(nif);

	uuid_clear(nif->nif_uuid);
	/* nx_netif_{compat_}attach() held both references */
	na_release_locked(nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV));
	na_release_locked(nx_port_get_na(nx, NEXUS_PORT_NET_IF_HOST));
	nx_port_free(nx, NEXUS_PORT_NET_IF_DEV);
	nx_port_free(nx, NEXUS_PORT_NET_IF_HOST);

	ifp->if_na_ops = NULL;
	ifp->if_na = NULL;
	nif->nif_ifp = NULL;
	nif->nif_netif_nxadv = NULL;
	SKYWALK_CLEAR_CAPABLE(ifp);
	if (suspended) {
		ifnet_datamov_resume(ifp);
	}

#if (DEVELOPMENT || DEBUG)
	skoid_destroy(&nif->nif_skoid);
#endif /* !DEVELOPMENT && !DEBUG */
	return 0;
}

/* process NXCFG_CMD_DETACH */
SK_NO_INLINE_ATTRIBUTE
static int
nx_netif_ctl_detach(struct kern_nexus *nx, struct nx_spec_req *nsr)
{
	struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
	int err = 0;

	SK_LOCK_ASSERT_HELD();

	/*
	 * nsr is NULL when we're called from the destructor, and it
	 * implies that we'll detach whatever that is attached.
	 */
	if (nsr != NULL && uuid_is_null(nsr->nsr_if_uuid)) {
		err = EINVAL;
	} else if (nsr != NULL && uuid_compare(nsr->nsr_if_uuid,
	    nif->nif_uuid) != 0) {
		err = ESRCH;
	} else if (nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV) == NULL) {
		/* nx_netif_ctl_attach() not yet done or already detached */
		err = ENXIO;
	} else if (nx->nx_ch_count != 0) {
		/*
		 * There's at least a channel opened; we can't
		 * yank the interface from underneath the nexus
		 * since our dlil input/output handler may be
		 * running now.  Bail out and come back here
		 * again when the nexus detaches.
		 */
		err = EBUSY;
	} else {
		err = nx_netif_clean(nif, TRUE);
	}

#if SK_LOG
	if (nsr != NULL) {
		uuid_string_t ifuuidstr;
		SK_DF(err ? SK_VERB_ERROR : SK_VERB_NETIF,
		    "nexus 0x%llx (%s) if_uuid %s flags 0x%x err %d",
		    SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name,
		    sk_uuid_unparse(nsr->nsr_if_uuid, ifuuidstr),
		    nsr->nsr_flags, err);
	} else {
		SK_DF(err ? SK_VERB_ERROR : SK_VERB_NETIF,
		    "nexus 0x%llx (%s) err %d", SK_KVA(nx),
		    NX_DOM_PROV(nx)->nxdom_prov_name, err);
	}
#endif /* SK_LOG */

	return err;
}

/*
 * XXX
 * These checks are copied from fsw.c
 * There are no tests exercising this code. Do we still need this?
 */
SK_NO_INLINE_ATTRIBUTE
static int
nx_netif_ctl_flow_check(struct nx_netif *nif, nxcfg_cmd_t cmd,
    struct proc *p, struct nx_flow_req *req)
{
#pragma unused(nif)
	boolean_t need_check;
	int error;

	if (uuid_is_null(req->nfr_flow_uuid)) {
		return EINVAL;
	}
	req->nfr_flags &= NXFLOWREQF_MASK;
	req->nfr_flowadv_idx = FLOWADV_IDX_NONE;

	if (cmd == NXCFG_CMD_FLOW_DEL) {
		return 0;
	}
	need_check = FALSE;
	if (req->nfr_epid != -1 && proc_pid(p) != req->nfr_epid) {
		need_check = TRUE;
	} else if (!uuid_is_null(req->nfr_euuid)) {
		uuid_t uuid;

		/* get the UUID of the issuing process */
		proc_getexecutableuuid(p, uuid, sizeof(uuid));

		/*
		 * If this is not issued by a process for its own
		 * executable UUID and if the process does not have
		 * the necessary privilege, reject the request.
		 * The logic is similar to so_set_effective_uuid().
		 */
		if (uuid_compare(req->nfr_euuid, uuid) != 0) {
			need_check = TRUE;
		}
	}
	if (need_check) {
		kauth_cred_t cred = kauth_cred_proc_ref(p);
		error = priv_check_cred(cred,
		    PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0);
		kauth_cred_unref(&cred);
		if (error != 0) {
			return error;
		}
	}
	return 0;
}

SK_NO_INLINE_ATTRIBUTE
static int
nx_netif_ctl_flow_add(struct nx_netif *nif, struct proc *p,
    struct nx_flow_req *req)
{
	int err;

	ASSERT(p != PROC_NULL);
	err = nx_netif_ctl_flow_check(nif, NXCFG_CMD_FLOW_ADD, p, req);
	if (err != 0) {
		return err;
	}

	/* init kernel only fields */
	nx_flow_req_internalize(req);
	req->nfr_context = NULL;
	req->nfr_flow_stats = NULL;
	req->nfr_port_reservation = NULL;
	req->nfr_pid = proc_pid(p);

	err = nx_netif_netagent_flow_add(nif, req);
	nx_flow_req_externalize(req);
	return err;
}

SK_NO_INLINE_ATTRIBUTE
static int
nx_netif_ctl_flow_del(struct nx_netif *nif, struct proc *p,
    struct nx_flow_req *req)
{
	int err;

	err = nx_netif_ctl_flow_check(nif, NXCFG_CMD_FLOW_DEL, p, req);
	if (err != 0) {
		return err;
	}

	nx_flow_req_internalize(req);
	req->nfr_pid = proc_pid(p);

	err = nx_netif_netagent_flow_del(nif, req);
	nx_flow_req_externalize(req);
	return err;
}

SK_NO_INLINE_ATTRIBUTE
static int
nx_netif_ctl(struct kern_nexus *nx, nxcfg_cmd_t nc_cmd, void *data,
    struct proc *p)
{
	struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
	struct nx_spec_req *__single nsr = data;
	struct nx_flow_req *__single nfr = data;
	int error = 0;

	SK_LOCK_ASSERT_HELD();

	switch (nc_cmd) {
	case NXCFG_CMD_ATTACH:
		error = nx_netif_ctl_attach(nx, nsr, p);
		break;

	case NXCFG_CMD_DETACH:
		error = nx_netif_ctl_detach(nx, nsr);
		break;

	case NXCFG_CMD_FLOW_ADD:
		error = nx_netif_ctl_flow_add(nif, p, nfr);
		break;

	case NXCFG_CMD_FLOW_DEL:
		error = nx_netif_ctl_flow_del(nif, p, nfr);
		break;

	default:
		SK_ERR("invalid cmd %u", nc_cmd);
		error = EINVAL;
		break;
	}
	return error;
}

static void
nx_netif_llink_notify(struct kern_nexus *nx, struct netif_llink *llink,
    uint32_t flags)
{
#pragma unused(flags)
	struct netif_qset *qset;

	SLIST_FOREACH(qset, &llink->nll_qset_list, nqs_list) {
		(void) nx_tx_qset_notify(nx, qset->nqs_ctx);
	}
}

static void
nx_netif_llink_notify_all(struct kern_nexus *nx, uint32_t flags)
{
	struct nx_netif *nif;
	struct netif_llink *llink;

	nif = NX_NETIF_PRIVATE(nx);

	lck_rw_lock_shared(&nif->nif_llink_lock);
	STAILQ_FOREACH(llink, &nif->nif_llink_list, nll_link) {
		nx_netif_llink_notify(nx, llink, flags);
	}
	lck_rw_unlock_shared(&nif->nif_llink_lock);
}

/*
 * if_start() callback for native Skywalk interfaces, registered
 * at ifnet_allocate_extended() time, and invoked by the ifnet
 * starter thread.
 */
static void
nx_netif_doorbell_internal(struct ifnet *ifp, uint32_t flags)
{
	if (__improbable(ifp->if_na == NULL)) {
		return;
	}

	/*
	 * Do this only if the nexus adapter is active, i.e. a channel
	 * has been opened to it by the module above (flowswitch, etc.)
	 */
	struct nexus_adapter *hwna = &NA(ifp)->nifna_up;
	if (__probable(NA_IS_ACTIVE(hwna))) {
		struct kern_nexus *nx = hwna->na_nx;

		/* update our work timestamp */
		hwna->na_work_ts = _net_uptime;

		if (NX_LLINK_PROV(nx)) {
			nx_netif_llink_notify_all(nx, flags);
		} else {
			struct __kern_channel_ring *kring;

			/* for doorbell purposes, use TX ring 0 */
			kring = &hwna->na_tx_rings[0];

			/* Issue a synchronous TX doorbell on the netif device ring */
			kring->ckr_na_sync(kring, PROC_NULL,
			    (NA_SYNCF_NETIF_DOORBELL | NA_SYNCF_NETIF_IFSTART));
		}
	} else {
		struct netif_stats *nifs =
		    &NX_NETIF_PRIVATE(hwna->na_nx)->nif_stats;
		STATS_INC(nifs, NETIF_STATS_DROP_NA_INACTIVE);
	}
}

static void
nx_netif_doorbell(struct ifnet *ifp)
{
	nx_netif_doorbell_internal(ifp, NETIF_XMIT_FLAG_HOST);
}

/*
 * TX sync callback, called from nx_netif_doorbell() where we'd expect to
 * perform synchronous TX doorbell to the driver, by invoking the driver's
 * doorbell callback directly in the same thread context.  It is also called
 * when the layer above performs a TX sync operation, where we might need
 * to do an asynchronous doorbell instead, by simply calling ifnet_start().
 */
static int
nx_netif_na_txsync(struct __kern_channel_ring *kring, struct proc *p,
    uint32_t flags)
{
#pragma unused(p)
	struct ifnet *ifp = KRNA(kring)->na_ifp;
	boolean_t sync_only;
	int ret = 0;

	ASSERT(ifp != NULL);

	SK_DF(SK_VERB_NETIF | SK_VERB_SYNC | SK_VERB_TX,
	    "%s(%d) kr \"%s\" (0x%llx) krflags 0x%b ring %u flags 0%x",
	    sk_proc_name_address(p), sk_proc_pid(p), kring->ckr_name,
	    SK_KVA(kring), kring->ckr_flags, CKRF_BITS, kring->ckr_ring_id,
	    flags);

	if (__improbable(!IF_FULLY_ATTACHED(ifp))) {
		SK_ERR("kr 0x%llx ifp %s (0x%llx), interface not attached",
		    SK_KVA(kring), if_name(ifp), SK_KVA(ifp));
		return ENXIO;
	}

	if (__improbable((ifp->if_start_flags & IFSF_FLOW_CONTROLLED) != 0)) {
		SK_DF(SK_VERB_SYNC | SK_VERB_TX, "kr 0x%llx ifp %s (0x%llx), "
		    "flow control ON", SK_KVA(kring), if_name(ifp),
		    SK_KVA(ifp));
		return ENXIO;
	}

	/* update our work timestamp */
	KRNA(kring)->na_work_ts = _net_uptime;

	sync_only = ((flags & NA_SYNCF_SYNC_ONLY) != 0) ||
	    !KR_KERNEL_ONLY(kring);
	/* regular sync (reclaim) */
	if ((flags & NA_SYNCF_NETIF) != 0 || __improbable(sync_only)) {
		ret = nx_sync_tx(kring, (flags & NA_SYNCF_FORCE_RECLAIM) ||
		    kring->ckr_pending_intr != 0);
		kring->ckr_pending_intr = 0;

		/* direct user channels do not need to use the doorbell */
		if (__improbable(sync_only)) {
			return ret;
		}
	}

	/*
	 * Doorbell call.  Here we do doorbell explicitly if the flag is
	 * set or implicitly if we're opened directly by a user channel.
	 * Synchronous vs. asynchronous depending on the context.
	 */
	if (__probable((flags & NA_SYNCF_NETIF_DOORBELL) != 0)) {
		if ((flags & NA_SYNCF_NETIF_IFSTART) != 0) {
			ASSERT(!(flags & NA_SYNCF_NETIF_IFSTART) ||
			    !(flags & NA_SYNCF_NETIF_ASYNC));
			nx_tx_doorbell(kring, (flags & NA_SYNCF_NETIF_ASYNC));
		} else {
			ifnet_start(ifp);
		}
	}

	return ret;
}

static int
nx_netif_na_rxsync(struct __kern_channel_ring *kring, struct proc *p,
    uint32_t flags)
{
#pragma unused(p)
	int ret;

	SK_DF(SK_VERB_NETIF | SK_VERB_SYNC | SK_VERB_RX,
	    "%s(%d) kr \"%s\" (0x%llx) krflags 0x%b ring %u flags 0%x",
	    sk_proc_name_address(p), sk_proc_pid(p), kring->ckr_name,
	    SK_KVA(kring), kring->ckr_flags, CKRF_BITS, kring->ckr_ring_id,
	    flags);

	ASSERT(kring->ckr_rhead <= kring->ckr_lim);

	/* update our work timestamp */
	KRNA(kring)->na_work_ts = _net_uptime;

	ret = nx_sync_rx(kring, (flags & NA_SYNCF_FORCE_READ) ||
	    kring->ckr_pending_intr != 0);
	kring->ckr_pending_intr = 0;

	return ret;
}

static void
nx_netif_na_dtor(struct nexus_adapter *na)
{
	struct ifnet *__single ifp;
	struct nexus_netif_adapter *nifna = NIFNA(na);

	SK_LOCK_ASSERT_HELD();
	ASSERT(na->na_type == NA_NETIF_DEV || na->na_type == NA_NETIF_HOST);

	SK_DF(SK_VERB_NETIF, "na \"%s\" (0x%llx)", na->na_name, SK_KVA(na));

	/*
	 * If the finalizer callback hasn't been called for whatever
	 * reasons, pick up the embryonic ifnet stored in na_private.
	 * Otherwise, release the I/O refcnt of a non-NULL na_ifp.
	 */
	if ((ifp = na->na_ifp) == NULL) {
		ifp = na->na_private;
		na->na_private = NULL;
	} else {
		ifnet_decr_iorefcnt(ifp);
		na->na_ifp = NULL;
	}

	if (nifna->nifna_netif != NULL) {
		nx_netif_release(nifna->nifna_netif);
		nifna->nifna_netif = NULL;
	}
	ASSERT(SKYWALK_NATIVE(ifp));
}

/*
 * Dispatch rx/tx interrupts to the channel rings.
 *
 * The 'notify' routine depends on what the ring is attached to.
 * - for a channel file descriptor, do an event wakeup on the individual
 *   waitqueue, plus one on the global one if needed (see na_notify)
 * - for a device port connected to a FlowSwitch, call the proper
 *   forwarding routine; see nx_fsw_tx_hwna_notify()
 *   or nx_fsw_rx_hwna_notify().
 */
int
nx_netif_common_intr(struct __kern_channel_ring *kring, struct proc *p,
    uint32_t flags, uint32_t *work_done)
{
	struct netif_stats *nifs =
	    &NX_NETIF_PRIVATE(KRNA(kring)->na_nx)->nif_stats;
	int (*notify)(struct __kern_channel_ring *kring,
	    struct proc *, uint32_t flags);
	int ret;

	KDBG((SK_KTRACE_NETIF_COMMON_INTR | DBG_FUNC_START), SK_KVA(kring));

	SK_DF(SK_VERB_NETIF | SK_VERB_INTR |
	    ((kring->ckr_tx == NR_RX) ? SK_VERB_RX : SK_VERB_TX),
	    "na \"%s\" (0x%llx) kr \"%s\" (0x%llx) krflags 0x%b",
	    KRNA(kring)->na_name, SK_KVA(KRNA(kring)), kring->ckr_name,
	    SK_KVA(kring), kring->ckr_flags, CKRF_BITS);

	/* update our work timestamp */
	KRNA(kring)->na_work_ts = _net_uptime;

	kring->ckr_pending_intr++;
	if (work_done != NULL) {
		*work_done = 1; /* do not fire again */
	}
	/*
	 * We can't be calling ckr_na_notify here since we could already be
	 * intercepting it, else we'd end up recursively calling ourselves.
	 * Use the original na_notify callback saved during na_activate, or in
	 * the case when the module above us is the flowswitch, the notify
	 * routine that it has installed in place of our original one.
	 */
	if (__probable(!KR_DROP(kring) &&
	    (notify = kring->ckr_netif_notify) != NULL)) {
		ret = notify(kring, p, flags);
	} else {
		/*
		 * If the ring is in drop mode, pretend as if it's busy.
		 * This allows the mitigation thread to pause for a while
		 * before attempting again.
		 */
		ret = EBUSY;
	}
	if (__improbable(ret != 0)) {
		switch (kring->ckr_tx) {
		case NR_RX:
			if (ret == EBUSY) {
				STATS_INC(nifs, NETIF_STATS_RX_IRQ_BUSY);
			} else if (ret == EAGAIN) {
				STATS_INC(nifs, NETIF_STATS_RX_IRQ_AGAIN);
			} else {
				STATS_INC(nifs, NETIF_STATS_RX_IRQ_ERR);
			}
			break;

		case NR_TX:
			if (ret == EBUSY) {
				STATS_INC(nifs, NETIF_STATS_TX_IRQ_BUSY);
			} else if (ret == EAGAIN) {
				STATS_INC(nifs, NETIF_STATS_TX_IRQ_AGAIN);
			} else {
				STATS_INC(nifs, NETIF_STATS_TX_IRQ_ERR);
			}
			break;

		default:
			break;
		}
	}

	KDBG((SK_KTRACE_NETIF_COMMON_INTR | DBG_FUNC_END), SK_KVA(kring), ret);

	return ret;
}

static int
nx_netif_na_notify_tx(struct __kern_channel_ring *kring, struct proc *p,
    uint32_t flags)
{
	return nx_netif_mit_tx_intr(kring, p, flags, NULL);
}

static int
nx_netif_na_notify_rx(struct __kern_channel_ring *kring, struct proc *p,
    uint32_t flags)
{
	int ret;

	/*
	 * In the event the mitigation thread is disabled, protect
	 * against recursion by detecting if we're already in the
	 * context of an RX notify.  IOSkywalkFamily may invoke the
	 * notify callback as part of its RX sync callback.
	 */
	if (__probable(!sk_is_rx_notify_protected())) {
		sk_protect_t protect;
		uint32_t work_done;

		protect = sk_rx_notify_protect();
		ret = nx_netif_mit_rx_intr(kring, p, flags, &work_done);
		sk_sync_unprotect(protect);
	} else {
		ret = EAGAIN;
	}

	return ret;
}

static int
nx_netif_na_notify_rx_redirect(struct __kern_channel_ring *kring, struct proc *p,
    uint32_t flags)
{
	struct netif_stats *nifs =
	    &NX_NETIF_PRIVATE(KRNA(kring)->na_nx)->nif_stats;
	uint32_t work_done;

	ASSERT(kring->ckr_tx == NR_RX);
	STATS_INC(nifs, NETIF_STATS_RX_IRQ);
	return nx_netif_common_intr(kring, p, flags, &work_done);
}

void
nx_netif_mit_config(struct nexus_netif_adapter *nifna,
    boolean_t *tx_mit, boolean_t *tx_mit_simple,
    boolean_t *rx_mit, boolean_t *rx_mit_simple)
{
	struct nx_netif *nif = nifna->nifna_netif;

	/*
	 * TX mitigation is disabled by default, but can be
	 * overridden via "sk_netif_tx_mit=N" boot-arg, where
	 * N is one of SK_NETIF_MIT_FORCE_* values.
	 */
	*tx_mit = *tx_mit_simple = FALSE;
	switch (sk_netif_tx_mit) {
	case SK_NETIF_MIT_FORCE_SIMPLE:
		*tx_mit_simple = TRUE;
		OS_FALLTHROUGH;
	case SK_NETIF_MIT_FORCE_ADVANCED:
		*tx_mit = TRUE;
		break;
	case SK_NETIF_MIT_FORCE_OFF:
	case SK_NETIF_MIT_AUTO:
		ASSERT(*tx_mit == FALSE);
		break;
	default:
		VERIFY(0);
		/* NOTREACHED */
		__builtin_unreachable();
	}

	/*
	 * RX mitigation is enabled by default only for BSD-style
	 * virtual network interfaces, but can be overridden
	 * via "sk_netif_rx_mit=N" boot-arg, where N is one of
	 * SK_NETIF_MIT_FORCE_* values.
	 */
	*rx_mit = *rx_mit_simple = FALSE;
	switch (sk_netif_rx_mit) {
	case SK_NETIF_MIT_FORCE_OFF:
		ASSERT(*rx_mit == FALSE);
		break;
	case SK_NETIF_MIT_FORCE_SIMPLE:
		*rx_mit_simple = TRUE;
		OS_FALLTHROUGH;
	case SK_NETIF_MIT_FORCE_ADVANCED:
		*rx_mit = TRUE;
		break;
	case SK_NETIF_MIT_AUTO:
		*rx_mit_simple = TRUE;
		/*
		 * Enable RX mitigation thread only for BSD-style virtual (and
		 * regular) interfaces, since otherwise we may run out of stack
		 * when subjected to IPsec processing, etc.
		 */
		*rx_mit = (NX_PROV(nifna->nifna_up.na_nx)->nxprov_flags &
		    NXPROVF_VIRTUAL_DEVICE) && !NETIF_IS_LOW_LATENCY(nif);
		break;
	default:
		VERIFY(0);
		/* NOTREACHED */
		__builtin_unreachable();
	}
}

static int
nx_netif_na_activate(struct nexus_adapter *na, na_activate_mode_t mode)
{
	struct nexus_netif_adapter *nifna = (struct nexus_netif_adapter *)na;
	boolean_t tx_mit, rx_mit, tx_mit_simple, rx_mit_simple;
	struct nx_netif *nif = nifna->nifna_netif;
	struct ifnet *ifp = na->na_ifp;
	int error = 0;
	uint32_t r;
	/* TODO -fbounds-safety: Remove tmp and use __counted_by_or_null */
	struct nx_netif_mit *mit_tmp;
	uint32_t nrings;

	ASSERT(na->na_type == NA_NETIF_DEV);
	ASSERT(!(na->na_flags & NAF_HOST_ONLY));

	SK_DF(SK_VERB_NETIF, "na \"%s\" (0x%llx) %s [%s]", na->na_name,
	    SK_KVA(na), ifp->if_xname, na_activate_mode2str(mode));

	switch (mode) {
	case NA_ACTIVATE_MODE_ON:
		ASSERT(SKYWALK_CAPABLE(ifp));

		nx_netif_mit_config(nifna, &tx_mit, &tx_mit_simple,
		    &rx_mit, &rx_mit_simple);

		/*
		 * Init the mitigation support on all the dev TX rings.
		 */
		if (tx_mit) {
			nrings = na_get_nrings(na, NR_TX);
			mit_tmp = skn_alloc_type_array(tx_on, struct nx_netif_mit,
			    nrings, Z_WAITOK, skmem_tag_netif_mit);
			if (mit_tmp == NULL) {
				SK_ERR("TX mitigation allocation failed");
				error = ENOMEM;
				goto out;
			}
			nifna->nifna_tx_mit = mit_tmp;
			nifna->nifna_tx_mit_count = nrings;
		} else {
			ASSERT(nifna->nifna_tx_mit == NULL);
		}

		/*
		 * Init the mitigation support on all the dev RX rings.
		 */
		if (rx_mit) {
			nrings = na_get_nrings(na, NR_RX);
			mit_tmp = skn_alloc_type_array(rx_on, struct nx_netif_mit,
			    nrings, Z_WAITOK, skmem_tag_netif_mit);
			if (mit_tmp == NULL) {
				SK_ERR("RX mitigation allocation failed");
				if (nifna->nifna_tx_mit != NULL) {
					skn_free_type_array_counted_by(rx_fail,
					    struct nx_netif_mit,
					    nifna->nifna_tx_mit_count,
					    nifna->nifna_tx_mit);
				}
				error = ENOMEM;
				goto out;
			}
			nifna->nifna_rx_mit = mit_tmp;
			nifna->nifna_rx_mit_count = nrings;
		} else {
			ASSERT(nifna->nifna_rx_mit == NULL);
		}

		/* intercept na_notify callback on the TX rings */
		for (r = 0; r < na_get_nrings(na, NR_TX); r++) {
			na->na_tx_rings[r].ckr_netif_notify =
			    na->na_tx_rings[r].ckr_na_notify;
			na->na_tx_rings[r].ckr_na_notify =
			    nx_netif_na_notify_tx;
			if (nifna->nifna_tx_mit != NULL) {
				nx_netif_mit_init(nif, ifp,
				    &nifna->nifna_tx_mit[r],
				    &na->na_tx_rings[r], tx_mit_simple);
			}
		}

		/* intercept na_notify callback on the RX rings */
		for (r = 0; r < na_get_nrings(na, NR_RX); r++) {
			na->na_rx_rings[r].ckr_netif_notify =
			    na->na_rx_rings[r].ckr_na_notify;
			na->na_rx_rings[r].ckr_na_notify = IFNET_IS_REDIRECT(ifp) ?
			    nx_netif_na_notify_rx_redirect : nx_netif_na_notify_rx;
			if (nifna->nifna_rx_mit != NULL) {
				nx_netif_mit_init(nif, ifp,
				    &nifna->nifna_rx_mit[r],
				    &na->na_rx_rings[r], rx_mit_simple);
			}
		}
		nx_netif_filter_enable(nif);
		nx_netif_flow_enable(nif);
		os_atomic_or(&na->na_flags, NAF_ACTIVE, relaxed);

		/* steer all start requests to netif; this must not fail */
		lck_mtx_lock(&ifp->if_start_lock);
		error = ifnet_set_start_handler(ifp, nx_netif_doorbell);
		VERIFY(error == 0);
		lck_mtx_unlock(&ifp->if_start_lock);
		break;

	case NA_ACTIVATE_MODE_DEFUNCT:
		ASSERT(SKYWALK_CAPABLE(ifp));
		break;

	case NA_ACTIVATE_MODE_OFF:
		/*
		 * Note that here we cannot assert SKYWALK_CAPABLE()
		 * as we're called in the destructor path.
		 */
		os_atomic_andnot(&na->na_flags, NAF_ACTIVE, relaxed);
		nx_netif_flow_disable(nif);
		nx_netif_filter_disable(nif);

		/*
		 * Here we may block while holding sk_lock, but because
		 * we've cleared NAF_ACTIVE above, kern_channel_tx_refill()
		 * should immediately return.  A better approach would be
		 * to drop sk_lock and add a monitor for this routine.
		 */
		lck_mtx_lock(&ifp->if_start_lock);
		while (ifp->if_start_active != 0) {
			++ifp->if_start_waiters;
			(void) msleep(&ifp->if_start_waiters,
			    &ifp->if_start_lock, (PZERO - 1),
			    na->na_name, NULL);
		}
		/* steer all start requests to default handler */
		ifnet_reset_start_handler(ifp);
		lck_mtx_unlock(&ifp->if_start_lock);

		/* reset all TX notify callbacks */
		for (r = 0; r < na_get_nrings(na, NR_TX); r++) {
			na->na_tx_rings[r].ckr_na_notify =
			    na->na_tx_rings[r].ckr_netif_notify;
			na->na_tx_rings[r].ckr_netif_notify = NULL;
			if (nifna->nifna_tx_mit != NULL) {
				na->na_tx_rings[r].ckr_netif_mit_stats = NULL;
				nx_netif_mit_cleanup(&nifna->nifna_tx_mit[r]);
			}
		}

		if (nifna->nifna_tx_mit != NULL) {
			skn_free_type_array_counted_by(tx_off, struct nx_netif_mit,
			    nifna->nifna_tx_mit_count, nifna->nifna_tx_mit);
		}

		/* reset all RX notify callbacks */
		for (r = 0; r < na_get_nrings(na, NR_RX); r++) {
			na->na_rx_rings[r].ckr_na_notify =
			    na->na_rx_rings[r].ckr_netif_notify;
			na->na_rx_rings[r].ckr_netif_notify = NULL;
			if (nifna->nifna_rx_mit != NULL) {
				na->na_rx_rings[r].ckr_netif_mit_stats = NULL;
				nx_netif_mit_cleanup(&nifna->nifna_rx_mit[r]);
			}
		}
		if (nifna->nifna_rx_mit != NULL) {
			skn_free_type_array_counted_by(rx_off, struct nx_netif_mit,
			    nifna->nifna_rx_mit_count, nifna->nifna_rx_mit);
		}
		break;

	default:
		VERIFY(0);
		/* NOTREACHED */
		__builtin_unreachable();
	}
out:
	return error;
}

SK_NO_INLINE_ATTRIBUTE
static int
nx_netif_attach(struct kern_nexus *nx, struct ifnet *ifp)
{
	struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
	struct nxprov_params *nxp = NX_PROV(nx)->nxprov_params;
	struct nexus_netif_adapter *devnifna = NULL;
	struct nexus_netif_adapter *hostnifna = NULL;
	struct nexus_adapter *__single devna = NULL;
	struct nexus_adapter *__single hostna = NULL;
	boolean_t embryonic = FALSE;
	int retval = 0;
	uint32_t na_flags;

	SK_LOCK_ASSERT_HELD();
	ASSERT(SKYWALK_NATIVE(ifp));
	ASSERT(!SKYWALK_CAPABLE(ifp));
	ASSERT(ifp->if_na == NULL);
	ASSERT(ifp->if_na_ops == NULL);

	devnifna = na_netif_alloc(Z_WAITOK);
	hostnifna = na_netif_alloc(Z_WAITOK);

	/*
	 * We can be called for two different interface states:
	 *
	 * Fully attached: get an io ref count; upon success, this
	 * holds a reference to the ifnet for the ifp pointer stored
	 * in 'na_ifp' down below for both adapters.
	 *
	 * Embryonic: temporary hold the ifnet in na_private, which
	 * upon a successful ifnet_attach(), will be moved over to
	 * the 'na_ifp' with an io ref count held.
	 *
	 * The ifnet in 'na_ifp' will be released by na_release_locked().
	 */
	if (!ifnet_is_attached(ifp, 1)) {
		if (!(ifp->if_refflags & IFRF_EMBRYONIC)) {
			ifp = NULL;
			retval = ENXIO;
			goto err;
		}
		embryonic = TRUE;
	}

	/* initialize the device netif adapter */
	devnifna->nifna_netif = nif;
	nx_netif_retain(nif);
	devna = &devnifna->nifna_up;
	devna->na_type = NA_NETIF_DEV;
	devna->na_free = na_netif_free;
	(void) strlcpy(devna->na_name, ifp->if_xname, sizeof(devna->na_name));
	uuid_generate_random(devna->na_uuid);
	if (embryonic) {
		/*
		 * We will move this over to na_ifp once
		 * the interface is fully attached.
		 */
		devna->na_private = ifp;
		ASSERT(devna->na_ifp == NULL);
	} else {
		ASSERT(devna->na_private == NULL);
		/* use I/O refcnt from ifnet_is_attached() */
		devna->na_ifp = ifp;
	}
	devna->na_activate = nx_netif_na_activate;
	devna->na_txsync = nx_netif_na_txsync;
	devna->na_rxsync = nx_netif_na_rxsync;
	devna->na_dtor = nx_netif_na_dtor;
	devna->na_krings_create = nx_netif_dev_krings_create;
	devna->na_krings_delete = nx_netif_dev_krings_delete;
	devna->na_special = nx_netif_na_special;

	na_flags = NAF_NATIVE;
	if (NX_PROV(nx)->nxprov_flags & NXPROVF_VIRTUAL_DEVICE) {
		na_flags |= NAF_VIRTUAL_DEVICE;
	}
	if (NX_LLINK_PROV(nx)) {
		/*
		 * while operating in logical link mode, we don't need to
		 * create backing memory regions for the rings as they are
		 * not used.
		 */
		na_flags |= NAF_MEM_NO_INIT;
	}
	os_atomic_or(&devna->na_flags, na_flags, relaxed);
	*(nexus_stats_type_t *)(uintptr_t)&devna->na_stats_type =
	    NEXUS_STATS_TYPE_INVALID;

	na_set_nrings(devna, NR_TX, nxp->nxp_tx_rings);
	na_set_nrings(devna, NR_RX, nxp->nxp_rx_rings);
	na_set_nslots(devna, NR_TX, nxp->nxp_tx_slots);
	na_set_nslots(devna, NR_RX, nxp->nxp_rx_slots);
	/*
	 * Verify upper bounds; the parameters must have already been
	 * validated by nxdom_prov_params() by the time we get here.
	 */
	ASSERT(na_get_nrings(devna, NR_TX) <= NX_DOM(nx)->nxdom_tx_rings.nb_max);
	ASSERT(na_get_nrings(devna, NR_RX) <= NX_DOM(nx)->nxdom_rx_rings.nb_max);
	ASSERT(na_get_nslots(devna, NR_TX) <= NX_DOM(nx)->nxdom_tx_slots.nb_max);
	ASSERT(na_get_nslots(devna, NR_RX) <= NX_DOM(nx)->nxdom_rx_slots.nb_max);

	na_attach_common(devna, nx, &nx_netif_prov_s);

	if ((retval = NX_DOM_PROV(nx)->nxdom_prov_mem_new(NX_DOM_PROV(nx),
	    nx, devna)) != 0) {
		ASSERT(devna->na_arena == NULL);
		goto err;
	}
	ASSERT(devna->na_arena != NULL);

	*(uint32_t *)(uintptr_t)&devna->na_flowadv_max = nxp->nxp_flowadv_max;
	ASSERT(devna->na_flowadv_max == 0 ||
	    skmem_arena_nexus(devna->na_arena)->arn_flowadv_obj != NULL);

	/* setup packet copy routines */
	if (skmem_arena_nexus(devna->na_arena)->arn_rx_pp->pp_max_frags > 1) {
		nif->nif_pkt_copy_from_mbuf = pkt_copy_multi_buflet_from_mbuf;
		nif->nif_pkt_copy_to_mbuf = pkt_copy_multi_buflet_to_mbuf;
		nif->nif_pkt_copy_from_pkt = pkt_copy_multi_buflet_from_pkt;
	} else {
		nif->nif_pkt_copy_from_mbuf = pkt_copy_from_mbuf;
		nif->nif_pkt_copy_to_mbuf = pkt_copy_to_mbuf;
		nif->nif_pkt_copy_from_pkt = pkt_copy_from_pkt;
	}

	/* initialize the host netif adapter */
	hostnifna->nifna_netif = nif;
	nx_netif_retain(nif);
	hostna = &hostnifna->nifna_up;
	(void) snprintf(hostna->na_name, sizeof(hostna->na_name),
	    "%s^", devna->na_name);
	uuid_generate_random(hostna->na_uuid);
	if (embryonic) {
		/*
		 * We will move this over to na_ifp once
		 * the interface is fully attached.
		 */
		hostna->na_private = ifp;
		ASSERT(hostna->na_ifp == NULL);
	} else {
		ASSERT(hostna->na_private == NULL);
		hostna->na_ifp = devna->na_ifp;
		ifnet_incr_iorefcnt(hostna->na_ifp);
	}
	hostna->na_type = NA_NETIF_HOST;
	hostna->na_free = na_netif_free;
	hostna->na_activate = nx_netif_host_na_activate;
	hostna->na_txsync = nx_netif_host_na_txsync;
	hostna->na_rxsync = nx_netif_host_na_rxsync;
	hostna->na_dtor = nx_netif_na_dtor;
	hostna->na_krings_create = nx_netif_host_krings_create;
	hostna->na_krings_delete = nx_netif_host_krings_delete;
	hostna->na_special = nx_netif_host_na_special;

	na_flags = NAF_HOST_ONLY | NAF_NATIVE;
	if (NX_LLINK_PROV(nx)) {
		/*
		 * while operating in logical link mode, we don't need to
		 * create backing memory regions for the rings as they are
		 * not used.
		 */
		na_flags |= NAF_MEM_NO_INIT;
	}
	os_atomic_or(&hostna->na_flags, na_flags, relaxed);
	*(nexus_stats_type_t *)(uintptr_t)&hostna->na_stats_type =
	    NEXUS_STATS_TYPE_INVALID;

	na_set_nrings(hostna, NR_TX, 1);
	na_set_nrings(hostna, NR_RX, 1);
	na_set_nslots(hostna, NR_TX, nxp->nxp_tx_slots);
	na_set_nslots(hostna, NR_RX, nxp->nxp_rx_slots);

	na_attach_common(hostna, nx, &nx_netif_prov_s);

	if ((retval = NX_DOM_PROV(nx)->nxdom_prov_mem_new(NX_DOM_PROV(nx),
	    nx, hostna)) != 0) {
		ASSERT(hostna->na_arena == NULL);
		goto err;
	}
	ASSERT(hostna->na_arena != NULL);

	*(uint32_t *)(uintptr_t)&hostna->na_flowadv_max = nxp->nxp_flowadv_max;
	ASSERT(hostna->na_flowadv_max == 0 ||
	    skmem_arena_nexus(hostna->na_arena)->arn_flowadv_obj != NULL);

	/* adjust the classq packet drop limit */
	if (embryonic) {
		uint32_t drop_lim;
		struct kern_pbufpool_memory_info pp_info;

		retval = kern_pbufpool_get_memory_info(nx->nx_tx_pp, &pp_info);
		VERIFY(retval == 0);

		/* set the drop limit as 80% of size of packet pool */
		drop_lim = (pp_info.kpm_packets * 4) / 5;
		VERIFY(drop_lim != 0);
		IFCQ_PKT_DROP_LIMIT(ifp->if_snd) = drop_lim;
	}

	/* these will be undone by destructor  */
	ifp->if_na_ops = &na_netif_ops;
	ifp->if_na = devnifna;
	na_retain_locked(devna);
	na_retain_locked(hostna);

	SKYWALK_SET_CAPABLE(ifp);

	NETIF_WLOCK(nif);
	nif->nif_ifp = ifp;
	nif->nif_netif_nxadv = nx->nx_adv.netif_nxv_adv;
	retval = nx_port_alloc(nx, NEXUS_PORT_NET_IF_DEV, NULL, &devna,
	    kernproc);
	ASSERT(retval == 0);
	retval = nx_port_alloc(nx, NEXUS_PORT_NET_IF_HOST, NULL, &hostna,
	    kernproc);
	ASSERT(retval == 0);
	NETIF_WUNLOCK(nif);

#if SK_LOG
	uuid_string_t uuidstr;
	SK_DF(SK_VERB_NETIF, "devna: \"%s\"", devna->na_name);
	SK_DF(SK_VERB_NETIF, "  UUID:        %s",
	    sk_uuid_unparse(devna->na_uuid, uuidstr));
	SK_DF(SK_VERB_NETIF, "  nx:          0x%llx (\"%s\":\"%s\")",
	    SK_KVA(devna->na_nx), NX_DOM(devna->na_nx)->nxdom_name,
	    NX_DOM_PROV(devna->na_nx)->nxdom_prov_name);
	SK_DF(SK_VERB_NETIF, "  flags:       0x%b", devna->na_flags, NAF_BITS);
	SK_DF(SK_VERB_NETIF, "  flowadv_max: %u", devna->na_flowadv_max);
	SK_DF(SK_VERB_NETIF, "  rings:       tx %u rx %u",
	    na_get_nrings(devna, NR_TX), na_get_nrings(devna, NR_RX));
	SK_DF(SK_VERB_NETIF, "  slots:       tx %u rx %u",
	    na_get_nslots(devna, NR_TX), na_get_nslots(devna, NR_RX));
#if CONFIG_NEXUS_USER_PIPE
	SK_DF(SK_VERB_NETIF, "  next_pipe:   %u", devna->na_next_pipe);
	SK_DF(SK_VERB_NETIF, "  max_pipes:   %u", devna->na_max_pipes);
#endif /* CONFIG_NEXUS_USER_PIPE */
	SK_DF(SK_VERB_NETIF, "  ifp:         0x%llx %s [ioref %u]",
	    SK_KVA(ifp), ifp->if_xname, ifp->if_refio);
	SK_DF(SK_VERB_NETIF, "hostna: \"%s\"", hostna->na_name);
	SK_DF(SK_VERB_NETIF, "  UUID:        %s",
	    sk_uuid_unparse(hostna->na_uuid, uuidstr));
	SK_DF(SK_VERB_NETIF, "  nx:          0x%llx (\"%s\":\"%s\")",
	    SK_KVA(hostna->na_nx), NX_DOM(hostna->na_nx)->nxdom_name,
	    NX_DOM_PROV(hostna->na_nx)->nxdom_prov_name);
	SK_DF(SK_VERB_NETIF, "  flags:       0x%b",
	    hostna->na_flags, NAF_BITS);
	SK_DF(SK_VERB_NETIF, "  flowadv_max: %u", hostna->na_flowadv_max);
	SK_DF(SK_VERB_NETIF, "  rings:       tx %u rx %u",
	    na_get_nrings(hostna, NR_TX), na_get_nrings(hostna, NR_RX));
	SK_DF(SK_VERB_NETIF, "  slots:       tx %u rx %u",
	    na_get_nslots(hostna, NR_TX), na_get_nslots(hostna, NR_RX));
#if CONFIG_NEXUS_USER_PIPE
	SK_DF(SK_VERB_NETIF, "  next_pipe:   %u", hostna->na_next_pipe);
	SK_DF(SK_VERB_NETIF, "  max_pipes:   %u", hostna->na_max_pipes);
#endif /* CONFIG_NEXUS_USER_PIPE */
	SK_DF(SK_VERB_NETIF, "  ifp:         0x%llx %s [ioref %u]",
	    SK_KVA(ifp), ifp->if_xname, ifp->if_refio);
#endif /* SK_LOG */

err:
	if (retval != 0) {
		if (ifp != NULL) {
			if (!embryonic) {
				ifnet_decr_iorefcnt(ifp);
			}
			ifp = NULL;
		}
		if (devna != NULL) {
			if (devna->na_arena != NULL) {
				skmem_arena_release(devna->na_arena);
				devna->na_arena = NULL;
			}
			if (devna->na_ifp != NULL) {
				ifnet_decr_iorefcnt(devna->na_ifp);
				devna->na_ifp = NULL;
			}
			devna->na_private = NULL;
		}
		if (hostna != NULL) {
			if (hostna->na_arena != NULL) {
				skmem_arena_release(hostna->na_arena);
				hostna->na_arena = NULL;
			}
			if (hostna->na_ifp != NULL) {
				ifnet_decr_iorefcnt(hostna->na_ifp);
				hostna->na_ifp = NULL;
			}
			hostna->na_private = NULL;
		}
		if (devnifna != NULL) {
			if (devnifna->nifna_netif != NULL) {
				nx_netif_release(devnifna->nifna_netif);
				devnifna->nifna_netif = NULL;
			}
			na_netif_free((struct nexus_adapter *)devnifna);
		}
		if (hostnifna != NULL) {
			if (hostnifna->nifna_netif != NULL) {
				nx_netif_release(hostnifna->nifna_netif);
				hostnifna->nifna_netif = NULL;
			}
			na_netif_free((struct nexus_adapter *)hostnifna);
		}
	}
	return retval;
}

/*
 * Any per-netif state that can be discovered at attach time should be
 * initialized here.
 */
static void
nx_netif_flags_init(struct nx_netif *nif)
{
	ifnet_t ifp = nif->nif_ifp;
	struct kern_nexus *nx = nif->nif_nx;
	struct nexus_adapter *devna = nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV);

	switch (devna->na_type) {
	case NA_NETIF_DEV:
		if (strlcmp(sk_ll_prefix, ifp->if_name, sizeof(sk_ll_prefix)) == 0) {
			nif->nif_flags |= NETIF_FLAG_LOW_LATENCY;
			if_set_xflags(ifp, IFXF_LOW_LATENCY);
		}
		break;
	case NA_NETIF_COMPAT_DEV:
		nif->nif_flags |= NETIF_FLAG_COMPAT;
		break;
	default:
		break;
	}
}

/*
 * This is also supposed to check for any inconsistent state at detach time.
 */
static void
nx_netif_flags_fini(struct nx_netif *nif)
{
	ifnet_t ifp = nif->nif_ifp;

	if (ifp != NULL) {
		if_clear_xflags(ifp, IFXF_LOW_LATENCY);
	}
	nif->nif_flags = 0;
}

SK_NO_INLINE_ATTRIBUTE
static void
nx_netif_callbacks_init(struct nx_netif *nif)
{
	ifnet_t ifp = nif->nif_ifp;

	/*
	 * XXX
	 * This function is meant to be called by na_netif_finalize(), which is
	 * called by ifnet_attach() while holding if_lock exclusively.
	 */
	ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE);
	if (ifnet_is_low_latency(ifp)) {
		ifnet_set_detach_notify_locked(ifp,
		    nx_netif_llw_detach_notify, ifp->if_na);
	}
}

SK_NO_INLINE_ATTRIBUTE
static void
nx_netif_callbacks_fini(struct nx_netif *nif)
{
	ifnet_t ifp = nif->nif_ifp;

	if (ifnet_is_low_latency(ifp)) {
		ifnet_set_detach_notify(ifp, NULL, NULL);
	}
}

static void
configure_capab_interface_advisory(struct nx_netif *nif,
    nxprov_capab_config_fn_t capab_fn)
{
	struct kern_nexus_capab_interface_advisory capab;
	struct kern_nexus *nx = nif->nif_nx;
	uint32_t capab_len;
	int error;

	/* check/configure interface advisory notifications */
	if ((nif->nif_ifp->if_eflags & IFEF_ADV_REPORT) == 0) {
		return;
	}
	bzero(&capab, sizeof(capab));
	capab.kncia_version =
	    KERN_NEXUS_CAPAB_INTERFACE_ADVISORY_VERSION_1;
	*__DECONST(kern_nexus_capab_interface_advisory_notify_fn_t *,
	    &(capab.kncia_notify)) = nx_netif_interface_advisory_notify;
	*__DECONST(void **, &(capab.kncia_kern_context)) = nx;
	capab_len = sizeof(capab);
	error = capab_fn(NX_PROV(nx), nx,
	    KERN_NEXUS_CAPAB_INTERFACE_ADVISORY, &capab, &capab_len);
	if (error != 0) {
		DTRACE_SKYWALK2(interface__advisory__capab__error,
		    struct nx_netif *, nif, int, error);
		return;
	}
	VERIFY(capab.kncia_config != NULL);
	VERIFY(capab.kncia_provider_context != NULL);
	nif->nif_intf_adv_config = capab.kncia_config;
	nif->nif_intf_adv_prov_ctx = capab.kncia_provider_context;
	nif->nif_extended_capabilities |= NETIF_CAPAB_INTERFACE_ADVISORY;
}

static void
unconfigure_capab_interface_advisory(struct nx_netif *nif)
{
	if ((nif->nif_extended_capabilities & NETIF_CAPAB_INTERFACE_ADVISORY) == 0) {
		return;
	}
	nif->nif_intf_adv_config = NULL;
	nif->nif_intf_adv_prov_ctx = NULL;
	nif->nif_extended_capabilities &= ~NETIF_CAPAB_INTERFACE_ADVISORY;
}

static void
configure_capab_qset_extensions(struct nx_netif *nif,
    nxprov_capab_config_fn_t capab_fn)
{
	struct kern_nexus_capab_qset_extensions capab;
	struct kern_nexus *nx = nif->nif_nx;
	uint32_t capab_len;
	int error;

	if (!NX_LLINK_PROV(nx)) {
		DTRACE_SKYWALK1(not__llink__prov, struct nx_netif *, nif);
		return;
	}
	bzero(&capab, sizeof(capab));
	capab.cqe_version = KERN_NEXUS_CAPAB_QSET_EXTENSIONS_VERSION_1;
	capab_len = sizeof(capab);
	error = capab_fn(NX_PROV(nx), nx,
	    KERN_NEXUS_CAPAB_QSET_EXTENSIONS, &capab, &capab_len);
	if (error != 0) {
		DTRACE_SKYWALK2(qset__extensions__capab__error,
		    struct nx_netif *, nif, int, error);
		return;
	}
	VERIFY(capab.cqe_notify_steering_info != NULL);
	VERIFY(capab.cqe_prov_ctx != NULL);
	nif->nif_qset_extensions.qe_notify_steering_info =
	    capab.cqe_notify_steering_info;
	nif->nif_qset_extensions.qe_prov_ctx = capab.cqe_prov_ctx;
	nif->nif_extended_capabilities |= NETIF_CAPAB_QSET_EXTENSIONS;
}

static void
unconfigure_capab_qset_extensions(struct nx_netif *nif)
{
	if ((nif->nif_extended_capabilities & NETIF_CAPAB_QSET_EXTENSIONS) == 0) {
		return;
	}
	bzero(&nif->nif_qset_extensions, sizeof(nif->nif_qset_extensions));
	nif->nif_extended_capabilities &= ~NETIF_CAPAB_QSET_EXTENSIONS;
}

int
nx_netif_notify_steering_info(struct nx_netif *nif, struct netif_qset *qset,
    struct ifnet_traffic_descriptor_common *td, bool add)
{
	struct netif_qset_extensions *qset_ext;
	int err;

	if ((nif->nif_extended_capabilities & NETIF_CAPAB_QSET_EXTENSIONS) == 0) {
		return ENOTSUP;
	}
	qset_ext = &nif->nif_qset_extensions;
	VERIFY(qset_ext->qe_prov_ctx != NULL);
	VERIFY(qset_ext->qe_notify_steering_info != NULL);
	err = qset_ext->qe_notify_steering_info(qset_ext->qe_prov_ctx,
	    qset->nqs_ctx, td, add);
	return err;
}

static void
nx_netif_capabilities_init(struct nx_netif *nif)
{
	struct kern_nexus *nx = nif->nif_nx;
	nxprov_capab_config_fn_t capab_fn;

	if ((NX_PROV(nx)->nxprov_netif_ext.nxnpi_version) ==
	    KERN_NEXUS_PROVIDER_VERSION_NETIF) {
		capab_fn = NX_PROV(nx)->nxprov_netif_ext.nxnpi_config_capab;
		ASSERT(capab_fn != NULL);
	} else {
		capab_fn = NX_PROV(nx)->nxprov_ext.nxpi_config_capab;
	}
	if (capab_fn == NULL) {
		return;
	}
	configure_capab_interface_advisory(nif, capab_fn);
	configure_capab_qset_extensions(nif, capab_fn);
}

static void
nx_netif_capabilities_fini(struct nx_netif *nif)
{
	unconfigure_capab_interface_advisory(nif);
	unconfigure_capab_qset_extensions(nif);
}

static void
nx_netif_verify_tso_config(struct nx_netif *nif)
{
	ifnet_t ifp = nif->nif_ifp;
	uint32_t tso_v4_mtu = 0;
	uint32_t tso_v6_mtu = 0;

	/*
	 * compat interfaces always use 128-byte buffers on the device packet
	 * pool side (for holding headers for classification) so no need to check
	 * the size here.
	 */
	if (!SKYWALK_NATIVE(ifp)) {
		return;
	}

	if ((ifp->if_hwassist & IFNET_TSO_IPV4) != 0) {
		tso_v4_mtu = ifp->if_tso_v4_mtu;
	}
	if ((ifp->if_hwassist & IFNET_TSO_IPV6) != 0) {
		tso_v6_mtu = ifp->if_tso_v6_mtu;
	}
	VERIFY(PP_BUF_SIZE_DEF(nif->nif_nx->nx_tx_pp) >=
	    max(tso_v4_mtu, tso_v6_mtu));
}

void
na_netif_finalize(struct nexus_netif_adapter *nifna, struct ifnet *ifp)
{
	struct nx_netif *nif = nifna->nifna_netif;
	struct kern_nexus *nx = nif->nif_nx;
	struct nexus_adapter *devna = nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV);
	struct nexus_adapter *hostna = nx_port_get_na(nx,
	    NEXUS_PORT_NET_IF_HOST);

	ASSERT(devna != NULL);
	ASSERT(hostna != NULL);

	if (!ifnet_is_attached(ifp, 1)) {
		VERIFY(0);
		/* NOTREACHED */
		__builtin_unreachable();
	}

	ASSERT(devna->na_private == ifp);
	ASSERT(devna->na_ifp == NULL);
	/* use I/O refcnt held by ifnet_is_attached() above */
	devna->na_ifp = devna->na_private;
	devna->na_private = NULL;

	ASSERT(hostna->na_private == ifp);
	ASSERT(hostna->na_ifp == NULL);
	hostna->na_ifp = hostna->na_private;
	hostna->na_private = NULL;
	ifnet_incr_iorefcnt(hostna->na_ifp);

	nx_netif_flags_init(nif);
	nx_netif_llink_init(nif);
	nx_netif_filter_init(nif);
	nx_netif_flow_init(nif);
	nx_netif_capabilities_init(nif);
	nx_netif_agent_init(nif);
	(void) nxctl_inet_traffic_rule_get_count(ifp->if_xname,
	    &ifp->if_traffic_rule_count);
	nx_netif_verify_tso_config(nif);
	nx_netif_callbacks_init(nif);
}

void
nx_netif_reap(struct nexus_netif_adapter *nifna, struct ifnet *ifp,
    uint32_t thres, boolean_t low)
{
#pragma unused(ifp)
	struct nx_netif *nif = nifna->nifna_netif;
	struct kern_nexus *nx = nif->nif_nx;
	struct nexus_adapter *devna = nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV);
	uint64_t now = _net_uptime;
	boolean_t purge;

	ASSERT(thres != 0);

	if (devna->na_work_ts == 0) {
		return;
	}

	/*
	 * Purge if it's has been inactive for some time (twice the drain
	 * threshold), and clear the work timestamp to temporarily skip this
	 * adapter until it's active again.  Purging cached objects can be
	 * expensive since we'd need to allocate and construct them again,
	 * so we do it only when necessary.
	 */
	if (low || (now - devna->na_work_ts) >= (thres << 1)) {
		devna->na_work_ts = 0;
		purge = TRUE;
	} else {
		purge = FALSE;
	}

	SK_DF(SK_VERB_NETIF, "%s: %s na %s", ifp->if_xname,
	    (purge ? "purging" : "pruning"), devna->na_name);

	/*
	 * Device and host adapters share the same packet buffer pool,
	 * so just reap the arena belonging to the device instance.
	 */
	skmem_arena_reap(devna->na_arena, purge);
}

/*
 * The purpose of this callback is to forceably remove resources held by VPNAs
 * in event of an interface detach. Without this callback an application can
 * prevent the detach from completing indefinitely. Note that this is only needed
 * for low latency VPNAs. Userspace do get notified about interface detach events
 * for other NA types (custom ether and filter) and will do the necessary cleanup.
 * The cleanup is done in two phases:
 * 1) VPNAs channels are defuncted. This releases the resources held by VPNAs and
 *    causes the device channel to be closed. All ifnet references held by VPNAs
 *    are also released.
 * 2) This cleans up the netif nexus and releases the two remaining ifnet
 *    references held by the device and host ports (nx_netif_clean()).
 */
void
nx_netif_llw_detach_notify(void *arg)
{
	struct nexus_netif_adapter *__single nifna = arg;
	struct nx_netif *nif = nifna->nifna_netif;
	struct kern_nexus *nx = nif->nif_nx;
	struct kern_channel **ch_list = NULL;
	struct kern_channel *ch;
	int err, i, all_ch_cnt = 0, vp_ch_cnt = 0;
	struct proc *p;

	ASSERT(NETIF_IS_LOW_LATENCY(nif));
	/*
	 * kern_channel_defunct() requires sk_lock to be not held. We
	 * will first find the list of channels we want to defunct and
	 * then call kern_channel_defunct() on each of them. The number
	 * of channels cannot increase after sk_lock is released since
	 * this interface is being detached.
	 */
	SK_LOCK();
	all_ch_cnt = nx->nx_ch_count;
	if (all_ch_cnt == 0) {
		DTRACE_SKYWALK1(no__channel, struct kern_nexus *, nx);
		SK_UNLOCK();
		return;
	}
	ch_list = sk_alloc_type_array(struct kern_channel *, all_ch_cnt,
	    Z_WAITOK | Z_NOFAIL, skmem_tag_netif_temp);

	STAILQ_FOREACH(ch, &nx->nx_ch_head, ch_link) {
		struct nexus_adapter *na = ch->ch_na;

		if (na != NULL && na->na_type == NA_NETIF_VP) {
			ASSERT(vp_ch_cnt < all_ch_cnt);

			/* retain channel to prevent it from being freed */
			ch_retain_locked(ch);
			ch_list[vp_ch_cnt] = ch;
			DTRACE_SKYWALK3(vp__ch__found, struct kern_nexus *, nx,
			    struct kern_channel *, ch, struct nexus_adapter *, na);
			vp_ch_cnt++;
		}
	}
	if (vp_ch_cnt == 0) {
		DTRACE_SKYWALK1(vp__ch__not__found, struct kern_nexus *, nx);
		sk_free_type_array(struct kern_channel *, all_ch_cnt, ch_list);
		SK_UNLOCK();
		return;
	}
	/* prevents the netif from being freed */
	nx_netif_retain(nif);
	SK_UNLOCK();

	for (i = 0; i < vp_ch_cnt; i++) {
		ch = ch_list[i];
		p = proc_find(ch->ch_pid);
		if (p == NULL) {
			SK_ERR("ch 0x%llx pid %d not found", SK_KVA(ch), ch->ch_pid);
			DTRACE_SKYWALK3(ch__pid__not__found, struct kern_nexus *, nx,
			    struct kern_channel *, ch, pid_t, ch->ch_pid);
			ch_release(ch);
			continue;
		}
		/*
		 * It is possible for the channel to be closed before defunct gets
		 * called. We need to get the fd lock here to ensure that the check
		 * for the closed state and the calling of channel defunct are done
		 * atomically.
		 */
		proc_fdlock(p);
		if ((ch->ch_flags & CHANF_ATTACHED) != 0) {
			kern_channel_defunct(p, ch);
		}
		proc_fdunlock(p);
		proc_rele(p);
		ch_release(ch);
	}
	sk_free_type_array(struct kern_channel *, all_ch_cnt, ch_list);

	SK_LOCK();
	/*
	 * Quiescing is not needed because:
	 * The defuncting above ensures that no more tx syncs could enter.
	 * The driver layer ensures that ifnet_detach() (this path) does not get
	 * called until RX upcalls have returned.
	 *
	 * Before sk_lock is reacquired above, userspace could close its channels
	 * and cause the nexus's destructor to be called. This is fine because we
	 * have retained the nif so it can't disappear.
	 */
	err = nx_netif_clean(nif, FALSE);
	if (err != 0) {
		SK_ERR("netif clean failed: err %d", err);
		DTRACE_SKYWALK2(nif__clean__failed, struct nx_netif *, nif, int, err);
	}
	nx_netif_release(nif);
	SK_UNLOCK();
}

void
nx_netif_copy_stats(struct nexus_netif_adapter *nifna,
    struct if_netif_stats *if_ns)
{
	struct nx_netif_mit *mit;
	struct mit_cfg_tbl *mit_cfg;

	if ((mit = nifna->nifna_rx_mit) == NULL) {
		return;
	}

	if ((mit->mit_flags & NETIF_MITF_INITIALIZED) == 0) {
		return;
	}

	if_ns->ifn_rx_mit_interval = mit->mit_interval;
	if_ns->ifn_rx_mit_mode = mit->mit_mode;
	if_ns->ifn_rx_mit_packets_avg = mit->mit_packets_avg;
	if_ns->ifn_rx_mit_packets_min = mit->mit_packets_min;
	if_ns->ifn_rx_mit_packets_max = mit->mit_packets_max;
	if_ns->ifn_rx_mit_bytes_avg = mit->mit_bytes_avg;
	if_ns->ifn_rx_mit_bytes_min = mit->mit_bytes_min;
	if_ns->ifn_rx_mit_bytes_max = mit->mit_bytes_max;
	if_ns->ifn_rx_mit_cfg_idx = mit->mit_cfg_idx;

	VERIFY(if_ns->ifn_rx_mit_cfg_idx < mit->mit_cfg_idx_max);
	mit_cfg = &mit->mit_tbl[if_ns->ifn_rx_mit_cfg_idx];
	if_ns->ifn_rx_mit_cfg_packets_lowat = mit_cfg->cfg_plowat;
	if_ns->ifn_rx_mit_cfg_packets_hiwat = mit_cfg->cfg_phiwat;
	if_ns->ifn_rx_mit_cfg_bytes_lowat = mit_cfg->cfg_blowat;
	if_ns->ifn_rx_mit_cfg_bytes_hiwat = mit_cfg->cfg_bhiwat;
	if_ns->ifn_rx_mit_cfg_interval = mit_cfg->cfg_ival;
}

int
nx_netif_na_special(struct nexus_adapter *na, struct kern_channel *ch,
    struct chreq *chr, nxspec_cmd_t spec_cmd)
{
	ASSERT(na->na_type == NA_NETIF_DEV ||
	    na->na_type == NA_NETIF_COMPAT_DEV);
	return nx_netif_na_special_common(na, ch, chr, spec_cmd);
}

int
nx_netif_na_special_common(struct nexus_adapter *na, struct kern_channel *ch,
    struct chreq *chr, nxspec_cmd_t spec_cmd)
{
	int error = 0;

	ASSERT(na->na_type == NA_NETIF_DEV || na->na_type == NA_NETIF_HOST ||
	    na->na_type == NA_NETIF_COMPAT_DEV ||
	    na->na_type == NA_NETIF_COMPAT_HOST);
	SK_LOCK_ASSERT_HELD();

	switch (spec_cmd) {
	case NXSPEC_CMD_CONNECT:
		/*
		 * netif adapter isn't created exclusively for kernel.
		 * We mark (and clear) NAF_KERNEL_ONLY flag upon a succesful
		 * na_special() connect and disconnect.
		 */
		if (NA_KERNEL_ONLY(na)) {
			error = EBUSY;
			goto done;
		}
		ASSERT(!(na->na_flags & NAF_SPEC_INIT));

		os_atomic_or(&na->na_flags, NAF_KERNEL_ONLY, relaxed);
		error = na_bind_channel(na, ch, chr);
		if (error != 0) {
			os_atomic_andnot(&na->na_flags, NAF_KERNEL_ONLY, relaxed);
			goto done;
		}
		os_atomic_or(&na->na_flags, NAF_SPEC_INIT, relaxed);
		break;

	case NXSPEC_CMD_DISCONNECT:
		ASSERT(NA_KERNEL_ONLY(na));
		ASSERT(na->na_channels > 0);
		ASSERT(na->na_flags & NAF_SPEC_INIT);
		na_unbind_channel(ch);
		os_atomic_andnot(&na->na_flags, (NAF_SPEC_INIT | NAF_KERNEL_ONLY), relaxed);
		break;

	case NXSPEC_CMD_START:
		na_kr_drop(na, FALSE);
		break;

	case NXSPEC_CMD_STOP:
		na_kr_drop(na, TRUE);
		LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_NOTOWNED);
		lck_mtx_lock(&ch->ch_lock);
		nxprov_advise_disconnect(na->na_nx, ch);
		lck_mtx_unlock(&ch->ch_lock);
		break;

	default:
		error = EINVAL;
		break;
	}

done:
	SK_DF(error ? SK_VERB_ERROR : SK_VERB_NETIF,
	    "ch 0x%llx from na \"%s\" (0x%llx) naflags %b nx 0x%llx "
	    "spec_cmd %u (err %d)", SK_KVA(ch), na->na_name, SK_KVA(na),
	    na->na_flags, NAF_BITS, SK_KVA(ch->ch_nexus), spec_cmd, error);

	return error;
}

/*
 * Get a skywalk netif adapter for the port.
 */
int
nx_netif_na_find(struct kern_nexus *nx, struct kern_channel *ch,
    struct chreq *chr, struct nxbind *nxb, struct proc *p,
    struct nexus_adapter **nap, boolean_t create)
{
#pragma unused(ch)
	struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
	boolean_t anon = NX_ANONYMOUS_PROV(nx);
	ch_endpoint_t ep = chr->cr_endpoint;
	nexus_port_t nx_port = chr->cr_port;
	struct nexus_adapter *__single na = NULL;
	struct ifnet *ifp;
	int err = 0;

	SK_LOCK_ASSERT_HELD();
	*nap = NULL; /* default */

#if SK_LOG
	uuid_string_t uuidstr;
	SK_D("name \"%s\" spec_uuid \"%s\" port %d mode 0x%b pipe_id %u "
	    "ring_id %d ring_set %u ep_type %u:%u create %u%s",
	    chr->cr_name, sk_uuid_unparse(chr->cr_spec_uuid, uuidstr),
	    (int)chr->cr_port, chr->cr_mode, CHMODE_BITS,
	    chr->cr_pipe_id, (int)chr->cr_ring_id, chr->cr_ring_set,
	    chr->cr_real_endpoint, chr->cr_endpoint, create,
	    (ep != CH_ENDPOINT_NET_IF) ? " (skipped)" : "");
#endif /* SK_LOG */

	if (!create || ep != CH_ENDPOINT_NET_IF) {
		err = ENODEV;
		goto done;
	}

	ASSERT(NX_DOM(nx)->nxdom_type == NEXUS_TYPE_NET_IF);
	if (nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV) == NULL) {
		err = ENXIO;
		goto done;
	}
	ifp = nif->nif_ifp;
	if (!(SKYWALK_CAPABLE(ifp))) {
		SK_ERR("interface %s is no longer usable", if_name(ifp));
		err = ENOTSUP;
		goto done;
	}

	if (chr->cr_mode & CHMODE_LOW_LATENCY) {
		SK_ERR("low latency is not supported for netif channel");
		err = ENOTSUP;
		goto done;
	}

	switch (nx_port) {
	case NEXUS_PORT_NET_IF_DEV:
		/*
		 * We have to reject direct user open that's not explicitly
		 * allowed because netif nexuses do not by default have
		 * user memory regions.
		 */
		if (p != kernproc &&
		    (!skywalk_netif_direct_allowed(ifp->if_xname) ||
		    (kauth_cred_issuser(kauth_cred_get()) == 0 &&
		    (anon || nif->nif_dev_nxb == NULL || nxb == NULL ||
		    !nxb_is_equal(nif->nif_dev_nxb, nxb))))) {
			DTRACE_SKYWALK2(direct__not__allowed, struct ifnet *,
			    ifp, struct chreq *, chr);
			err = ENOTSUP;
			goto done;
		}
		if (chr->cr_mode & CHMODE_EVENT_RING) {
			SK_ERR("event ring is not supported for netif dev port channel");
			err = ENOTSUP;
			goto done;
		}
		na = nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV);
		break;

	case NEXUS_PORT_NET_IF_HOST:
		if (p != kernproc) {
			err = ENOTSUP;
			goto done;
		}
		if (chr->cr_mode & CHMODE_EVENT_RING) {
			SK_ERR("event ring is not supported for netif host port channel");
			err = ENOTSUP;
			goto done;
		}
		na = nx_port_get_na(nx, NEXUS_PORT_NET_IF_HOST);
		break;

	default:
		ASSERT(!(chr->cr_mode & CHMODE_CONFIG));

		NETIF_WLOCK(nif);
		err = nx_port_alloc(nx, nx_port, nxb, &na, p);
		if (err != 0) {
			NETIF_WUNLOCK(nif);
			goto done;
		}

		if (na == NULL) {
			if (chr->cr_mode & CHMODE_FILTER) {
				err = netif_filter_na_create(nx, chr, &na);
			} else {
				err = netif_vp_na_create(nx, chr, &na);
			}
			if (err != 0) {
				NETIF_WUNLOCK(nif);
				goto done;
			}
			err = nx_port_alloc(nx, nx_port, nxb, &na, p);
			if (err != 0) {
				NETIF_WUNLOCK(nif);
				goto done;
			}
		}
		NETIF_WUNLOCK(nif);

		break;
	}

	ASSERT(err == 0);
	ASSERT(na != NULL);

#if CONFIG_NEXUS_USER_PIPE
	if (NA_OWNED_BY_ANY(na) || na->na_next_pipe > 0) {
#else /* !CONFIG_NEXUS_USER_PIPE */
	if (NA_OWNED_BY_ANY(na)) {
#endif /* !CONFIG_NEXUS_USER_PIPE */
		err = EBUSY;
		na = NULL;
		goto done;
	}

	*nap = na;
	na_retain_locked(na);

done:
	ASSERT(err != 0 || na != NULL);
	if (err) {
		SK_ERR("na not found, err(%d)", err);
	} else {
		SK_DF(SK_VERB_NETIF, "found na 0x%llu", na);
	}
	return err;
}

/* na_krings_create callback for all netif device adapters */
int
nx_netif_dev_krings_create(struct nexus_adapter *na, struct kern_channel *ch)
{
	int ret;

	ASSERT(na->na_type == NA_NETIF_DEV ||
	    na->na_type == NA_NETIF_COMPAT_DEV);
	/*
	 * Allocate context structures for native netif only, for
	 * IOSkywalkFamily to store its object references.
	 */
	ret = na_rings_mem_setup(na, (na->na_flags & NAF_NATIVE), ch);

	/*
	 * We mark CKRF_DROP for kernel-only rings (kernel channel
	 * opened by the flowswitch, etc.) to prevent packets from
	 * going thru until after the client of the kernel channel
	 * has fully plumbed things on its side.  For userland-facing
	 * rings (regular channel opened to netif), this is not
	 * required, and so don't mark CKRF_DROP there.
	 */
	if (ret == 0 && NA_KERNEL_ONLY(na)) {
		na_kr_drop(na, TRUE);
	}

	return ret;
}

/* call with SK_LOCK held */
void
nx_netif_dev_krings_delete(struct nexus_adapter *na, struct kern_channel *ch,
    boolean_t defunct)
{
	ASSERT(na->na_type == NA_NETIF_DEV ||
	    na->na_type == NA_NETIF_COMPAT_DEV);

	/* see comments in nx_netif_dev_krings_create() */
	if (NA_KERNEL_ONLY(na)) {
		na_kr_drop(na, TRUE);
	}

	na_rings_mem_teardown(na, ch, defunct);
}

struct nx_netif *
nx_netif_alloc(zalloc_flags_t how)
{
	struct nx_netif *n;

	SK_LOCK_ASSERT_HELD();

	n = zalloc_flags(nx_netif_zone, how | Z_ZERO);
	if (n == NULL) {
		return NULL;
	}

	NETIF_RWINIT(n);
	os_ref_init(&n->nif_refcnt, NULL);
	SK_DF(SK_VERB_MEM, "netif 0x%llx", SK_KVA(n));

	return n;
}

static void
nx_netif_destroy(struct nx_netif *n)
{
	ASSERT(n->nif_dev_nxb == NULL);
	ASSERT(n->nif_host_nxb == NULL);
	ASSERT(os_ref_get_count(&n->nif_refcnt) == 0);
	nx_netif_llink_config_free(n);
	SK_DF(SK_VERB_MEM, "netif 0x%llx", SK_KVA(n));
	NETIF_RWDESTROY(n);
	zfree(nx_netif_zone, n);
}

void
nx_netif_release(struct nx_netif *n)
{
	SK_LOCK_ASSERT_HELD();

	SK_DF(SK_VERB_MEM, "netif 0x%llx, refcnt %d", SK_KVA(n),
	    os_ref_get_count(&n->nif_refcnt));
	if (os_ref_release(&n->nif_refcnt) == 0) {
		nx_netif_destroy(n);
	}
}

void
nx_netif_retain(struct nx_netif *n)
{
	SK_LOCK_ASSERT_HELD();

	/* retaining an object with a zero refcount is not allowed */
	ASSERT(os_ref_get_count(&n->nif_refcnt) >= 1);
	os_ref_retain(&n->nif_refcnt);
	SK_DF(SK_VERB_MEM, "netif 0x%llx, refcnt %d", SK_KVA(n),
	    os_ref_get_count(&n->nif_refcnt));
}

void
nx_netif_free(struct nx_netif *n)
{
	nx_netif_release(n);
}

static int
nx_netif_interface_advisory_report(struct kern_nexus *nx,
    const struct ifnet_interface_advisory *advisory)
{
	struct kern_nexus *notify_nx;
	struct __kern_netif_intf_advisory *intf_adv;
	struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
	ifnet_t difp = nif->nif_ifp;
	ifnet_t __single parent = NULL;

	/* If we are a delegate, notify the parent instead */
	if (ifnet_get_delegate_parent(difp, &parent) == 0) {
		nif = parent->if_na->nifna_netif;
	}
	if (nif->nif_fsw_nxadv != NULL) {
		ASSERT(nif->nif_fsw != NULL);
		intf_adv = &nif->nif_fsw_nxadv->_nxadv_intf_adv;
		notify_nx = nif->nif_fsw->fsw_nx;
	} else {
		intf_adv = &nif->nif_netif_nxadv->__kern_intf_adv;
		notify_nx = nif->nif_nx;
	}
	/*
	 * copy the advisory report in shared memory
	 */
	intf_adv->cksum = os_cpu_copy_in_cksum(advisory, &intf_adv->adv,
	    sizeof(*advisory), 0);
	STATS_INC(&nif->nif_stats, NETIF_STATS_IF_ADV_UPD_RECV);
	/*
	 * notify user channels on advisory report availability
	 */
	nx_interface_advisory_notify(notify_nx);
	if (parent != NULL) {
		ifnet_release_delegate_parent(difp);
	}
	return 0;
}

static errno_t
nx_netif_interface_advisory_notify(void *kern_ctx,
    const struct ifnet_interface_advisory *advisory)
{
	_CASSERT(offsetof(struct ifnet_interface_advisory, version) ==
	    offsetof(struct ifnet_interface_advisory, header.version));
	_CASSERT(offsetof(struct ifnet_interface_advisory, direction) ==
	    offsetof(struct ifnet_interface_advisory, header.direction));
	_CASSERT(offsetof(struct ifnet_interface_advisory, _reserved) ==
	    offsetof(struct ifnet_interface_advisory, header.interface_type));

	if (__improbable(kern_ctx == NULL || advisory == NULL)) {
		return EINVAL;
	}
	if (__improbable((advisory->header.version <
	    IF_INTERFACE_ADVISORY_VERSION_MIN) ||
	    (advisory->header.version > IF_INTERFACE_ADVISORY_VERSION_MAX))) {
		SK_ERR("Invalid advisory version %d", advisory->header.version);
		return EINVAL;
	}
	if (__improbable((advisory->header.direction !=
	    IF_INTERFACE_ADVISORY_DIRECTION_TX) &&
	    (advisory->header.direction !=
	    IF_INTERFACE_ADVISORY_DIRECTION_RX))) {
		SK_ERR("Invalid advisory direction %d",
		    advisory->header.direction);
		return EINVAL;
	}
	if (__improbable(((advisory->header.interface_type <
	    IF_INTERFACE_ADVISORY_INTERFACE_TYPE_MIN) ||
	    (advisory->header.interface_type >
	    IF_INTERFACE_ADVISORY_INTERFACE_TYPE_MAX)) &&
	    (advisory->header.version >= IF_INTERFACE_ADVISORY_VERSION_2))) {
		SK_ERR("Invalid advisory interface type %d",
		    advisory->header.interface_type);
		return EINVAL;
	}
	return nx_netif_interface_advisory_report(kern_ctx, advisory);
}

void
nx_netif_config_interface_advisory(struct kern_nexus *nx, bool enable)
{
	struct kern_nexus *nx_netif;
	struct nx_netif *nif;

	if (NX_REJECT_ACT(nx) || (nx->nx_flags & NXF_CLOSED) != 0) {
		return;
	}
	if (NX_PROV(nx)->nxprov_params->nxp_type == NEXUS_TYPE_FLOW_SWITCH) {
		struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
		nx_netif = fsw->fsw_nifna->na_nx;
	} else {
		nx_netif = nx;
	}
	ASSERT(NX_PROV(nx_netif)->nxprov_params->nxp_type == NEXUS_TYPE_NET_IF);
	nif = NX_NETIF_PRIVATE(nx_netif);
	if (nif->nif_intf_adv_config != NULL) {
		nif->nif_intf_adv_config(nif->nif_intf_adv_prov_ctx, enable);
	}
}

/*
 * This function has no use anymore since we are now passing truncated packets
 * to filters. We keep this logic just in case we need to prevent certain
 * packets from being passed to filters.
 */
static boolean_t
packet_is_filterable(struct nexus_netif_adapter *nifna,
    struct __kern_packet *pkt)
{
#pragma unused (nifna, pkt)
	return TRUE;
}

/*
 * This function is only meant for supporting the RX path because the TX path
 * will not send packets > MTU size due to the disabling of TSO when filters
 * are enabled.
 */
static void
get_filterable_packets(struct nexus_netif_adapter *nifna,
    struct __kern_packet *pkt_chain, struct __kern_packet **fpkt_chain,
    struct __kern_packet **passthrough_chain)
{
	struct nx_netif *nif = nifna->nifna_netif;
	struct netif_stats *nifs = &nif->nif_stats;
	struct __kern_packet *pkt = pkt_chain, *next, *fpkt;
	struct __kern_packet *__single fpkt_head = NULL;
	struct __kern_packet *__single passthrough_head = NULL;
	struct __kern_packet **fpkt_tailp = &fpkt_head;
	struct __kern_packet **passthrough_tailp = &passthrough_head;
	int fcnt = 0, pcnt = 0, dcnt = 0;

	while (pkt != NULL) {
		next = pkt->pkt_nextpkt;
		pkt->pkt_nextpkt = NULL;

		if (!packet_is_filterable(nifna, pkt)) {
			pcnt++;
			*passthrough_tailp = pkt;
			passthrough_tailp = &pkt->pkt_nextpkt;
			pkt = next;
			continue;
		}
		fpkt = nx_netif_pkt_to_filter_pkt(nifna, pkt, NETIF_CONVERT_RX);
		if (fpkt != NULL) {
			fcnt++;
			*fpkt_tailp = fpkt;
			fpkt_tailp = &fpkt->pkt_nextpkt;
		} else {
			dcnt++;
		}
		pkt = next;
	}
	*fpkt_chain = fpkt_head;
	*passthrough_chain = passthrough_head;

	/*
	 * No need to increment drop stats because that's already
	 * done in nx_netif_pkt_to_filter_pkt.
	 */
	STATS_ADD(nifs, NETIF_STATS_FILTER_RX_NOT_FILTERABLE, pcnt);
	DTRACE_SKYWALK6(filterable, struct nexus_netif_adapter *, nifna,
	    int, fcnt, int, pcnt, int, dcnt, struct __kern_packet *,
	    fpkt_head, struct __kern_packet *, passthrough_head);
}

/*
 * This is only used by ring-based notify functions for now.
 * When a qset-based notify becomes available, this function can be used
 * unmodified.
 */
void
netif_receive(struct nexus_netif_adapter *nifna,
    struct __kern_packet *pkt_chain, struct nexus_pkt_stats *stats)
{
	struct nx_netif *nif = nifna->nifna_netif;
	struct nexus_adapter *na = &nifna->nifna_up;
	struct netif_stats *nifs = &nif->nif_stats;
	int err, dropcnt, dropstat = -1;

	if ((nif->nif_ifp->if_xflags & IFXF_DISABLE_INPUT) != 0) {
		uint64_t byte_cnt = 0;
		struct __kern_packet *pkt;
		struct ifnet *ifp = nif->nif_ifp;

		dropcnt = 0;
		for (pkt = pkt_chain; pkt != NULL; pkt = pkt->pkt_nextpkt) {
			dropcnt++;
			byte_cnt += ((pkt->pkt_pflags & PKT_F_MBUF_DATA) != 0) ?
			    m_pktlen(pkt->pkt_mbuf) : pkt->pkt_length;
		}
		os_atomic_add(&ifp->if_data.ifi_ipackets, dropcnt, relaxed);
		os_atomic_add(&ifp->if_data.ifi_ibytes, byte_cnt, relaxed);

		dropstat = NETIF_STATS_DROP_INPUT_DISABLED;
		goto drop;
	}

	/* update our work timestamp */
	na->na_work_ts = _net_uptime;

	if (nif->nif_filter_cnt > 0) {
		struct __kern_packet *__single fpkt_chain = NULL;
		struct __kern_packet *__single passthrough_chain = NULL;

		get_filterable_packets(nifna, pkt_chain, &fpkt_chain,
		    &passthrough_chain);
		if (fpkt_chain != NULL) {
			(void) nx_netif_filter_inject(nifna, NULL, fpkt_chain,
			    NETIF_FILTER_RX | NETIF_FILTER_SOURCE);
		}
		if (passthrough_chain != NULL) {
			pkt_chain = passthrough_chain;
		} else {
			return;
		}
	} else if (!NETIF_IS_LOW_LATENCY(nif) && nx_netif_filter_default_drop != 0) {
		/*
		 * Default drop is meant for dropping packets on interfaces without
		 * interface filters attached. It can be skipped for LLW because it
		 * doesn't have a network stack path.
		 */
		DTRACE_SKYWALK2(rx__default__drop, struct nx_netif *, nif,
		    struct __kern_packet *, pkt_chain);
		dropstat = NETIF_STATS_FILTER_DROP_DEFAULT;
		goto drop;
	}

	if (nif->nif_flow_cnt > 0) {
		struct __kern_packet *__single remain = NULL;

		err = nx_netif_demux(nifna, pkt_chain, &remain, stats, NETIF_FLOW_SOURCE);
		if (remain == NULL) {
			return;
		}
		pkt_chain = remain;
	}

	if (na->na_rx != NULL) {
		na->na_rx(na, pkt_chain, stats);
	} else {
		DTRACE_SKYWALK2(no__rx__cb, struct nx_netif *, nif,
		    struct __kern_packet *, pkt_chain);
		dropstat = NETIF_STATS_DROP_NO_RX_CB;
		goto drop;
	}

	return;

drop:
	dropcnt = 0;
	nx_netif_free_packet_chain(pkt_chain, &dropcnt);
	if (dropstat != -1) {
		STATS_ADD(nifs, dropstat, dropcnt);
	}
	STATS_ADD(nifs, NETIF_STATS_DROP, dropcnt);
}

static slot_idx_t
netif_rate_limit(struct __kern_channel_ring *r, uint64_t rate,
    slot_idx_t begin, slot_idx_t end, boolean_t *rate_limited)
{
	uint64_t elapsed;
	uint64_t now;
	struct __kern_packet *pkt;
	clock_sec_t sec;
	clock_usec_t usec;
	slot_idx_t i;

	if (__probable(rate == 0)) {
		return end;
	}

	/* init tbr if not so */
	if (__improbable(r->ckr_tbr_token == CKR_TBR_TOKEN_INVALID)) {
		r->ckr_tbr_token = rate;
		r->ckr_tbr_depth = rate;
		r->ckr_tbr_last = mach_absolute_time();
	} else {
		now = mach_absolute_time();
		elapsed = now - r->ckr_tbr_last;
		absolutetime_to_microtime(elapsed, &sec, &usec);
		r->ckr_tbr_token +=
		    ((sec * USEC_PER_SEC + usec) * rate / USEC_PER_SEC);
		if (__improbable(r->ckr_tbr_token > r->ckr_tbr_depth)) {
			r->ckr_tbr_token = r->ckr_tbr_depth;
		}
		r->ckr_tbr_last = now;
	}

	*rate_limited = FALSE;
	for (i = begin; i != end; i = SLOT_NEXT(i, r->ckr_lim)) {
		pkt = KR_KSD(r, i)->sd_pkt;
		if (__improbable(pkt == NULL)) {
			continue;
		}
		if (__improbable(r->ckr_tbr_token <= 0)) {
			end = i;
			*rate_limited = TRUE;
			break;
		}
		r->ckr_tbr_token -= pkt->pkt_length * 8;
	}

	SK_DF(SK_VERB_FSW | SK_VERB_RX, "ckr %p %s rate limited at %d",
	    r, r->ckr_name, i);

	return end;
}

SK_NO_INLINE_ATTRIBUTE
static struct __kern_packet *
consume_pkts(struct __kern_channel_ring *ring, slot_idx_t end)
{
	struct __kern_packet *__single pkt_chain = NULL;
	struct __kern_packet **tailp = &pkt_chain;
	slot_idx_t idx = ring->ckr_rhead;

	while (idx != end) {
		struct __kern_slot_desc *ksd = KR_KSD(ring, idx);
		struct __kern_packet *pkt = ksd->sd_pkt;

		ASSERT(pkt->pkt_nextpkt == NULL);
		KR_SLOT_DETACH_METADATA(ring, ksd);
		*tailp = pkt;
		tailp = &pkt->pkt_nextpkt;
		idx = SLOT_NEXT(idx, ring->ckr_lim);
	}
	ring->ckr_rhead = end;
	ring->ckr_rtail = ring->ckr_ktail;
	return pkt_chain;
}

int
netif_rx_notify_default(struct __kern_channel_ring *ring, struct proc *p,
    uint32_t flags)
{
	struct nexus_adapter *hwna;
	struct nexus_netif_adapter *nifna;
	struct nx_netif *nif;
	struct __kern_packet *pkt_chain;
	struct nexus_pkt_stats stats = {0};
	sk_protect_t protect;
	slot_idx_t ktail;
	int err = 0;

	KDBG((SK_KTRACE_NETIF_RX_NOTIFY_DEFAULT | DBG_FUNC_START),
	    SK_KVA(ring));

	ASSERT(ring->ckr_tx == NR_RX);
	ASSERT(!NA_KERNEL_ONLY(KRNA(ring)) || KR_KERNEL_ONLY(ring));

	err = kr_enter(ring, ((flags & NA_NOTEF_CAN_SLEEP) != 0));
	if (err != 0) {
		/* not a serious error, so no need to be chatty here */
		SK_DF(SK_VERB_FSW,
		    "hwna \"%s\" (0x%llx) kr \"%s\" (0x%llx) krflags 0x%b "
		    "(%d)", KRNA(ring)->na_name, SK_KVA(KRNA(ring)),
		    ring->ckr_name, SK_KVA(ring), ring->ckr_flags,
		    CKRF_BITS, err);
		goto out;
	}
	if (__improbable(KR_DROP(ring))) {
		kr_exit(ring);
		err = ENODEV;
		goto out;
	}
	hwna = KRNA(ring);
	nifna = NIFNA(hwna);
	nif = nifna->nifna_netif;
	if (__improbable(hwna->na_ifp == NULL)) {
		kr_exit(ring);
		err = ENODEV;
		goto out;
	}
	protect = sk_sync_protect();
	err = ring->ckr_na_sync(ring, p, 0);
	if (err != 0 && err != EAGAIN) {
		goto put_out;
	}

	/* read the tail pointer once */
	ktail = ring->ckr_ktail;
	if (__improbable(ring->ckr_khead == ktail)) {
		SK_DF(SK_VERB_FSW | SK_VERB_NOTIFY | SK_VERB_RX,
		    "how strange, interrupt with no packets on hwna "
		    "\"%s\" (0x%llx)", KRNA(ring)->na_name, SK_KVA(KRNA(ring)));
		goto put_out;
	}
	ktail = netif_rate_limit(ring, nif->nif_input_rate, ring->ckr_rhead,
	    ktail, &ring->ckr_rate_limited);

	pkt_chain = consume_pkts(ring, ktail);
	if (pkt_chain != NULL) {
		netif_receive(nifna, pkt_chain, &stats);

		if (ring->ckr_netif_mit_stats != NULL &&
		    stats.nps_pkts != 0 && stats.nps_bytes != 0) {
			ring->ckr_netif_mit_stats(ring, stats.nps_pkts,
			    stats.nps_bytes);
		}
	}

put_out:
	sk_sync_unprotect(protect);
	kr_exit(ring);

out:
	KDBG((SK_KTRACE_NETIF_RX_NOTIFY_DEFAULT | DBG_FUNC_END),
	    SK_KVA(ring), err);
	return err;
}

int
netif_rx_notify_fast(struct __kern_channel_ring *ring, struct proc *p,
    uint32_t flags)
{
#pragma unused(p, flags)
	sk_protect_t protect;
	struct nexus_adapter *hwna;
	struct nexus_pkt_stats stats = {0};
	uint32_t i, count;
	int err = 0;

	KDBG((SK_KTRACE_NETIF_RX_NOTIFY_FAST | DBG_FUNC_START),
	    SK_KVA(ring));

	/* XXX
	 * sk_sync_protect() is not needed for this case because
	 * we are not using the dev ring. Unfortunately lots of
	 * macros used by fsw still require this.
	 */
	protect = sk_sync_protect();
	hwna = KRNA(ring);
	count = na_get_nslots(hwna, NR_RX);
	err = nx_rx_sync_packets(ring, ring->ckr_scratch, &count);
	if (__improbable(err != 0)) {
		SK_ERR("nx_rx_sync_packets failed: %d", err);
		DTRACE_SKYWALK2(rx__sync__packets__failed,
		    struct __kern_channel_ring *, ring, int, err);
		goto out;
	}
	DTRACE_SKYWALK1(chain__count, uint32_t, count);
	for (i = 0; i < count; i++) {
		struct __kern_packet *pkt_chain;

		pkt_chain = SK_PTR_ADDR_KPKT(ring->ckr_scratch[i]);
		ASSERT(pkt_chain != NULL);
		netif_receive(NIFNA(KRNA(ring)), pkt_chain, &stats);

		if (ring->ckr_netif_mit_stats != NULL &&
		    stats.nps_pkts != 0 && stats.nps_bytes != 0) {
			ring->ckr_netif_mit_stats(ring, stats.nps_pkts,
			    stats.nps_bytes);
		}
	}
out:
	sk_sync_unprotect(protect);
	KDBG((SK_KTRACE_NETIF_RX_NOTIFY_FAST | DBG_FUNC_END),
	    SK_KVA(ring), err);
	return err;
}


/*
 * Configure the NA to operate in a particular mode.
 */
static channel_ring_notify_t
netif_hwna_get_notify(struct __kern_channel_ring *ring, netif_mode_t mode)
{
	channel_ring_notify_t notify = NULL;
	boolean_t has_sync_pkts = (sk_rx_sync_packets != 0 &&
	    nx_has_rx_sync_packets(ring));

	if (mode == NETIF_MODE_FSW) {
		notify = (has_sync_pkts ? netif_rx_notify_fast :
		    netif_rx_notify_default);
	} else if (mode == NETIF_MODE_LLW) {
		notify = (has_sync_pkts ? netif_llw_rx_notify_fast :
		    netif_llw_rx_notify_default);
	}
	return notify;
}


static uint32_t
netif_mode_to_flag(netif_mode_t mode)
{
	uint32_t flag = 0;

	if (mode == NETIF_MODE_FSW) {
		flag = NAF_MODE_FSW;
	} else if (mode == NETIF_MODE_LLW) {
		flag = NAF_MODE_LLW;
	}
	return flag;
}

static void
netif_hwna_config_mode(struct nexus_adapter *hwna, netif_mode_t mode,
    void (*rx)(struct nexus_adapter *, struct __kern_packet *,
    struct nexus_pkt_stats *), boolean_t set)
{
	uint32_t i;
	uint32_t flag;

	ASSERT(hwna->na_type == NA_NETIF_DEV ||
	    hwna->na_type == NA_NETIF_COMPAT_DEV);

	for (i = 0; i < na_get_nrings(hwna, NR_RX); i++) {
		struct __kern_channel_ring *kr = &NAKR(hwna, NR_RX)[i];
		channel_ring_notify_t notify = netif_hwna_get_notify(kr, mode);

		if (set) {
			kr->ckr_save_notify = kr->ckr_netif_notify;
			kr->ckr_netif_notify = notify;
		} else {
			kr->ckr_netif_notify = kr->ckr_save_notify;
			kr->ckr_save_notify = NULL;
		}
	}
	if (set) {
		hwna->na_rx = rx;
		flag = netif_mode_to_flag(mode);
		os_atomic_or(&hwna->na_flags, flag, relaxed);
	} else {
		hwna->na_rx = NULL;
		os_atomic_andnot(&hwna->na_flags, (NAF_MODE_FSW | NAF_MODE_LLW), relaxed);
	}
}

void
netif_hwna_set_mode(struct nexus_adapter *hwna, netif_mode_t mode,
    void (*rx)(struct nexus_adapter *, struct __kern_packet *,
    struct nexus_pkt_stats *))
{
	return netif_hwna_config_mode(hwna, mode, rx, TRUE);
}

void
netif_hwna_clear_mode(struct nexus_adapter *hwna)
{
	return netif_hwna_config_mode(hwna, NETIF_MODE_NONE, NULL, FALSE);
}

static void
netif_inject_rx(struct nexus_adapter *na, struct __kern_packet *pkt_chain)
{
	struct nexus_netif_adapter *nifna = NIFNA(na);
	struct nx_netif *nif = nifna->nifna_netif;
	struct netif_stats *nifs = &nif->nif_stats;
	struct __kern_channel_ring *r;
	struct nexus_pkt_stats stats;
	sk_protect_t protect;
	boolean_t ring_drop = FALSE;
	int err, dropcnt;

	if (!NA_OWNED_BY_FSW(na)) {
		DTRACE_SKYWALK1(fsw__disabled, struct nexus_adapter *, na);
		goto fail;
	}
	ASSERT(na->na_rx != NULL);

	/*
	 * XXX
	 * This function is called when a filter injects a packet back to the
	 * regular RX path. We can assume the ring is 0 for now because RSS
	 * is not supported. This needs to be revisited when we add support for
	 * RSS.
	 */
	r = &na->na_rx_rings[0];
	ASSERT(r->ckr_tx == NR_RX);
	err = kr_enter(r, TRUE);
	VERIFY(err == 0);

	if (__improbable(KR_DROP(r))) {
		kr_exit(r);
		DTRACE_SKYWALK2(ring__drop, struct nexus_adapter *, na,
		    struct __kern_channel_ring *, r);
		ring_drop = TRUE;
		goto fail;
	}
	protect = sk_sync_protect();
	na->na_rx(na, pkt_chain, &stats);

	if (r->ckr_netif_mit_stats != NULL &&
	    stats.nps_pkts != 0 && stats.nps_bytes != 0) {
		r->ckr_netif_mit_stats(r, stats.nps_pkts, stats.nps_bytes);
	}
	sk_sync_unprotect(protect);

	kr_exit(r);
	return;

fail:
	dropcnt = 0;
	nx_netif_free_packet_chain(pkt_chain, &dropcnt);
	if (ring_drop) {
		STATS_ADD(nifs, NETIF_STATS_DROP_KRDROP_MODE, dropcnt);
	}
	STATS_ADD(nifs, NETIF_STATS_DROP, dropcnt);
}

/*
 * This is called when an inbound packet has traversed all filters.
 */
errno_t
nx_netif_filter_rx_cb(struct nexus_netif_adapter *nifna,
    struct __kern_packet *fpkt_chain, uint32_t flags)
{
#pragma unused (flags)
	struct nx_netif *nif = nifna->nifna_netif;
	struct netif_stats *nifs = &nif->nif_stats;
	struct nexus_adapter *na = &nifna->nifna_up;
	struct __kern_packet *pkt_chain;
	int err;

	pkt_chain = nx_netif_filter_pkt_to_pkt_chain(nifna,
	    fpkt_chain, NETIF_CONVERT_RX);
	if (pkt_chain == NULL) {
		return ENOMEM;
	}
	if (nif->nif_flow_cnt > 0) {
		struct __kern_packet *__single remain = NULL;

		err = nx_netif_demux(nifna, pkt_chain, &remain,
		    NULL, NETIF_FLOW_INJECT);
		if (remain == NULL) {
			return err;
		}
		pkt_chain = remain;
	}
	if (na->na_rx != NULL) {
		netif_inject_rx(na, pkt_chain);
	} else {
		int dropcnt = 0;
		nx_netif_free_packet_chain(pkt_chain, &dropcnt);
		STATS_ADD(nifs,
		    NETIF_STATS_FILTER_DROP_NO_RX_CB, dropcnt);
		STATS_ADD(nifs, NETIF_STATS_DROP, dropcnt);
	}
	return 0;
}

/*
 * This is called when an outbound packet has traversed all filters.
 */
errno_t
nx_netif_filter_tx_cb(struct nexus_netif_adapter *nifna,
    struct __kern_packet *fpkt_chain, uint32_t flags)
{
#pragma unused (flags)
	struct nx_netif *nif = nifna->nifna_netif;
	struct nexus_adapter *na = &nifna->nifna_up;
	int err;

	if (NETIF_IS_COMPAT(nif)) {
		struct mbuf *m_chain;
		mbuf_svc_class_t sc;

		m_chain = nx_netif_filter_pkt_to_mbuf_chain(nifna,
		    fpkt_chain, NETIF_CONVERT_TX);
		if (m_chain == NULL) {
			return ENOMEM;
		}
		/*
		 * All packets in the chain have the same service class.
		 * If the sc is missing or invalid, a valid value will be
		 * returned.
		 */
		sc = mbuf_get_service_class(m_chain);
		err = nx_netif_filter_tx_processed_mbuf_enqueue(nifna,
		    sc, m_chain);
	} else {
		struct __kern_packet *pkt_chain;
		kern_packet_svc_class_t sc;

		pkt_chain = nx_netif_filter_pkt_to_pkt_chain(nifna,
		    fpkt_chain, NETIF_CONVERT_TX);
		if (pkt_chain == NULL) {
			return ENOMEM;
		}
		/*
		 * All packets in the chain have the same service class.
		 * If the sc is missing or invalid, a valid value will be
		 * returned.
		 */
		sc = kern_packet_get_service_class(SK_PKT2PH(pkt_chain));
		err = nx_netif_filter_tx_processed_pkt_enqueue(nifna,
		    sc, pkt_chain);
	}
	/* Tell driver to resume dequeuing */
	ifnet_start(na->na_ifp);
	return err;
}

void
nx_netif_vp_region_params_adjust(struct nexus_adapter *na,
    struct skmem_region_params *srp)
{
#pragma unused(na, srp)
	return;
}

/* returns true, if starter thread is utilized */
static bool
netif_use_starter_thread(struct ifnet *ifp, uint32_t flags)
{
#if (DEVELOPMENT || DEBUG)
	if (__improbable(nx_netif_force_ifnet_start != 0)) {
		ifnet_start(ifp);
		return true;
	}
#endif /* !DEVELOPMENT && !DEBUG */
	/*
	 * use starter thread in following conditions:
	 * - interface is not skywalk native
	 * - interface attached to virtual driver (ipsec, utun)
	 * - TBR is enabled
	 * - delayed start mechanism is in use
	 * - remaining stack space on the thread is not enough for driver
	 * - caller is in rx workloop context
	 * - caller is from the flowswitch path doing ARP resolving
	 * - caller requires the use of starter thread (stack usage)
	 * - caller requires starter thread for pacing
	 */
	if (!SKYWALK_NATIVE(ifp) || NA(ifp) == NULL ||
	    !NA_IS_ACTIVE(&NA(ifp)->nifna_up) ||
	    ((NA(ifp)->nifna_up.na_flags & NAF_VIRTUAL_DEVICE) != 0) ||
	    IFCQ_TBR_IS_ENABLED(ifp->if_snd) ||
	    (ifp->if_eflags & IFEF_ENQUEUE_MULTI) ||
	    (flags & NETIF_XMIT_FLAG_PACING) != 0 ||
	    sk_is_rx_notify_protected() ||
	    sk_is_async_transmit_protected() ||
	    (sk_is_sync_protected() && (flags & NETIF_XMIT_FLAG_HOST) != 0)) {
		DTRACE_SKYWALK2(use__starter__thread, struct ifnet *, ifp,
		    uint32_t, flags);
		ifnet_start(ifp);
		return true;
	}
	lck_mtx_lock_spin(&ifp->if_start_lock);
	/* interface is flow controlled */
	if (__improbable(ifp->if_start_flags & IFSF_FLOW_CONTROLLED)) {
		lck_mtx_unlock(&ifp->if_start_lock);
		return true;
	}
	/* if starter thread is active, utilize it */
	if (ifp->if_start_active) {
		ifp->if_start_req++;
		lck_mtx_unlock(&ifp->if_start_lock);
		return true;
	}
	lck_mtx_unlock(&ifp->if_start_lock);
	/* Check remaining stack space */
	if ((OSKernelStackRemaining() < NX_NETIF_MIN_DRIVER_STACK_SIZE)) {
		ifnet_start(ifp);
		return true;
	}
	return false;
}

void
netif_transmit(struct ifnet *ifp, uint32_t flags)
{
	if (netif_use_starter_thread(ifp, flags)) {
		return;
	}
	nx_netif_doorbell_internal(ifp, flags);
}

static struct ifclassq *
netif_get_default_ifcq(struct nexus_adapter *hwna)
{
	struct nx_netif *nif;
	struct ifclassq *ifcq;

	nif = NX_NETIF_PRIVATE(hwna->na_nx);
	if (NETIF_LLINK_ENABLED(nif)) {
		struct netif_qset *qset;

		/*
		 * Use the default ifcq for now.
		 * In the future this could be chosen by the caller.
		 */
		qset = nx_netif_get_default_qset_noref(nif);
		ASSERT(qset != NULL);
		ifcq = qset->nqs_ifcq;
	} else {
		ifcq = nif->nif_ifp->if_snd;
	}
	return ifcq;
}

static errno_t
netif_deq_packets(struct nexus_adapter *hwna, struct ifclassq *ifcq,
    uint32_t pkt_limit, uint32_t byte_limit, struct __kern_packet **head,
    boolean_t *pkts_pending, kern_packet_svc_class_t sc,
    uint32_t *pkt_cnt, uint32_t *bytes, uint8_t qset_idx)
{
	classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
	struct ifnet *ifp = hwna->na_ifp;
	uint32_t pkts_cnt;
	uint32_t bytes_cnt;
	errno_t rc;

	ASSERT(ifp != NULL);
	ASSERT(ifp->if_output_sched_model < IFNET_SCHED_MODEL_MAX);
	ASSERT((pkt_limit != 0) && (byte_limit != 0));

	if (ifcq == NULL) {
		ifcq = netif_get_default_ifcq(hwna);
	}
	if (ifp->if_output_sched_model == IFNET_SCHED_MODEL_DRIVER_MANAGED) {
		rc = ifclassq_dequeue_sc(ifcq, (mbuf_svc_class_t)sc,
		    pkt_limit, byte_limit, &pkt_head, NULL, pkt_cnt, bytes, qset_idx);
	} else {
		rc = ifclassq_dequeue(ifcq, pkt_limit, byte_limit,
		    &pkt_head, NULL, pkt_cnt, bytes, qset_idx);
	}
	ASSERT((rc == 0) || (rc == EAGAIN));
	ASSERT((pkt_head.cp_ptype == QP_PACKET) || (pkt_head.cp_kpkt == NULL));

	ifclassq_get_len(ifcq, (mbuf_svc_class_t)sc, qset_idx,
	    &pkts_cnt, &bytes_cnt);
	*pkts_pending = pkts_cnt > 0;

	*head = pkt_head.cp_kpkt;
	return rc;
}

#if SK_LOG
/* Hoisted out of line to reduce kernel stack footprint */
SK_LOG_ATTRIBUTE
static void
netif_no_ring_space_log(const struct nexus_adapter *na,
    const kern_channel_ring_t ring)
{
	SK_DF(SK_VERB_SYNC | SK_VERB_TX,
	    "no ring space: na \"%s\" [%u] "
	    "\"%s\"(kh %u kt %u | rh %u rt %u)",
	    na->na_name, ring->ckr_ring_id,
	    ring->ckr_name, ring->ckr_khead,
	    ring->ckr_ktail, ring->ckr_rhead,
	    ring->ckr_rtail);
}
#endif /* SK_LOG */

/*
 * netif refill function for rings
 */
errno_t
netif_ring_tx_refill(const kern_channel_ring_t ring, uint32_t pkt_limit,
    uint32_t byte_limit, boolean_t tx_doorbell_ctxt, boolean_t *pkts_pending,
    boolean_t canblock)
{
	struct nexus_adapter *hwna;
	struct ifnet *ifp;
	struct __kern_packet *__single head = NULL;
	sk_protect_t protect;
	errno_t rc = 0;
	errno_t sync_err = 0;
	uint32_t npkts = 0, consumed = 0;
	uint32_t flags;
	slot_idx_t idx, ktail;
	int ring_space = 0;

	KDBG((SK_KTRACE_NETIF_RING_TX_REFILL | DBG_FUNC_START), SK_KVA(ring));

	VERIFY(ring != NULL);
	hwna = KRNA(ring);
	ifp = hwna->na_ifp;

	ASSERT(hwna->na_type == NA_NETIF_DEV);
	ASSERT(ring->ckr_tx == NR_TX);
	*pkts_pending = FALSE;

	if (__improbable(pkt_limit == 0 || byte_limit == 0)) {
		SK_ERR("invalid limits plim %d, blim %d",
		    pkt_limit, byte_limit);
		rc = EINVAL;
		goto out;
	}

	if (__improbable(!IF_FULLY_ATTACHED(ifp))) {
		SK_ERR("hwna 0x%llx ifp %s (0x%llx), interface not attached",
		    SK_KVA(hwna), if_name(ifp), SK_KVA(ifp));
		rc = ENXIO;
		goto out;
	}

	if (__improbable((ifp->if_start_flags & IFSF_FLOW_CONTROLLED) != 0)) {
		SK_DF(SK_VERB_SYNC | SK_VERB_TX, "hwna 0x%llx ifp %s (0x%llx), "
		    "flow control ON", SK_KVA(hwna), if_name(ifp), SK_KVA(ifp));
		rc = ENXIO;
		goto out;
	}

	/*
	 * if the ring is busy, it means another dequeue is in
	 * progress, so ignore this request and return success.
	 */
	if (kr_enter(ring, canblock) != 0) {
		rc = 0;
		goto out;
	}
	/* mark thread with sync-in-progress flag */
	protect = sk_sync_protect();

	if (__improbable(KR_DROP(ring) ||
	    !NA_IS_ACTIVE(ring->ckr_na))) {
		SK_ERR("hw-kr 0x%llx stopped", SK_KVA(ring));
		rc = ENXIO;
		goto done;
	}

	idx = ring->ckr_rhead;
	ktail = ring->ckr_ktail;
	/* calculate available space on tx ring */
	ring_space = ktail - idx;
	if (ring_space < 0) {
		ring_space += ring->ckr_num_slots;
	}
	if (ring_space == 0) {
		struct ifclassq *ifcq;

		/* no space in ring, driver should retry */
#if SK_LOG
		if (__improbable((sk_verbose &
		    (SK_VERB_SYNC | SK_VERB_TX)) != 0)) {
			netif_no_ring_space_log(hwna, ring);
		}
#endif /* SK_LOG */
		ifcq = netif_get_default_ifcq(hwna);
		if (IFCQ_LEN(ifcq) != 0) {
			*pkts_pending = TRUE;
		}
		/*
		 * We ran out of space in ring, most probably
		 * because the driver is slow to drain its TX queue.
		 * We want another doorbell to be generated as soon
		 * as the TX notify completion happens; mark this
		 * through ckr_pending_doorbell counter.  Do this
		 * regardless of whether there's any pending packet.
		 */
		ring->ckr_pending_doorbell++;
		rc = EAGAIN;
		goto sync_ring;
	}

	if ((uint32_t)ring_space < pkt_limit) {
		pkt_limit = ring_space;
	}

	if (tx_doorbell_ctxt &&
	    ((hwna->na_flags & NAF_VIRTUAL_DEVICE) == 0)) {
		pkt_limit = MIN(pkt_limit,
		    nx_netif_doorbell_max_dequeue);
	}

	rc = netif_deq_packets(hwna, NULL, pkt_limit, byte_limit,
	    &head, pkts_pending, ring->ckr_svc, NULL, NULL, 0);

	/*
	 * There's room in ring; if we haven't dequeued everything,
	 * mark ckr_pending_doorbell for the next TX notify to issue
	 * a TX door bell; otherwise, clear it.  The next packet that
	 * gets enqueued will trigger a door bell again.
	 */
	if (*pkts_pending) {
		ring->ckr_pending_doorbell++;
	} else if (ring->ckr_pending_doorbell != 0) {
		ring->ckr_pending_doorbell = 0;
	}

	if (rc != 0) {
		/*
		 * This is expected sometimes as the IOSkywalkFamily
		 * errs on the side of caution to perform an extra
		 * dequeue when multiple doorbells are pending;
		 * nothing to dequeue, do a sync if there are slots
		 * to reclaim else just return.
		 */
		SK_DF(SK_VERB_SYNC | SK_VERB_TX,
		    "nothing to dequeue, err %d", rc);

		if ((uint32_t)ring_space == ring->ckr_lim) {
			goto done;
		} else {
			goto sync_ring;
		}
	}
	/* move the dequeued packets to tx ring */
	while (head != NULL && idx != ktail) {
		ASSERT(npkts <= pkt_limit);
		struct __kern_packet *pkt = head;
		KR_SLOT_ATTACH_METADATA(ring, KR_KSD(ring, idx),
		    (struct __kern_quantum *)pkt);
		npkts++;
		if (__improbable(pkt->pkt_trace_id != 0)) {
			KDBG(SK_KTRACE_PKT_TX_AQM | DBG_FUNC_END, pkt->pkt_trace_id);
			KDBG(SK_KTRACE_PKT_TX_DRV | DBG_FUNC_START, pkt->pkt_trace_id);
		}
		idx = SLOT_NEXT(idx, ring->ckr_lim);
		head = pkt->pkt_nextpkt;
		pkt->pkt_nextpkt = NULL;
	}

	/*
	 * We checked for ring space earlier so the ring should have enough
	 * space for the entire chain.
	 */
	ASSERT(head == NULL);
	ring->ckr_rhead = idx;

sync_ring:
	flags = NA_SYNCF_NETIF;
	if (ring->ckr_pending_doorbell != 0) {
		flags |= (NA_SYNCF_NETIF_DOORBELL | NA_SYNCF_NETIF_ASYNC);
	}

	ring->ckr_khead_pre = ring->ckr_khead;
	sync_err = ring->ckr_na_sync(ring, kernproc, flags);
	if (sync_err != 0 && sync_err != EAGAIN) {
		SK_ERR("unexpected sync err %d", sync_err);
		if (rc == 0) {
			rc = sync_err;
		}
		goto done;
	}
	/*
	 * Verify that the driver has detached packets from the consumed slots.
	 */
	idx = ring->ckr_khead_pre;
	consumed = 0;
	while (idx != ring->ckr_khead) {
		struct __kern_slot_desc *ksd = KR_KSD(ring, idx);

		consumed++;
		VERIFY(!KSD_VALID_METADATA(ksd));
		idx = SLOT_NEXT(idx, ring->ckr_lim);
	}
	ring->ckr_khead_pre = ring->ckr_khead;

done:
	sk_sync_unprotect(protect);
	kr_exit(ring);
out:
	KDBG((SK_KTRACE_NETIF_RING_TX_REFILL | DBG_FUNC_END),
	    SK_KVA(ring), rc, 0, npkts);

	return rc;
}

#define NQ_EWMA(old, new, decay) do {                               \
	u_int64_t _avg;                                                 \
	if (__probable((_avg = (old)) > 0))                             \
	        _avg = (((_avg << (decay)) - _avg) + (new)) >> (decay); \
	else                                                            \
	        _avg = (new);                                           \
	(old) = _avg;                                                   \
} while (0)

void
kern_netif_increment_queue_stats(kern_netif_queue_t queue,
    uint32_t pkt_count, uint32_t byte_count)
{
	struct netif_llink *llink = queue->nq_qset->nqs_llink;
	struct ifnet *ifp = llink->nll_nif->nif_ifp;
	if ((queue->nq_flags & NETIF_QUEUE_IS_RX) == 0) {
		os_atomic_add(&ifp->if_data.ifi_opackets, pkt_count, relaxed);
		os_atomic_add(&ifp->if_data.ifi_obytes, byte_count, relaxed);
	} else {
		os_atomic_add(&ifp->if_data.ifi_ipackets, pkt_count, relaxed);
		os_atomic_add(&ifp->if_data.ifi_ibytes, byte_count, relaxed);
	}

	if (ifp->if_data_threshold != 0) {
		ifnet_notify_data_threshold(ifp);
	}

	uint64_t now;
	uint64_t diff_secs;
	struct netif_qstats *stats = &queue->nq_stats;

	if (nq_stat_enable == 0) {
		return;
	}

	if (__improbable(pkt_count == 0)) {
		return;
	}

	stats->nq_num_xfers++;
	stats->nq_total_bytes += byte_count;
	stats->nq_total_pkts += pkt_count;
	if (pkt_count > stats->nq_max_pkts) {
		stats->nq_max_pkts = pkt_count;
	}
	if (stats->nq_min_pkts == 0 ||
	    pkt_count < stats->nq_min_pkts) {
		stats->nq_min_pkts = pkt_count;
	}

	now = net_uptime();
	if (__probable(queue->nq_accumulate_start != 0)) {
		diff_secs = now - queue->nq_accumulate_start;
		if (diff_secs >= nq_accumulate_interval) {
			uint64_t        bps;
			uint64_t        pps;
			uint64_t        pps_ma;

			/* bytes per second */
			bps = queue->nq_accumulated_bytes / diff_secs;
			NQ_EWMA(stats->nq_bytes_ps_ma,
			    bps, nq_transfer_decay);
			stats->nq_bytes_ps = bps;

			/* pkts per second */
			pps = queue->nq_accumulated_pkts / diff_secs;
			pps_ma = stats->nq_pkts_ps_ma;
			NQ_EWMA(pps_ma, pps, nq_transfer_decay);
			stats->nq_pkts_ps_ma = (uint32_t)pps_ma;
			stats->nq_pkts_ps = (uint32_t)pps;

			/* start over */
			queue->nq_accumulate_start = now;
			queue->nq_accumulated_bytes = 0;
			queue->nq_accumulated_pkts = 0;

			stats->nq_min_pkts = 0;
			stats->nq_max_pkts = 0;
		}
	} else {
		queue->nq_accumulate_start = now;
	}
	queue->nq_accumulated_bytes += byte_count;
	queue->nq_accumulated_pkts += pkt_count;
}

void
kern_netif_queue_rx_enqueue(kern_netif_queue_t queue, kern_packet_t ph_chain,
    uint32_t count, uint32_t flags)
{
#pragma unused (count)
	struct netif_queue *q = queue;
	struct netif_llink *llink = q->nq_qset->nqs_llink;
	struct __kern_packet *pkt_chain = SK_PTR_ADDR_KPKT(ph_chain);
	bool flush = ((flags & KERN_NETIF_QUEUE_RX_ENQUEUE_FLAG_FLUSH) != 0);
	struct pktq *pktq = &q->nq_pktq;
	struct netif_stats *nifs = &llink->nll_nif->nif_stats;
	struct nexus_pkt_stats stats = {0};
	sk_protect_t protect;

	ASSERT((q->nq_flags & NETIF_QUEUE_IS_RX) != 0);
	if (llink->nll_state == NETIF_LLINK_STATE_DESTROYED) {
		int drop_cnt = 0;

		pp_free_packet_chain(pkt_chain, &drop_cnt);
		STATS_ADD(nifs, NETIF_STATS_LLINK_RX_DROP_BAD_STATE, drop_cnt);
		return;
	}
	KPKTQ_ENQUEUE_LIST(pktq, pkt_chain);
	if (flush) {
		pkt_chain = KPKTQ_FIRST(pktq);
		KPKTQ_INIT(pktq);

		protect = sk_sync_protect();
		netif_receive(NA(llink->nll_nif->nif_ifp), pkt_chain, &stats);
		sk_sync_unprotect(protect);
		kern_netif_increment_queue_stats(queue, (uint32_t)stats.nps_pkts,
		    (uint32_t)stats.nps_bytes);
	}
}

errno_t
kern_netif_queue_tx_dequeue(kern_netif_queue_t queue, uint32_t pkt_limit,
    uint32_t byte_limit, boolean_t *pending, kern_packet_t *ph_chain)
{
	struct netif_queue *q = queue;
	struct netif_llink *llink = q->nq_qset->nqs_llink;
	struct netif_stats *nifs = &llink->nll_nif->nif_stats;
	struct nexus_adapter *hwna;
	struct __kern_packet *__single pkt_chain = NULL;
	uint32_t bytes = 0, pkt_cnt = 0;
	errno_t rc;

	ASSERT((q->nq_flags & NETIF_QUEUE_IS_RX) == 0);
	if (llink->nll_state == NETIF_LLINK_STATE_DESTROYED) {
		STATS_INC(nifs, NETIF_STATS_LLINK_AQM_DEQ_BAD_STATE);
		return ENXIO;
	}
	hwna = &NA(llink->nll_nif->nif_ifp)->nifna_up;

	if (((hwna->na_flags & NAF_VIRTUAL_DEVICE) == 0) &&
	    sk_is_tx_notify_protected()) {
		pkt_limit = MIN(pkt_limit, nx_netif_doorbell_max_dequeue);
	}
	rc = netif_deq_packets(hwna, q->nq_qset->nqs_ifcq, pkt_limit,
	    byte_limit, &pkt_chain, pending, q->nq_svc, &pkt_cnt, &bytes,
	    q->nq_qset->nqs_idx);

	if (pkt_cnt > 0) {
		kern_netif_increment_queue_stats(queue, pkt_cnt, bytes);
	}
	if (pkt_chain != NULL) {
		*ph_chain = SK_PKT2PH(pkt_chain);
	}
	return rc;
}

errno_t
kern_netif_qset_tx_queue_len(kern_netif_qset_t qset, uint32_t svc,
    uint32_t * pkts_cnt, uint32_t * bytes_cnt)
{
	VERIFY(qset != NULL);
	VERIFY(pkts_cnt != NULL);
	VERIFY(bytes_cnt != NULL);

	return ifclassq_get_len(qset->nqs_ifcq, svc, qset->nqs_idx, pkts_cnt,
	           bytes_cnt);
}

void
kern_netif_set_qset_combined(kern_netif_qset_t qset)
{
	VERIFY(qset != NULL);
	VERIFY(qset->nqs_ifcq != NULL);

	ifclassq_set_grp_combined(qset->nqs_ifcq, qset->nqs_idx);
}

void
kern_netif_set_qset_separate(kern_netif_qset_t qset)
{
	VERIFY(qset != NULL);
	VERIFY(qset->nqs_ifcq != NULL);

	ifclassq_set_grp_separated(qset->nqs_ifcq, qset->nqs_idx);
}

errno_t
kern_nexus_netif_llink_add(struct kern_nexus *nx,
    struct kern_nexus_netif_llink_init *llink_init)
{
	errno_t err;
	struct nx_netif *nif;
	struct netif_llink *__single llink;
	struct netif_stats *nifs;

	VERIFY(nx != NULL);
	VERIFY(llink_init != NULL);
	VERIFY((nx->nx_flags & NXF_ATTACHED) != 0);

	nif = NX_NETIF_PRIVATE(nx);
	nifs = &nif->nif_stats;

	err = nx_netif_validate_llink_config(llink_init, false);
	if (err != 0) {
		SK_ERR("Invalid llink init params");
		STATS_INC(nifs, NETIF_STATS_LLINK_ADD_BAD_PARAMS);
		return err;
	}

	err = nx_netif_llink_add(nif, llink_init, &llink);
	return err;
}

errno_t
kern_nexus_netif_llink_remove(struct kern_nexus *nx,
    kern_nexus_netif_llink_id_t llink_id)
{
	struct nx_netif *nif;

	VERIFY(nx != NULL);
	VERIFY((nx->nx_flags & NXF_ATTACHED) != 0);

	nif = NX_NETIF_PRIVATE(nx);
	return nx_netif_llink_remove(nif, llink_id);
}

errno_t
kern_netif_queue_get_service_class(kern_netif_queue_t queue,
    kern_packet_svc_class_t *svc)
{
	*svc = queue->nq_svc;
	return 0;
}