This is xnu-11215.1.10. See this file in:
/*
 * Copyright (c) 2020-2021 Apple Inc. All rights reserved.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
 *
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. The rights granted to you under the License
 * may not be used to create, or enable the creation or redistribution of,
 * unlawful or unlicensed copies of an Apple operating system, or to
 * circumvent, violate, or enable the circumvention or violation of, any
 * terms of an Apple operating system software license agreement.
 *
 * Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this file.
 *
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
 */
#include <skywalk/os_skywalk_private.h>
#include <skywalk/nexus/netif/nx_netif.h>
#include <pexpert/pexpert.h> /* for PE_parse_boot_argn */
#include <os/refcnt.h>
#include <sys/sdt.h>

#define NX_NETIF_TAG_QSET   "com.apple.skywalk.netif.qset"
static SKMEM_TAG_DEFINE(nx_netif_tag_qset, NX_NETIF_TAG_QSET);

#define NX_NETIF_TAG_LLINK_CFG   "com.apple.skywalk.netif.llink.cfg"
static SKMEM_TAG_DEFINE(nx_netif_tag_llink_cfg, NX_NETIF_TAG_LLINK_CFG);

LCK_ATTR_DECLARE(netif_llink_lock_attr, 0, 0);
static LCK_GRP_DECLARE(netif_llink_lock_group, "netif llink locks");

#if (DEVELOPMENT || DEBUG)
static TUNABLE(uint32_t, nx_netif_disable_llink, "sk_disable_llink", 0);
#endif /* (DEVELOPMENT || DEBUG) */

static struct netif_llink *nx_netif_llink_alloc(void);
static void nx_netif_llink_free(struct netif_llink **);
static struct netif_qset *nx_netif_qset_alloc(uint8_t, uint8_t);
static void nx_netif_qset_free(struct netif_qset **);
static void nx_netif_qset_setup_ifclassq(struct netif_llink *,
    struct netif_qset *);
static void nx_netif_qset_teardown_ifclassq(struct netif_qset *);
static void nx_netif_qset_init(struct netif_qset *, struct netif_llink *,
    uint8_t idx, struct kern_nexus_netif_llink_qset_init *);
static struct netif_qset *nx_netif_qset_create(struct netif_llink *,
    uint8_t, struct kern_nexus_netif_llink_qset_init *);
static void nx_netif_qset_destroy(struct netif_qset *);
static void nx_netif_llink_initialize(struct netif_llink *, struct nx_netif *,
    struct kern_nexus_netif_llink_init *);
static void nx_netif_driver_queue_destroy(struct netif_queue *);
static void nx_netif_driver_queue_init(struct netif_qset *,
    struct netif_queue *, kern_packet_svc_class_t, bool);
static struct netif_llink *nx_netif_llink_create_locked(struct nx_netif *,
    struct kern_nexus_netif_llink_init *);
static void nx_netif_default_llink_add(struct nx_netif *);
static int netif_qset_enqueue_single(struct netif_qset *,
    struct __kern_packet *, uint32_t *, uint32_t *);
static int nx_netif_llink_ext_init_queues(struct kern_nexus *,
    struct netif_llink *);
static void nx_netif_llink_ext_fini_queues(struct kern_nexus *,
    struct netif_llink *);

static uint32_t nx_netif_random_qset = 0;
#if (DEVELOPMENT || DEBUG)
SYSCTL_UINT(_kern_skywalk_netif, OID_AUTO, random_qset,
    CTLFLAG_RW | CTLFLAG_LOCKED, &nx_netif_random_qset, 0,
    "pick a random qset");
#endif /* DEVELOPMENT || DEBUG */

/* retains a reference for the callee */
static struct netif_llink *
nx_netif_llink_alloc(void)
{
	struct netif_llink *llink;

	llink = sk_alloc_type(struct netif_llink, Z_WAITOK | Z_NOFAIL,
	    skmem_tag_netif_llink);
	os_ref_init(&llink->nll_refcnt, NULL);
	return llink;
}

SK_NO_INLINE_ATTRIBUTE
void
nx_netif_llink_retain(struct netif_llink *llink)
{
	os_ref_retain(&llink->nll_refcnt);
}

SK_NO_INLINE_ATTRIBUTE
static void
nx_netif_llink_free(struct netif_llink **pllink)
{
	struct netif_llink *llink = *pllink;
	struct netif_qset *qset, *tqset;

	VERIFY(llink->nll_state == NETIF_LLINK_STATE_DESTROYED);
	*pllink = NULL;
	SLIST_FOREACH_SAFE(qset, &llink->nll_qset_list, nqs_list, tqset) {
		SLIST_REMOVE(&llink->nll_qset_list, qset, netif_qset,
		    nqs_list);
		nx_netif_qset_destroy(qset);
	}
	if (llink->nll_ifcq != NULL) {
		ifclassq_release(&llink->nll_ifcq);
	}

	sk_free_type(struct netif_llink, llink);
}

SK_NO_INLINE_ATTRIBUTE
void
nx_netif_llink_release(struct netif_llink **pllink)
{
	struct netif_llink *__single llink = *pllink;

	*pllink = NULL;
	if (os_ref_release(&llink->nll_refcnt) == 0) {
		nx_netif_llink_free(&llink);
	}
}

/* retains a reference for the callee */
static struct netif_qset *
nx_netif_qset_alloc(uint8_t nrxqs, uint8_t ntxqs)
{
	struct netif_qset *qset;

	_CASSERT(sizeof(struct netif_queue) % sizeof(uint64_t) == 0);

	qset = sk_alloc_type_header_array(struct netif_qset, struct netif_queue,
	    nrxqs + ntxqs, Z_WAITOK | Z_NOFAIL, nx_netif_tag_qset);

	qset->nqs_num_queues = nrxqs + ntxqs;
	qset->nqs_num_rx_queues = nrxqs;
	qset->nqs_num_tx_queues =  ntxqs;
	return qset;
}

SK_NO_INLINE_ATTRIBUTE
void
nx_netif_qset_retain(struct netif_qset *qset)
{
	/*
	 * Logical link is immutable, i.e. Queue Sets can't added/removed
	 * from it. We will rely on this property to simply acquire a refcnt
	 * on the logical link, which is the parent structure of a qset.
	 */
	nx_netif_llink_retain(qset->nqs_llink);
}

SK_NO_INLINE_ATTRIBUTE
void
nx_netif_qset_release(struct netif_qset **pqset)
{
	struct netif_qset *qset = *pqset;
	struct netif_llink *__single llink = qset->nqs_llink;

	*pqset = NULL;
	nx_netif_llink_release(&llink);
}

SK_NO_INLINE_ATTRIBUTE
static void
nx_netif_qset_free(struct netif_qset **pqset)
{
	struct netif_qset *qset = *pqset;
	uint8_t i;

	VERIFY(qset->nqs_llink->nll_state == NETIF_LLINK_STATE_DESTROYED);

	for (i = 0; i < qset->nqs_num_rx_queues; i++) {
		nx_netif_driver_queue_destroy(NETIF_QSET_RX_QUEUE(qset, i));
	}
	for (i = 0; i < qset->nqs_num_tx_queues; i++) {
		nx_netif_driver_queue_destroy(NETIF_QSET_TX_QUEUE(qset, i));
	}
	if (qset->nqs_flags & NETIF_QSET_FLAG_AQM) {
		nx_netif_qset_teardown_ifclassq(qset);
	}
	qset->nqs_llink = NULL;
	sk_free_type_header_array(struct netif_qset, struct netif_queue,
	    qset->nqs_num_rx_queues + qset->nqs_num_tx_queues, qset);
}

SK_NO_INLINE_ATTRIBUTE
static void
nx_netif_qset_destroy(struct netif_qset *qset)
{
	VERIFY(qset->nqs_llink->nll_state == NETIF_LLINK_STATE_DESTROYED);
	nx_netif_qset_free(&qset);
}

SK_NO_INLINE_ATTRIBUTE
static void
nx_netif_qset_setup_ifclassq(struct netif_llink *llink,
    struct netif_qset *qset)
{
	uint8_t flags = 0;

	ASSERT((qset->nqs_flags & NETIF_QSET_FLAG_AQM) != 0);
	ASSERT(llink->nll_ifcq != NULL);

	ifclassq_retain(llink->nll_ifcq);
	qset->nqs_ifcq = llink->nll_ifcq;

	if ((qset->nqs_flags & NETIF_QSET_FLAG_LOW_LATENCY) != 0) {
		flags |= IF_CLASSQ_LOW_LATENCY;
	}
	if ((qset->nqs_flags & NETIF_QSET_FLAG_DEFAULT) != 0) {
		flags |= IF_DEFAULT_GRP;
	}

	ifclassq_setup_group(qset->nqs_ifcq, qset->nqs_idx, flags);
}

SK_NO_INLINE_ATTRIBUTE
static void
nx_netif_qset_teardown_ifclassq(struct netif_qset *qset)
{
	ASSERT((qset->nqs_flags & NETIF_QSET_FLAG_AQM) != 0);
	ASSERT(qset->nqs_ifcq != NULL);

	qset->nqs_flags &= ~NETIF_QSET_FLAG_AQM;
	ifclassq_release(&qset->nqs_ifcq);
}

SK_NO_INLINE_ATTRIBUTE
static void
nx_netif_qset_init(struct netif_qset *qset, struct netif_llink *llink,
    uint8_t idx, struct kern_nexus_netif_llink_qset_init *qset_init)
{
#define _NETIF_QSET_MAX_TXQS    4
	kern_packet_svc_class_t svc[_NETIF_QSET_MAX_TXQS] =
	{KPKT_SC_BE, KPKT_SC_BK, KPKT_SC_VI, KPKT_SC_VO};
	struct ifnet *ifp = llink->nll_nif->nif_ifp;
	uint8_t i;

	/*
	 * no need to retain a reference for llink, as the logical link is
	 * immutable and qsets are created and destroyed along with logical
	 * link.
	 */
	qset->nqs_llink = llink;
	qset->nqs_id = NETIF_QSET_ID_ENCODE(llink->nll_link_id_internal, idx);
	qset->nqs_idx = idx;

	if (qset_init->nlqi_flags & KERN_NEXUS_NET_LLINK_QSET_DEFAULT) {
		qset->nqs_flags |= NETIF_QSET_FLAG_DEFAULT;
	}
	if (qset_init->nlqi_flags & KERN_NEXUS_NET_LLINK_QSET_LOW_LATENCY) {
		qset->nqs_flags |= NETIF_QSET_FLAG_LOW_LATENCY;
	}
	if (qset_init->nlqi_flags & KERN_NEXUS_NET_LLINK_QSET_AQM) {
		qset->nqs_flags |= NETIF_QSET_FLAG_AQM;
		nx_netif_qset_setup_ifclassq(llink, qset);
	}


	for (i = 0; i < qset->nqs_num_rx_queues; i++) {
		nx_netif_driver_queue_init(qset, NETIF_QSET_RX_QUEUE(qset, i),
		    KPKT_SC_UNSPEC, true);
	}

	if (ifp->if_output_sched_model == IFNET_SCHED_MODEL_DRIVER_MANAGED) {
		VERIFY(qset->nqs_num_tx_queues == _NETIF_QSET_MAX_TXQS);
		for (i = 0; i < qset->nqs_num_tx_queues; i++) {
			nx_netif_driver_queue_init(qset,
			    NETIF_QSET_TX_QUEUE(qset, i), svc[i], false);
		}
	} else {
		for (i = 0; i < qset->nqs_num_tx_queues; i++) {
			nx_netif_driver_queue_init(qset,
			    NETIF_QSET_TX_QUEUE(qset, i), KPKT_SC_UNSPEC, false);
		}
	}
}

SK_NO_INLINE_ATTRIBUTE
static struct netif_qset *
nx_netif_qset_create(struct netif_llink *llink, uint8_t idx,
    struct kern_nexus_netif_llink_qset_init *qset_init)
{
	struct netif_qset *qset;

	qset = nx_netif_qset_alloc(qset_init->nlqi_num_rxqs,
	    qset_init->nlqi_num_txqs);
	nx_netif_qset_init(qset, llink, idx, qset_init);
	return qset;
}

static uint16_t
nx_netif_generate_internal_llink_id(struct nx_netif *nif)
{
	struct netif_llink *llink;
	struct netif_stats *nifs = &nif->nif_stats;
	uint16_t id;

again:
	id = (uint16_t)(random() % 65536);
	STAILQ_FOREACH(llink, &nif->nif_llink_list, nll_link) {
		if (__improbable(llink->nll_link_id_internal == id)) {
			break;
		}
	}
	if (__probable(llink == NULL && id != 0)) {
		return id;
	} else {
		STATS_INC(nifs, NETIF_STATS_LLINK_DUP_INT_ID_GENERATED);
		DTRACE_SKYWALK1(dup__llink__id__internal, uint16_t, id);
		goto again;
	}
}

static void
nx_netif_llink_initialize(struct netif_llink *llink, struct nx_netif *nif,
    struct kern_nexus_netif_llink_init *llink_init)
{
	uint8_t i;
	struct ifnet *ifp = nif->nif_ifp;

	LCK_RW_ASSERT(&nif->nif_llink_lock, LCK_RW_ASSERT_EXCLUSIVE);

	llink->nll_nif = nif;
	llink->nll_link_id = llink_init->nli_link_id;
	if (llink_init->nli_flags & KERN_NEXUS_NET_LLINK_DEFAULT) {
		llink->nll_flags |= NETIF_LLINK_FLAG_DEFAULT;
	}
	llink->nll_link_id_internal = nx_netif_generate_internal_llink_id(nif);
	llink->nll_ctx = llink_init->nli_ctx;
	SLIST_INIT(&llink->nll_qset_list);

	for (i = 0; i < llink_init->nli_num_qsets; i++) {
		if (llink->nll_ifcq == NULL &&
		    (llink_init->nli_qsets[i].nlqi_flags &
		    KERN_NEXUS_NET_LLINK_QSET_AQM)) {
			if (NETIF_DEFAULT_LLINK(llink)) {
				/* use the default AQM queues from ifnet */
				ifclassq_retain(ifp->if_snd);
				llink->nll_ifcq = ifp->if_snd;
			} else {
				llink->nll_ifcq = ifclassq_alloc();
				dlil_ifclassq_setup(ifp, llink->nll_ifcq);
			}
		}

		struct netif_qset *qset = nx_netif_qset_create(llink, i,
		    &llink_init->nli_qsets[i]);
		/* nx_netif_qset_create retains a reference for the callee */
		SLIST_INSERT_HEAD(&llink->nll_qset_list, qset, nqs_list);
		if (NETIF_DEFAULT_QSET(qset)) {
			/* there can only be one default queue set */
			VERIFY(llink->nll_default_qset == NULL);
			llink->nll_default_qset = qset;
		}
	}
	llink->nll_qset_cnt = llink_init->nli_num_qsets;
	/* there should be a default queue set */
	VERIFY(llink->nll_default_qset != NULL);
	llink->nll_state = NETIF_LLINK_STATE_INIT;
}

static void
nx_netif_driver_queue_destroy(struct netif_queue *drvq)
{
	VERIFY(drvq->nq_qset->nqs_llink->nll_state ==
	    NETIF_LLINK_STATE_DESTROYED);

	lck_mtx_lock(&drvq->nq_lock);
	VERIFY(KPKTQ_EMPTY(&drvq->nq_pktq));
	lck_mtx_unlock(&drvq->nq_lock);

	drvq->nq_qset = NULL;
	lck_mtx_destroy(&drvq->nq_lock, &netif_llink_lock_group);
}

static void
nx_netif_driver_queue_init(struct netif_qset *qset,
    struct netif_queue *drvq, kern_packet_svc_class_t svc, bool is_rx)
{
	lck_mtx_init(&drvq->nq_lock, &netif_llink_lock_group,
	    &netif_llink_lock_attr);

	lck_mtx_lock(&drvq->nq_lock);
	KPKTQ_INIT(&drvq->nq_pktq);
	lck_mtx_unlock(&drvq->nq_lock);

	/*
	 * no need to retain a reference for qset, as queue set is
	 * immutable and driver queue is part of the queue set data structure.
	 */
	drvq->nq_qset = qset;
	drvq->nq_svc = svc;
	if (is_rx) {
		drvq->nq_flags |= NETIF_QUEUE_IS_RX;
	}
}

SK_NO_INLINE_ATTRIBUTE
static struct netif_llink *
nx_netif_llink_create_locked(struct nx_netif *nif,
    struct kern_nexus_netif_llink_init *llink_init)
{
	struct netif_llink *llink;
	struct netif_stats *nifs = &nif->nif_stats;

	LCK_RW_ASSERT(&nif->nif_llink_lock, LCK_RW_ASSERT_EXCLUSIVE);
	llink = nx_netif_llink_alloc();
	nx_netif_llink_initialize(llink, nif, llink_init);
	/* nx_netif_llink_alloc retains a reference for the caller */
	STAILQ_INSERT_TAIL(&nif->nif_llink_list, llink, nll_link);
	nif->nif_llink_cnt++;
	STATS_INC(nifs, NETIF_STATS_LLINK_ADD);
	if (NETIF_DEFAULT_LLINK(llink)) {
		/* there can only be one default logical link */
		VERIFY(nif->nif_default_llink == NULL);
	}
	return llink;
}

SK_NO_INLINE_ATTRIBUTE
static void
nx_netif_llink_destroy_locked(struct nx_netif *nif, struct netif_llink **pllink)
{
	struct netif_stats *nifs = &nif->nif_stats;

	LCK_RW_ASSERT(&nif->nif_llink_lock, LCK_RW_ASSERT_EXCLUSIVE);
	(*pllink)->nll_state = NETIF_LLINK_STATE_DESTROYED;
	STAILQ_REMOVE(&nif->nif_llink_list, *pllink, netif_llink, nll_link);
	nif->nif_llink_cnt--;
	STATS_INC(nifs, NETIF_STATS_LLINK_REMOVE);
	nx_netif_llink_release(pllink);
}

int
nx_netif_llink_add(struct nx_netif *nif,
    struct kern_nexus_netif_llink_init *llink_init, struct netif_llink **pllink)
{
	int err;
	struct netif_llink *__single llink;
	struct netif_stats *nifs = &nif->nif_stats;

	*pllink = NULL;
	lck_rw_lock_exclusive(&nif->nif_llink_lock);
	/* ensure logical_link_id is unique */
	STAILQ_FOREACH(llink, &nif->nif_llink_list, nll_link) {
		if (llink->nll_link_id == llink_init->nli_link_id) {
			SK_ERR("duplicate llink_id 0x%llu",
			    llink_init->nli_link_id);
			STATS_INC(nifs, NETIF_STATS_LLINK_DUP_ID_GIVEN);
			DTRACE_SKYWALK1(dup__id__given, uint64_t,
			    llink_init->nli_link_id);
			lck_rw_unlock_exclusive(&nif->nif_llink_lock);
			return EINVAL;
		}
	}
	llink = nx_netif_llink_create_locked(nif, llink_init);
	lck_rw_unlock_exclusive(&nif->nif_llink_lock);
	VERIFY(llink != NULL);
	err = nx_netif_llink_ext_init_queues(nif->nif_nx, llink);
	if (err != 0) {
		lck_rw_lock_exclusive(&nif->nif_llink_lock);
		nx_netif_llink_destroy_locked(nif, &llink);
		lck_rw_unlock_exclusive(&nif->nif_llink_lock);
	} else {
		/* increment reference for the caller */
		nx_netif_llink_retain(llink);
		*pllink = llink;
	}
	return err;
}

int
nx_netif_llink_remove(struct nx_netif *nif,
    kern_nexus_netif_llink_id_t llink_id)
{
	bool llink_found = false;
	struct netif_llink *__single llink;
	struct netif_stats *nifs = &nif->nif_stats;

	lck_rw_lock_exclusive(&nif->nif_llink_lock);
	STAILQ_FOREACH(llink, &nif->nif_llink_list, nll_link) {
		if (llink->nll_link_id == llink_id) {
			llink_found = true;
			break;
		}
	}
	lck_rw_unlock_exclusive(&nif->nif_llink_lock);
	if (!llink_found) {
		STATS_INC(nifs, NETIF_STATS_LLINK_NOT_FOUND_REMOVE);
		DTRACE_SKYWALK1(not__found, uint64_t, llink_id);
		return ENOENT;
	}
	nx_netif_llink_ext_fini_queues(nif->nif_nx, llink);
	lck_rw_lock_exclusive(&nif->nif_llink_lock);
	nx_netif_llink_destroy_locked(nif, &llink);
	lck_rw_unlock_exclusive(&nif->nif_llink_lock);
	return 0;
}

static void
nx_netif_default_llink_add(struct nx_netif *nif)
{
	struct kern_nexus_netif_llink_init llink_init, *pllink_init;
	struct kern_nexus_netif_llink_qset_init qset;
	struct ifnet *ifp = nif->nif_ifp;
	struct netif_llink *llink;

	LCK_RW_ASSERT(&nif->nif_llink_lock, LCK_RW_ASSERT_EXCLUSIVE);
	VERIFY(SKYWALK_NATIVE(ifp));

	llink_init.nli_flags = KERN_NEXUS_NET_LLINK_DEFAULT;

	if (NX_LLINK_PROV(nif->nif_nx)) {
		VERIFY(nif->nif_default_llink_params != NULL);
		pllink_init = nif->nif_default_llink_params;
	} else {
		struct nexus_adapter *devna =
		    nx_port_get_na(nif->nif_nx, NEXUS_PORT_NET_IF_DEV);

		llink_init.nli_link_id = NETIF_LLINK_ID_DEFAULT;
		qset.nlqi_flags = KERN_NEXUS_NET_LLINK_QSET_DEFAULT;
		/*
		 * For the legacy mode of operation we will assume that
		 * AQM is not needed on low-latency interface.
		 */
		if (NETIF_IS_LOW_LATENCY(nif)) {
			qset.nlqi_flags |=
			    KERN_NEXUS_NET_LLINK_QSET_LOW_LATENCY;
		} else {
			qset.nlqi_flags |= KERN_NEXUS_NET_LLINK_QSET_AQM;
		}
		qset.nlqi_num_rxqs =
		    (uint8_t)na_get_nrings(devna, NR_RX);
		qset.nlqi_num_txqs =
		    (uint8_t)na_get_nrings(devna, NR_TX);
		llink_init.nli_num_qsets = 1;
		llink_init.nli_qsets = &qset;
		llink_init.nli_ctx = NULL;
		pllink_init = &llink_init;
	}
	llink = nx_netif_llink_create_locked(nif, pllink_init);
	/* there can only be one default logical link */
	VERIFY(nif->nif_default_llink == NULL);
	nx_netif_llink_retain(llink);
	/* obtain a reference for the default logical link pointer */
	nif->nif_default_llink = llink;
}

static void
nx_netif_default_llink_remove(struct nx_netif *nif)
{
	struct netif_llink *__single llink;

	LCK_RW_ASSERT(&nif->nif_llink_lock, LCK_RW_ASSERT_EXCLUSIVE);
	ASSERT(nif->nif_default_llink != NULL);
	ASSERT(nif->nif_llink_cnt == 1);
	llink = nif->nif_default_llink;
	nx_netif_llink_release(&nif->nif_default_llink);
	ASSERT(nif->nif_default_llink == NULL);
	nx_netif_llink_destroy_locked(nif, &llink);
}

static int
netif_qset_enqueue_single(struct netif_qset *qset, struct __kern_packet *pkt,
    uint32_t *flowctl, uint32_t *dropped)
{
	struct ifnet *ifp = qset->nqs_ifcq->ifcq_ifp;
	boolean_t pkt_drop = FALSE;
	int err;

	/*
	 * we are using the first 4 bytes of flow_id as the AQM flow
	 * identifier.
	 */
	ASSERT(!uuid_is_null(pkt->pkt_flow_id));
	netif_ifp_inc_traffic_class_out_pkt(ifp, pkt->pkt_svc_class,
	    1, pkt->pkt_length);

	if (__improbable(pkt->pkt_trace_id != 0)) {
		KDBG(SK_KTRACE_PKT_TX_FSW | DBG_FUNC_END, pkt->pkt_trace_id);
		KDBG(SK_KTRACE_PKT_TX_AQM | DBG_FUNC_START, pkt->pkt_trace_id);
	}

	/* Only native path is supported */
	ASSERT((pkt->pkt_pflags & PKT_F_MBUF_DATA) == 0);
	ASSERT(pkt->pkt_mbuf == NULL);

	err = ifnet_enqueue_ifcq_pkt(ifp, qset->nqs_ifcq, pkt, false,
	    &pkt_drop);
	if (__improbable(err != 0)) {
		if ((err == EQFULL || err == EQSUSPENDED) && flowctl != NULL) {
			(*flowctl)++;
		}
		if (pkt_drop && dropped != NULL) {
			(*dropped)++;
		}
	}
	return err;
}

int
netif_qset_enqueue(struct netif_qset *qset, struct __kern_packet *pkt_chain,
    struct __kern_packet *tail, uint32_t cnt, uint32_t bytes, uint32_t *flowctl,
    uint32_t *dropped)
{
#pragma unused(tail)
	struct __kern_packet *pkt = pkt_chain;
	struct __kern_packet *next;
	struct netif_stats *nifs = &qset->nqs_llink->nll_nif->nif_stats;
	uint32_t c = 0, b = 0, drop_cnt = 0, flowctl_cnt = 0;
	int err = 0;

	/* drop packets if logical link state is destroyed */
	if (qset->nqs_llink->nll_state == NETIF_LLINK_STATE_DESTROYED) {
		pp_free_packet_chain(pkt_chain, (int *)&drop_cnt);
		STATS_ADD(nifs, NETIF_STATS_LLINK_TX_DROP_BAD_STATE, drop_cnt);
		if (dropped != NULL) {
			*dropped = drop_cnt;
		}
		return ENXIO;
	}

	/* We don't support chains for now */
	while (pkt != NULL) {
		next = pkt->pkt_nextpkt;
		pkt->pkt_nextpkt = NULL;
		c++;
		b += pkt->pkt_length;

		(void) netif_qset_enqueue_single(qset, pkt, &flowctl_cnt,
		    &drop_cnt);
		pkt = next;
	}
	VERIFY(c == cnt);
	VERIFY(b == bytes);
	if (flowctl != NULL && flowctl_cnt > 0) {
		*flowctl = flowctl_cnt;
		STATS_ADD(nifs, NETIF_STATS_LLINK_AQM_QFULL, flowctl_cnt);
		err = EIO;
	}
	if (dropped != NULL && drop_cnt > 0) {
		*dropped = drop_cnt;
		STATS_ADD(nifs, NETIF_STATS_LLINK_AQM_DROPPED, drop_cnt);
		err = EIO;
	}
	return err;
}

struct netif_qset *
nx_netif_get_default_qset_noref(struct nx_netif *nif)
{
	struct netif_qset *qset;
	struct netif_stats *nifs = &nif->nif_stats;

	ASSERT(NETIF_LLINK_ENABLED(nif));
	if (__improbable(nif->nif_default_llink->nll_state !=
	    NETIF_LLINK_STATE_INIT)) {
		STATS_INC(nifs, NETIF_STATS_LLINK_QSET_BAD_STATE);
		DTRACE_SKYWALK1(llink__bad__state, struct nx_netif *, nif);
		return NULL;
	}
	qset = nif->nif_default_llink->nll_default_qset;
	return qset;
}

static void
nx_netif_qset_hint_decode(uint64_t hint,
    uint16_t *link_id_internal, uint16_t *qset_idx)
{
	/* The top 32 bits are unused for now */
	*link_id_internal = (uint16_t)((0xffff0000 & hint) >> 16);
	*qset_idx = (uint16_t)((0x0000ffff & hint));
}

/* retains a reference for the caller */
static struct netif_qset *
nx_netif_get_default_qset(struct nx_netif *nif)
{
	struct netif_qset *qset;

	qset = nif->nif_default_llink->nll_default_qset;
	nx_netif_qset_retain(qset);
	return qset;
}

/*
 * Find the qset based on the qset hint. Fall back to the default qset
 * if not found. The random qset is used for experimentation.
 */
struct netif_qset *
nx_netif_find_qset(struct nx_netif *nif, uint64_t hint)
{
	uint16_t ll_id_internal, qset_idx;
	struct netif_llink *llink;
	struct netif_qset *qset;
	struct netif_stats *nifs = &nif->nif_stats;
	int i, j, random_id;

	ASSERT(NETIF_LLINK_ENABLED(nif));
	if (__improbable(nif->nif_default_llink->nll_state !=
	    NETIF_LLINK_STATE_INIT)) {
		STATS_INC(nifs, NETIF_STATS_LLINK_QSET_BAD_STATE);
		DTRACE_SKYWALK1(llink__bad__state, struct nx_netif *, nif);
		return NULL;
	}
	if (!NX_LLINK_PROV(nif->nif_nx) ||
	    (nx_netif_random_qset == 0 && hint == 0)) {
		goto def_qset;
	}
	if (nx_netif_random_qset == 0) {
		nx_netif_qset_hint_decode(hint, &ll_id_internal, &qset_idx);
	} else {
		ll_id_internal = 0;
		qset_idx = 0;
	}
	lck_rw_lock_shared(&nif->nif_llink_lock);
	i = 0;
	random_id = random();
	STAILQ_FOREACH(llink, &nif->nif_llink_list, nll_link) {
		if (nx_netif_random_qset != 0 &&
		    (random_id % nif->nif_llink_cnt) == i) {
			break;
		} else if (llink->nll_link_id_internal == ll_id_internal) {
			break;
		}
		i++;
	}
	if (llink == NULL) {
		STATS_INC(nifs, NETIF_STATS_LLINK_HINT_NOT_USEFUL);
		lck_rw_unlock_shared(&nif->nif_llink_lock);
		goto def_qset;
	}
	j = 0;
	random_id = random();
	SLIST_FOREACH(qset, &llink->nll_qset_list, nqs_list) {
		if (nx_netif_random_qset != 0 &&
		    (random_id % llink->nll_qset_cnt) == j) {
			break;
		} else if (qset->nqs_idx == qset_idx) {
			break;
		}
		j++;
	}
	if (qset == NULL) {
		STATS_INC(nifs, NETIF_STATS_LLINK_HINT_NOT_USEFUL);
		lck_rw_unlock_shared(&nif->nif_llink_lock);
		goto def_qset;
	}
	nx_netif_qset_retain(qset);
	STATS_INC(nifs, NETIF_STATS_LLINK_NONDEF_QSET_USED);
	lck_rw_unlock_shared(&nif->nif_llink_lock);
	if (nx_netif_random_qset != 0) {
		SK_DF(SK_VERB_LLINK, "%s: random qset: qset %p, ifcq %p, "
		    "llink_idx %d, qset_idx %d", if_name(nif->nif_ifp),
		    qset, qset->nqs_ifcq, i, j);

		DTRACE_SKYWALK5(random__qset, struct nx_netif *, nif,
		    struct netif_qset *, qset, struct ifclassq *,
		    qset->nqs_ifcq, int, i, int, j);
	} else {
		SK_DF(SK_VERB_LLINK, "%s: non-default qset: qset %p, ifcq %p, "
		    " ll_id_internal 0x%x, qset_idx %d", if_name(nif->nif_ifp),
		    qset, qset->nqs_ifcq, ll_id_internal, qset_idx);

		DTRACE_SKYWALK5(nondef__qset, struct nx_netif *, nif,
		    struct netif_qset *, qset, struct ifclassq *,
		    qset->nqs_ifcq, uint16_t, ll_id_internal,
		    uint16_t, qset_idx);
	}
	return qset;

def_qset:
	STATS_INC(nifs, NETIF_STATS_LLINK_DEF_QSET_USED);
	qset = nx_netif_get_default_qset(nif);
	ASSERT(qset != NULL);

	SK_DF(SK_VERB_LLINK, "%s: default qset: qset %p, ifcq %p, hint %llx",
	    if_name(nif->nif_ifp), qset, qset->nqs_ifcq, hint);

	DTRACE_SKYWALK4(def__qset, struct nx_netif *, nif, struct netif_qset *,
	    qset, struct ifclassq *, qset->nqs_ifcq, uint64_t, hint);
	return qset;
}

void
nx_netif_llink_init(struct nx_netif *nif)
{
	ifnet_t ifp = nif->nif_ifp;

#if (DEVELOPMENT || DEBUG)
	if (__improbable(nx_netif_disable_llink != 0)) {
		SK_DF(SK_VERB_LLINK, "%s: llink is disabled",
		    if_name(nif->nif_ifp));
		return;
	}
#endif /* (DEVELOPMENT || DEBUG) */

	if (!SKYWALK_NATIVE(ifp)) {
		SK_DF(SK_VERB_LLINK,
		    "%s: llink is supported on native devices only",
		    if_name(ifp));
		return;
	}
	ASSERT(!NETIF_LLINK_ENABLED(nif));
	lck_rw_init(&nif->nif_llink_lock, &netif_llink_lock_group,
	    &netif_llink_lock_attr);

	lck_rw_lock_exclusive(&nif->nif_llink_lock);

	STAILQ_INIT(&nif->nif_llink_list);
	nif->nif_llink_cnt = 0;
	nx_netif_default_llink_add(nif);
	nif->nif_flags |= NETIF_FLAG_LLINK_INITIALIZED;

	lck_rw_unlock_exclusive(&nif->nif_llink_lock);

	SK_DF(SK_VERB_LLINK, "%s: llink initialized", if_name(ifp));
}

void
nx_netif_llink_fini(struct nx_netif *nif)
{
	if (!NETIF_LLINK_ENABLED(nif)) {
		SK_DF(SK_VERB_LLINK, "%s: llink not initialized",
		    if_name(nif->nif_ifp));
		return;
	}

	lck_rw_lock_exclusive(&nif->nif_llink_lock);

	nif->nif_flags &= ~NETIF_FLAG_LLINK_INITIALIZED;
	nx_netif_default_llink_remove(nif);
	ASSERT(nif->nif_llink_cnt == 0);
	ASSERT(STAILQ_EMPTY(&nif->nif_llink_list));

	lck_rw_unlock_exclusive(&nif->nif_llink_lock);

	nx_netif_llink_config_free(nif);
	lck_rw_destroy(&nif->nif_llink_lock, &netif_llink_lock_group);
	SK_DF(SK_VERB_LLINK, "%s: llink uninitialization done",
	    if_name(nif->nif_ifp));
}

int
nx_netif_validate_llink_config(struct kern_nexus_netif_llink_init *init,
    bool default_llink)
{
	struct kern_nexus_netif_llink_qset_init *qsinit;
	bool has_default_qset = false;
	bool default_llink_flag;
	uint8_t i;

	default_llink_flag =
	    ((init->nli_flags & KERN_NEXUS_NET_LLINK_DEFAULT) != 0);

	if (default_llink != default_llink_flag) {
		SK_ERR("default llink flag incompatible: default_llink(%s), "
		    "default_llink_flag(%s)",
		    default_llink ? "true" : "false",
		    default_llink_flag ? "true" : "false");
		return EINVAL;
	}
	if (init->nli_num_qsets == 0) {
		SK_ERR("num qsets is zero");
		return EINVAL;
	}
	if ((qsinit = init->nli_qsets) == NULL) {
		SK_ERR("qsets is NULL");
		return EINVAL;
	}
	for (i = 0; i < init->nli_num_qsets; i++) {
		if (qsinit[i].nlqi_flags &
		    KERN_NEXUS_NET_LLINK_QSET_DEFAULT) {
			if (has_default_qset) {
				SK_ERR("has more than one default qset");
				return EINVAL;
			}
			if (qsinit[i].nlqi_num_rxqs == 0) {
				SK_ERR("num_rxqs == 0");
				return EINVAL;
			}
			has_default_qset = true;
		}
		if (qsinit[i].nlqi_num_txqs == 0) {
			SK_ERR("num_txqs == 0");
			return EINVAL;
		}
		if ((qsinit[i].nlqi_flags &
		    KERN_NEXUS_NET_LLINK_QSET_WMM_MODE) &&
		    (qsinit[i].nlqi_num_txqs != NEXUS_NUM_WMM_QUEUES)) {
			SK_ERR("invalid wmm mode");
			return EINVAL;
		}
	}
	return 0;
}

int
nx_netif_default_llink_config(struct nx_netif *nif,
    struct kern_nexus_netif_llink_init *init)
{
	struct kern_nexus_netif_llink_qset_init *qsinit;
	int i, err;

	err = nx_netif_validate_llink_config(init, true);
	if (err != 0) {
		return err;
	}
	nif->nif_default_llink_params = sk_alloc_type(
		struct kern_nexus_netif_llink_init,
		Z_WAITOK | Z_NOFAIL, nx_netif_tag_llink_cfg);

	qsinit = sk_alloc_type_array(struct kern_nexus_netif_llink_qset_init,
	    init->nli_num_qsets, Z_WAITOK, nx_netif_tag_llink_cfg);
	if (qsinit == NULL) {
		SK_ERR("failed to alloc kern_nexus_netif_llink_qset_init");
		sk_free_type(struct kern_nexus_netif_llink_init,
		    nif->nif_default_llink_params);
		nif->nif_default_llink_params = NULL;
		return ENOMEM;
	}
	memcpy(nif->nif_default_llink_params, init,
	    __builtin_offsetof(struct kern_nexus_netif_llink_init,
	    nli_qsets));
	for (i = 0; i < init->nli_num_qsets; i++) {
		*(&qsinit[i]) = *(&init->nli_qsets[i]);
	}
	nif->nif_default_llink_params->nli_qsets = qsinit;
	nif->nif_default_llink_params->nli_num_qsets = init->nli_num_qsets;
	return 0;
}

void
nx_netif_llink_config_free(struct nx_netif *nif)
{
	if (nif->nif_default_llink_params == NULL) {
		return;
	}
	sk_free_type_array_counted_by(struct kern_nexus_netif_llink_qset_init,
	    nif->nif_default_llink_params->nli_num_qsets,
	    nif->nif_default_llink_params->nli_qsets);

	sk_free_type(struct kern_nexus_netif_llink_init,
	    nif->nif_default_llink_params);
	nif->nif_default_llink_params = NULL;
}

static int
nx_netif_llink_ext_init_queues(struct kern_nexus *nx, struct netif_llink *llink)
{
	struct kern_nexus_provider *nxprov = NX_PROV(nx);
	struct kern_nexus_netif_provider_init *nxnpi;
	struct netif_qset *qset;
	struct netif_stats *nifs = &NX_NETIF_PRIVATE(nx)->nif_stats;
	int err = 0;
	uint8_t i;

	nxnpi = &nxprov->nxprov_netif_ext;
	ASSERT(nxprov->nxprov_netif_ext.nxnpi_qset_init != NULL);
	ASSERT(nxprov->nxprov_netif_ext.nxnpi_queue_init != NULL);

	SLIST_FOREACH(qset, &llink->nll_qset_list, nqs_list) {
		struct netif_queue *drvq;

		ASSERT((qset->nqs_flags & NETIF_QSET_FLAG_EXT_INITED) == 0);
		err = nxnpi->nxnpi_qset_init(nxprov, nx, llink->nll_ctx,
		    qset->nqs_idx, qset->nqs_id, qset, &qset->nqs_ctx);
		if (err != 0) {
			STATS_INC(nifs, NETIF_STATS_LLINK_QSET_INIT_FAIL);
			SK_ERR("nx: 0x%llx, qset: %d, qset init err %d",
			    SK_KVA(nx), qset->nqs_idx, err);
			goto out;
		}
		qset->nqs_flags |= NETIF_QSET_FLAG_EXT_INITED;

		for (i = 0; i < qset->nqs_num_rx_queues; i++) {
			drvq = NETIF_QSET_RX_QUEUE(qset, i);

			ASSERT((drvq->nq_flags & NETIF_QUEUE_EXT_INITED) == 0);
			err = nxnpi->nxnpi_queue_init(nxprov, nx, qset->nqs_ctx,
			    i, false, drvq, &drvq->nq_ctx);
			if (err != 0) {
				STATS_INC(nifs, NETIF_STATS_LLINK_RXQ_INIT_FAIL);
				SK_ERR("nx: 0x%llx qset: %d queue_init err %d",
				    SK_KVA(nx), qset->nqs_idx, err);
				goto out;
			}
			drvq->nq_flags |= NETIF_QUEUE_EXT_INITED;
		}
		for (i = 0; i < qset->nqs_num_tx_queues; i++) {
			drvq = NETIF_QSET_TX_QUEUE(qset, i);

			ASSERT((drvq->nq_flags & NETIF_QUEUE_EXT_INITED) == 0);
			err = nxnpi->nxnpi_queue_init(nxprov, nx, qset->nqs_ctx,
			    i, true, drvq, &drvq->nq_ctx);
			if (err != 0) {
				STATS_INC(nifs, NETIF_STATS_LLINK_TXQ_INIT_FAIL);
				SK_ERR("nx: 0x%llx qset: %d queue_init err %d",
				    SK_KVA(nx), qset->nqs_idx, err);
				goto out;
			}
			drvq->nq_flags |= NETIF_QUEUE_EXT_INITED;
		}
	}
out:
	if (err != 0) {
		nx_netif_llink_ext_fini_queues(nx, llink);
	}
	return err;
}

static void
nx_netif_llink_ext_fini_queues(struct kern_nexus *nx, struct netif_llink *llink)
{
	struct kern_nexus_provider *nxprov = NX_PROV(nx);
	struct kern_nexus_netif_provider_init *nxnpi;
	struct netif_qset *qset;
	uint8_t i;

	nxnpi = &nxprov->nxprov_netif_ext;
	ASSERT(nxprov->nxprov_netif_ext.nxnpi_qset_fini != NULL);
	ASSERT(nxprov->nxprov_netif_ext.nxnpi_queue_fini != NULL);

	SLIST_FOREACH(qset, &llink->nll_qset_list, nqs_list) {
		struct netif_queue *drvq;

		for (i = 0; i < qset->nqs_num_rx_queues; i++) {
			drvq = NETIF_QSET_RX_QUEUE(qset, i);
			if ((drvq->nq_flags & NETIF_QUEUE_EXT_INITED) == 0) {
				continue;
			}
			nxnpi->nxnpi_queue_fini(nxprov, nx, drvq->nq_ctx);
			drvq->nq_flags &= ~NETIF_QUEUE_EXT_INITED;
		}
		for (i = 0; i < qset->nqs_num_tx_queues; i++) {
			drvq = NETIF_QSET_TX_QUEUE(qset, i);
			if ((drvq->nq_flags & NETIF_QUEUE_EXT_INITED) == 0) {
				continue;
			}
			nxnpi->nxnpi_queue_fini(nxprov, nx, drvq->nq_ctx);
			drvq->nq_flags &= ~NETIF_QUEUE_EXT_INITED;
		}
		if ((qset->nqs_flags & NETIF_QSET_FLAG_EXT_INITED) == 0) {
			continue;
		}
		nxnpi->nxnpi_qset_fini(nxprov, nx, qset->nqs_ctx);
		qset->nqs_flags &= ~NETIF_QSET_FLAG_EXT_INITED;
	}
}

int
nx_netif_llink_ext_init_default_queues(struct kern_nexus *nx)
{
	struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
	return nx_netif_llink_ext_init_queues(nx, nif->nif_default_llink);
}

void
nx_netif_llink_ext_fini_default_queues(struct kern_nexus *nx)
{
	struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
	nx_netif_llink_ext_fini_queues(nx, nif->nif_default_llink);
}