This is xnu-11215.1.10. See this file in:
/*
 * Copyright (c) 2019-2021 Apple Inc. All rights reserved.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
 *
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. The rights granted to you under the License
 * may not be used to create, or enable the creation or redistribution of,
 * unlawful or unlicensed copies of an Apple operating system, or to
 * circumvent, violate, or enable the circumvention or violation of, any
 * terms of an Apple operating system software license agreement.
 *
 * Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this file.
 *
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
 */

#include <skywalk/os_skywalk_private.h>
#include <skywalk/nexus/netif/nx_netif.h>
#include <sys/kdebug.h>
#include <kern/thread.h>
#include <kern/sched_prim.h>
#include <net/dlil_sysctl.h>

extern kern_return_t thread_terminate(thread_t);

#define NETIF_POLL_EWMA(old, new, decay) do {                                 \
	uint32_t _avg;                                                 \
	if ((_avg = (old)) > 0)                                         \
	        _avg = (((_avg << (decay)) - _avg) + (new)) >> (decay); \
	else                                                            \
	        _avg = (new);                                           \
	(old) = _avg;                                                   \
} while (0)

/* rate limit debug messages */
struct timespec netif_poll_dbgrate = { .tv_sec = 1, .tv_nsec = 0 };

static inline void
nx_netif_rxpoll_set_mode(struct ifnet *ifp, ifnet_model_t mode)
{
	errno_t err;
	uint64_t ival;
	struct timespec ts;
	struct ifnet_model_params p = { .model = mode, .reserved = { 0 } };

	if ((ival = ifp->if_rxpoll_ival) < IF_RXPOLL_INTERVALTIME_MIN) {
		ival = IF_RXPOLL_INTERVALTIME_MIN;
	}

	if ((err = ((*ifp->if_input_ctl)(ifp, IFNET_CTL_SET_INPUT_MODEL,
	    sizeof(p), &p))) != 0) {
		SK_ERR("%s: error setting polling mode to %s (%d)",
		    if_name(ifp), (mode == IFNET_MODEL_INPUT_POLL_ON) ?
		    "ON" : "OFF", err);
	}

	switch (mode) {
	case IFNET_MODEL_INPUT_POLL_OFF:
		ifnet_set_poll_cycle(ifp, NULL);
		ifp->if_rxpoll_offreq++;
		if (err != 0) {
			ifp->if_rxpoll_offerr++;
		}
		break;

	case IFNET_MODEL_INPUT_POLL_ON:
		net_nsectimer(&ival, &ts);
		ifnet_set_poll_cycle(ifp, &ts);
		ifp->if_rxpoll_onreq++;
		if (err != 0) {
			ifp->if_rxpoll_onerr++;
		}
		break;

	default:
		VERIFY(0);
		/* NOTREACHED */
		__builtin_unreachable();
	}
}

/*
 * Updates the input poll statistics and determines the next mode based
 * on the configured thresholds.
 */
static inline void
netif_rxpoll_compat_update_rxpoll_stats(struct ifnet *ifp,
    struct ifnet_stat_increment_param *s)
{
	uint32_t poll_thresh = 0, poll_ival = 0;
	uint32_t m_cnt, m_size, poll_req = 0;
	struct timespec now, delta;
	ifnet_model_t mode;
	uint64_t ival;

	ASSERT(net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL));
	LCK_MTX_ASSERT(&ifp->if_poll_lock, LCK_MTX_ASSERT_NOTOWNED);

	/* total packets and bytes passed in by driver */
	m_cnt = s->packets_in;
	m_size = s->bytes_in;

	lck_mtx_lock_spin(&ifp->if_poll_lock);
	if ((ival = ifp->if_rxpoll_ival) < IF_RXPOLL_INTERVALTIME_MIN) {
		ival = IF_RXPOLL_INTERVALTIME_MIN;
	}
	/* Link parameters changed? */
	if (ifp->if_poll_update != 0) {
		ifp->if_poll_update = 0;
		(void) netif_rxpoll_set_params(ifp, NULL, TRUE);
	}

	/* Current operating mode */
	mode = ifp->if_poll_mode;

	nanouptime(&now);
	if (!net_timerisset(&ifp->if_poll_sample_lasttime)) {
		*(&ifp->if_poll_sample_lasttime) = now;
	}

	net_timersub(&now, &ifp->if_poll_sample_lasttime, &delta);
	if (if_rxpoll && net_timerisset(&ifp->if_poll_sample_holdtime)) {
		uint32_t ptot, btot;

		/* Accumulate statistics for current sampling */
		PKTCNTR_ADD(&ifp->if_poll_sstats, m_cnt, m_size);

		if (net_timercmp(&delta, &ifp->if_poll_sample_holdtime, <)) {
			goto skip;
		}
		*(&ifp->if_poll_sample_lasttime) = now;

		/* Calculate min/max of inbound bytes */
		btot = (uint32_t)ifp->if_poll_sstats.bytes;
		if (ifp->if_rxpoll_bmin == 0 || ifp->if_rxpoll_bmin > btot) {
			ifp->if_rxpoll_bmin = btot;
		}
		if (btot > ifp->if_rxpoll_bmax) {
			ifp->if_rxpoll_bmax = btot;
		}

		/* Calculate EWMA of inbound bytes */
		NETIF_POLL_EWMA(ifp->if_rxpoll_bavg, btot, if_rxpoll_decay);

		/* Calculate min/max of inbound packets */
		ptot = (uint32_t)ifp->if_poll_sstats.packets;
		if (ifp->if_rxpoll_pmin == 0 || ifp->if_rxpoll_pmin > ptot) {
			ifp->if_rxpoll_pmin = ptot;
		}
		if (ptot > ifp->if_rxpoll_pmax) {
			ifp->if_rxpoll_pmax = ptot;
		}

		/* Calculate EWMA of inbound packets */
		NETIF_POLL_EWMA(ifp->if_rxpoll_pavg, ptot, if_rxpoll_decay);

		/* Reset sampling statistics */
		PKTCNTR_CLEAR(&ifp->if_poll_sstats);

#if (SK_LOG && (DEVELOPMENT || DEBUG))
		if (__improbable(sk_verbose & SK_VERB_NETIF_POLL)) {
			if (!net_timerisset(&ifp->if_poll_dbg_lasttime)) {
				*(&ifp->if_poll_dbg_lasttime) = *(&now);
			}
			net_timersub(&now, &ifp->if_poll_dbg_lasttime, &delta);
			if (net_timercmp(&delta, &netif_poll_dbgrate, >=)) {
				*(&ifp->if_poll_dbg_lasttime) = *(&now);
				SK_DF(SK_VERB_NETIF_POLL,
				    "%s: [%s] pkts avg %d max %d "
				    "limits [%d/%d], bytes avg %d "
				    "limits [%d/%d]", if_name(ifp),
				    (ifp->if_poll_mode ==
				    IFNET_MODEL_INPUT_POLL_ON) ?
				    "ON" : "OFF", ifp->if_rxpoll_pavg,
				    ifp->if_rxpoll_pmax,
				    ifp->if_rxpoll_plowat,
				    ifp->if_rxpoll_phiwat,
				    ifp->if_rxpoll_bavg,
				    ifp->if_rxpoll_blowat,
				    ifp->if_rxpoll_bhiwat);
			}
		}
#endif /* (SK_LOG && (DEVELOPMENT || DEBUG)) */

		/* Perform mode transition, if necessary */
		if (!net_timerisset(&ifp->if_poll_mode_lasttime)) {
			*(&ifp->if_poll_mode_lasttime) = *(&now);
		}

		net_timersub(&now, &ifp->if_poll_mode_lasttime, &delta);
		if (net_timercmp(&delta, &ifp->if_poll_mode_holdtime, <)) {
			goto skip;
		}

		if (ifp->if_rxpoll_pavg <= ifp->if_rxpoll_plowat &&
		    ifp->if_rxpoll_bavg <= ifp->if_rxpoll_blowat &&
		    ifp->if_poll_mode != IFNET_MODEL_INPUT_POLL_OFF) {
			mode = IFNET_MODEL_INPUT_POLL_OFF;
		} else if (ifp->if_rxpoll_pavg >= ifp->if_rxpoll_phiwat &&
		    ifp->if_rxpoll_bavg >= ifp->if_rxpoll_bhiwat &&
		    ifp->if_poll_mode != IFNET_MODEL_INPUT_POLL_ON) {
			mode = IFNET_MODEL_INPUT_POLL_ON;
		}

		if (mode != ifp->if_poll_mode) {
			ifp->if_poll_mode = mode;
			*(&ifp->if_poll_mode_lasttime) = *(&now);
			poll_req++;
		}
	}
skip:
	/* update rxpoll stats */
	if (ifp->if_poll_tstats.packets != 0) {
		ifp->if_poll_pstats.ifi_poll_packets +=
		    ifp->if_poll_tstats.packets;
		ifp->if_poll_tstats.packets = 0;
	}
	if (ifp->if_poll_tstats.bytes != 0) {
		ifp->if_poll_pstats.ifi_poll_bytes +=
		    ifp->if_poll_tstats.bytes;
		ifp->if_poll_tstats.bytes = 0;
	}

	lck_mtx_unlock(&ifp->if_poll_lock);
	/*
	 * If there's a mode change, perform a downcall to the driver
	 * for the new mode. This function is called from the poller thread
	 * which holds a reference on the ifnet.
	 */
	if (poll_req != 0) {
		nx_netif_rxpoll_set_mode(ifp, mode);
	}

	/* Signal the poller thread to do work if required */
	if (mode == IFNET_MODEL_INPUT_POLL_ON && m_cnt > 1 &&
	    (poll_ival = if_rxpoll_interval_pkts) > 0) {
		poll_thresh = m_cnt;
	}
	if (poll_thresh != 0 && poll_ival > 0 &&
	    (--poll_thresh % poll_ival) == 0) {
		lck_mtx_lock_spin(&ifp->if_poll_lock);
		ifp->if_poll_req++;
		lck_mtx_unlock(&ifp->if_poll_lock);
	}
}

/*
 * Must be called on an attached ifnet (caller is expected to check.)
 * Caller may pass NULL for poll parameters to indicate "auto-tuning."
 */
errno_t
netif_rxpoll_set_params(struct ifnet *ifp, struct ifnet_poll_params *p,
    boolean_t locked)
{
	errno_t err;

	VERIFY(ifp != NULL);
	if ((ifp->if_eflags & IFEF_RXPOLL) == 0) {
		return ENXIO;
	}
	err = dlil_rxpoll_validate_params(p);
	if (err != 0) {
		return err;
	}

	if (!locked) {
		lck_mtx_lock(&ifp->if_poll_lock);
	}
	LCK_MTX_ASSERT(&ifp->if_poll_lock, LCK_MTX_ASSERT_OWNED);
	/*
	 * Normally, we'd reset the parameters to the auto-tuned values
	 * if the the poller thread detects a change in link rate.  If the
	 * driver provides its own parameters right after a link rate
	 * changes, but before the input thread gets to run, we want to
	 * make sure to keep the driver's values.  Clearing if_poll_update
	 * will achieve that.
	 */
	if (p != NULL && !locked && ifp->if_poll_update != 0) {
		ifp->if_poll_update = 0;
	}
	dlil_rxpoll_update_params(ifp, p);
	if (!locked) {
		lck_mtx_unlock(&ifp->if_poll_lock);
	}
	return 0;
}

static inline void
netif_rxpoll_poll_driver(struct ifnet *ifp, uint32_t m_lim,
    struct ifnet_stat_increment_param *s, struct timespec *start_time,
    struct timespec *poll_duration)
{
	struct mbuf *__single m_head = NULL, *__single m_tail = NULL;
	uint32_t m_cnt = 0, m_totlen = 0;
	struct timespec now;

	/* invoke the driver's input poll routine */
	((*ifp->if_input_poll)(ifp, 0, m_lim, &m_head, &m_tail, &m_cnt,
	&m_totlen));
	VERIFY((m_cnt > 0) || ((m_head == NULL) && (m_tail == NULL)));

	s->packets_in = m_cnt;
	s->bytes_in = m_totlen;
	/*
	 * Bracket the work done with timestamps to compute the effective
	 * poll interval.
	 */
	nanouptime(start_time);
	(void) ifnet_input_poll(ifp, m_head, m_tail,
	    (m_head != NULL) ? s : NULL);
	nanouptime(&now);
	net_timersub(&now, start_time, poll_duration);

	SK_DF(SK_VERB_NETIF_POLL, "%s: polled %d pkts, pkts avg %d max %d, "
	    "wreq avg %d, bytes avg %d", if_name(ifp), m_cnt,
	    ifp->if_rxpoll_pavg, ifp->if_rxpoll_pmax, ifp->if_rxpoll_wavg,
	    ifp->if_rxpoll_bavg);
}

static inline void
netif_rxpoll_process_interrupt(struct ifnet *ifp, proc_t p,
    struct ifnet_stat_increment_param *s, struct nx_mbq *rcvq)
{
	struct nexus_adapter *na = &NA(ifp)->nifna_up;

	nx_mbq_lock_spin(rcvq);
	s->packets_in = nx_mbq_len(rcvq);
	s->bytes_in = (uint32_t)nx_mbq_size(rcvq);
	nx_mbq_unlock(rcvq);
	(void) nx_netif_mit_rx_intr((NAKR(na, NR_RX)), p, 0, NULL);
}

__attribute__((noreturn))
static void
netif_rxpoll_compat_thread_cont(void *v, wait_result_t wres)
{
	struct ifnet *__single ifp = v;
	struct timespec *ts = NULL;
	struct timespec start_time, poll_intvl, poll_duration;
	struct ifnet_stat_increment_param s;

	VERIFY(ifp->if_eflags & IFEF_RXPOLL);
	bzero(&s, sizeof(s));
	net_timerclear(&start_time);

	lck_mtx_lock_spin(&ifp->if_poll_lock);
	if (__improbable(wres == THREAD_INTERRUPTED ||
	    (ifp->if_poll_flags & IF_POLLF_TERMINATING) != 0)) {
		goto terminate;
	}

	ifp->if_poll_flags |= IF_POLLF_RUNNING;
	/*
	 * Keep on servicing until no more request.
	 */

	for (;;) {
		uint16_t req = ifp->if_poll_req;
		struct nexus_adapter *na = &NA(ifp)->nifna_up;
		struct __kern_channel_ring *kring = &na->na_rx_rings[0];
		struct nx_mbq *rxq = &kring->ckr_rx_queue;
		uint32_t m_lim;
		boolean_t poll, poll_again = false;

		m_lim = (ifp->if_rxpoll_plim != 0) ? ifp->if_rxpoll_plim :
		    MAX((nx_mbq_limit(rxq)), (ifp->if_rxpoll_phiwat << 2));
		poll = (ifp->if_poll_mode == IFNET_MODEL_INPUT_POLL_ON);
		lck_mtx_unlock(&ifp->if_poll_lock);

		net_timerclear(&poll_duration);

		/* If no longer attached, there's nothing to do;
		 * else hold an IO refcnt to prevent the interface
		 * from being detached (will be released below.)
		 */
		if (!ifnet_is_attached(ifp, 1)) {
			lck_mtx_lock_spin(&ifp->if_poll_lock);
			break;
		}

		if (poll) {
			netif_rxpoll_poll_driver(ifp, m_lim, &s, &start_time,
			    &poll_duration);
			/*
			 * if the polled duration is more than the poll
			 * interval, then poll again to catch up.
			 */
			ASSERT(net_timerisset(&ifp->if_poll_cycle));
			if (net_timercmp(&poll_duration, &ifp->if_poll_cycle,
			    >=)) {
				poll_again = true;
			}
		} else {
			netif_rxpoll_process_interrupt(ifp, kernproc, &s, rxq);
			net_timerclear(&start_time);
		}

		netif_rxpoll_compat_update_rxpoll_stats(ifp, &s);
		/* Release the io ref count */
		ifnet_decr_iorefcnt(ifp);

		lck_mtx_lock_spin(&ifp->if_poll_lock);

		/* if signalled to terminate */
		if (__improbable((ifp->if_poll_flags & IF_POLLF_TERMINATING)
		    != 0)) {
			break;
		}
		/* if there's no pending request, we're done. */
		if (!poll_again && (req == ifp->if_poll_req)) {
			break;
		}
	}

	ifp->if_poll_req = 0;
	ifp->if_poll_flags &= ~IF_POLLF_RUNNING;
	/*
	 * Wakeup N ns from now, else sleep indefinitely (ts = NULL)
	 * until ifnet_poll() is called again.
	 */
	/* calculate work duration (since last start work time) */
	if (ifp->if_poll_mode == IFNET_MODEL_INPUT_POLL_ON) {
		ASSERT(net_timerisset(&ifp->if_poll_cycle));
		ASSERT(net_timercmp(&poll_duration, &ifp->if_poll_cycle, <));
		net_timersub(&ifp->if_poll_cycle, &poll_duration, &poll_intvl);
		ASSERT(net_timerisset(&poll_intvl));
		ts = &poll_intvl;
	} else {
		ts = NULL;
	}

	if (__probable((ifp->if_poll_flags & IF_POLLF_TERMINATING) == 0)) {
		uint64_t deadline = TIMEOUT_WAIT_FOREVER;

		if (ts != NULL) {
			uint64_t interval;

			_CASSERT(IF_RXPOLL_INTERVALTIME_MIN >= (1ULL * 1000));
			net_timerusec(ts, &interval);
			ASSERT(interval <= UINT32_MAX);
			clock_interval_to_deadline((uint32_t)interval, NSEC_PER_USEC,
			    &deadline);
		}

		(void) assert_wait_deadline(&ifp->if_poll_thread,
		    THREAD_UNINT, deadline);
		lck_mtx_unlock(&ifp->if_poll_lock);
		(void) thread_block_parameter(netif_rxpoll_compat_thread_cont,
		    ifp);
		/* NOTREACHED */
	} else {
terminate:
		/* interface is detached (maybe while asleep)? */
		ifnet_set_poll_cycle(ifp, NULL);
		ifp->if_poll_flags &= ~IF_POLLF_READY;

		/* clear if_poll_thread to allow termination to continue */
		ASSERT(ifp->if_poll_thread != THREAD_NULL);
		ifp->if_poll_thread = THREAD_NULL;
		wakeup((caddr_t)&ifp->if_poll_thread);
		lck_mtx_unlock(&ifp->if_poll_lock);
		SK_DF(SK_VERB_NETIF_POLL, "%s: poller thread terminated",
		    if_name(ifp));
		/* for the extra refcnt from kernel_thread_start() */
		thread_deallocate(current_thread());
		/* this is the end */
		thread_terminate(current_thread());
		/* NOTREACHED */
	}

	VERIFY(0);
	/* NOTREACHED */
	__builtin_unreachable();
}

__attribute__((noreturn))
void
netif_rxpoll_compat_thread_func(void *v, wait_result_t w)
{
#pragma unused(w)
	char thread_name_buf[MAXTHREADNAMESIZE];
	const char *__null_terminated thread_name = NULL;
	struct ifnet *__single ifp = v;

	VERIFY(ifp->if_eflags & IFEF_RXPOLL);
	VERIFY(current_thread() == ifp->if_poll_thread);

	/* construct the name for this thread, and then apply it */
	bzero(thread_name_buf, sizeof(thread_name_buf));
	thread_name = tsnprintf(thread_name_buf, sizeof(thread_name_buf),
	    "skywalk_netif_poller_%s", ifp->if_xname);
	thread_set_thread_name(ifp->if_poll_thread, thread_name);

	lck_mtx_lock(&ifp->if_poll_lock);
	VERIFY(!(ifp->if_poll_flags & (IF_POLLF_READY | IF_POLLF_RUNNING)));
	/* tell nx_netif_compat_na_activate() to proceed */
	ifp->if_poll_flags |= IF_POLLF_READY;
	wakeup((caddr_t)&ifp->if_poll_flags);
	(void) assert_wait(&ifp->if_poll_thread, THREAD_UNINT);
	lck_mtx_unlock(&ifp->if_poll_lock);
	(void) thread_block_parameter(netif_rxpoll_compat_thread_cont, ifp);
	/* NOTREACHED */
	__builtin_unreachable();
}