This is xnu-12377.1.9. See this file in:
/*
* Copyright (c) 1999-2024 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
* This file contains Original Code and/or Modifications of Original Code
* as defined in and that are subject to the Apple Public Source License
* Version 2.0 (the 'License'). You may not use this file except in
* compliance with the License. The rights granted to you under the License
* may not be used to create, or enable the creation or redistribution of,
* unlawful or unlicensed copies of an Apple operating system, or to
* circumvent, violate, or enable the circumvention or violation of, any
* terms of an Apple operating system software license agreement.
*
* Please obtain a copy of the License at
* http://www.opensource.apple.com/apsl/ and read it before using this file.
*
* The Original Code and all software distributed under the License are
* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
* Please see the License for the specific language governing rights and
* limitations under the License.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
*/
#include <net/if_var.h>
#include <net/if_var_private.h>
#include <net/dlil_var_private.h>
#include <net/dlil.h>
#include <net/dlil_sysctl.h>
#define DLIL_EWMA(old, new, decay) do { \
u_int32_t _avg; \
if ((_avg = (old)) > 0) \
_avg = (((_avg << (decay)) - _avg) + (new)) >> (decay); \
else \
_avg = (new); \
(old) = _avg; \
} while (0)
/*
* Detect whether a queue contains a burst that needs to be trimmed.
*/
#define MBUF_QUEUE_IS_OVERCOMMITTED(q) \
__improbable(MAX(if_rcvq_burst_limit, qlimit(q)) < qlen(q) && \
qtype(q) == QP_MBUF)
/* rate limit debug messages */
struct timespec dlil_dbgrate = { .tv_sec = 1, .tv_nsec = 0 };
extern void proto_input_run(void);
static errno_t dlil_input_async(struct dlil_threading_info *inp, struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail, const struct ifnet_stat_increment_param *s, boolean_t poll, struct thread *tp);
static errno_t dlil_input_sync(struct dlil_threading_info *inp, struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail, const struct ifnet_stat_increment_param *s, boolean_t poll, struct thread *tp);
static void dlil_input_cksum_dbg(struct ifnet *ifp, struct mbuf *m, char *frame_header, protocol_family_t pf);
static void dlil_input_packet_list_common(struct ifnet *, mbuf_ref_t, u_int32_t, ifnet_model_t, boolean_t);
static void dlil_input_thread_func(void *, wait_result_t);
static void dlil_input_thread_cont(void *, wait_result_t);
static inline void dlil_input_wakeup(struct dlil_threading_info *inp);
static int dlil_interface_filters_input(struct ifnet *, mbuf_ref_ref_t, char **, protocol_family_t, boolean_t);
static void dlil_main_input_thread_func(void *, wait_result_t);
static void dlil_main_input_thread_cont(void *, wait_result_t);
static void dlil_rxpoll_input_thread_func(void *, wait_result_t);
static void dlil_rxpoll_input_thread_cont(void *, wait_result_t);
static uint32_t dlil_trim_overcomitted_queue_locked(class_queue_t *input_queue, dlil_freeq_t *freeq, struct ifnet_stat_increment_param *stat_delta);
static inline mbuf_t handle_bridge_early_input(ifnet_t ifp, mbuf_t m, u_int32_t cnt);
/*
* Publicly visible functions.
*/
int
dlil_create_input_thread(ifnet_t ifp, struct dlil_threading_info *inp,
thread_continue_t *thfunc)
{
boolean_t dlil_rxpoll_input;
thread_continue_t func = NULL;
u_int32_t limit;
int error = 0;
dlil_rxpoll_input = (ifp != NULL && net_rxpoll &&
(ifp->if_eflags & IFEF_RXPOLL) && (ifp->if_xflags & IFXF_LEGACY));
/* default strategy utilizes the DLIL worker thread */
inp->dlth_strategy = dlil_input_async;
/* NULL ifp indicates the main input thread, called at dlil_init time */
if (ifp == NULL) {
/*
* Main input thread only.
*/
func = dlil_main_input_thread_func;
VERIFY(inp == dlil_main_input_thread);
inp->dlth_name = tsnprintf(inp->dlth_name_storage, sizeof(inp->dlth_name_storage),
"main_input");
} else if (dlil_rxpoll_input) {
/*
* Legacy (non-netif) hybrid polling.
*/
func = dlil_rxpoll_input_thread_func;
VERIFY(inp != dlil_main_input_thread);
inp->dlth_name = tsnprintf(inp->dlth_name_storage, sizeof(inp->dlth_name_storage),
"%s_input_poll", if_name(ifp));
} else if (net_async || (ifp->if_xflags & IFXF_LEGACY)) {
/*
* Asynchronous strategy.
*/
func = dlil_input_thread_func;
VERIFY(inp != dlil_main_input_thread);
inp->dlth_name = tsnprintf(inp->dlth_name_storage, sizeof(inp->dlth_name_storage),
"%s_input", if_name(ifp));
} else {
/*
* Synchronous strategy if there's a netif below and
* the device isn't capable of hybrid polling.
*/
ASSERT(func == NULL);
ASSERT(!(ifp->if_xflags & IFXF_LEGACY));
VERIFY(inp != dlil_main_input_thread);
ASSERT(!inp->dlth_affinity);
inp->dlth_strategy = dlil_input_sync;
inp->dlth_name = __unsafe_null_terminated_from_indexable(inp->dlth_name_storage);
}
VERIFY(inp->dlth_thread == THREAD_NULL);
/* let caller know */
if (thfunc != NULL) {
*thfunc = func;
}
inp->dlth_lock_grp = lck_grp_alloc_init(inp->dlth_name, LCK_GRP_ATTR_NULL);
lck_mtx_init(&inp->dlth_lock, inp->dlth_lock_grp, &dlil_lck_attributes);
inp->dlth_ifp = ifp; /* NULL for main input thread */
/*
* For interfaces that support opportunistic polling, set the
* low and high watermarks for outstanding inbound packets/bytes.
* Also define freeze times for transitioning between modes
* and updating the average.
*/
if (ifp != NULL && net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) {
limit = MAX(if_rcvq_maxlen, IF_RCVQ_MINLEN);
if (ifp->if_xflags & IFXF_LEGACY) {
(void) dlil_rxpoll_set_params(ifp, NULL, FALSE);
}
} else {
/*
* For interfaces that don't support opportunistic
* polling, set the burst limit to prevent memory exhaustion.
* The values of `if_rcvq_burst_limit' are safeguarded
* on customer builds by `sysctl_rcvq_burst_limit'.
*/
limit = if_rcvq_burst_limit;
}
_qinit(&inp->dlth_pkts, Q_DROPTAIL, limit, QP_MBUF);
if (inp == dlil_main_input_thread) {
dlil_main_threading_info_ref_t inpm =
__container_of(inp, struct dlil_main_threading_info, inp);
_qinit(&inpm->lo_rcvq_pkts, Q_DROPTAIL, limit, QP_MBUF);
}
if (func == NULL) {
ASSERT(!(ifp->if_xflags & IFXF_LEGACY));
ASSERT(error == 0);
error = ENODEV;
goto done;
}
error = kernel_thread_start(func, inp, &inp->dlth_thread);
if (error == KERN_SUCCESS) {
thread_precedence_policy_data_t info;
__unused kern_return_t kret;
bzero(&info, sizeof(info));
info.importance = 0;
kret = thread_policy_set(inp->dlth_thread,
THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
THREAD_PRECEDENCE_POLICY_COUNT);
ASSERT(kret == KERN_SUCCESS);
/*
* We create an affinity set so that the matching workloop
* thread or the starter thread (for loopback) can be
* scheduled on the same processor set as the input thread.
*/
if (net_affinity) {
struct thread *tp __single = inp->dlth_thread;
u_int32_t tag;
/*
* Randomize to reduce the probability
* of affinity tag namespace collision.
*/
read_frandom(&tag, sizeof(tag));
if (dlil_affinity_set(tp, tag) == KERN_SUCCESS) {
thread_reference(tp);
inp->dlth_affinity_tag = tag;
inp->dlth_affinity = TRUE;
}
}
} else if (inp == dlil_main_input_thread) {
panic_plain("%s: couldn't create main input thread", __func__);
/* NOTREACHED */
} else {
panic_plain("%s: couldn't create %s input thread", __func__,
if_name(ifp));
/* NOTREACHED */
}
OSAddAtomic(1, &cur_dlil_input_threads);
done:
return error;
}
void
dlil_terminate_input_thread(struct dlil_threading_info *inp)
{
ifnet_ref_t ifp = inp->dlth_ifp;
classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
VERIFY(current_thread() == inp->dlth_thread);
VERIFY(inp != dlil_main_input_thread);
OSAddAtomic(-1, &cur_dlil_input_threads);
#if TEST_INPUT_THREAD_TERMINATION
{ /* do something useless that won't get optimized away */
uint32_t v = 1;
for (uint32_t i = 0;
i < if_input_thread_termination_spin;
i++) {
v = (i + 1) * v;
}
DLIL_PRINTF("the value is %d\n", v);
}
#endif /* TEST_INPUT_THREAD_TERMINATION */
lck_mtx_lock_spin(&inp->dlth_lock);
_getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
VERIFY((inp->dlth_flags & DLIL_INPUT_TERMINATE) != 0);
inp->dlth_flags |= DLIL_INPUT_TERMINATE_COMPLETE;
wakeup_one((caddr_t)&inp->dlth_flags);
lck_mtx_unlock(&inp->dlth_lock);
/* free up pending packets */
if (pkt.cp_mbuf != NULL) {
mbuf_freem_list(pkt.cp_mbuf);
}
/* for the extra refcnt from kernel_thread_start() */
thread_deallocate(current_thread());
if (dlil_verbose) {
DLIL_PRINTF("%s: input thread terminated\n",
if_name(ifp));
}
/* this is the end */
thread_terminate(current_thread());
/* NOTREACHED */
}
boolean_t
dlil_is_rxpoll_input(thread_continue_t func)
{
return func == dlil_rxpoll_input_thread_func;
}
errno_t
dlil_input_handler(struct ifnet *ifp, struct mbuf *m_head,
struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
boolean_t poll, struct thread *tp)
{
dlil_threading_info_ref_t inp = ifp->if_inp;
if (__improbable(inp == NULL)) {
inp = dlil_main_input_thread;
}
#if (DEVELOPMENT || DEBUG)
if (__improbable(net_thread_is_marked(NET_THREAD_SYNC_RX))) {
return dlil_input_sync(inp, ifp, m_head, m_tail, s, poll, tp);
} else
#endif /* (DEVELOPMENT || DEBUG) */
{
return inp->dlth_strategy(inp, ifp, m_head, m_tail, s, poll, tp);
}
}
__private_extern__ void
dlil_input_packet_list(struct ifnet *ifp, struct mbuf *m)
{
return dlil_input_packet_list_common(ifp, m, 0,
IFNET_MODEL_INPUT_POLL_OFF, FALSE);
}
__private_extern__ void
dlil_input_packet_list_extended(struct ifnet *ifp, struct mbuf *m,
u_int32_t cnt, ifnet_model_t mode)
{
return dlil_input_packet_list_common(ifp, m, cnt, mode, TRUE);
}
/*
* Static function implementations.
*/
static void
dlil_ifproto_input(struct if_proto * ifproto, mbuf_ref_t m)
{
int error;
if (ifproto->proto_kpi == kProtoKPI_v1) {
/* Version 1 protocols get one packet at a time */
while (m != NULL) {
/*
* Version 1 KPI does not accept header len,
* hence the pointer to the frame header must be `__single'.
*/
char *frame_header_ptr __single;
mbuf_t next_packet;
next_packet = m->m_nextpkt;
m->m_nextpkt = NULL;
frame_header_ptr = m->m_pkthdr.pkt_hdr;
m->m_pkthdr.pkt_hdr = NULL;
error = (*ifproto->kpi.v1.input)(ifproto->ifp,
ifproto->protocol_family, m, frame_header_ptr);
if (error != 0 && error != EJUSTRETURN) {
m_drop_if(m, ifproto->ifp, DROPTAP_FLAG_DIR_IN, DROP_REASON_DLIL_IF_FILTER, NULL, 0);
}
m = next_packet;
}
} else if (ifproto->proto_kpi == kProtoKPI_v2) {
/* Version 2 protocols support packet lists */
error = (*ifproto->kpi.v2.input)(ifproto->ifp,
ifproto->protocol_family, m);
if (error != 0 && error != EJUSTRETURN) {
m_drop_list(m, ifproto->ifp, DROPTAP_FLAG_DIR_IN, DROP_REASON_DLIL_IF_FILTER, NULL, 0);
}
}
}
static errno_t
dlil_input_async(struct dlil_threading_info *inp,
struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
const struct ifnet_stat_increment_param *s, boolean_t poll,
struct thread *tp)
{
u_int32_t m_cnt = s->packets_in;
u_int32_t m_size = s->bytes_in;
boolean_t notify = FALSE;
struct ifnet_stat_increment_param s_adj = *s;
dlil_freeq_t freeq;
MBUFQ_INIT(&freeq);
/*
* If there is a matching DLIL input thread associated with an
* affinity set, associate this thread with the same set. We
* will only do this once.
*/
lck_mtx_lock_spin(&inp->dlth_lock);
if (inp != dlil_main_input_thread && inp->dlth_affinity && tp != NULL &&
((!poll && inp->dlth_driver_thread == THREAD_NULL) ||
(poll && inp->dlth_poller_thread == THREAD_NULL))) {
u_int32_t tag = inp->dlth_affinity_tag;
if (poll) {
VERIFY(inp->dlth_poller_thread == THREAD_NULL);
inp->dlth_poller_thread = tp;
} else {
VERIFY(inp->dlth_driver_thread == THREAD_NULL);
inp->dlth_driver_thread = tp;
}
lck_mtx_unlock(&inp->dlth_lock);
/* Associate the current thread with the new affinity tag */
(void) dlil_affinity_set(tp, tag);
/*
* Take a reference on the current thread; during detach,
* we will need to refer to it in order to tear down its
* affinity.
*/
thread_reference(tp);
lck_mtx_lock_spin(&inp->dlth_lock);
}
VERIFY(m_head != NULL || (m_tail == NULL && m_cnt == 0));
/*
* Because of loopbacked multicast we cannot stuff the ifp in
* the rcvif of the packet header: loopback (lo0) packets use a
* dedicated list so that we can later associate them with lo_ifp
* on their way up the stack. Packets for other interfaces without
* dedicated input threads go to the regular list.
*/
if (m_head != NULL) {
classq_pkt_t head, tail;
class_queue_t *input_queue;
CLASSQ_PKT_INIT_MBUF(&head, m_head);
CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
if (inp == dlil_main_input_thread && ifp == lo_ifp) {
dlil_main_threading_info_ref_t inpm =
__container_of(inp, struct dlil_main_threading_info, inp);
input_queue = &inpm->lo_rcvq_pkts;
} else {
input_queue = &inp->dlth_pkts;
}
_addq_multi(input_queue, &head, &tail, m_cnt, m_size);
if (MBUF_QUEUE_IS_OVERCOMMITTED(input_queue)) {
dlil_trim_overcomitted_queue_locked(input_queue, &freeq, &s_adj);
inp->dlth_trim_pkts_dropped += s_adj.dropped;
inp->dlth_trim_cnt += 1;
os_log_error(OS_LOG_DEFAULT,
"%s %s burst limit %u (sysctl: %u) exceeded. "
"%u packets dropped [%u total in %u events]. new qlen %u ",
__func__, if_name(ifp), qlimit(input_queue), if_rcvq_burst_limit,
s_adj.dropped, inp->dlth_trim_pkts_dropped, inp->dlth_trim_cnt,
qlen(input_queue));
}
}
#if IFNET_INPUT_SANITY_CHK
/*
* Verify that the original stat increment parameter
* accurately describes the input chain `m_head`.
* This is not affected by the trimming of input queue.
*/
if (__improbable(dlil_input_sanity_check != 0)) {
u_int32_t count = 0, size = 0;
struct mbuf *m0;
for (m0 = m_head; m0; m0 = mbuf_nextpkt(m0)) {
m_add_hdr_crumb_interface_input(m0, ifp->if_index, false);
size += m_length(m0);
count++;
}
if (count != m_cnt) {
panic_plain("%s: invalid total packet count %u "
"(expected %u)\n", if_name(ifp), count, m_cnt);
/* NOTREACHED */
__builtin_unreachable();
} else if (size != m_size) {
panic_plain("%s: invalid total packet size %u "
"(expected %u)\n", if_name(ifp), size, m_size);
/* NOTREACHED */
__builtin_unreachable();
}
inp->dlth_pkts_cnt += m_cnt;
}
#else
m_add_hdr_crumb_interface_input(m_head, ifp->if_index, true);
#endif /* IFNET_INPUT_SANITY_CHK */
/* NOTE: use the adjusted parameter, vs the original one */
dlil_input_stats_add(&s_adj, inp, ifp, poll);
/*
* If we're using the main input thread, synchronize the
* stats now since we have the interface context. All
* other cases involving dedicated input threads will
* have their stats synchronized there.
*/
if (inp == dlil_main_input_thread) {
notify = dlil_input_stats_sync(ifp, inp);
}
dlil_input_wakeup(inp);
lck_mtx_unlock(&inp->dlth_lock);
/*
* Actual freeing of the excess packets must happen
* after the dlth_lock had been released.
*/
if (!MBUFQ_EMPTY(&freeq)) {
m_drop_list(MBUFQ_FIRST(&freeq), ifp, DROPTAP_FLAG_DIR_IN, DROP_REASON_DLIL_BURST_LIMIT, NULL, 0);
}
if (notify) {
ifnet_notify_data_threshold(ifp);
}
return 0;
}
static errno_t
dlil_input_sync(struct dlil_threading_info *inp,
struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
const struct ifnet_stat_increment_param *s, boolean_t poll,
struct thread *tp)
{
#pragma unused(tp)
u_int32_t m_cnt = s->packets_in;
u_int32_t m_size = s->bytes_in;
boolean_t notify = FALSE;
classq_pkt_t head, tail;
struct ifnet_stat_increment_param s_adj = *s;
dlil_freeq_t freeq;
MBUFQ_INIT(&freeq);
ASSERT(inp != dlil_main_input_thread);
/* XXX: should we just assert instead? */
if (__improbable(m_head == NULL)) {
return 0;
}
CLASSQ_PKT_INIT_MBUF(&head, m_head);
CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
lck_mtx_lock_spin(&inp->dlth_lock);
_addq_multi(&inp->dlth_pkts, &head, &tail, m_cnt, m_size);
if (MBUF_QUEUE_IS_OVERCOMMITTED(&inp->dlth_pkts)) {
dlil_trim_overcomitted_queue_locked(&inp->dlth_pkts, &freeq, &s_adj);
inp->dlth_trim_pkts_dropped += s_adj.dropped;
inp->dlth_trim_cnt += 1;
os_log_error(OS_LOG_DEFAULT,
"%s %s burst limit %u (sysctl: %u) exceeded. "
"%u packets dropped [%u total in %u events]. new qlen %u \n",
__func__, if_name(ifp), qlimit(&inp->dlth_pkts), if_rcvq_burst_limit,
s_adj.dropped, inp->dlth_trim_pkts_dropped, inp->dlth_trim_cnt,
qlen(&inp->dlth_pkts));
}
#if IFNET_INPUT_SANITY_CHK
if (__improbable(dlil_input_sanity_check != 0)) {
u_int32_t count = 0, size = 0;
struct mbuf *m0;
for (m0 = m_head; m0; m0 = mbuf_nextpkt(m0)) {
m_add_hdr_crumb_interface_input(m0, ifp->if_index, false);
size += m_length(m0);
count++;
}
if (count != m_cnt) {
panic_plain("%s: invalid total packet count %u "
"(expected %u)\n", if_name(ifp), count, m_cnt);
/* NOTREACHED */
__builtin_unreachable();
} else if (size != m_size) {
panic_plain("%s: invalid total packet size %u "
"(expected %u)\n", if_name(ifp), size, m_size);
/* NOTREACHED */
__builtin_unreachable();
}
inp->dlth_pkts_cnt += m_cnt;
}
#else
m_add_hdr_crumb_interface_input(m_head, ifp->if_index, true);
#endif /* IFNET_INPUT_SANITY_CHK */
/* NOTE: use the adjusted parameter, vs the original one */
dlil_input_stats_add(&s_adj, inp, ifp, poll);
m_cnt = qlen(&inp->dlth_pkts);
_getq_all(&inp->dlth_pkts, &head, NULL, NULL, NULL);
#if SKYWALK
/*
* If this interface is attached to a netif nexus,
* the stats are already incremented there; otherwise
* do it here.
*/
if (!(ifp->if_capabilities & IFCAP_SKYWALK))
#endif /* SKYWALK */
notify = dlil_input_stats_sync(ifp, inp);
lck_mtx_unlock(&inp->dlth_lock);
/*
* Actual freeing of the excess packets must happen
* after the dlth_lock had been released.
*/
if (!MBUFQ_EMPTY(&freeq)) {
m_drop_list(MBUFQ_FIRST(&freeq), ifp, DROPTAP_FLAG_DIR_IN, DROP_REASON_DLIL_BURST_LIMIT, NULL, 0);
}
if (notify) {
ifnet_notify_data_threshold(ifp);
}
/*
* NOTE warning %%% attention !!!!
* We should think about putting some thread starvation
* safeguards if we deal with long chains of packets.
*/
if (head.cp_mbuf != NULL) {
dlil_input_packet_list_extended(ifp, head.cp_mbuf,
m_cnt, ifp->if_poll_mode);
}
return 0;
}
static void
dlil_input_cksum_dbg(struct ifnet *ifp, struct mbuf *m, char *frame_header,
protocol_family_t pf)
{
uint16_t sum = 0;
uint32_t hlen;
if (frame_header == NULL ||
frame_header < (char *)mbuf_datastart(m) ||
frame_header > (char *)m->m_data) {
DLIL_PRINTF("%s: frame header pointer 0x%llx out of range "
"[0x%llx,0x%llx] for mbuf 0x%llx\n", if_name(ifp),
(uint64_t)VM_KERNEL_ADDRPERM(frame_header),
(uint64_t)VM_KERNEL_ADDRPERM(mbuf_datastart(m)),
(uint64_t)VM_KERNEL_ADDRPERM(m->m_data),
(uint64_t)VM_KERNEL_ADDRPERM(m));
return;
}
hlen = (uint32_t)(m->m_data - (uintptr_t)frame_header);
switch (pf) {
case PF_INET:
case PF_INET6:
break;
default:
return;
}
/*
* Force partial checksum offload; useful to simulate cases
* where the hardware does not support partial checksum offload,
* in order to validate correctness throughout the layers above.
*/
if (hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED) {
uint32_t foff = hwcksum_dbg_partial_rxoff_forced;
if (foff > (uint32_t)m->m_pkthdr.len) {
return;
}
m->m_pkthdr.csum_flags &= ~CSUM_RX_FLAGS;
/* Compute 16-bit 1's complement sum from forced offset */
sum = m_sum16(m, foff, (m->m_pkthdr.len - foff));
m->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PARTIAL);
m->m_pkthdr.csum_rx_val = sum;
m->m_pkthdr.csum_rx_start = (uint16_t)(foff + hlen);
hwcksum_dbg_partial_forced++;
hwcksum_dbg_partial_forced_bytes += m->m_pkthdr.len;
}
/*
* Partial checksum offload verification (and adjustment);
* useful to validate and test cases where the hardware
* supports partial checksum offload.
*/
if ((m->m_pkthdr.csum_flags &
(CSUM_DATA_VALID | CSUM_PARTIAL | CSUM_PSEUDO_HDR)) ==
(CSUM_DATA_VALID | CSUM_PARTIAL)) {
uint32_t rxoff;
/* Start offset must begin after frame header */
rxoff = m->m_pkthdr.csum_rx_start;
if (hlen > rxoff) {
hwcksum_dbg_bad_rxoff++;
if (dlil_verbose) {
DLIL_PRINTF("%s: partial cksum start offset %d "
"is less than frame header length %d for "
"mbuf 0x%llx\n", if_name(ifp), rxoff, hlen,
(uint64_t)VM_KERNEL_ADDRPERM(m));
}
return;
}
rxoff -= hlen;
if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED)) {
/*
* Compute the expected 16-bit 1's complement sum;
* skip this if we've already computed it above
* when partial checksum offload is forced.
*/
sum = m_sum16(m, rxoff, (m->m_pkthdr.len - rxoff));
/* Hardware or driver is buggy */
if (sum != m->m_pkthdr.csum_rx_val) {
hwcksum_dbg_bad_cksum++;
if (dlil_verbose) {
DLIL_PRINTF("%s: bad partial cksum value "
"0x%x (expected 0x%x) for mbuf "
"0x%llx [rx_start %d]\n",
if_name(ifp),
m->m_pkthdr.csum_rx_val, sum,
(uint64_t)VM_KERNEL_ADDRPERM(m),
m->m_pkthdr.csum_rx_start);
}
return;
}
}
hwcksum_dbg_verified++;
/*
* This code allows us to emulate various hardwares that
* perform 16-bit 1's complement sum beginning at various
* start offset values.
*/
if (hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_RXOFF_ADJ) {
uint32_t aoff = hwcksum_dbg_partial_rxoff_adj;
if (aoff == rxoff || aoff > (uint32_t)m->m_pkthdr.len) {
return;
}
sum = m_adj_sum16(m, rxoff, aoff,
m_pktlen(m) - aoff, sum);
m->m_pkthdr.csum_rx_val = sum;
m->m_pkthdr.csum_rx_start = (uint16_t)(aoff + hlen);
hwcksum_dbg_adjusted++;
}
}
}
#if (DEVELOPMENT || DEBUG)
static void
dlil_input_process_wake_packet(ifnet_t ifp, protocol_family_t protocol_family, mbuf_ref_t m)
{
/*
* For testing we do not care about broadcast and multicast packets as
* they are not as controllable as unicast traffic
*/
if (check_wake_mbuf(ifp, protocol_family, m) == false) {
return;
}
if (__improbable(ifp->if_xflags & IFXF_MARK_WAKE_PKT)) {
if ((protocol_family == PF_INET || protocol_family == PF_INET6) &&
(m->m_flags & (M_BCAST | M_MCAST)) == 0) {
/*
* This is a one-shot command
*/
ifp->if_xflags &= ~IFXF_MARK_WAKE_PKT;
m->m_pkthdr.pkt_flags |= PKTF_WAKE_PKT;
}
}
}
#endif /* (DEVELOPMENT || DEBUG) */
static void
dlil_input_packet_list_common(struct ifnet *ifp_param, mbuf_ref_t m,
u_int32_t cnt, ifnet_model_t mode, boolean_t ext)
{
int error = 0;
protocol_family_t protocol_family;
mbuf_t next_packet;
ifnet_t ifp = ifp_param;
char *__single frame_header = NULL;
if_proto_ref_t last_ifproto = NULL;
mbuf_t pkt_first = NULL;
mbuf_t *pkt_next = NULL;
u_int32_t poll_thresh = 0, poll_ival = 0;
int iorefcnt = 0;
boolean_t skip_bridge_filter = FALSE;
KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
if (ext && mode == IFNET_MODEL_INPUT_POLL_ON && cnt > 1 &&
(poll_ival = if_rxpoll_interval_pkts) > 0) {
poll_thresh = cnt;
}
if (bridge_enable_early_input != 0 &&
ifp != NULL && ifp->if_bridge != NULL) {
m = handle_bridge_early_input(ifp, m, cnt);
skip_bridge_filter = TRUE;
}
while (m != NULL) {
if_proto_ref_t ifproto = NULL;
uint32_t pktf_mask; /* pkt flags to preserve */
m_add_crumb(m, PKT_CRUMB_DLIL_INPUT);
m_add_hdr_crumb_interface_input(m, ifp->if_index, false);
if (ifp_param == NULL) {
ifp = m->m_pkthdr.rcvif;
}
if ((ifp->if_eflags & IFEF_RXPOLL) &&
(ifp->if_xflags & IFXF_LEGACY) && poll_thresh != 0 &&
poll_ival > 0 && (--poll_thresh % poll_ival) == 0) {
ifnet_poll(ifp);
}
/* Check if this mbuf looks valid */
MBUF_INPUT_CHECK(m, ifp);
next_packet = m->m_nextpkt;
m->m_nextpkt = NULL;
frame_header = m->m_pkthdr.pkt_hdr;
m->m_pkthdr.pkt_hdr = NULL;
/*
* Get an IO reference count if the interface is not
* loopback (lo0) and it is attached; lo0 never goes
* away, so optimize for that.
*/
if (ifp != lo_ifp) {
/* iorefcnt is 0 if it hasn't been taken yet */
if (iorefcnt == 0) {
if (!ifnet_datamov_begin(ifp)) {
m_drop(m, DROPTAP_FLAG_DIR_IN, DROP_REASON_DLIL_IF_DATAMOV_BEGIN, NULL, 0);
goto next;
}
}
iorefcnt = 1;
/*
* Preserve the time stamp and skip pktap flags.
*/
pktf_mask = PKTF_TS_VALID | PKTF_SKIP_PKTAP;
} else {
/*
* If this arrived on lo0, preserve interface addr
* info to allow for connectivity between loopback
* and local interface addresses.
*/
pktf_mask = (PKTF_LOOP | PKTF_IFAINFO);
}
pktf_mask |= PKTF_WAKE_PKT;
/* make sure packet comes in clean */
m_classifier_init(m, pktf_mask);
ifp_inc_traffic_class_in(ifp, m);
/* find which protocol family this packet is for */
ifnet_lock_shared(ifp);
error = (*ifp->if_demux)(ifp, m, frame_header,
&protocol_family);
ifnet_lock_done(ifp);
if (error != 0) {
if (error == EJUSTRETURN) {
goto next;
}
protocol_family = 0;
}
/* check for an updated frame header */
if (m->m_pkthdr.pkt_hdr != NULL) {
frame_header = m->m_pkthdr.pkt_hdr;
m->m_pkthdr.pkt_hdr = NULL;
}
#if (DEVELOPMENT || DEBUG)
/* For testing only */
dlil_input_process_wake_packet(ifp, protocol_family, m);
#endif /* (DEVELOPMENT || DEBUG) */
pktap_input(ifp, protocol_family, m, frame_header);
/* Drop v4 packets received on CLAT46 enabled cell interface */
if (protocol_family == PF_INET && IS_INTF_CLAT46(ifp) &&
ifp->if_type == IFT_CELLULAR) {
m_drop(m, DROPTAP_FLAG_DIR_IN, DROP_REASON_DLIL_CLAT64, NULL, 0);
ip6stat.ip6s_clat464_in_v4_drop++;
goto next;
}
/* Translate the packet if it is received on CLAT interface */
if ((m->m_flags & M_PROMISC) == 0 &&
protocol_family == PF_INET6 &&
IS_INTF_CLAT46(ifp) &&
dlil_is_clat_needed(protocol_family, m)) {
char *data = NULL;
struct ether_header eh;
struct ether_header *ehp = NULL;
if (ifp->if_type == IFT_ETHER) {
ehp = (struct ether_header *)(void *)frame_header;
/* Skip RX Ethernet packets if they are not IPV6 */
if (ntohs(ehp->ether_type) != ETHERTYPE_IPV6) {
goto skip_clat;
}
/* Keep a copy of frame_header for Ethernet packets */
char *fh = __unsafe_forge_bidi_indexable(char *, m->m_pkthdr.pkt_hdr, ifnet_hdrlen(ifp));
if (fh) {
bcopy(fh, (caddr_t)&eh, ETHER_HDR_LEN);
}
}
error = dlil_clat64(ifp, &protocol_family, &m);
data = mtod(m, char*);
if (error != 0) {
m_drop(m, DROPTAP_FLAG_DIR_IN, DROP_REASON_DLIL_CLAT64, NULL, 0);
ip6stat.ip6s_clat464_in_drop++;
goto next;
}
/* Native v6 should be No-op */
if (protocol_family != PF_INET) {
goto skip_clat;
}
/* Do this only for translated v4 packets. */
switch (ifp->if_type) {
case IFT_CELLULAR:
frame_header = data;
break;
case IFT_ETHER:
/*
* Drop if the mbuf doesn't have enough
* space for Ethernet header
*/
if (M_LEADINGSPACE(m) < ETHER_HDR_LEN) {
m_drop(m, DROPTAP_FLAG_DIR_IN, DROP_REASON_DLIL_CLAT64, NULL, 0);
ip6stat.ip6s_clat464_in_drop++;
goto next;
}
/*
* Set the frame_header ETHER_HDR_LEN bytes
* preceeding the data pointer. Change
* the ether_type too.
* N.B. The variable `fh' is needed because
* the `frame_header' variable is `__single',
* and hence would not be appropriate for use with `bcopy'.
*/
char *fh = data - ETHER_HDR_LEN;
frame_header = fh;
eh.ether_type = htons(ETHERTYPE_IP);
bcopy((caddr_t)&eh, fh, ETHER_HDR_LEN);
break;
}
}
skip_clat:
/*
* Match the wake packet against the list of ports that has been
* been queried by the driver before the device went to sleep
*/
if (__improbable(m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT)) {
if (protocol_family != PF_INET && protocol_family != PF_INET6) {
if_ports_used_match_mbuf(ifp, protocol_family, m);
}
}
if (hwcksum_dbg != 0 && !(ifp->if_flags & IFF_LOOPBACK) &&
!(m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
dlil_input_cksum_dbg(ifp, m, frame_header,
protocol_family);
}
/*
* For partial checksum offload, we expect the driver to
* set the start offset indicating the start of the span
* that is covered by the hardware-computed checksum;
* adjust this start offset accordingly because the data
* pointer has been advanced beyond the link-layer header.
*
* Virtual lan types (bridge, vlan, bond) can call
* dlil_input_packet_list() with the same packet with the
* checksum flags set. Set a flag indicating that the
* adjustment has already been done.
*/
if ((m->m_pkthdr.csum_flags & CSUM_ADJUST_DONE) != 0) {
/* adjustment has already been done */
} else if ((m->m_pkthdr.csum_flags &
(CSUM_DATA_VALID | CSUM_PARTIAL)) ==
(CSUM_DATA_VALID | CSUM_PARTIAL)) {
int adj;
if (frame_header == NULL ||
frame_header < (char *)mbuf_datastart(m) ||
frame_header > (char *)m->m_data ||
(adj = (int)(m->m_data - (uintptr_t)frame_header)) >
m->m_pkthdr.csum_rx_start) {
m->m_pkthdr.csum_data = 0;
m->m_pkthdr.csum_flags &= ~CSUM_DATA_VALID;
hwcksum_in_invalidated++;
} else {
m->m_pkthdr.csum_rx_start -= adj;
}
/* make sure we don't adjust more than once */
m->m_pkthdr.csum_flags |= CSUM_ADJUST_DONE;
}
if (clat_debug) {
pktap_input(ifp, protocol_family, m, frame_header);
}
if (m->m_flags & (M_BCAST | M_MCAST)) {
os_atomic_inc(&ifp->if_imcasts, relaxed);
}
/* run interface filters */
error = dlil_interface_filters_input(ifp, &m,
&frame_header, protocol_family, skip_bridge_filter);
if (error != 0) {
if (error != EJUSTRETURN) {
m_drop(m, DROPTAP_FLAG_DIR_IN, DROP_REASON_DLIL_IF_FILTER, NULL, 0);
}
goto next;
}
/*
* A VLAN and Bond interface receives packets by attaching
* a "protocol" to the underlying interface.
* A promiscuous packet needs to be delivered to the
* VLAN or Bond interface since:
* - Bond interface member may not support setting the
* MAC address, so packets are inherently "promiscuous"
* - A VLAN or Bond interface could be members of a bridge,
* where promiscuous packets correspond to other
* devices that the bridge forwards packets to/from
*/
if ((m->m_flags & M_PROMISC) != 0) {
switch (protocol_family) {
case PF_VLAN:
case PF_BOND:
/* VLAN and Bond get promiscuous packets */
break;
default:
if (droptap_verbose > 0) {
m_drop(m, DROPTAP_FLAG_DIR_IN, DROP_REASON_DLIL_PROMISC, NULL, 0);
} else {
m_freem(m);
}
goto next;
}
}
/* Lookup the protocol attachment to this interface */
if (protocol_family == 0) {
ifproto = NULL;
} else if (last_ifproto != NULL && last_ifproto->ifp == ifp &&
(last_ifproto->protocol_family == protocol_family)) {
VERIFY(ifproto == NULL);
ifproto = last_ifproto;
if_proto_ref(last_ifproto);
} else {
VERIFY(ifproto == NULL);
ifnet_lock_shared(ifp);
/* callee holds a proto refcnt upon success */
ifproto = find_attached_proto(ifp, protocol_family);
ifnet_lock_done(ifp);
}
if (ifproto == NULL) {
/* no protocol for this packet, discard */
m_drop_extended(m, ifp, frame_header, DROPTAP_FLAG_DIR_IN, DROP_REASON_DLIL_NO_PROTO, NULL, 0);
goto next;
}
if (ifproto != last_ifproto) {
if (last_ifproto != NULL) {
/* pass up the list for the previous protocol */
dlil_ifproto_input(last_ifproto, pkt_first);
pkt_first = NULL;
if_proto_free(last_ifproto);
}
last_ifproto = ifproto;
if_proto_ref(ifproto);
}
/* extend the list */
m->m_pkthdr.pkt_hdr = frame_header;
if (pkt_first == NULL) {
pkt_first = m;
} else {
*pkt_next = m;
}
pkt_next = &m->m_nextpkt;
next:
if (next_packet == NULL && last_ifproto != NULL) {
/* pass up the last list of packets */
dlil_ifproto_input(last_ifproto, pkt_first);
if_proto_free(last_ifproto);
last_ifproto = NULL;
}
if (ifproto != NULL) {
if_proto_free(ifproto);
ifproto = NULL;
}
m = next_packet;
/* update the driver's multicast filter, if needed */
if (ifp->if_updatemcasts > 0 && if_mcasts_update(ifp) == 0) {
ifp->if_updatemcasts = 0;
}
if (iorefcnt == 1) {
/* If the next mbuf is on a different interface, unlock data-mov */
if (!m || (ifp != ifp_param && ifp != m->m_pkthdr.rcvif)) {
ifnet_datamov_end(ifp);
iorefcnt = 0;
}
}
}
KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
}
/*
* Input thread for interfaces with legacy input model.
*/
__attribute__((noreturn))
static void
dlil_input_thread_func(void *v, wait_result_t w)
{
#pragma unused(w)
char thread_name_storage[MAXTHREADNAMESIZE];
const char *__null_terminated thread_name;
dlil_threading_info_ref_t inp = v;
ifnet_ref_t ifp = inp->dlth_ifp;
VERIFY(inp != dlil_main_input_thread);
VERIFY(ifp != NULL);
VERIFY(!(ifp->if_eflags & IFEF_RXPOLL) || !net_rxpoll ||
!(ifp->if_xflags & IFXF_LEGACY));
VERIFY(ifp->if_poll_mode == IFNET_MODEL_INPUT_POLL_OFF ||
!(ifp->if_xflags & IFXF_LEGACY));
VERIFY(current_thread() == inp->dlth_thread);
/* construct the name for this thread, and then apply it */
bzero(thread_name_storage, sizeof(thread_name_storage));
thread_name = tsnprintf(thread_name_storage, sizeof(thread_name_storage),
"dlil_input_%s", ifp->if_xname);
thread_set_thread_name(inp->dlth_thread, thread_name);
#if CONFIG_THREAD_GROUPS
if (IFNET_REQUIRES_CELL_GROUP(ifp)) {
thread_group_join_cellular();
}
#endif /* CONFIG_THREAD_GROUPS */
lck_mtx_lock(&inp->dlth_lock);
VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
/* wake up once to get out of embryonic state */
dlil_input_wakeup(inp);
lck_mtx_unlock(&inp->dlth_lock);
(void) thread_block_parameter(dlil_input_thread_cont, inp);
/* NOTREACHED */
__builtin_unreachable();
}
__attribute__((noreturn))
static void
dlil_input_thread_cont(void *v, wait_result_t wres)
{
dlil_threading_info_ref_t inp = v;
ifnet_ref_t ifp = inp->dlth_ifp;
lck_mtx_lock_spin(&inp->dlth_lock);
if (__improbable(wres == THREAD_INTERRUPTED ||
(inp->dlth_flags & DLIL_INPUT_TERMINATE))) {
goto terminate;
}
VERIFY(!(inp->dlth_flags & DLIL_INPUT_RUNNING));
inp->dlth_flags |= DLIL_INPUT_RUNNING;
while (1) {
struct mbuf *m = NULL;
classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
boolean_t notify = FALSE;
boolean_t embryonic;
u_int32_t m_cnt;
inp->dlth_flags &= ~DLIL_INPUT_WAITING;
if (__improbable(embryonic =
(inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
}
/*
* Protocol registration and injection must always use
* the main input thread; in theory the latter can utilize
* the corresponding input thread where the packet arrived
* on, but that requires our knowing the interface in advance
* (and the benefits might not worth the trouble.)
*/
VERIFY(!(inp->dlth_flags &
(DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER)));
/* Packets for this interface */
m_cnt = qlen(&inp->dlth_pkts);
_getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
m = pkt.cp_mbuf;
inp->dlth_wtot = 0;
#if SKYWALK
/*
* If this interface is attached to a netif nexus,
* the stats are already incremented there; otherwise
* do it here.
*/
if (!(ifp->if_capabilities & IFCAP_SKYWALK))
#endif /* SKYWALK */
notify = dlil_input_stats_sync(ifp, inp);
lck_mtx_unlock(&inp->dlth_lock);
if (__improbable(embryonic)) {
ifnet_decr_pending_thread_count(ifp);
}
if (__improbable(notify)) {
ifnet_notify_data_threshold(ifp);
}
/*
* NOTE warning %%% attention !!!!
* We should think about putting some thread starvation
* safeguards if we deal with long chains of packets.
*/
if (__probable(m != NULL)) {
dlil_input_packet_list_extended(ifp, m,
m_cnt, ifp->if_poll_mode);
}
lck_mtx_lock_spin(&inp->dlth_lock);
VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
if (!(inp->dlth_flags & ~(DLIL_INPUT_RUNNING |
DLIL_INPUT_TERMINATE))) {
break;
}
}
inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
if (__improbable(inp->dlth_flags & DLIL_INPUT_TERMINATE)) {
terminate:
lck_mtx_unlock(&inp->dlth_lock);
dlil_terminate_input_thread(inp);
/* NOTREACHED */
} else {
(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
lck_mtx_unlock(&inp->dlth_lock);
(void) thread_block_parameter(dlil_input_thread_cont, inp);
/* NOTREACHED */
}
VERIFY(0); /* we should never get here */
/* NOTREACHED */
__builtin_unreachable();
}
static inline void
dlil_input_wakeup(struct dlil_threading_info *inp)
{
LCK_MTX_ASSERT(&inp->dlth_lock, LCK_MTX_ASSERT_OWNED);
inp->dlth_flags |= DLIL_INPUT_WAITING;
if (!(inp->dlth_flags & DLIL_INPUT_RUNNING)) {
inp->dlth_wtot++;
wakeup_one((caddr_t)&inp->dlth_flags);
}
}
static int
dlil_interface_filters_input(struct ifnet *ifp, mbuf_ref_ref_t m_p,
char **frame_header_p, protocol_family_t protocol_family,
boolean_t skip_bridge)
{
boolean_t is_vlan_packet = FALSE;
struct ifnet_filter *filter;
struct mbuf *m = *m_p;
is_vlan_packet = packet_has_vlan_tag(m);
if (TAILQ_EMPTY(&ifp->if_flt_head)) {
return 0;
}
/*
* Pass the inbound packet to the interface filters
*/
lck_mtx_lock_spin(&ifp->if_flt_lock);
/* prevent filter list from changing in case we drop the lock */
if_flt_monitor_busy(ifp);
TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
int result;
/* exclude VLAN packets from external filters PR-3586856 */
if (is_vlan_packet &&
(filter->filt_flags & DLIL_IFF_INTERNAL) == 0) {
continue;
}
/* the bridge has already seen the packet */
if (skip_bridge &&
(filter->filt_flags & DLIL_IFF_BRIDGE) != 0) {
continue;
}
if (!filter->filt_skip && filter->filt_input != NULL &&
(filter->filt_protocol == 0 ||
filter->filt_protocol == protocol_family)) {
lck_mtx_unlock(&ifp->if_flt_lock);
result = (*filter->filt_input)(filter->filt_cookie,
ifp, protocol_family, m_p, frame_header_p);
lck_mtx_lock_spin(&ifp->if_flt_lock);
if (result != 0) {
/* we're done with the filter list */
if_flt_monitor_unbusy(ifp);
lck_mtx_unlock(&ifp->if_flt_lock);
return result;
}
}
}
/* we're done with the filter list */
if_flt_monitor_unbusy(ifp);
lck_mtx_unlock(&ifp->if_flt_lock);
/*
* Strip away M_PROTO1 bit prior to sending packet up the stack as
* it is meant to be local to a subsystem -- if_bridge for M_PROTO1
*/
if (*m_p != NULL) {
(*m_p)->m_flags &= ~M_PROTO1;
}
return 0;
}
__attribute__((noreturn))
static void
dlil_main_input_thread_func(void *v, wait_result_t w)
{
#pragma unused(w)
dlil_threading_info_ref_t inp = v;
VERIFY(inp == dlil_main_input_thread);
VERIFY(inp->dlth_ifp == NULL);
VERIFY(current_thread() == inp->dlth_thread);
lck_mtx_lock(&inp->dlth_lock);
VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
/* wake up once to get out of embryonic state */
dlil_input_wakeup(inp);
lck_mtx_unlock(&inp->dlth_lock);
(void) thread_block_parameter(dlil_main_input_thread_cont, inp);
/* NOTREACHED */
__builtin_unreachable();
}
/*
* Main input thread:
*
* a) handles all inbound packets for lo0
* b) handles all inbound packets for interfaces with no dedicated
* input thread (e.g. anything but Ethernet/PDP or those that support
* opportunistic polling.)
* c) protocol registrations
* d) packet injections
*/
__attribute__((noreturn))
static void
dlil_main_input_thread_cont(void *v, wait_result_t wres)
{
dlil_main_threading_info_ref_t inpm = v;
dlil_threading_info_ref_t inp = v;
/* main input thread is uninterruptible */
VERIFY(wres != THREAD_INTERRUPTED);
lck_mtx_lock_spin(&inp->dlth_lock);
VERIFY(!(inp->dlth_flags & (DLIL_INPUT_TERMINATE |
DLIL_INPUT_RUNNING)));
inp->dlth_flags |= DLIL_INPUT_RUNNING;
while (1) {
struct mbuf *m = NULL, *m_loop = NULL;
u_int32_t m_cnt, m_cnt_loop;
classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
boolean_t proto_req;
boolean_t embryonic;
inp->dlth_flags &= ~DLIL_INPUT_WAITING;
if (__improbable(embryonic =
(inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
}
proto_req = (inp->dlth_flags &
(DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER));
/* Packets for non-dedicated interfaces other than lo0 */
m_cnt = qlen(&inp->dlth_pkts);
_getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
m = pkt.cp_mbuf;
/* Packets exclusive to lo0 */
m_cnt_loop = qlen(&inpm->lo_rcvq_pkts);
_getq_all(&inpm->lo_rcvq_pkts, &pkt, NULL, NULL, NULL);
m_loop = pkt.cp_mbuf;
inp->dlth_wtot = 0;
lck_mtx_unlock(&inp->dlth_lock);
if (__improbable(embryonic)) {
dlil_decr_pending_thread_count();
}
/*
* NOTE warning %%% attention !!!!
* We should think about putting some thread starvation
* safeguards if we deal with long chains of packets.
*/
if (__probable(m_loop != NULL)) {
dlil_input_packet_list_extended(lo_ifp, m_loop,
m_cnt_loop, IFNET_MODEL_INPUT_POLL_OFF);
}
if (__probable(m != NULL)) {
dlil_input_packet_list_extended(NULL, m,
m_cnt, IFNET_MODEL_INPUT_POLL_OFF);
}
if (__improbable(proto_req)) {
proto_input_run();
}
lck_mtx_lock_spin(&inp->dlth_lock);
VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
/* main input thread cannot be terminated */
VERIFY(!(inp->dlth_flags & DLIL_INPUT_TERMINATE));
if (!(inp->dlth_flags & ~DLIL_INPUT_RUNNING)) {
break;
}
}
inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
lck_mtx_unlock(&inp->dlth_lock);
(void) thread_block_parameter(dlil_main_input_thread_cont, inp);
VERIFY(0); /* we should never get here */
/* NOTREACHED */
__builtin_unreachable();
}
/*
* Input thread for interfaces with opportunistic polling input model.
*/
__attribute__((noreturn))
static void
dlil_rxpoll_input_thread_func(void *v, wait_result_t w)
{
#pragma unused(w)
char thread_name_storage[MAXTHREADNAMESIZE];
const char *__null_terminated thread_name;
dlil_threading_info_ref_t inp = v;
ifnet_ref_t ifp = inp->dlth_ifp;
VERIFY(inp != dlil_main_input_thread);
VERIFY(ifp != NULL && (ifp->if_eflags & IFEF_RXPOLL) &&
(ifp->if_xflags & IFXF_LEGACY));
VERIFY(current_thread() == inp->dlth_thread);
/* construct the name for this thread, and then apply it */
bzero(thread_name_storage, sizeof(thread_name_storage));
thread_name = tsnprintf(thread_name_storage, sizeof(thread_name_storage),
"dlil_input_poll_%s", ifp->if_xname);
thread_set_thread_name(inp->dlth_thread, thread_name);
lck_mtx_lock(&inp->dlth_lock);
VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
/* wake up once to get out of embryonic state */
dlil_input_wakeup(inp);
lck_mtx_unlock(&inp->dlth_lock);
(void) thread_block_parameter(dlil_rxpoll_input_thread_cont, inp);
/* NOTREACHED */
__builtin_unreachable();
}
__attribute__((noreturn))
static void
dlil_rxpoll_input_thread_cont(void *v, wait_result_t wres)
{
dlil_threading_info_ref_t inp = v;
ifnet_ref_t ifp = inp->dlth_ifp;
struct timespec ts;
lck_mtx_lock_spin(&inp->dlth_lock);
if (__improbable(wres == THREAD_INTERRUPTED ||
(inp->dlth_flags & DLIL_INPUT_TERMINATE))) {
goto terminate;
}
VERIFY(!(inp->dlth_flags & DLIL_INPUT_RUNNING));
inp->dlth_flags |= DLIL_INPUT_RUNNING;
while (1) {
struct mbuf *m = NULL;
uint32_t m_cnt, poll_req = 0;
uint64_t m_size = 0;
ifnet_model_t mode;
struct timespec now, delta;
classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
boolean_t notify;
boolean_t embryonic;
uint64_t ival;
inp->dlth_flags &= ~DLIL_INPUT_WAITING;
if (__improbable(embryonic =
(inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
goto skip;
}
if ((ival = ifp->if_rxpoll_ival) < IF_RXPOLL_INTERVALTIME_MIN) {
ival = IF_RXPOLL_INTERVALTIME_MIN;
}
/* Link parameters changed? */
if (ifp->if_poll_update != 0) {
ifp->if_poll_update = 0;
(void) dlil_rxpoll_set_params(ifp, NULL, TRUE);
}
/* Current operating mode */
mode = ifp->if_poll_mode;
/*
* Protocol registration and injection must always use
* the main input thread; in theory the latter can utilize
* the corresponding input thread where the packet arrived
* on, but that requires our knowing the interface in advance
* (and the benefits might not worth the trouble.)
*/
VERIFY(!(inp->dlth_flags &
(DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER)));
/* Total count of all packets */
m_cnt = qlen(&inp->dlth_pkts);
/* Total bytes of all packets */
m_size = qsize(&inp->dlth_pkts);
/* Packets for this interface */
_getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
m = pkt.cp_mbuf;
VERIFY(m != NULL || m_cnt == 0);
nanouptime(&now);
if (!net_timerisset(&ifp->if_poll_sample_lasttime)) {
*(&ifp->if_poll_sample_lasttime) = *(&now);
}
net_timersub(&now, &ifp->if_poll_sample_lasttime, &delta);
if (if_rxpoll && net_timerisset(&ifp->if_poll_sample_holdtime)) {
u_int32_t ptot, btot;
/* Accumulate statistics for current sampling */
PKTCNTR_ADD(&ifp->if_poll_sstats, m_cnt, m_size);
if (net_timercmp(&delta, &ifp->if_poll_sample_holdtime, <)) {
goto skip;
}
*(&ifp->if_poll_sample_lasttime) = *(&now);
/* Calculate min/max of inbound bytes */
btot = (u_int32_t)ifp->if_poll_sstats.bytes;
if (ifp->if_rxpoll_bmin == 0 || ifp->if_rxpoll_bmin > btot) {
ifp->if_rxpoll_bmin = btot;
}
if (btot > ifp->if_rxpoll_bmax) {
ifp->if_rxpoll_bmax = btot;
}
/* Calculate EWMA of inbound bytes */
DLIL_EWMA(ifp->if_rxpoll_bavg, btot, if_rxpoll_decay);
/* Calculate min/max of inbound packets */
ptot = (u_int32_t)ifp->if_poll_sstats.packets;
if (ifp->if_rxpoll_pmin == 0 || ifp->if_rxpoll_pmin > ptot) {
ifp->if_rxpoll_pmin = ptot;
}
if (ptot > ifp->if_rxpoll_pmax) {
ifp->if_rxpoll_pmax = ptot;
}
/* Calculate EWMA of inbound packets */
DLIL_EWMA(ifp->if_rxpoll_pavg, ptot, if_rxpoll_decay);
/* Reset sampling statistics */
PKTCNTR_CLEAR(&ifp->if_poll_sstats);
/* Calculate EWMA of wakeup requests */
DLIL_EWMA(ifp->if_rxpoll_wavg, inp->dlth_wtot,
if_rxpoll_decay);
inp->dlth_wtot = 0;
if (dlil_verbose) {
if (!net_timerisset(&ifp->if_poll_dbg_lasttime)) {
*(&ifp->if_poll_dbg_lasttime) = *(&now);
}
net_timersub(&now, &ifp->if_poll_dbg_lasttime, &delta);
if (net_timercmp(&delta, &dlil_dbgrate, >=)) {
*(&ifp->if_poll_dbg_lasttime) = *(&now);
DLIL_PRINTF("%s: [%s] pkts avg %d max %d "
"limits [%d/%d], wreq avg %d "
"limits [%d/%d], bytes avg %d "
"limits [%d/%d]\n", if_name(ifp),
(ifp->if_poll_mode ==
IFNET_MODEL_INPUT_POLL_ON) ?
"ON" : "OFF", ifp->if_rxpoll_pavg,
ifp->if_rxpoll_pmax,
ifp->if_rxpoll_plowat,
ifp->if_rxpoll_phiwat,
ifp->if_rxpoll_wavg,
ifp->if_rxpoll_wlowat,
ifp->if_rxpoll_whiwat,
ifp->if_rxpoll_bavg,
ifp->if_rxpoll_blowat,
ifp->if_rxpoll_bhiwat);
}
}
/* Perform mode transition, if necessary */
if (!net_timerisset(&ifp->if_poll_mode_lasttime)) {
*(&ifp->if_poll_mode_lasttime) = *(&now);
}
net_timersub(&now, &ifp->if_poll_mode_lasttime, &delta);
if (net_timercmp(&delta, &ifp->if_poll_mode_holdtime, <)) {
goto skip;
}
if (ifp->if_rxpoll_pavg <= ifp->if_rxpoll_plowat &&
ifp->if_rxpoll_bavg <= ifp->if_rxpoll_blowat &&
ifp->if_poll_mode != IFNET_MODEL_INPUT_POLL_OFF) {
mode = IFNET_MODEL_INPUT_POLL_OFF;
} else if (ifp->if_rxpoll_pavg >= ifp->if_rxpoll_phiwat &&
(ifp->if_rxpoll_bavg >= ifp->if_rxpoll_bhiwat ||
ifp->if_rxpoll_wavg >= ifp->if_rxpoll_whiwat) &&
ifp->if_poll_mode != IFNET_MODEL_INPUT_POLL_ON) {
mode = IFNET_MODEL_INPUT_POLL_ON;
}
if (mode != ifp->if_poll_mode) {
ifp->if_poll_mode = mode;
*(&ifp->if_poll_mode_lasttime) = *(&now);
poll_req++;
}
}
skip:
notify = dlil_input_stats_sync(ifp, inp);
lck_mtx_unlock(&inp->dlth_lock);
if (__improbable(embryonic)) {
ifnet_decr_pending_thread_count(ifp);
}
if (__improbable(notify)) {
ifnet_notify_data_threshold(ifp);
}
/*
* If there's a mode change and interface is still attached,
* perform a downcall to the driver for the new mode. Also
* hold an IO refcnt on the interface to prevent it from
* being detached (will be release below.)
*/
if (poll_req != 0 && ifnet_get_ioref(ifp)) {
struct ifnet_model_params p = {
.model = mode, .reserved = { 0 }
};
errno_t err;
if (dlil_verbose) {
DLIL_PRINTF("%s: polling is now %s, "
"pkts avg %d max %d limits [%d/%d], "
"wreq avg %d limits [%d/%d], "
"bytes avg %d limits [%d/%d]\n",
if_name(ifp),
(mode == IFNET_MODEL_INPUT_POLL_ON) ?
"ON" : "OFF", ifp->if_rxpoll_pavg,
ifp->if_rxpoll_pmax, ifp->if_rxpoll_plowat,
ifp->if_rxpoll_phiwat, ifp->if_rxpoll_wavg,
ifp->if_rxpoll_wlowat, ifp->if_rxpoll_whiwat,
ifp->if_rxpoll_bavg, ifp->if_rxpoll_blowat,
ifp->if_rxpoll_bhiwat);
}
if ((err = ((*ifp->if_input_ctl)(ifp,
IFNET_CTL_SET_INPUT_MODEL, sizeof(p), &p))) != 0) {
DLIL_PRINTF("%s: error setting polling mode "
"to %s (%d)\n", if_name(ifp),
(mode == IFNET_MODEL_INPUT_POLL_ON) ?
"ON" : "OFF", err);
}
switch (mode) {
case IFNET_MODEL_INPUT_POLL_OFF:
ifnet_set_poll_cycle(ifp, NULL);
ifp->if_rxpoll_offreq++;
if (err != 0) {
ifp->if_rxpoll_offerr++;
}
break;
case IFNET_MODEL_INPUT_POLL_ON:
net_nsectimer(&ival, &ts);
ifnet_set_poll_cycle(ifp, &ts);
ifnet_poll(ifp);
ifp->if_rxpoll_onreq++;
if (err != 0) {
ifp->if_rxpoll_onerr++;
}
break;
default:
VERIFY(0);
/* NOTREACHED */
}
/* Release the IO refcnt */
ifnet_decr_iorefcnt(ifp);
}
/*
* NOTE warning %%% attention !!!!
* We should think about putting some thread starvation
* safeguards if we deal with long chains of packets.
*/
if (__probable(m != NULL)) {
dlil_input_packet_list_extended(ifp, m, m_cnt, mode);
}
lck_mtx_lock_spin(&inp->dlth_lock);
VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
if (!(inp->dlth_flags & ~(DLIL_INPUT_RUNNING |
DLIL_INPUT_TERMINATE))) {
break;
}
}
inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
if (__improbable(inp->dlth_flags & DLIL_INPUT_TERMINATE)) {
terminate:
lck_mtx_unlock(&inp->dlth_lock);
dlil_terminate_input_thread(inp);
/* NOTREACHED */
} else {
(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
lck_mtx_unlock(&inp->dlth_lock);
(void) thread_block_parameter(dlil_rxpoll_input_thread_cont,
inp);
/* NOTREACHED */
}
VERIFY(0); /* we should never get here */
/* NOTREACHED */
__builtin_unreachable();
}
static uint32_t
dlil_trim_overcomitted_queue_locked(class_queue_t *input_queue,
dlil_freeq_t *freeq, struct ifnet_stat_increment_param *stat_delta)
{
uint32_t overcommitted_qlen; /* Length in packets. */
uint64_t overcommitted_qsize; /* Size in bytes. */
uint32_t target_qlen; /* The desired queue length after trimming. */
uint32_t pkts_to_drop = 0; /* Number of packets to drop. */
uint32_t dropped_pkts = 0; /* Number of packets that were dropped. */
uint32_t dropped_bytes = 0; /* Number of dropped bytes. */
struct mbuf *m = NULL, *m_tmp = NULL;
overcommitted_qlen = qlen(input_queue);
overcommitted_qsize = qsize(input_queue);
target_qlen = (qlimit(input_queue) * if_rcvq_trim_pct) / 100;
if (overcommitted_qlen <= target_qlen) {
/*
* The queue is already within the target limits.
*/
dropped_pkts = 0;
goto out;
}
pkts_to_drop = overcommitted_qlen - target_qlen;
/*
* Proceed to removing packets from the head of the queue,
* starting from the oldest, until the desired number of packets
* has been dropped.
*/
MBUFQ_FOREACH_SAFE(m, &qmbufq(input_queue), m_tmp) {
if (pkts_to_drop <= dropped_pkts) {
break;
}
MBUFQ_REMOVE(&qmbufq(input_queue), m);
MBUFQ_NEXT(m) = NULL;
MBUFQ_ENQUEUE(freeq, m);
dropped_pkts += 1;
dropped_bytes += m_length(m);
}
/*
* Adjust the length and the estimated size of the queue
* after trimming.
*/
VERIFY(overcommitted_qlen == target_qlen + dropped_pkts);
qlen(input_queue) = target_qlen;
/* qsize() is an approximation. */
if (dropped_bytes < qsize(input_queue)) {
qsize(input_queue) -= dropped_bytes;
} else {
qsize(input_queue) = 0;
}
/*
* Adjust the ifnet statistics increments, if needed.
*/
stat_delta->dropped += dropped_pkts;
if (dropped_pkts < stat_delta->packets_in) {
stat_delta->packets_in -= dropped_pkts;
} else {
stat_delta->packets_in = 0;
}
if (dropped_bytes < stat_delta->bytes_in) {
stat_delta->bytes_in -= dropped_bytes;
} else {
stat_delta->bytes_in = 0;
}
out:
if (dlil_verbose) {
/*
* The basic information about the drop is logged
* by the invoking function (dlil_input_{,a}sync).
* If `dlil_verbose' flag is set, provide more information
* that can be useful for debugging.
*/
DLIL_PRINTF("%s: "
"qlen: %u -> %u, "
"qsize: %llu -> %llu "
"qlimit: %u (sysctl: %u) "
"target_qlen: %u (if_rcvq_trim_pct: %u) pkts_to_drop: %u "
"dropped_pkts: %u dropped_bytes %u\n",
__func__,
overcommitted_qlen, qlen(input_queue),
overcommitted_qsize, qsize(input_queue),
qlimit(input_queue), if_rcvq_burst_limit,
target_qlen, if_rcvq_trim_pct, pkts_to_drop,
dropped_pkts, dropped_bytes);
}
return dropped_pkts;
}
static inline mbuf_t
handle_bridge_early_input(ifnet_t ifp, mbuf_t m, u_int32_t cnt)
{
lck_mtx_lock_spin(&ifp->if_flt_lock);
if_flt_monitor_busy(ifp);
lck_mtx_unlock(&ifp->if_flt_lock);
if (ifp->if_bridge != NULL) {
m = bridge_early_input(ifp, m, cnt);
}
lck_mtx_lock_spin(&ifp->if_flt_lock);
if_flt_monitor_unbusy(ifp);
lck_mtx_unlock(&ifp->if_flt_lock);
return m;
}