This is xnu-11215.1.10. See this file in:
/*
 * Copyright (c) 2016-2020 Apple Inc. All rights reserved.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
 *
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. The rights granted to you under the License
 * may not be used to create, or enable the creation or redistribution of,
 * unlawful or unlicensed copies of an Apple operating system, or to
 * circumvent, violate, or enable the circumvention or violation of, any
 * terms of an Apple operating system software license agreement.
 *
 * Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this file.
 *
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
 */

#include <skywalk/os_skywalk_private.h>
#include <skywalk/nexus/flowswitch/nx_flowswitch.h>
#include <skywalk/nexus/flowswitch/fsw_var.h>
#include <netinet/in_arp.h>
#include <netinet/ip6.h>
#include <netinet6/in6_var.h>
#include <netinet6/nd6.h>
#include <net/ethernet.h>
#include <net/route.h>
#include <sys/eventhandler.h>
#include <net/sockaddr_utils.h>

#define FSW_ETHER_LEN_PADDED     16
#define FSW_ETHER_PADDING        (FSW_ETHER_LEN_PADDED - ETHER_HDR_LEN)
#define FSW_ETHER_FRAME_HEADROOM FSW_ETHER_LEN_PADDED

static void fsw_ethernet_ctor(struct nx_flowswitch *, struct flow_route *);
static int fsw_ethernet_resolve(struct nx_flowswitch *, struct flow_route *,
    struct __kern_packet *);
static void fsw_ethernet_frame(struct nx_flowswitch *, struct flow_route *,
    struct __kern_packet *);
static sa_family_t fsw_ethernet_demux(struct nx_flowswitch *,
    struct __kern_packet *);

extern struct rtstat_64 rtstat;

int
fsw_ethernet_setup(struct nx_flowswitch *fsw, struct ifnet *ifp)
{
	struct ifaddr *lladdr = ifp->if_lladdr;

	if (SDL(lladdr->ifa_addr)->sdl_alen != ETHER_ADDR_LEN ||
	    SDL(lladdr->ifa_addr)->sdl_type != IFT_ETHER) {
		return ENOTSUP;
	}

	ifnet_lladdr_copy_bytes(ifp, fsw->fsw_ether_shost, ETHER_ADDR_LEN);
	fsw->fsw_ctor = fsw_ethernet_ctor;
	fsw->fsw_resolve = fsw_ethernet_resolve;
	fsw->fsw_frame = fsw_ethernet_frame;
	fsw->fsw_frame_headroom = FSW_ETHER_FRAME_HEADROOM;
	fsw->fsw_demux = fsw_ethernet_demux;

	return 0;
}

static void
fsw_ethernet_ctor(struct nx_flowswitch *fsw, struct flow_route *fr)
{
	ASSERT(fr->fr_af == AF_INET || fr->fr_af == AF_INET6);

	fr->fr_llhdr.flh_gencnt = fsw->fsw_src_lla_gencnt;
	bcopy(fsw->fsw_ether_shost, fr->fr_eth.ether_shost, ETHER_ADDR_LEN);
	fr->fr_eth.ether_type = ((fr->fr_af == AF_INET) ?
	    htons(ETHERTYPE_IP) : htons(ETHERTYPE_IPV6));

	/* const override */
	_CASSERT(sizeof(fr->fr_llhdr.flh_off) == sizeof(uint8_t));
	_CASSERT(sizeof(fr->fr_llhdr.flh_len) == sizeof(uint8_t));
	*(uint8_t *)(uintptr_t)&fr->fr_llhdr.flh_off = 2;
	*(uint8_t *)(uintptr_t)&fr->fr_llhdr.flh_len = ETHER_HDR_LEN;

	SK_DF(SK_VERB_FLOW_ROUTE,
	    "fr 0x%llx eth_type 0x%x eth_src %x:%x:%x:%x:%x:%x",
	    SK_KVA(fr), ntohs(fr->fr_eth.ether_type),
	    fr->fr_eth.ether_shost[0], fr->fr_eth.ether_shost[1],
	    fr->fr_eth.ether_shost[2], fr->fr_eth.ether_shost[3],
	    fr->fr_eth.ether_shost[4], fr->fr_eth.ether_shost[5]);
}

static int
fsw_ethernet_resolve(struct nx_flowswitch *fsw, struct flow_route *fr,
    struct __kern_packet *pkt)
{
#if SK_LOG
	char dst_s[MAX_IPv6_STR_LEN];
#endif /* SK_LOG */
	struct ifnet *ifp = fsw->fsw_ifp;
	struct rtentry *tgt_rt = NULL;
	struct sockaddr *tgt_sa = NULL;
	struct mbuf *m = NULL;
	boolean_t reattach_mbuf = FALSE;
	boolean_t probing;
	int err = 0;
	uint64_t pkt_mflags_restore;  /* Save old mbuf flags to restore in error cases */

	ASSERT(fr != NULL);
	ASSERT(ifp != NULL);

	FR_LOCK(fr);
	/*
	 * If the destination is on-link, we use the final destination
	 * address as target.  If it's off-link, we use the gateway
	 * address instead.  Point tgt_rt to the the destination or
	 * gateway route accordingly.
	 */
	if (fr->fr_flags & FLOWRTF_ONLINK) {
		tgt_sa = SA(&fr->fr_faddr);
		tgt_rt = fr->fr_rt_dst;
	} else if (fr->fr_flags & FLOWRTF_GATEWAY) {
		tgt_sa = SA(&fr->fr_gaddr);
		tgt_rt = fr->fr_rt_gw;
	}

	/*
	 * Perform another routing table lookup if necessary.
	 */
	if (tgt_rt == NULL || !(tgt_rt->rt_flags & RTF_UP) ||
	    fr->fr_want_configure) {
		if (fr->fr_want_configure == 0) {
			os_atomic_inc(&fr->fr_want_configure, relaxed);
		}
		err = flow_route_configure(fr, ifp, NULL);
		if (err != 0) {
			SK_ERR("failed to configure route to %s on %s (err %d)",
			    sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
			    sizeof(dst_s)), ifp->if_xname, err);
			goto done;
		}

		/* refresh pointers */
		if (fr->fr_flags & FLOWRTF_ONLINK) {
			tgt_sa = SA(&fr->fr_faddr);
			tgt_rt = fr->fr_rt_dst;
		} else if (fr->fr_flags & FLOWRTF_GATEWAY) {
			tgt_sa = SA(&fr->fr_gaddr);
			tgt_rt = fr->fr_rt_gw;
		}
	}

	if (__improbable(!(fr->fr_flags & (FLOWRTF_ONLINK | FLOWRTF_GATEWAY)))) {
		err = EHOSTUNREACH;
		SK_ERR("invalid route for %s on %s (err %d)",
		    sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
		    sizeof(dst_s)), ifp->if_xname, err);
		goto done;
	}

	ASSERT(tgt_sa != NULL);
	ASSERT(tgt_rt != NULL);

	/*
	 * Attempt to convert kpkt to mbuf before acquiring the
	 * rt lock so that the lock won't be held if we need to do
	 * blocked a mbuf allocation.
	 */
	if (!(fr->fr_flags & FLOWRTF_HAS_LLINFO)) {
		/*
		 * We need to resolve; if caller passes in a kpkt,
		 * convert the kpkt within to mbuf.  Caller is then
		 * reponsible for freeing kpkt.  In future, we could
		 * optimize this by having the ARP/ND lookup routines
		 * understand kpkt and perform the conversion only
		 * when it is needed.
		 */
		if (__probable(pkt != NULL)) {
			if (pkt->pkt_pflags & PKT_F_MBUF_DATA) {
				reattach_mbuf = TRUE;
				m = pkt->pkt_mbuf;
				pkt_mflags_restore = (pkt->pkt_pflags & PKT_F_MBUF_MASK);
				KPKT_CLEAR_MBUF_DATA(pkt);
			} else {
				m = fsw_classq_kpkt_to_mbuf(fsw, pkt);
			}
			if (m == NULL) {
				/* not a fatal error; move on */
				SK_ERR("failed to allocate mbuf while "
				    "resolving %s on %s",
				    sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
				    sizeof(dst_s)), ifp->if_xname);
			}
		} else {
			m = NULL;
		}
	}

	RT_LOCK(tgt_rt);

	if (__improbable(!IS_DIRECT_HOSTROUTE(tgt_rt) ||
	    tgt_rt->rt_gateway->sa_family != AF_LINK ||
	    SDL(tgt_rt->rt_gateway)->sdl_type != IFT_ETHER)) {
		rtstat.rts_badrtgwroute++;
		err = ENETUNREACH;
		RT_UNLOCK(tgt_rt);
		SK_ERR("bad gateway route %s on %s (err %d)",
		    sk_sa_ntop(tgt_sa, dst_s, sizeof(dst_s)),
		    ifp->if_xname, err);
		goto done;
	}

	/*
	 * If already resolved, grab the link-layer address and mark the
	 * flow route accordingly.  Given that we will use the cached
	 * link-layer info, there's no need to convert and enqueue the
	 * packet to ARP/ND (i.e. no need to return EJUSTRETURN).
	 */
	if (__probable((fr->fr_flags & FLOWRTF_HAS_LLINFO) &&
	    SDL(tgt_rt->rt_gateway)->sdl_alen == ETHER_ADDR_LEN)) {
		VERIFY(m == NULL);
		/* XXX Remove explicit __bidi_indexable once rdar://119193012 lands */
		struct sockaddr_dl *__bidi_indexable sdl =
		    (struct sockaddr_dl *__bidi_indexable) SDL(tgt_rt->rt_gateway);
		FLOWRT_UPD_ETH_DST(fr, LLADDR(sdl));
		os_atomic_or(&fr->fr_flags, (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO), relaxed);
		/* if we're not probing, then we're done */
		if (!(probing = (fr->fr_want_probe != 0))) {
			VERIFY(err == 0);
			RT_UNLOCK(tgt_rt);
			goto done;
		}
		os_atomic_store(&fr->fr_want_probe, 0, release);
	} else {
		probing = FALSE;
		os_atomic_andnot(&fr->fr_flags, (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO), relaxed);
	}

	SK_DF(SK_VERB_FLOW_ROUTE, "%s %s on %s", (probing ?
	    "probing" : "resolving"), sk_sa_ntop(tgt_sa, dst_s,
	    sizeof(dst_s)), ifp->if_xname);

	/*
	 * Trigger ARP/NDP resolution or probing.
	 */
	switch (tgt_sa->sa_family) {
	case AF_INET: {
		struct sockaddr_dl sdl;

		RT_UNLOCK(tgt_rt);
		/*
		 * Note we pass NULL as "hint" parameter, as tgt_sa
		 * is already refererring to the target address.
		 */
		SOCKADDR_ZERO(&sdl, sizeof(sdl));
		err = arp_lookup_ip(ifp, SIN(tgt_sa), &sdl, sizeof(sdl),
		    NULL, m);

		/*
		 * If we're resolving (not probing), and it's now resolved,
		 * grab the link-layer address and update the flow route.
		 * If we get EJUSTRETURN, the mbuf (if any) would have
		 * been added to the hold queue.  Any other return values
		 * including 0 means that we need to free it.
		 *
		 * If we're probing, we won't have any mbuf to deal with,
		 * and since we already have the cached llinfo we'll just
		 * return success even if we get EJUSTRETURN.
		 */
		if (!probing) {
			if (err == 0 && sdl.sdl_alen == ETHER_ADDR_LEN) {
				SK_DF(SK_VERB_FLOW_ROUTE,
				    "fast-resolve %s on %s",
				    sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
				    sizeof(dst_s)), ifp->if_xname);
				FLOWRT_UPD_ETH_DST(fr, LLADDR(&sdl));
				os_atomic_or(&fr->fr_flags, (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO), relaxed);
			}
			if (err == EJUSTRETURN && m != NULL) {
				SK_DF(SK_VERB_FLOW_ROUTE, "packet queued "
				    "while resolving %s on %s",
				    sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
				    sizeof(dst_s)), ifp->if_xname);
				m = NULL;
			}
		} else {
			VERIFY(m == NULL);
			if (err == EJUSTRETURN) {
				err = 0;
			}
		}
		break;
	}

	case AF_INET6: {
		struct llinfo_nd6 *__single ln = tgt_rt->rt_llinfo;

		/*
		 * Check if the route is down.  RTF_LLINFO is set during
		 * RTM_{ADD,RESOLVE}, and is never cleared until the route
		 * is deleted from the routing table.
		 */
		if ((tgt_rt->rt_flags & (RTF_UP | RTF_LLINFO)) !=
		    (RTF_UP | RTF_LLINFO) || ln == NULL) {
			err = EHOSTUNREACH;
			SK_ERR("route unavailable for %s on %s (err %d)",
			    sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
			    sizeof(dst_s)), ifp->if_xname, err);
			RT_UNLOCK(tgt_rt);
			break;
		}

		/*
		 * If we're probing and IPv6 ND cache entry is STALE,
		 * use it anyway but also mark it for delayed probe
		 * and update the expiry.
		 */
		if (probing) {
			VERIFY(m == NULL);
			VERIFY(ln->ln_state > ND6_LLINFO_INCOMPLETE);
			if (ln->ln_state == ND6_LLINFO_STALE) {
				ln->ln_asked = 0;
				ND6_CACHE_STATE_TRANSITION(ln,
				    ND6_LLINFO_DELAY);
				ln_setexpire(ln, net_uptime() + nd6_delay);
				RT_UNLOCK(tgt_rt);

				lck_mtx_lock(rnh_lock);
				nd6_sched_timeout(NULL, NULL);
				lck_mtx_unlock(rnh_lock);

				SK_DF(SK_VERB_FLOW_ROUTE,
				    "NUD probe scheduled for %s on %s",
				    sk_sa_ntop(tgt_sa, dst_s,
				    sizeof(dst_s)), ifp->if_xname);
			} else {
				RT_UNLOCK(tgt_rt);
			}
			VERIFY(err == 0);
			break;
		}

		/*
		 * If this is a permanent ND entry, we're done.
		 */
		if (ln->ln_expire == 0 &&
		    ln->ln_state == ND6_LLINFO_REACHABLE) {
			if (SDL(tgt_rt->rt_gateway)->sdl_alen !=
			    ETHER_ADDR_LEN) {
				err = EHOSTUNREACH;
				SK_ERR("invalid permanent route %s on %s"
				    "ln 0x%llx (err %d)",
				    sk_sa_ntop(rt_key(tgt_rt), dst_s,
				    sizeof(dst_s)), ifp->if_xname,
				    SK_KVA(ln), err);
			} else {
				SK_DF(SK_VERB_FLOW_ROUTE, "fast-resolve "
				    "permanent route %s on %s",
				    sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
				    sizeof(dst_s)), ifp->if_xname);
				/* copy permanent address into the flow route */
				/*
				 * XXX Remove explicit __bidi_indexable once
				 * rdar://119193012 lands
				 */
				struct sockaddr_dl *__bidi_indexable sdl =
				    (struct sockaddr_dl *__bidi_indexable) SDL(tgt_rt->rt_gateway);
				FLOWRT_UPD_ETH_DST(fr, LLADDR(sdl));
				os_atomic_or(&fr->fr_flags, (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO), relaxed);
				VERIFY(err == 0);
			}
			RT_UNLOCK(tgt_rt);
			break;
		}

		if (ln->ln_state == ND6_LLINFO_NOSTATE) {
			ND6_CACHE_STATE_TRANSITION(ln, ND6_LLINFO_INCOMPLETE);
		}

		if (ln->ln_state == ND6_LLINFO_INCOMPLETE && (!ln->ln_asked ||
		    !(fr->fr_flags & FLOWRTF_HAS_LLINFO))) {
			struct nd_ifinfo *ndi = ND_IFINFO(tgt_rt->rt_ifp);
			/*
			 * There is a neighbor cache entry, but no Ethernet
			 * address response yet.  Replace the held mbuf
			 * (if any) with this the one we have (if any),
			 * else leave it alone.
			 *
			 * This code conforms to the rate-limiting rule
			 * described in Section 7.2.2 of RFC 4861, because
			 * the timer is set correctly after sending an
			 * NS below.
			 */
			if (m != NULL) {
				if (ln->ln_hold != NULL) {
					m_freem_list(ln->ln_hold);
				}
				ln->ln_hold = m;
				m = NULL;

				SK_DF(SK_VERB_FLOW_ROUTE,
				    "packet queued while resolving %s on %s",
				    sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
				    sizeof(dst_s)), ifp->if_xname);
			}
			VERIFY(ndi != NULL && ndi->initialized);
			ln->ln_asked++;
			ln_setexpire(ln, net_uptime() + ndi->retrans / 1000);
			RT_UNLOCK(tgt_rt);

			SK_DF(SK_VERB_FLOW_ROUTE, "soliciting for %s on %s"
			    "ln 0x%llx state %u", sk_sa_ntop(rt_key(tgt_rt),
			    dst_s, sizeof(dst_s)), ifp->if_xname, SK_KVA(ln),
			    ln->ln_state);

			/* XXX Refactor this to use same src ip */
			nd6_ns_output(tgt_rt->rt_ifp, NULL,
			    &SIN6(rt_key(tgt_rt))->sin6_addr, NULL, NULL, 0);

			lck_mtx_lock(rnh_lock);
			nd6_sched_timeout(NULL, NULL);
			lck_mtx_unlock(rnh_lock);
			err = EJUSTRETURN;
		} else {
			SK_DF(SK_VERB_FLOW_ROUTE, "fast-resolve %s on %s",
			    sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
			    sizeof(dst_s)), ifp->if_xname);
			/*
			 * The neighbor cache entry has been resolved;
			 * copy the address into the flow route.
			 */
			/*
			 * XXX Remove explicit __bidi_indexable once
			 * rdar://119193012 lands
			 */
			struct sockaddr_dl *__bidi_indexable sdl =
			    (struct sockaddr_dl *__bidi_indexable) SDL(tgt_rt->rt_gateway);
			FLOWRT_UPD_ETH_DST(fr, LLADDR(sdl));
			os_atomic_or(&fr->fr_flags, (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO), relaxed);
			RT_UNLOCK(tgt_rt);
			VERIFY(err == 0);
		}
		/*
		 * XXX Need to optimize for the NDP garbage
		 * collection.  It would be even better to unify
		 * BSD/SK NDP management through the completion
		 * of L2/L3 split.
		 */
		break;
	}

	default:
		VERIFY(0);
		/* NOTREACHED */
		__builtin_unreachable();
	}
	RT_LOCK_ASSERT_NOTHELD(tgt_rt);

done:
	if (m != NULL) {
		if (reattach_mbuf) {
			pkt->pkt_mbuf = m;
			pkt->pkt_pflags |= pkt_mflags_restore;
		} else {
			m_freem_list(m);
		}
		m = NULL;
	}

	if (__improbable(err != 0 && err != EJUSTRETURN)) {
		SK_ERR("route to %s on %s can't be resolved (err %d)",
		    sk_sa_ntop(SA(&fr->fr_faddr), dst_s, sizeof(dst_s)),
		    ifp->if_xname, err);
		/* keep FLOWRTF_HAS_LLINFO as llinfo is still useful */
		os_atomic_andnot(&fr->fr_flags, FLOWRTF_RESOLVED, relaxed);
		flow_route_cleanup(fr);
	}

	FR_UNLOCK(fr);

	return err;
}

static void
fsw_ethernet_frame(struct nx_flowswitch *fsw, struct flow_route *fr,
    struct __kern_packet *pkt)
{
	/* in the event the source MAC address changed, update our copy */
	if (__improbable(fr->fr_llhdr.flh_gencnt != fsw->fsw_src_lla_gencnt)) {
		uint8_t old_shost[ETHER_ADDR_LEN];

		bcopy(&fr->fr_eth.ether_shost, &old_shost, ETHER_ADDR_LEN);
		fsw_ethernet_ctor(fsw, fr);

		SK_ERR("fr 0x%llx source MAC address updated on %s, "
		    "was %x:%x:%x:%x:%x:%x now %x:%x:%x:%x:%x:%x",
		    SK_KVA(fr), fsw->fsw_ifp,
		    old_shost[0], old_shost[1],
		    old_shost[2], old_shost[3],
		    old_shost[4], old_shost[5],
		    fr->fr_eth.ether_shost[0], fr->fr_eth.ether_shost[1],
		    fr->fr_eth.ether_shost[2], fr->fr_eth.ether_shost[3],
		    fr->fr_eth.ether_shost[4], fr->fr_eth.ether_shost[5]);
	}

	_CASSERT(sizeof(fr->fr_eth_padded) == FSW_ETHER_LEN_PADDED);

	if ((fr->fr_flags & FLOWRTF_DST_LL_MCAST) != 0) {
		pkt->pkt_link_flags |= PKT_LINKF_MCAST;
	} else if ((fr->fr_flags & FLOWRTF_DST_LL_BCAST) != 0) {
		pkt->pkt_link_flags |= PKT_LINKF_BCAST;
	}

	ASSERT(pkt->pkt_headroom >= FSW_ETHER_LEN_PADDED);

	char *pkt_buf;
	MD_BUFLET_ADDR_ABS(pkt, pkt_buf);
	sk_copy64_16((uint64_t *)(void *)&fr->fr_eth_padded,
	    (uint64_t *)(void *)(pkt_buf + pkt->pkt_headroom - FSW_ETHER_LEN_PADDED));

	pkt->pkt_headroom -= ETHER_HDR_LEN;
	pkt->pkt_l2_len = ETHER_HDR_LEN;

	if ((pkt->pkt_pflags & PKT_F_MBUF_DATA) != 0) {
		/* frame and fix up mbuf */
		struct mbuf *m = pkt->pkt_mbuf;
		void *buf = m_mtod_current(m) - FSW_ETHER_LEN_PADDED;

		sk_copy64_16((uint64_t *)(void *)&fr->fr_eth_padded, buf);
		ASSERT((uintptr_t)m->m_data ==
		    (uintptr_t)mbuf_datastart(m) + FSW_ETHER_FRAME_HEADROOM);
		m->m_data -= ETHER_HDR_LEN;
		m->m_len += ETHER_HDR_LEN;
		m_pktlen(m) += ETHER_HDR_LEN;
		ASSERT(m->m_len == m_pktlen(m));
		pkt->pkt_length = m_pktlen(m);
	} else {
		METADATA_ADJUST_LEN(pkt, ETHER_HDR_LEN, pkt->pkt_headroom);
	}
}

static sa_family_t
fsw_ethernet_demux(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
{
#pragma unused(fsw)
	const struct ether_header *eh;
	sa_family_t af = AF_UNSPEC;
	uint32_t bdlen, bdlim, bdoff;
	uint8_t *baddr;

	MD_BUFLET_ADDR_ABS_DLEN(pkt, baddr, bdlen, bdlim, bdoff);
	baddr += pkt->pkt_headroom;
	eh = (struct ether_header *)(void *)baddr;

	if (__improbable(sizeof(*eh) > pkt->pkt_length)) {
		STATS_INC(&fsw->fsw_stats, FSW_STATS_RX_DEMUX_ERR);
		SK_ERR("unrecognized pkt, len %u", pkt->pkt_length);
		return AF_UNSPEC;
	}

	if (__improbable(pkt->pkt_headroom + sizeof(*eh) > bdlim)) {
		SK_ERR("ethernet header overrun 1st buflet");
		STATS_INC(&fsw->fsw_stats, FSW_STATS_RX_DEMUX_ERR);
		return AF_UNSPEC;
	}

	if (__improbable((pkt->pkt_link_flags & PKT_LINKF_ETHFCS) != 0)) {
		pkt->pkt_length -= ETHER_CRC_LEN;
		pkt->pkt_link_flags &= ~PKT_LINKF_ETHFCS;
		if (pkt->pkt_pflags & PKT_F_MBUF_DATA) {
			ASSERT((pkt->pkt_mbuf->m_flags & M_HASFCS) != 0);
			m_adj(pkt->pkt_mbuf, -ETHER_CRC_LEN);
			pkt->pkt_mbuf->m_flags &= ~M_HASFCS;
		}
	}
	pkt->pkt_l2_len = ETHER_HDR_LEN;
	if ((eh->ether_dhost[0] & 1) == 0) {
		/*
		 * When the driver is put into promiscuous mode we may receive
		 * unicast frames that are not intended for our interfaces.
		 * They are marked here as being promiscuous so the caller may
		 * dispose of them after passing the packets to any interface
		 * filters.
		 */
		if (_ether_cmp(eh->ether_dhost, IF_LLADDR(fsw->fsw_ifp))) {
			pkt->pkt_pflags |= PKT_F_PROMISC;
			STATS_INC(&fsw->fsw_stats, FSW_STATS_RX_DEMUX_PROMISC);
			return AF_UNSPEC;
		}
	}
	uint16_t ether_type = ntohs(eh->ether_type);
	switch (ether_type) {
	case ETHERTYPE_IP:
		af = AF_INET;
		break;
	case ETHERTYPE_IPV6:
		af = AF_INET6;
		break;
	default:
		STATS_INC(&fsw->fsw_stats, FSW_STATS_RX_DEMUX_UNSPEC);
		break;
	}

	return af;
}