Source of /bsd/skywalk/nexus/flowswitch/flow/flow

/*
 * Copyright (c) 2019-2023 Apple Inc. All rights reserved.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
 *
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. The rights granted to you under the License
 * may not be used to create, or enable the creation or redistribution of,
 * unlawful or unlicensed copies of an Apple operating system, or to
 * circumvent, violate, or enable the circumvention or violation of, any
 * terms of an Apple operating system software license agreement.
 *
 * Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this file.
 *
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
 */

#include <skywalk/os_skywalk_private.h>
#include <skywalk/nexus/flowswitch/nx_flowswitch.h>
#include <skywalk/nexus/flowswitch/fsw_var.h>
#include <skywalk/nexus/flowswitch/flow/flow_var.h>
#include <skywalk/nexus/netif/nx_netif.h>
#include <skywalk/nexus/netif/nx_netif_compat.h>
#include <netinet/tcp.h>
#include <netinet/ip.h>
#include <netinet/ip6.h>
#include <net/pktap.h>
#include <sys/sdt.h>

#define MAX_AGG_IP_LEN()        MIN(sk_fsw_rx_agg_tcp, IP_MAXPACKET)
#define MAX_BUFLET_COUNT        (32)
#define TCP_FLAGS_IGNORE        (TH_FIN|TH_SYN|TH_RST|TH_URG)
#define PKT_IS_MBUF(_pkt)       (_pkt->pkt_pflags & PKT_F_MBUF_DATA)
#define PKT_IS_TRUNC_MBUF(_pkt) (PKT_IS_MBUF(_pkt) &&           \
	                        (_pkt->pkt_pflags & PKT_F_TRUNCATED))
#define PKT_IS_WAKE_PKT(_pkt)   ((PKT_IS_MBUF(_pkt) &&                                  \
	                        (pkt->pkt_mbuf->m_pkthdr.pkt_flags & PKTF_WAKE_PKT)) || \
	                        (!PKT_IS_MBUF(_pkt) &&                                  \
	                        (_pkt->pkt_pflags & PKT_F_WAKE_PKT)))


typedef uint16_t (* flow_agg_fix_pkt_sum_func)(uint16_t, uint16_t, uint16_t);

static uint16_t
flow_agg_pkt_fix_sum(uint16_t csum, uint16_t old, uint16_t new);

static uint16_t
flow_agg_pkt_fix_sum_no_op(uint16_t csum, uint16_t old, uint16_t new);

/*
 * This structure holds per-super object (mbuf/packet) flow aggregation.
 */
struct flow_agg {
	union {
		struct {
			union {
				void *          _fa_sobj;
				struct mbuf *   _fa_smbuf;      /* super mbuf */
				struct __kern_packet *_fa_spkt; /* super pkt */
			};
			uint8_t *__indexable _fa_sptr;        /* ptr to super IP header */
			bool     _fa_sobj_is_pkt; /* super obj is pkt or mbuf */
			/*
			 * super obj is not large enough to hold the IP & TCP
			 * header in a contiguous buffer.
			 */
			bool     _fa_sobj_is_short;
			uint32_t _fa_tcp_seq;     /* expected next sequence # */
			uint32_t _fa_ulen;        /* expected next ulen */
			uint32_t _fa_total;       /* total aggregated bytes */
			/* function that fix packet checksum */
			flow_agg_fix_pkt_sum_func _fa_fix_pkt_sum;
		} __flow_agg;
		uint64_t __flow_agg_data[5];
	};
#define fa_sobj           __flow_agg._fa_sobj
#define fa_smbuf          __flow_agg._fa_smbuf
#define fa_spkt           __flow_agg._fa_spkt
#define fa_sptr           __flow_agg._fa_sptr
#define fa_sobj_is_pkt    __flow_agg._fa_sobj_is_pkt
#define fa_sobj_is_short  __flow_agg._fa_sobj_is_short
#define fa_tcp_seq        __flow_agg._fa_tcp_seq
#define fa_ulen           __flow_agg._fa_ulen
#define fa_total          __flow_agg._fa_total
#define fa_fix_pkt_sum   __flow_agg._fa_fix_pkt_sum
};

#if __has_ptrcheck
#define FLOW_AGG_CLEAR(_fa) do {                                    \
	_CASSERT(sizeof(struct flow_agg) == 48);         \
	_CASSERT(offsetof(struct flow_agg, fa_fix_pkt_sum) == 40);              \
	sk_zero_48(_fa);                                                \
	(_fa)->fa_fix_pkt_sum = 0;                                                                             \
} while (0)
#else
#define FLOW_AGG_CLEAR(_fa) do {                                    \
	_CASSERT(sizeof(struct flow_agg) == 40);         \
	_CASSERT(offsetof(struct flow_agg, fa_fix_pkt_sum) == 32);              \
	sk_zero_32(_fa);                                                \
	(_fa)->fa_fix_pkt_sum = 0;                                                                             \
} while (0)
#endif

#define MASK_SIZE       80      /* size of struct {ip,ip6}_tcp_mask */

struct ip_tcp_mask {
	struct ip       ip_m;
	struct tcphdr   tcp_m;
	uint32_t        tcp_option_m[MAX_TCPOPTLEN / sizeof(uint32_t)];
};

static const struct ip_tcp_mask ip_tcp_mask
__sk_aligned(16) =
{
	.ip_m = {
		.ip_hl = 0xf,
		.ip_v = 0xf,
		.ip_tos = 0xff,
		/* Not checked; aggregated packet's ip_len is increasing */
		.ip_len = 0,
		.ip_id = 0,
		.ip_off = 0xffff,
		.ip_ttl = 0xff,
		.ip_p = 0xff,
		.ip_sum = 0,
		.ip_src.s_addr = 0xffffffff,
		.ip_dst.s_addr = 0xffffffff,
	},
	.tcp_m = {
		.th_sport = 0xffff,
		.th_dport = 0xffff,
		.th_seq = 0,
		.th_ack = 0xffffffff,
		.th_x2 = 0xf,
		.th_off = 0xf,
		.th_flags = ~TH_PUSH,
		.th_win = 0xffff,
		.th_sum = 0,
		.th_urp = 0xffff,
	},
	.tcp_option_m = {
		/* Max 40 bytes of TCP options */
		0xffffffff,
		0xffffffff,
		0xffffffff,
		0,      /* Filling up to MASK_SIZE */
		0,      /* Filling up to MASK_SIZE */
		0,      /* Filling up to MASK_SIZE */
		0,      /* Filling up to MASK_SIZE */
		0,      /* Filling up to MASK_SIZE */
		0,      /* Filling up to MASK_SIZE */
		0,      /* Filling up to MASK_SIZE */
	},
};

struct ip6_tcp_mask {
	struct ip6_hdr  ip6_m;
	struct tcphdr   tcp_m;
	uint32_t        tcp_option_m[5]; /* 5 bytes to fill up to MASK_SIZE */
};

static const struct ip6_tcp_mask ip6_tcp_mask
__sk_aligned(16) =
{
	.ip6_m = {
		.ip6_ctlun.ip6_un1.ip6_un1_flow = 0xffffffff,
		/* Not checked; aggregated packet's ip_len is increasing */
		.ip6_ctlun.ip6_un1.ip6_un1_plen = 0,
		.ip6_ctlun.ip6_un1.ip6_un1_nxt = 0xff,
		.ip6_ctlun.ip6_un1.ip6_un1_hlim = 0xff,
		.ip6_src.__u6_addr.__u6_addr32[0] = 0xffffff,
		.ip6_src.__u6_addr.__u6_addr32[1] = 0xffffff,
		.ip6_src.__u6_addr.__u6_addr32[2] = 0xffffff,
		.ip6_src.__u6_addr.__u6_addr32[3] = 0xffffff,
		.ip6_dst.__u6_addr.__u6_addr32[0] = 0xffffff,
		.ip6_dst.__u6_addr.__u6_addr32[1] = 0xffffff,
		.ip6_dst.__u6_addr.__u6_addr32[2] = 0xffffff,
		.ip6_dst.__u6_addr.__u6_addr32[3] = 0xffffff,
	},
	.tcp_m = {
		.th_sport = 0xffff,
		.th_dport = 0xffff,
		.th_seq = 0,
		.th_ack = 0xffffffff,
		.th_x2 = 0xf,
		.th_off = 0xf,
		.th_flags = ~TH_PUSH,
		.th_win = 0xffff,
		.th_sum = 0,
		.th_urp = 0xffff,
	},
	.tcp_option_m = {
		/* Max 40 bytes of TCP options */
		0xffffffff,
		0xffffffff,
		0xffffffff,
		0,          /* Filling up to MASK_SIZE */
		0,          /* Filling up to MASK_SIZE */
	},
};

#if SK_LOG
SK_LOG_ATTRIBUTE
static void
_pkt_agg_log(struct __kern_packet *pkt, struct proc *p, bool is_input)
{
	SK_LOG_VAR(uint64_t logflags = ((SK_VERB_FSW | SK_VERB_RX) |
	    (PKT_IS_MBUF(pkt) ? SK_VERB_COPY_MBUF : SK_VERB_COPY)));

	kern_packet_t ph = SK_PKT2PH(pkt);
	uint64_t bufcnt = 1;
	if (!is_input) {
		bufcnt = kern_packet_get_buflet_count(ph);
	}

	SK_DF(logflags, "%s(%d) %spkt 0x%llx plen %u",
	    sk_proc_name_address(p), sk_proc_pid(p), is_input ? "s":"d",
	    SK_KVA(pkt), pkt->pkt_length);

	SK_DF(logflags, "%spkt csumf/rxstart/rxval 0x%x/%u/0x%04x",
	    is_input ? "s":"d", pkt->pkt_csum_flags,
	    (uint32_t)pkt->pkt_csum_rx_start_off,
	    (uint32_t)pkt->pkt_csum_rx_value);

	if (!is_input) {
		kern_buflet_t buf = kern_packet_get_next_buflet(ph, NULL);

		/* Individual buflets */
		for (uint64_t i = 0; i < bufcnt && buf != NULL; i++) {
			SK_DF(logflags | SK_VERB_DUMP, "%s",
			    sk_dump("buf", __buflet_get_data_address(buf),
			    __buflet_get_data_length(buf), 128, NULL, 0));
			buf = kern_packet_get_next_buflet(ph, buf);
		}
	}
}

#define pkt_agg_log(_pkt, _p, _is_input) do {                           \
	if (__improbable(sk_verbose != 0)) {                            \
	        _pkt_agg_log(_pkt, _p, _is_input);                      \
	}                                                               \
} while (0)

SK_LOG_ATTRIBUTE
static void
_mbuf_agg_log(struct mbuf *m, struct proc *p, bool is_mbuf)
{
	SK_LOG_VAR(uint64_t logflags = ((SK_VERB_FSW | SK_VERB_RX) |
	    (is_mbuf ? SK_VERB_COPY_MBUF : SK_VERB_COPY)));

	SK_DF(logflags, "%s(%d) dest mbuf 0x%llx pktlen %u",
	    sk_proc_name_address(p), sk_proc_pid(p), SK_KVA(m),
	    m->m_pkthdr.len);

	SK_DF(logflags, "dest mbuf csumf/rxstart/rxval 0x%x/%u/0x%04x",
	    m->m_pkthdr.csum_flags, (uint32_t)m->m_pkthdr.csum_rx_start,
	    (uint32_t)m->m_pkthdr.csum_rx_val);

	/* Dump the first mbuf */
	ASSERT(m_mtod_current(m) != NULL);
	SK_DF(logflags | SK_VERB_DUMP, "%s", sk_dump("buf",
	    (uint8_t *)m_mtod_current(m), m->m_len, 128, NULL, 0));
}

#define mbuf_agg_log(_m, _p, _is_mbuf) do {                             \
	if (__improbable(sk_verbose != 0)) {                            \
	        _mbuf_agg_log(_m, _p, _is_mbuf);                        \
	}                                                               \
} while (0)

SK_LOG_ATTRIBUTE
static void
_mchain_agg_log(struct mbuf *m, struct proc *p, bool is_mbuf)
{
	SK_LOG_VAR(uint64_t logflags = ((SK_VERB_FSW | SK_VERB_RX) |
	    (is_mbuf ? SK_VERB_COPY_MBUF : SK_VERB_COPY)));

	while (m != NULL) {
		SK_DF(logflags, "%s(%d) dest mbuf 0x%llx pktlen %u",
		    sk_proc_name_address(p), sk_proc_pid(p), SK_KVA(m),
		    m->m_pkthdr.len);

		SK_DF(logflags, "dest mbuf csumf/rxstart/rxval 0x%x/%u/0x%04x",
		    m->m_pkthdr.csum_flags, (uint32_t)m->m_pkthdr.csum_rx_start,
		    (uint32_t)m->m_pkthdr.csum_rx_val);

		m = m->m_nextpkt;
	}
}

#define mchain_agg_log(_m, _p, _is_mbuf) do {                           \
	if (__improbable(sk_verbose != 0)) {                            \
	        _mchain_agg_log(_m, _p, _is_mbuf);                      \
	}                                                               \
} while (0)
#else
#define pkt_agg_log(...)
#define mbuf_agg_log(...)
#define mchain_agg_log(...)
#endif /* SK_LOG */

/*
 * Checksum only for packet with mbuf.
 */
static bool
mbuf_csum(struct __kern_packet *pkt, struct mbuf *m, bool verify_l3,
    uint16_t *data_csum)
{
	ASSERT(data_csum != NULL);

	SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX));
	uint32_t plen = pkt->pkt_l2_len + pkt->pkt_flow_ip_hlen +
	    pkt->pkt_flow_tcp_hlen + pkt->pkt_flow_ulen;
	uint16_t l4len = plen - pkt->pkt_l2_len - pkt->pkt_flow_ip_hlen;
	uint16_t start = pkt->pkt_l2_len;
	uint32_t partial = 0;
	uint16_t csum = 0;

	ASSERT(plen == m_pktlen(m));

	/* Some compat drivers compute full checksum */
	if ((m->m_pkthdr.csum_flags & CSUM_RX_FULL_FLAGS) ==
	    CSUM_RX_FULL_FLAGS) {
		SK_DF(logflags, "HW csumf/rxstart/rxval 0x%x/%u/0x%04x",
		    m->m_pkthdr.csum_flags, m->m_pkthdr.csum_rx_start,
		    m->m_pkthdr.csum_rx_val);

		/* Compute the data_csum */
		struct tcphdr *tcp =
		    (struct tcphdr *)(void *)(mtod(m, uint8_t *) +
		    pkt->pkt_l2_len + pkt->pkt_flow_ip_hlen);
		/* 16-bit alignment is sufficient */
		ASSERT(IS_P2ALIGNED(tcp, sizeof(uint16_t)));

		uint16_t th_sum = tcp->th_sum;
		tcp->th_sum = 0;

		partial = m_sum16(m, start + pkt->pkt_flow_ip_hlen,
		    pkt->pkt_flow_tcp_hlen);
		partial += htons(l4len + IPPROTO_TCP);
		if (pkt->pkt_flow_ip_ver == IPVERSION) {
			csum = in_pseudo(pkt->pkt_flow_ipv4_src.s_addr,
			    pkt->pkt_flow_ipv4_dst.s_addr, partial);
		} else {
			ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
			csum = in6_pseudo(&pkt->pkt_flow_ipv6_src,
			    &pkt->pkt_flow_ipv6_dst, partial);
		}
		/* Restore the original checksum */
		tcp->th_sum = th_sum;
		th_sum = __packet_fix_sum(th_sum, csum, 0);
		*data_csum = ~th_sum & 0xffff;

		/* pkt metadata will be transfer to super packet */
		__packet_set_inet_checksum(SK_PKT2PH(pkt), PACKET_CSUM_RX_FULL_FLAGS,
		    0, m->m_pkthdr.csum_rx_val, false);

		if ((m->m_pkthdr.csum_rx_val ^ 0xffff) == 0) {
			return true;
		} else {
			return false;
		}
	}
	/* Reset the csum RX flags */
	m->m_pkthdr.csum_flags &= ~CSUM_RX_FLAGS;
	if (verify_l3) {
		csum = m_sum16(m, start, pkt->pkt_flow_ip_hlen);
		SK_DF(logflags, "IP copy+sum %u(%u) (csum 0x%04x)",
		    start, pkt->pkt_flow_ip_hlen, csum);
		m->m_pkthdr.csum_flags |= CSUM_IP_CHECKED;
		if ((csum ^ 0xffff) != 0) {
			return false;
		} else {
			m->m_pkthdr.csum_flags |= CSUM_IP_VALID;
		}
	}
	/* Compute L4 header checksum */
	partial = m_sum16(m, start + pkt->pkt_flow_ip_hlen,
	    pkt->pkt_flow_tcp_hlen);
	/* Compute payload checksum */
	start += (pkt->pkt_flow_ip_hlen + pkt->pkt_flow_tcp_hlen);
	*data_csum = m_sum16(m, start, (plen - start));

	/* Fold in the data checksum to TCP checksum */
	partial += *data_csum;
	partial += htons(l4len + IPPROTO_TCP);
	if (pkt->pkt_flow_ip_ver == IPVERSION) {
		csum = in_pseudo(pkt->pkt_flow_ipv4_src.s_addr,
		    pkt->pkt_flow_ipv4_dst.s_addr, partial);
	} else {
		ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
		csum = in6_pseudo(&pkt->pkt_flow_ipv6_src,
		    &pkt->pkt_flow_ipv6_dst, partial);
	}
	SK_DF(logflags, "TCP copy+sum %u(%u) (csum 0x%04x)",
	    start - pkt->pkt_flow_tcp_hlen, l4len, csum);
	// Set start to 0 for full checksum
	m->m_pkthdr.csum_rx_start = 0;
	m->m_pkthdr.csum_rx_val = csum;
	m->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);

	/* pkt metadata will be transfer to super packet */
	__packet_set_inet_checksum(SK_PKT2PH(pkt), PACKET_CSUM_RX_FULL_FLAGS,
	    0, csum, false);

	if ((csum ^ 0xffff) != 0) {
		return false;
	}

	return true;
}

/* structure to pass an array of data buffers */
typedef struct _dbuf_array {
	union {
		struct __kern_buflet *dba_buflet[MAX_BUFLET_COUNT];
		struct mbuf *dba_mbuf[MAX_BUFLET_COUNT];
	};
	uint8_t dba_num_dbufs;
	bool dba_is_buflet;
} _dbuf_array_t;

static inline void
_copy_data_sum_dbuf(struct __kern_packet *spkt, uint16_t soff, uint16_t plen,
    uint32_t *partial_sum, boolean_t *odd_start, _dbuf_array_t *dbuf,
    boolean_t do_csum)
{
	uint8_t i = 0;
	uint32_t buflet_dlim, buflet_dlen, buf_off = 0;

	ASSERT(plen > 0);
	while (plen > 0) {
		ASSERT(i < dbuf->dba_num_dbufs);
		uint32_t dbuf_lim, tmplen;
		uint8_t *dbuf_addr;

		if (dbuf->dba_is_buflet) {
			ASSERT(kern_buflet_get_data_offset(dbuf->dba_buflet[i]) == 0);
			/* XXX -fbounds-safety: use the inline variant to return an __indexable */
			dbuf_addr = __buflet_get_data_address(dbuf->dba_buflet[i]);

			buflet_dlim = kern_buflet_get_data_limit(dbuf->dba_buflet[i]);
			buflet_dlen = kern_buflet_get_data_length(dbuf->dba_buflet[i]);
			buf_off = buflet_dlen;
			dbuf_lim = buflet_dlim - buf_off;
			dbuf_addr += buf_off;
		} else {
			dbuf_lim = (uint32_t) M_TRAILINGSPACE(dbuf->dba_mbuf[i]);
			dbuf_addr = mtod(dbuf->dba_mbuf[i], uint8_t *);
			buf_off = dbuf->dba_mbuf[i]->m_len;
			dbuf_addr += buf_off;
		}

		tmplen = min(plen, dbuf_lim);
		if (PKT_IS_TRUNC_MBUF(spkt)) {
			if (do_csum) {
				*partial_sum = m_copydata_sum(spkt->pkt_mbuf,
				    soff, tmplen, dbuf_addr, *partial_sum,
				    odd_start);
			} else {
				m_copydata(spkt->pkt_mbuf, soff, tmplen,
				    dbuf_addr);
			}
		} else {
			*partial_sum = pkt_copyaddr_sum(SK_PKT2PH(spkt),
			    soff, dbuf_addr, tmplen, do_csum, *partial_sum,
			    odd_start);
		}
		if (dbuf->dba_is_buflet) {
			VERIFY(kern_buflet_set_data_length(dbuf->dba_buflet[i],
			    tmplen + buf_off) == 0);
		} else {
			dbuf->dba_mbuf[i]->m_len += tmplen;
			dbuf->dba_mbuf[0]->m_pkthdr.len += tmplen;
		}
		soff += tmplen;
		plen -= tmplen;
		buf_off = 0;
		i++;
	}
	ASSERT(plen == 0);
}

/*
 * Copy (fill) and checksum for packet.
 * spkt: source IP packet.
 * plen: length of data in spkt (IP hdr + TCP hdr + TCP payload).
 * verify_l3: verify IPv4 header checksum.
 * currm: destination mbuf.
 * currp: destination skywalk packet.
 * dbuf: additional destination data buffer(s), used when current destination
 * packet is out of space.
 * added: amount of data copied from spkt to the additional buffer.
 * data_sum: 16-bit folded partial checksum of the copied TCP payload.
 */
static bool
copy_pkt_csum_packed(struct __kern_packet *spkt, uint32_t plen,
    _dbuf_array_t *dbuf, bool verify_l3, struct mbuf *currm,
    struct __kern_buflet *currp, uint16_t *data_csum, int *added)
{
	ASSERT(data_csum != NULL);

	SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX |
	    SK_VERB_COPY));

	uint16_t start = 0, csum = 0;
	uint32_t len = 0;
	uint32_t l4len;
	/* soff is only used for packets */
	uint16_t soff = spkt->pkt_headroom + spkt->pkt_l2_len;
	uint32_t data_partial = 0, partial = 0;
	int32_t curr_oldlen;
	uint32_t curr_trailing;
	char *curr_ptr;
	int32_t curr_len;
	uint16_t data_off;
	uint32_t tmplen;
	boolean_t odd_start = FALSE;
	bool verify_l4;

	/* One of them must be != NULL, but they can't be both set */
	VERIFY((currm != NULL || currp != NULL) &&
	    ((currm != NULL) != (currp != NULL)));

	if (currm != NULL) {
		curr_oldlen = currm->m_len;
		curr_trailing = (uint32_t)M_TRAILINGSPACE(currm);
		curr_ptr = mtod(currm, char *) + currm->m_len;
		curr_len = currm->m_len;
	} else {
		curr_oldlen = currp->buf_dlen;
		curr_trailing = currp->buf_dlim - currp->buf_doff -
		    currp->buf_dlen;
		/* XXX -fbounds-safety: use the inline variant to return an __indexable */
		curr_ptr = (char *)__buflet_get_data_address(currp) + currp->buf_doff +
		    currp->buf_dlen;
		curr_len = currp->buf_dlen;
	}

	/* Verify checksum only for IPv4 */
	len = spkt->pkt_flow_ip_hlen;
	verify_l3 = (verify_l3 && !PACKET_HAS_VALID_IP_CSUM(spkt));
	if (verify_l3) {
		if (PKT_IS_TRUNC_MBUF(spkt)) {
			partial = os_cpu_in_cksum_mbuf(spkt->pkt_mbuf,
			    len, 0, 0);
		} else {
			partial = pkt_sum(SK_PKT2PH(spkt), soff, len);
		}

		csum = __packet_fold_sum(partial);
		SK_DF(logflags, "IP copy+sum %u(%u) (csum 0x%04x)", 0,
		    len, csum);
		spkt->pkt_csum_flags |= PACKET_CSUM_IP_CHECKED;
		if ((csum ^ 0xffff) != 0) {
			/* No need to copy & checkum TCP+payload */
			return false;
		} else {
			spkt->pkt_csum_flags |= PACKET_CSUM_IP_VALID;
		}
	}

	verify_l4 = !PACKET_HAS_FULL_CHECKSUM_FLAGS(spkt);

	/* Copy & verify TCP checksum */
	start = spkt->pkt_flow_ip_hlen + spkt->pkt_flow_tcp_hlen;
	l4len = plen - spkt->pkt_flow_ip_hlen;
	len = plen - start;
	if (PKT_IS_TRUNC_MBUF(spkt)) {
		tmplen = min(len, curr_trailing);
		odd_start = FALSE;

		/* First, simple checksum on the TCP header */
		if (verify_l4) {
			partial = os_cpu_in_cksum_mbuf(spkt->pkt_mbuf,
			    spkt->pkt_flow_tcp_hlen, spkt->pkt_flow_ip_hlen, 0);
		}

		/* Now, copy & sum the payload */
		if (tmplen > 0) {
			data_partial = m_copydata_sum(spkt->pkt_mbuf,
			    start, tmplen, curr_ptr, 0, &odd_start);
			curr_len += tmplen;
		}
		data_off = start + tmplen;
	} else {
		tmplen = min(len, curr_trailing);
		odd_start = FALSE;

		/* First, simple checksum on the TCP header */
		if (verify_l4) {
			partial = pkt_sum(SK_PKT2PH(spkt), (soff +
			    spkt->pkt_flow_ip_hlen), spkt->pkt_flow_tcp_hlen);
		}

		/* Now, copy & sum the payload */
		if (tmplen > 0) {
			data_partial = pkt_copyaddr_sum(SK_PKT2PH(spkt),
			    (soff + start), (uint8_t *)curr_ptr, tmplen,
			    true, 0, &odd_start);
			curr_len += tmplen;
		}
		data_off = soff + start + tmplen;
	}

	/* copy & sum remaining payload in additional buffers */
	if ((len - tmplen) > 0) {
		ASSERT(dbuf != NULL);
		_copy_data_sum_dbuf(spkt, data_off, (len - tmplen),
		    &data_partial, &odd_start, dbuf, true);
		*added = (len - tmplen);
	}

	/* Fold data checksum to 16 bit */
	*data_csum = __packet_fold_sum(data_partial);

	if (currm != NULL) {
		currm->m_len = curr_len;
	} else {
		currp->buf_dlen = curr_len;
	}

	if (verify_l4) {
		/* Fold in the data checksum to TCP checksum */
		partial += *data_csum;
		partial += htons(l4len + IPPROTO_TCP);
		if (spkt->pkt_flow_ip_ver == IPVERSION) {
			csum = in_pseudo(spkt->pkt_flow_ipv4_src.s_addr,
			    spkt->pkt_flow_ipv4_dst.s_addr, partial);
		} else {
			ASSERT(spkt->pkt_flow_ip_ver == IPV6_VERSION);
			csum = in6_pseudo(&spkt->pkt_flow_ipv6_src,
			    &spkt->pkt_flow_ipv6_dst, partial);
		}
		/* pkt metadata will be transfer to super packet */
		__packet_set_inet_checksum(SK_PKT2PH(spkt),
		    PACKET_CSUM_RX_FULL_FLAGS, 0, csum, false);
	} else {
		/* grab csum value from offload */
		csum = spkt->pkt_csum_rx_value;
	}

	SK_DF(logflags, "TCP copy+sum %u(%u) (csum 0x%04x)",
	    start - spkt->pkt_flow_tcp_hlen, l4len, ntohs(csum));

	if ((csum ^ 0xffff) != 0) {
		/*
		 * Revert whatever we did here!
		 * currm/currp should be restored to previous value.
		 * dbuf (for additional payload) should be restore to 0.
		 */
		if (currm != NULL) {
			currm->m_len = curr_oldlen;
		} else {
			currp->buf_dlen = curr_oldlen;
		}
		if (dbuf != NULL) {
			for (int i = 0; i < dbuf->dba_num_dbufs; i++) {
				if (dbuf->dba_is_buflet) {
					struct __kern_buflet *b = dbuf->dba_buflet[i];
					kern_buflet_set_data_length(b, 0);
					kern_buflet_set_data_offset(b, 0);
				} else {
					struct mbuf *m = dbuf->dba_mbuf[i];
					m->m_len = m->m_pkthdr.len = 0;
				}
			}
		}

		return false;
	}

	return true;
}

/*
 * Copy and checksum for packet or packet with mbuf
 * data_csum is only supported for bsd flows
 */
static bool
copy_pkt_csum(struct __kern_packet *pkt, uint32_t plen, _dbuf_array_t *dbuf,
    uint16_t *data_csum, bool verify_l3)
{
	/*
	 * To keep this routine simple and optimal, we are asserting on the
	 * assumption that the smallest flowswitch packet pool buffer should
	 * be large enough to hold the IP and TCP headers in the first buflet.
	 */
	_CASSERT(NX_FSW_MINBUFSIZE >= NETIF_COMPAT_MAX_MBUF_DATA_COPY);

	SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX |
	    (PKT_IS_MBUF(pkt) ? SK_VERB_COPY_MBUF : SK_VERB_COPY)));

	uint16_t start = 0, csum = 0;
	uint32_t len = 0;
	/* soff is only used for packets */
	uint16_t soff = pkt->pkt_headroom + pkt->pkt_l2_len;
	uint32_t data_partial = 0, partial = 0;
	boolean_t odd_start = false;
	uint32_t data_len;
	uint16_t dbuf_off;
	uint16_t copied_len = 0;
	bool l3_csum_ok;
	uint8_t *daddr;

	if (dbuf->dba_is_buflet) {
		/* XXX -fbounds-safety: use the inline variant to return an __indexable */
		daddr = __buflet_get_data_address(dbuf->dba_buflet[0]);
		daddr += kern_buflet_get_data_length(dbuf->dba_buflet[0]);
	} else {
		daddr = mtod(dbuf->dba_mbuf[0], uint8_t *);
		daddr += dbuf->dba_mbuf[0]->m_len;
		/*
		 * available space check for payload is done later
		 * in _copy_data_sum_dbuf
		 */
		ASSERT(M_TRAILINGSPACE(dbuf->dba_mbuf[0]) >=
		    pkt->pkt_flow_ip_hlen + pkt->pkt_flow_tcp_hlen);
	}

	if (PACKET_HAS_FULL_CHECKSUM_FLAGS(pkt)) {
		/* copy only */
		_copy_data_sum_dbuf(pkt, PKT_IS_TRUNC_MBUF(pkt) ? 0: soff,
		    plen, &partial, &odd_start, dbuf, false);
		if (PKT_IS_MBUF(pkt)) {
			csum = pkt->pkt_mbuf->m_pkthdr.csum_rx_val;
			SK_DF(logflags, "HW csumf/rxstart/rxval 0x%x/%u/0x%04x",
			    pkt->pkt_mbuf->m_pkthdr.csum_flags,
			    pkt->pkt_mbuf->m_pkthdr.csum_rx_start, csum);
		} else {
			csum = pkt->pkt_csum_rx_value;
			SK_DF(logflags, "HW csumf/rxstart/rxval 0x%x/%u/0x%04x",
			    pkt->pkt_csum_flags,
			    pkt->pkt_csum_rx_start_off, csum);
		}

		/* pkt metadata will be transfer to super packet */
		__packet_set_inet_checksum(SK_PKT2PH(pkt),
		    PACKET_CSUM_RX_FULL_FLAGS, 0, csum, false);
		if ((csum ^ 0xffff) == 0) {
			return true;
		} else {
			return false;
		}
	}

	/* Copy l3 & verify checksum only for IPv4 */
	start = 0;
	len = pkt->pkt_flow_ip_hlen;
	if (PKT_IS_TRUNC_MBUF(pkt)) {
		partial = m_copydata_sum(pkt->pkt_mbuf, start, len,
		    (daddr + start), 0, NULL);
	} else {
		partial = pkt_copyaddr_sum(SK_PKT2PH(pkt), soff,
		    (daddr + start), len, true, 0, NULL);
	}
	verify_l3 = (verify_l3 && !PACKET_HAS_VALID_IP_CSUM(pkt));
	l3_csum_ok = !verify_l3;
	if (verify_l3) {
		csum = __packet_fold_sum(partial);
		SK_DF(logflags, "IP copy+sum %u(%u) (csum 0x%04x)",
		    start, len, csum);
		pkt->pkt_csum_flags |= PACKET_CSUM_IP_CHECKED;
		if ((csum ^ 0xffff) != 0) {
			/* proceed to copy the rest of packet */
		} else {
			pkt->pkt_csum_flags |= PACKET_CSUM_IP_VALID;
			l3_csum_ok = true;
		}
	}
	copied_len += pkt->pkt_flow_ip_hlen;

	/* Copy & verify TCP checksum */
	start = pkt->pkt_flow_ip_hlen;
	len = plen - start;

	if (PKT_IS_TRUNC_MBUF(pkt)) {
		/* First, copy and sum TCP header */
		partial = m_copydata_sum(pkt->pkt_mbuf, start,
		    pkt->pkt_flow_tcp_hlen, (daddr + start), 0, NULL);

		data_len = len - pkt->pkt_flow_tcp_hlen;
		start += pkt->pkt_flow_tcp_hlen;
		dbuf_off = start;
		/* Next, copy and sum payload (if any) */
	} else {
		/* First, copy and sum TCP header */
		partial = pkt_copyaddr_sum(SK_PKT2PH(pkt), (soff + start),
		    (daddr + start), pkt->pkt_flow_tcp_hlen, true, 0, NULL);

		data_len = len - pkt->pkt_flow_tcp_hlen;
		start += pkt->pkt_flow_tcp_hlen;
		dbuf_off = start;
		start += soff;
	}
	copied_len += pkt->pkt_flow_tcp_hlen;

	if (dbuf->dba_is_buflet) {
		VERIFY(kern_buflet_set_data_length(dbuf->dba_buflet[0],
		    kern_buflet_get_data_length(dbuf->dba_buflet[0]) +
		    copied_len) == 0);
	} else {
		dbuf->dba_mbuf[0]->m_len += copied_len;
		dbuf->dba_mbuf[0]->m_pkthdr.len += copied_len;
	}

	/* copy and sum payload (if any) */
	if (data_len > 0) {
		odd_start = false;
		_copy_data_sum_dbuf(pkt, start, data_len, &data_partial,
		    &odd_start, dbuf, l3_csum_ok);
	}

	if (__improbable(!l3_csum_ok)) {
		return false;
	}

	/* Fold data sum to 16 bit and then into the partial */
	*data_csum = __packet_fold_sum(data_partial);

	/* Fold in the data checksum to TCP checksum */
	partial += *data_csum;

	partial += htons(len + IPPROTO_TCP);
	if (pkt->pkt_flow_ip_ver == IPVERSION) {
		csum = in_pseudo(pkt->pkt_flow_ipv4_src.s_addr,
		    pkt->pkt_flow_ipv4_dst.s_addr, partial);
	} else {
		ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
		csum = in6_pseudo(&pkt->pkt_flow_ipv6_src,
		    &pkt->pkt_flow_ipv6_dst, partial);
	}

	SK_DF(logflags, "TCP copy+sum %u(%u) (csum 0x%04x)",
	    pkt->pkt_flow_ip_hlen, len, csum);

	/* pkt metadata will be transfer to super packet */
	__packet_set_inet_checksum(SK_PKT2PH(pkt), PACKET_CSUM_RX_FULL_FLAGS,
	    0, csum, false);
	if ((csum ^ 0xffff) != 0) {
		return false;
	}

	return true;
}

SK_INLINE_ATTRIBUTE
static void
flow_agg_init_common(struct nx_flowswitch *fsw, struct flow_agg *fa,
    struct __kern_packet *pkt)
{
	struct ifnet *ifp;

	switch (pkt->pkt_flow_ip_ver) {
	case IPVERSION:
		if (pkt->pkt_flow_ip_hlen != sizeof(struct ip)) {
			return;
		}
		break;
	case IPV6_VERSION:
		if (pkt->pkt_flow_ip_hlen != sizeof(struct ip6_hdr)) {
			return;
		}
		break;
	default:
		VERIFY(0);
		/* NOTREACHED */
		__builtin_unreachable();
	}

	fa->fa_tcp_seq = ntohl(pkt->pkt_flow_tcp_seq) + pkt->pkt_flow_ulen;
	fa->fa_ulen = pkt->pkt_flow_ulen;
	fa->fa_total = pkt->pkt_flow_ip_hlen +
	    pkt->pkt_flow_tcp_hlen + pkt->pkt_flow_ulen;

	ifp = fsw->fsw_ifp;
	ASSERT(ifp != NULL);
	if (__improbable((ifp->if_hwassist & IFNET_LRO) != 0)) {
		/* in case hardware supports LRO, don't fix checksum in the header */
		fa->fa_fix_pkt_sum = flow_agg_pkt_fix_sum_no_op;
	} else {
		fa->fa_fix_pkt_sum = flow_agg_pkt_fix_sum;
	}
}

static void
flow_agg_init_smbuf(struct nx_flowswitch *fsw, struct flow_agg *fa,
    struct mbuf *smbuf, struct __kern_packet *pkt)
{
	FLOW_AGG_CLEAR(fa);

	ASSERT(smbuf != NULL);
	fa->fa_smbuf = smbuf;

	fa->fa_sptr = mtod(smbuf, uint8_t *);
	ASSERT(fa->fa_sptr != NULL);

	/*
	 * Note here we use 'pkt' instead of 'smbuf', since we rely on the
	 * contents of the flow structure which don't exist in 'smbuf'.
	 */
	flow_agg_init_common(fsw, fa, pkt);
}

static void
flow_agg_init_spkt(struct nx_flowswitch *fsw, struct flow_agg *fa,
    struct __kern_packet *spkt, struct __kern_packet *pkt)
{
	FLOW_AGG_CLEAR(fa);

	ASSERT(spkt != NULL);
	fa->fa_spkt = spkt;
	fa->fa_sobj_is_pkt = true;
	VERIFY(spkt->pkt_headroom == 0 && spkt->pkt_l2_len == 0);

	MD_BUFLET_ADDR_ABS(spkt, fa->fa_sptr);
	ASSERT(fa->fa_sptr != NULL);

	/*
	 * Note here we use 'pkt' instead of 'spkt', since we rely on the
	 * contents of the flow structure which don't exist in 'spkt'.
	 */
	flow_agg_init_common(fsw, fa, pkt);
}

/*
 * -fbounds-safety: The reason hardcoded values 64 (and 80) are used here is
 * because this function calls the 64-byte version of sk memcmp function (same
 * thing for the 80-byte version). In can_agg_fastpath, there is a check being
 * done for TCP header length with options: sizeof(struct tcphdr) +
 * TCPOLEN_TSTAMP_APPA , which is 20 + 12 = 32 bytes. In case of IPv4, adding IP
 * header size of 20 to it makes it 52 bytes. From the sk_memcmp_* variants, the
 * closest one is the 64B option.
 */
SK_INLINE_ATTRIBUTE
static bool
ipv4_tcp_memcmp(const uint8_t *__counted_by(64)h1, const uint8_t *__counted_by(64)h2)
{
	return sk_memcmp_mask_64B(h1, h2, (const uint8_t *)&ip_tcp_mask) == 0;
}

SK_INLINE_ATTRIBUTE
static bool
ipv6_tcp_memcmp(const uint8_t *__counted_by(80)h1, const uint8_t *__counted_by(80)h2)
{
	return sk_memcmp_mask_80B(h1, h2, (const uint8_t *)&ip6_tcp_mask) == 0;
}

SK_INLINE_ATTRIBUTE
static bool
can_agg_fastpath(struct flow_agg *fa, struct __kern_packet *pkt,
    struct fsw_stats *fsws)
{
	bool match;
	uint8_t *ip_hdr;

	ASSERT(fa->fa_sptr != NULL);
	_CASSERT(sizeof(struct ip6_tcp_mask) == MASK_SIZE);
	_CASSERT(sizeof(struct ip_tcp_mask) == MASK_SIZE);

	if (__improbable(pkt->pkt_length < MASK_SIZE)) {
		STATS_INC(fsws, FSW_STATS_RX_AGG_NO_SHORT_TCP);
		goto slow_path;
	}

	if (__improbable(fa->fa_sobj_is_short)) {
		STATS_INC(fsws, FSW_STATS_RX_AGG_NO_SHORT_MBUF);
		goto slow_path;
	}

	if (__improbable(pkt->pkt_flow_tcp_hlen !=
	    (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_APPA))) {
		goto slow_path;
	}

	switch (pkt->pkt_flow_ip_ver) {
	case IPVERSION:
		/*
		 * -fbounds-safety: pkt->pkt_flow_ip_hdr is a mach_vm_address_t,
		 * so we forge it here. The reason the constant values 64 and 80
		 * are used is because ipv4_tcp_memcmp takes a __counted_by(64)
		 * and __counted_by(80), respectively.
		 */
		ip_hdr = __unsafe_forge_bidi_indexable(uint8_t *,
		    pkt->pkt_flow_ip_hdr, 64);
		match = ipv4_tcp_memcmp(fa->fa_sptr, ip_hdr);
		break;
	case IPV6_VERSION:
		ip_hdr = __unsafe_forge_bidi_indexable(uint8_t *,
		    pkt->pkt_flow_ip_hdr, 80);
		match = ipv6_tcp_memcmp(fa->fa_sptr, ip_hdr);
		break;
	default:
		VERIFY(0);
		/* NOTREACHED */
		__builtin_unreachable();
	}

	if (__improbable(!match)) {
		STATS_INC(fsws, FSW_STATS_RX_AGG_NO_MASK_TCP);
		goto slow_path;
	}
	if (__improbable(pkt->pkt_flow_ulen != fa->fa_ulen)) {
		STATS_INC(fsws, FSW_STATS_RX_AGG_NO_ULEN_TCP);
		goto slow_path;
	}

	STATS_INC(fsws, FSW_STATS_RX_AGG_OK_FASTPATH_TCP);
	fa->fa_tcp_seq += pkt->pkt_flow_ulen;
	fa->fa_ulen = pkt->pkt_flow_ulen;
	return true;

slow_path:
	return false;
}

SK_NO_INLINE_ATTRIBUTE
static bool
can_agg_slowpath(struct flow_agg *fa, struct __kern_packet *pkt,
    struct fsw_stats *fsws)
{
	uint8_t *sl3_hdr = fa->fa_sptr;
	uint8_t *l3_hdr = __unsafe_forge_bidi_indexable(uint8_t *,
	    pkt->pkt_flow_ip_hdr, pkt->pkt_flow_ip_hlen);
	uint32_t sl3tlen = 0;
	uint16_t sl3hlen = 0;

	DTRACE_SKYWALK2(aggr__slow, struct __kern_packet *, pkt,
	    uint8_t *, sl3_hdr);

	ASSERT(sl3_hdr != NULL);

	/*
	 * Compare IP header length, TOS, frag flags and IP options
	 * For IPv4, the options should match exactly
	 * For IPv6, if options are present, bail out
	 */
	if (pkt->pkt_flow_ip_ver == IPVERSION) {
		struct ip *siph = (struct ip *)(void *)sl3_hdr;
		struct ip *iph = (struct ip *)(void *)l3_hdr;

		ASSERT(siph->ip_v == IPVERSION);
		/* 16-bit alignment is sufficient (handles mbuf case) */
		ASSERT(IS_P2ALIGNED(siph, sizeof(uint16_t)));
		ASSERT(IS_P2ALIGNED(iph, sizeof(uint16_t)));

		sl3hlen = (siph->ip_hl << 2);
		if (sl3hlen != pkt->pkt_flow_ip_hlen) {
			STATS_INC(fsws, FSW_STATS_RX_AGG_NO_HLEN_IP);
			DTRACE_SKYWALK2(aggr__fail2, uint16_t, sl3hlen, uint8_t,
			    pkt->pkt_flow_ip_hlen);
			return false;
		}

		if (siph->ip_ttl != iph->ip_ttl) {
			STATS_INC(fsws, FSW_STATS_RX_AGG_NO_TTL_IP);
			DTRACE_SKYWALK2(aggr__fail3, uint8_t, siph->ip_ttl,
			    uint8_t, iph->ip_ttl);
			return false;
		}

		if (siph->ip_tos != iph->ip_tos) {
			STATS_INC(fsws, FSW_STATS_RX_AGG_NO_TOS_IP);
			DTRACE_SKYWALK2(aggr__fail4, uint8_t, siph->ip_tos,
			    uint8_t, iph->ip_tos);
			return false;
		}
		/* For IPv4, DF bit should match */
		if ((ntohs(siph->ip_off) & (IP_DF | IP_RF)) !=
		    (ntohs(iph->ip_off) & (IP_DF | IP_RF))) {
			STATS_INC(fsws, FSW_STATS_RX_AGG_NO_OFF_IP);
			DTRACE_SKYWALK2(aggr__fail5, uint16_t,
			    ntohs(siph->ip_off), uint16_t, ntohs(iph->ip_off));
			return false;
		}

		uint8_t ip_opts_len = pkt->pkt_flow_ip_hlen -
		    sizeof(struct ip);
		if (ip_opts_len > 0 &&
		    memcmp((uint8_t *)(siph + 1), (uint8_t *)(iph + 1),
		    ip_opts_len) != 0) {
			STATS_INC(fsws, FSW_STATS_RX_AGG_NO_OPT_IP);
			DTRACE_SKYWALK3(aggr__fail6, uint8_t, ip_opts_len,
			    uint8_t *, (uint8_t *)(siph + 1), uint8_t *,
			    (uint8_t *)(iph + 1));
			return false;
		}
		sl3tlen = ntohs(siph->ip_len);
	} else {
		struct ip6_hdr *sip6 = (struct ip6_hdr *)(void *)sl3_hdr;
		struct ip6_hdr *ip6 = (struct ip6_hdr *)l3_hdr;

		ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
		ASSERT((sip6->ip6_vfc & IPV6_VERSION_MASK) == IPV6_VERSION);
		/* 16-bit alignment is sufficient (handles mbuf case) */
		ASSERT(IS_P2ALIGNED(sip6, sizeof(uint16_t)));

		if (pkt->pkt_flow_ip_hlen != sizeof(struct ip6_hdr)) {
			/*
			 * Don't aggregate if extension header is present in
			 * packet. N.B. currently flow switch only classifies
			 * frag header
			 */
			STATS_INC(fsws, FSW_STATS_RX_AGG_NO_HLEN_IP);
			DTRACE_SKYWALK1(aggr__fail7, uint8_t,
			    pkt->pkt_flow_ip_hlen);
			return false;
		}

		sl3hlen = sizeof(struct ip6_hdr);
		/* For IPv6, flow info mask covers TOS and flow label */
		if (memcmp((uint8_t *)&sip6->ip6_flow, (uint8_t *)&ip6->ip6_flow,
		    sizeof(sip6->ip6_flow)) != 0) {
			STATS_INC(fsws, FSW_STATS_RX_AGG_NO_TOS_IP);
			DTRACE_SKYWALK2(aggr__fail8, uint32_t,
			    ntohl(sip6->ip6_flow), uint32_t,
			    ntohl(ip6->ip6_flow));
			return false;
		}

		if (sip6->ip6_hlim != ip6->ip6_hlim) {
			STATS_INC(fsws, FSW_STATS_RX_AGG_NO_TTL_IP);
			DTRACE_SKYWALK2(aggr__fail9, uint8_t, sip6->ip6_hlim,
			    uint8_t, ip6->ip6_hlim);
			return false;
		}

		sl3tlen = (sizeof(struct ip6_hdr) + ntohs(sip6->ip6_plen));
	}

	/*
	 * For TCP header, compare ACK number and window size
	 * Compare TCP flags
	 * Compare TCP header length and TCP options
	 */
	struct tcphdr *stcp = (struct tcphdr *)(void *)(sl3_hdr + sl3hlen);
	/* -fbounds-safety: pkt_flow_tcp_hdr is a mach_vm_address_t */
	struct tcphdr *tcp = __unsafe_forge_bidi_indexable(struct tcphdr *,
	    pkt->pkt_flow_tcp_hdr, pkt->pkt_flow_tcp_hlen);

	uint16_t sl4hlen = (stcp->th_off << 2);
	if (memcmp(&stcp->th_ack, &tcp->th_ack, sizeof(stcp->th_ack)) != 0 ||
	    memcmp(&stcp->th_win, &tcp->th_win, sizeof(stcp->th_win)) != 0) {
		STATS_INC(fsws, FSW_STATS_RX_AGG_NO_ACKWIN_TCP);
		DTRACE_SKYWALK4(aggr__fail9, uint32_t, ntohl(stcp->th_ack),
		    uint32_t, ntohl(tcp->th_ack), uint16_t, ntohs(stcp->th_win),
		    uint16_t, ntohs(tcp->th_win));
		return false;
	}

	if ((stcp->th_flags & ~(TH_PUSH)) != (tcp->th_flags & ~(TH_PUSH))) {
		STATS_INC(fsws, FSW_STATS_RX_AGG_NO_FLAGS_TCP);
		DTRACE_SKYWALK2(aggr__fail10, uint8_t, stcp->th_flags,
		    uint8_t, tcp->th_flags);
		return false;
	}

	if (sl4hlen != pkt->pkt_flow_tcp_hlen) {
		STATS_INC(fsws, FSW_STATS_RX_AGG_NO_HLEN_TCP);
		DTRACE_SKYWALK2(aggr__fail11, uint8_t, sl4hlen,
		    uint8_t, pkt->pkt_flow_tcp_hlen);
		return false;
	}

	uint8_t tcp_opts_len = pkt->pkt_flow_tcp_hlen - sizeof(struct tcphdr);
	/*
	 * We know that the TCP-option lengthes are the same thanks to the above
	 * sl4hlen check
	 */
	if (tcp_opts_len > 0 && memcmp((uint8_t *)(stcp + 1),
	    (uint8_t *)(tcp + 1), tcp_opts_len) != 0) {
		/*
		 * Fast-path header prediction:
		 *
		 * TCP Timestamp option is usually put after two NOP-headers,
		 * and thus total TCP-option length is 12. If that's the case,
		 * we can aggregate as only the TCP time-stamp option differs.
		 */
		if (tcp_opts_len != TCPOLEN_TSTAMP_APPA) {
			STATS_INC(fsws, FSW_STATS_RX_AGG_NO_EXOPT_TCP);
			DTRACE_SKYWALK1(aggr__fail13, uint8_t, tcp_opts_len);
			return false;
		} else {
			uint32_t sts_hdr, ts_hdr;
			if (IS_P2ALIGNED(stcp + 1, sizeof(uint32_t))) {
				sts_hdr = *((uint32_t *)(stcp + 1));
			} else {
				bcopy(stcp + 1, &sts_hdr, sizeof(sts_hdr));
			}
			if (IS_P2ALIGNED(tcp + 1, sizeof(uint32_t))) {
				ts_hdr = *((uint32_t *)(tcp + 1));
			} else {
				bcopy(tcp + 1, &ts_hdr, sizeof(ts_hdr));
			}

			if (sts_hdr != htonl(TCPOPT_TSTAMP_HDR) ||
			    ts_hdr != htonl(TCPOPT_TSTAMP_HDR)) {
				STATS_INC(fsws, FSW_STATS_RX_AGG_NO_OPTTS_TCP);
				DTRACE_SKYWALK2(aggr__fail14, uint32_t,
				    sts_hdr, uint32_t, ts_hdr);
				return false;
			}
		}
	}
	STATS_INC(fsws, FSW_STATS_RX_AGG_OK_SLOWPATH_TCP);
	fa->fa_tcp_seq += pkt->pkt_flow_ulen;
	fa->fa_ulen = pkt->pkt_flow_ulen;
	return true;
}

static bool
flow_agg_is_ok(struct flow_agg *fa, struct __kern_packet *pkt,
    struct fsw_stats *fsws)
{
	/* Shouldn't exceed the ip_len beyond MIN(custom ip_len, 64K) */
	const uint32_t max_ip_len = MAX_AGG_IP_LEN();
	bool can_agg = false;

	DTRACE_SKYWALK2(aggr__check, struct flow_agg *, fa,
	    struct __kern_packet *, pkt);

	ASSERT(pkt->pkt_flow_ip_proto == IPPROTO_TCP);
	if (__improbable(pkt->pkt_flow_tcp_agg_fast != 0)) {
		pkt->pkt_flow_tcp_agg_fast = 0;
	}
	/*
	 * Don't aggregate if any of the following is true:
	 * 1. TCP flag is other than TH_{ACK,PUSH}
	 * 2. Payload length is 0 (pure ACK)
	 * 3. This is the first packet
	 * 4. TCP sequence number is not expected
	 * 5. We would've exceeded the maximum aggregated size
	 * 6. It's not the first packet and the wake flag is set
	 */
	if (__improbable((pkt->pkt_flow_tcp_flags & TCP_FLAGS_IGNORE) != 0 ||
	    pkt->pkt_flow_ulen == 0 || fa->fa_sobj == NULL)) {
		DTRACE_SKYWALK1(aggr__fail1a, struct __kern_packet *, pkt);
		goto done;
	}
	if (__improbable(ntohl(pkt->pkt_flow_tcp_seq) != fa->fa_tcp_seq)) {
		DTRACE_SKYWALK2(aggr__fail1b, uint32_t,
		    ntohl(pkt->pkt_flow_tcp_seq), uint32_t, fa->fa_tcp_seq);
		STATS_INC(fsws, FSW_STATS_RX_AGG_NO_SEQN_TCP);
		goto done;
	}
	if (__improbable((fa->fa_total + pkt->pkt_flow_ulen) > max_ip_len)) {
		DTRACE_SKYWALK3(aggr__fail1c, uint32_t, fa->fa_total,
		    uint32_t, pkt->pkt_flow_ulen, uint32_t, max_ip_len);
		/* We've reached aggregation limit */
		STATS_INC(fsws, FSW_STATS_RX_AGG_LIMIT);
		goto done;
	}
	if (__improbable(PKT_IS_WAKE_PKT(pkt) && fa->fa_total > 0)) {
		DTRACE_SKYWALK1(aggr__fail1d, struct __kern_packet *, pkt);
		goto done;
	}

	can_agg = can_agg_fastpath(fa, pkt, fsws);
	if (can_agg) {
		pkt->pkt_flow_tcp_agg_fast = 1;
		goto done;
	}

	can_agg = can_agg_slowpath(fa, pkt, fsws);
	ASSERT(!pkt->pkt_flow_tcp_agg_fast);

done:
	return can_agg;
}

static uint16_t
flow_agg_pkt_fix_sum(uint16_t csum, uint16_t old, uint16_t new)
{
	return __packet_fix_sum(csum, old, new);
}

static uint16_t
flow_agg_pkt_fix_sum_no_op(uint16_t __unused csum, uint16_t __unused old,
    uint16_t __unused new)
{
	return 0;
}

static inline void
flow_agg_pkt_fix_hdr_sum(struct flow_agg *fa,
    uint8_t *__sized_by(sizeof(uint32_t))field, uint16_t *csum,
    uint32_t new)
{
	uint32_t old;
	memcpy((uint8_t *)&old, field, sizeof(old));
	memcpy(field, (uint8_t *)&new, sizeof(uint32_t));
	*csum = fa->fa_fix_pkt_sum(fa->fa_fix_pkt_sum(*csum,
	    (uint16_t)(old >> 16), (uint16_t)(new >> 16)),
	    (uint16_t)(old & 0xffff),
	    (uint16_t)(new & 0xffff));
}

static void
flow_agg_merge_hdr(struct flow_agg *fa, struct __kern_packet *pkt,
    __unused uint16_t data_csum, struct fsw_stats *fsws)
{
	struct tcphdr *stcp, *tcp;
	uint8_t *l3hdr, l3hlen;
	uint16_t old_l3len = 0;
	uint8_t result;

	SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX));

	/*
	 * The packet being merged should always have full checksum flags
	 * and a valid checksum. Otherwise, it would fail copy_pkt_csum_packed
	 * and not enter this function.
	 */
	ASSERT(PACKET_HAS_FULL_CHECKSUM_FLAGS(pkt));
	ASSERT((pkt->pkt_csum_rx_value ^ 0xffff) == 0);

	ASSERT(fa->fa_sobj != NULL);
	ASSERT(!fa->fa_sobj_is_pkt ||
	    (fa->fa_spkt->pkt_headroom == 0 && fa->fa_spkt->pkt_l2_len == 0));
	uint8_t *sl3_hdr = fa->fa_sptr;
	ASSERT(sl3_hdr != NULL);
	ASSERT(fa->fa_fix_pkt_sum != NULL);

	fa->fa_total += pkt->pkt_flow_ulen;

	/*
	 * Update the IP header as:
	 * 1. Set the IP ID (IPv4 only) to that of the new packet
	 * 2. Set the ttl to the lowest of the two
	 * 3. Increment the IP length by the payload length of new packet
	 * 4. Leave the IP (IPv4 only) checksum as is
	 * Update the resp. flow classification fields, if any
	 * Nothing to update for TCP header for now
	 */
	if (pkt->pkt_flow_ip_ver == IPVERSION) {
		struct ip *siph = (struct ip *)(void *)sl3_hdr;

		/* 16-bit alignment is sufficient (handles mbuf case) */
		ASSERT(IS_P2ALIGNED(siph, sizeof(uint16_t)));

		l3hdr = (uint8_t *)siph;
		l3hlen = siph->ip_hl << 2;

		old_l3len = ntohs(siph->ip_len);
		uint16_t l3tlen = ntohs(siph->ip_len) + pkt->pkt_flow_ulen;
		siph->ip_len = htons(l3tlen);
		siph->ip_sum = fa->fa_fix_pkt_sum(siph->ip_sum, 0,
		    htons(pkt->pkt_flow_ulen));

		SK_DF(logflags, "Agg IP len %u", ntohs(siph->ip_len));
	} else {
		struct ip6_hdr *sip6 = (struct ip6_hdr *)(void *)sl3_hdr;

		/* 16-bit alignment is sufficient (handles mbuf case) */
		ASSERT(IS_P2ALIGNED(sip6, sizeof(uint16_t)));
		ASSERT((sip6->ip6_vfc & IPV6_VERSION_MASK) == IPV6_VERSION);
		ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);

		l3hdr = (uint8_t *)sip6;
		l3hlen = sizeof(struct ip6_hdr);

		/* No extension headers should be present */
		ASSERT(pkt->pkt_flow_ip_hlen == sizeof(struct ip6_hdr));

		old_l3len = ntohs(sip6->ip6_plen) + sizeof(struct ip6_hdr);
		uint16_t l3plen = ntohs(sip6->ip6_plen) + pkt->pkt_flow_ulen;
		sip6->ip6_plen = htons(l3plen);

		SK_DF(logflags, "Agg IP6 len %u", ntohs(sip6->ip6_plen));
	}

	if (__probable(pkt->pkt_flow_tcp_agg_fast)) {
		STATS_INC(fsws, FSW_STATS_RX_AGG_MERGE_FASTPATH_IP);
	} else {
		STATS_INC(fsws, FSW_STATS_RX_AGG_MERGE_SLOWPATH_IP);
	}

	stcp = (struct tcphdr *)(void *)(l3hdr + l3hlen);
	tcp = __unsafe_forge_bidi_indexable(struct tcphdr *,
	    (struct tcphdr *)pkt->pkt_flow_tcp_hdr, pkt->pkt_flow_tcp_hlen);

	/* 16-bit alignment is sufficient (handles mbuf case) */
	ASSERT(IS_P2ALIGNED(stcp, sizeof(uint16_t)));
	ASSERT(IS_P2ALIGNED(tcp, sizeof(uint16_t)));

	/*
	 * If it is bigger, that means there are TCP-options that need to be
	 * copied over.
	 */
	if (pkt->pkt_flow_tcp_hlen > sizeof(struct tcphdr) ||
	    (stcp->th_flags & TH_PUSH) == 0) {
		VERIFY(stcp->th_off << 2 == pkt->pkt_flow_tcp_hlen);
		if (__improbable(!pkt->pkt_flow_tcp_agg_fast &&
		    memcmp(stcp + 1, tcp + 1, (pkt->pkt_flow_tcp_hlen -
		    sizeof(struct tcphdr))) != 0)) {
			uint8_t *sopt = (uint8_t *)(stcp + 1);
			uint8_t *opt = (uint8_t *)(tcp + 1);

			uint32_t ntsval, ntsecr;
			bcopy((void *)(opt + 4), &ntsval, sizeof(ntsval));
			bcopy((void *)(opt + 8), &ntsecr, sizeof(ntsecr));

			flow_agg_pkt_fix_hdr_sum(fa, sopt + 4, &stcp->th_sum, ntsval);
			flow_agg_pkt_fix_hdr_sum(fa, sopt + 8, &stcp->th_sum, ntsecr);

			STATS_INC(fsws, FSW_STATS_RX_AGG_MERGE_SLOWPATH_TCP);
		} else {
			STATS_INC(fsws, FSW_STATS_RX_AGG_MERGE_FASTPATH_TCP);
		}

		if ((stcp->th_flags & TH_PUSH) == 0 &&
		    (tcp->th_flags & TH_PUSH) != 0) {
			uint16_t old, new;
			tcp_seq *th_ack = &stcp->th_ack;
			/*
			 * -fbounds-safety: C-style cast (uint16_t *)(th_ack+1)
			 * doesn't work here, because th_ack's bound is a single
			 * uint32_t, so trying to go one address above, and then
			 * later dereferncing it would lead to a panic.
			 */
			uint16_t *next = __unsafe_forge_single(uint16_t *,
			    th_ack + 1);
			old = *next;
			/* If the new segment has a PUSH-flag, append it! */
			stcp->th_flags |= tcp->th_flags & TH_PUSH;
			next = __unsafe_forge_single(uint16_t *, th_ack + 1);
			new = *next;
			stcp->th_sum = fa->fa_fix_pkt_sum(stcp->th_sum, old, new);
		}
	}

	/* Update pseudo header checksum */
	stcp->th_sum = fa->fa_fix_pkt_sum(stcp->th_sum, 0,
	    htons(pkt->pkt_flow_ulen));

	/* Update data checksum  */
	if (__improbable(old_l3len & 0x1)) {
		/* swap the byte order, refer to rfc 1071 section 2 */
		stcp->th_sum = fa->fa_fix_pkt_sum(stcp->th_sum, 0,
		    ntohs(data_csum));
	} else {
		stcp->th_sum = fa->fa_fix_pkt_sum(stcp->th_sum, 0, data_csum);
	}

	if (fa->fa_sobj_is_pkt) {
		struct __kern_packet *spkt = fa->fa_spkt;
		spkt->pkt_aggr_type = PKT_AGGR_SINGLE_IP;
		spkt->pkt_flow_ulen += pkt->pkt_flow_ulen;
		/*
		 * Super packet length includes L3 and L4
		 * header length for first packet only.
		 */
		spkt->pkt_length += pkt->pkt_flow_ulen;
		if (spkt->pkt_seg_cnt == 0) {
			/* First time we append packets, need to set it to 1 */
			spkt->pkt_seg_cnt = 1;
		}
		_CASSERT(sizeof(result) == sizeof(spkt->pkt_seg_cnt));
		if (!os_add_overflow(1, spkt->pkt_seg_cnt, &result)) {
			spkt->pkt_seg_cnt = result;
		}
		SK_DF(logflags, "Agg pkt len %u TCP csum 0x%04x",
		    spkt->pkt_length, ntohs(stcp->th_sum));
	} else {
		struct mbuf *smbuf = fa->fa_smbuf;
		smbuf->m_pkthdr.len += pkt->pkt_flow_ulen;
		if (smbuf->m_pkthdr.seg_cnt == 0) {
			/* First time we append packets, need to set it to 1 */
			smbuf->m_pkthdr.seg_cnt = 1;
		}
		_CASSERT(sizeof(result) == sizeof(smbuf->m_pkthdr.seg_cnt));
		if (!os_add_overflow(1, smbuf->m_pkthdr.seg_cnt, &result)) {
			smbuf->m_pkthdr.seg_cnt = result;
		}
		SK_DF(logflags, "Agg mbuf len %u TCP csum 0x%04x",
		    smbuf->m_pkthdr.len, ntohs(stcp->th_sum));
	}
}

/*
 * Copy metadata from source packet to destination packet
 */
static void
pkt_copy_metadata(struct __kern_packet *spkt, struct __kern_packet *dpkt)
{
	/* Copy packet metadata */
	_QUM_COPY(&(spkt)->pkt_qum, &(dpkt)->pkt_qum);
	_PKT_COPY(spkt, dpkt);
}

static void
pkt_finalize(kern_packet_t ph)
{
	int err = __packet_finalize(ph);
	VERIFY(err == 0);
#if (DEVELOPMENT || DEBUG)
	struct __kern_packet *pkt = SK_PTR_ADDR_KPKT(ph);
	uint8_t *buf;
	MD_BUFLET_ADDR_ABS(pkt, buf);
	buf += pkt->pkt_headroom + pkt->pkt_l2_len;
	DTRACE_SKYWALK2(aggr__finalize, struct __kern_packet *, pkt,
	    uint8_t *, buf);
#endif
}

static inline uint32_t
estimate_buf_cnt(struct flow_entry *fe, uint32_t total_bytes, uint32_t total_pkts,
    uint32_t min_bufsize, uint32_t agg_bufsize)
{
	uint32_t max_ip_len = MAX_AGG_IP_LEN();
	uint32_t agg_size = MAX(fe->fe_rx_largest_size, min_bufsize);
	uint32_t hdr_overhead;

	if (__improbable(sk_fsw_rx_agg_tcp == 0)) {
		return MIN(total_pkts, MAX_BUFLET_COUNT);
	}

	agg_size = MIN(agg_size, agg_bufsize);

	hdr_overhead = (total_bytes / max_ip_len) *
	    (MAX(sizeof(struct ip), sizeof(struct ip6_hdr)) +
	    sizeof(struct tcphdr));

	return ((total_bytes + hdr_overhead) / agg_size) + 1;
}

SK_INLINE_ATTRIBUTE
static inline void
_append_dbuf_array_to_kpkt(kern_packet_t ph, kern_buflet_t pbuf,
    _dbuf_array_t *dbuf_array, kern_buflet_t *lbuf)
{
	for (uint8_t i = 0; i < dbuf_array->dba_num_dbufs; i++) {
		kern_buflet_t buf = dbuf_array->dba_buflet[i];
		VERIFY(kern_packet_add_buflet(ph, pbuf, buf) == 0);
		pbuf = buf;
		dbuf_array->dba_buflet[i] = NULL;
	}
	ASSERT(pbuf != NULL);
	dbuf_array->dba_num_dbufs = 0;
	*lbuf = pbuf;
}

SK_INLINE_ATTRIBUTE
static inline void
_free_dbuf_array(struct kern_pbufpool *pp,
    _dbuf_array_t *dbuf_array)
{
	for (uint8_t i = 0; i < dbuf_array->dba_num_dbufs; i++) {
		kern_buflet_t buf = dbuf_array->dba_buflet[i];
		pp_free_buflet(pp, buf);
		dbuf_array->dba_buflet[i] = NULL;
	}
	dbuf_array->dba_num_dbufs = 0;
}

static inline void
finalize_super_packet(struct __kern_packet **spkt, kern_packet_t *sph,
    struct flow_agg *fa, uint32_t *largest_spkt, uint16_t *spkts,
    uint16_t bufcnt)
{
	(*spkts)++;
	if (bufcnt > 1) {
		(*spkt)->pkt_aggr_type = PKT_AGGR_SINGLE_IP;
	}
	pkt_finalize(*sph);
	if ((*spkt)->pkt_length > *largest_spkt) {
		*largest_spkt = (*spkt)->pkt_length;
	}
	pkt_agg_log(*spkt, kernproc, false);
	DTRACE_SKYWALK1(aggr__buflet__count, uint16_t, bufcnt);
	*sph = 0;
	*spkt = NULL;
	FLOW_AGG_CLEAR(fa);
}

static inline void
converge_aggregation_size(struct flow_entry *fe, uint32_t largest_agg_size)
{
	if (fe->fe_rx_largest_size > largest_agg_size) {
		/*
		 * Make it slowly move towards largest_agg_size if we
		 * consistently get non-aggregatable size.
		 *
		 * If we start at 16K, this makes us go to 4K within 6 rounds
		 * and down to 2K within 12 rounds.
		 */
		fe->fe_rx_largest_size -=
		    ((fe->fe_rx_largest_size - largest_agg_size) >> 2);
	} else {
		fe->fe_rx_largest_size +=
		    ((largest_agg_size - fe->fe_rx_largest_size) >> 2);
	}
}

SK_NO_INLINE_ATTRIBUTE
static void
flow_rx_agg_channel(struct nx_flowswitch *fsw, struct flow_entry *fe,
    struct pktq *rx_pkts, uint32_t rx_bytes, bool is_mbuf)
{
#define __RX_AGG_CHAN_DROP_SOURCE_PACKET(_pkt, _reason, _flags)    do {    \
	pp_drop_packet_single(_pkt, fsw->fsw_ifp, _flags, _reason, __func__, __LINE__); \
	(_pkt) = NULL;                                                     \
	FLOW_AGG_CLEAR(&fa);                                               \
	prev_csum_ok = false;                                              \
} while (0)
	struct flow_agg fa;             /* states */
	FLOW_AGG_CLEAR(&fa);

	struct pktq super_pkts;         /* dst super packets */
	struct pktq disposed_pkts;      /* done src packets */

	KPKTQ_INIT(&super_pkts);
	KPKTQ_INIT(&disposed_pkts);

	struct __kern_channel_ring *ring;
	ring = fsw_flow_get_rx_ring(fsw, fe);
	if (__improbable(ring == NULL)) {
		SK_ERR("Rx ring is NULL");
		STATS_ADD(&fsw->fsw_stats, FSW_STATS_DST_NXPORT_INVALID,
		    KPKTQ_LEN(rx_pkts));
		pp_drop_pktq(rx_pkts, fsw->fsw_ifp, DROPTAP_FLAG_DIR_IN,
		    DROP_REASON_FSW_DST_NXPORT_INVALID, __func__, __LINE__);
		return;
	}
	struct kern_pbufpool *dpp = ring->ckr_pp;
	ASSERT(dpp->pp_max_frags > 1);

	struct __kern_packet *pkt, *tpkt;
	/* state for super packet */
	struct __kern_packet *__single spkt = NULL;
	kern_packet_t sph = 0;
	kern_buflet_t __single sbuf = NULL;
	bool prev_csum_ok = false, csum_ok, agg_ok;
	uint16_t spkts = 0, bufcnt = 0;
	int err;

	struct fsw_stats *fsws = &fsw->fsw_stats;

	/* state for buflet batch alloc */
	uint32_t bh_cnt, bh_cnt_tmp;
	uint64_t buf_arr[MAX_BUFLET_COUNT];
	_dbuf_array_t dbuf_array = {.dba_is_buflet = true, .dba_num_dbufs = 0};
	uint32_t largest_spkt = 0; /* largest aggregated packet size */
	uint32_t agg_bufsize;
	uint8_t iter = 0;
	bool large_buffer = false;

	SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX));
	SK_DF(logflags, "Rx input queue len %u", KPKTQ_LEN(rx_pkts));

	if (__probable(fe->fe_rx_largest_size != 0 &&
	    NX_FSW_TCP_RX_AGG_ENABLED())) {
		if (fe->fe_rx_largest_size <= PP_BUF_SIZE_DEF(dpp) ||
		    PP_BUF_SIZE_LARGE(dpp) == 0) {
			agg_bufsize = PP_BUF_SIZE_DEF(dpp);
		} else {
			agg_bufsize = PP_BUF_SIZE_LARGE(dpp);
			large_buffer = true;
		}
		bh_cnt = estimate_buf_cnt(fe, rx_bytes, KPKTQ_LEN(rx_pkts),
		    PP_BUF_SIZE_DEF(dpp), agg_bufsize);
		DTRACE_SKYWALK1(needed_blt_cnt_agg, uint32_t, bh_cnt);
		bh_cnt = MIN(bh_cnt, MAX_BUFLET_COUNT);
		bh_cnt_tmp = bh_cnt;
	} else {
		/*
		 * No payload, thus it's all small-sized ACKs/...
		 * OR aggregation is disabled.
		 */
		agg_bufsize = PP_BUF_SIZE_DEF(dpp);
		bh_cnt_tmp = bh_cnt = MIN(KPKTQ_LEN(rx_pkts), MAX_BUFLET_COUNT);
		DTRACE_SKYWALK1(needed_blt_cnt_no_agg, uint32_t, bh_cnt);
	}

	err = pp_alloc_buflet_batch(dpp, buf_arr, &bh_cnt, SKMEM_NOSLEEP,
	    large_buffer);
	if (__improbable(bh_cnt == 0)) {
		SK_ERR("failed to alloc %u buflets (err %d), use slow path",
		    bh_cnt_tmp, err);
	}
	bool is_ipv4 = (fe->fe_key.fk_ipver == IPVERSION);
	KPKTQ_FOREACH_SAFE(pkt, rx_pkts, tpkt) {
		if (tpkt != NULL) {
			void *baddr;
			MD_BUFLET_ADDR_ABS_PKT(tpkt, baddr);
			SK_PREFETCH(baddr, 0);
		}

		ASSERT(pkt->pkt_qum.qum_pp != dpp);
		ASSERT(is_mbuf == !!(PKT_IS_MBUF(pkt)));
		ASSERT(fe->fe_key.fk_ipver == pkt->pkt_flow_ip_ver);
		ASSERT((pkt->pkt_link_flags & PKT_LINKF_ETHFCS) == 0);
		ASSERT(!pkt->pkt_flow_ip_is_frag);
		ASSERT(pkt->pkt_flow_ip_proto == IPPROTO_TCP);

		csum_ok = false;
		agg_ok = false;
		/* supports TCP only */
		uint32_t thlen = (pkt->pkt_flow_ip_hlen +
		    pkt->pkt_flow_tcp_hlen);
		uint32_t plen = (thlen + pkt->pkt_flow_ulen);
		uint16_t data_csum = 0;

		KPKTQ_REMOVE(rx_pkts, pkt);
		rx_bytes -= pkt->pkt_flow_ulen;
		err = flow_pkt_track(fe, pkt, true);
		if (__improbable(err != 0)) {
			STATS_INC(fsws, FSW_STATS_RX_FLOW_TRACK_ERR);
			/* if need to trigger RST */
			if (err == ENETRESET) {
				flow_track_abort_tcp(fe, pkt, NULL);
			}
			SK_DF(SK_VERB_FLOW_TRACK, "flow_pkt_track failed (err %d)", err);
			__RX_AGG_CHAN_DROP_SOURCE_PACKET(pkt,
			    DROP_REASON_FSW_FLOW_TRACK_ERR, 0);
			continue;
		}

		if (is_mbuf) {          /* compat */
			m_adj(pkt->pkt_mbuf, pkt->pkt_l2_len);
			pkt->pkt_svc_class = m_get_service_class(pkt->pkt_mbuf);
			if (pkt->pkt_mbuf->m_pkthdr.pkt_flags & PKTF_WAKE_PKT) {
				pkt->pkt_pflags |= PKT_F_WAKE_PKT;
			}
		}

		if (prev_csum_ok && sbuf) {
			ASSERT(fa.fa_spkt == spkt);
			ASSERT(spkt == NULL || fa.fa_sobj_is_pkt);
			agg_ok = flow_agg_is_ok(&fa, pkt, fsws);
			agg_ok = (agg_ok && bufcnt < dpp->pp_max_frags);

			if (agg_ok && sbuf->buf_dlim - sbuf->buf_doff -
			    sbuf->buf_dlen >= plen - thlen) {
				/*
				 * No need for a new packet, just
				 * append to curr_m.
				 */
				csum_ok = copy_pkt_csum_packed(pkt, plen, NULL,
				    is_ipv4, NULL, sbuf, &data_csum, NULL);

				if (!csum_ok) {
					STATS_INC(fsws,
					    FSW_STATS_RX_AGG_BAD_CSUM);
					SK_ERR("Checksum for aggregation "
					    "is wrong");
					DTRACE_SKYWALK(aggr__chan_packed_tcp_csum_fail1);
					/*
					 * Turns out, checksum is wrong!
					 * Fallback to no-agg mode.
					 */
					agg_ok = false;
				} else {
					flow_agg_merge_hdr(&fa, pkt,
					    data_csum, fsws);
					goto next;
				}
			}
		}

		/* calculate number of buflets required */
		bh_cnt_tmp = howmany(plen, agg_bufsize);
		if (__improbable(bh_cnt_tmp > MAX_BUFLET_COUNT)) {
			STATS_INC(fsws, FSW_STATS_DROP_NOMEM_PKT);
			SK_ERR("packet too big: bufcnt %d len %d", bh_cnt_tmp,
			    plen);
			__RX_AGG_CHAN_DROP_SOURCE_PACKET(pkt,
			    DROP_REASON_FSW_GSO_NOMEM_PKT, 0);
			continue;
		}
		if (bh_cnt < bh_cnt_tmp) {
			uint32_t tmp;

			if (iter != 0) {
				/*
				 * rearrange the array for additional
				 * allocation
				 */
				uint8_t i;
				for (i = 0; i < bh_cnt; i++, iter++) {
					buf_arr[i] = buf_arr[iter];
					buf_arr[iter] = 0;
				}
				iter = 0;
			}
			tmp = estimate_buf_cnt(fe, rx_bytes, KPKTQ_LEN(rx_pkts),
			    PP_BUF_SIZE_DEF(dpp), agg_bufsize);
			tmp = MIN(tmp, MAX_BUFLET_COUNT);
			tmp = MAX(tmp, bh_cnt_tmp);
			tmp -= bh_cnt;
			ASSERT(tmp <= (MAX_BUFLET_COUNT - bh_cnt));
			DTRACE_SKYWALK1(refilled_blt_cnt, uint32_t, tmp);
			err = pp_alloc_buflet_batch(dpp, &buf_arr[bh_cnt],
			    &tmp, SKMEM_NOSLEEP, large_buffer);
			bh_cnt += tmp;
			if (__improbable((tmp == 0) || (bh_cnt < bh_cnt_tmp))) {
				STATS_INC(fsws, FSW_STATS_DROP_NOMEM_PKT);
				SK_ERR("buflet alloc failed (err %d)", err);
				__RX_AGG_CHAN_DROP_SOURCE_PACKET(pkt,
				    DROP_REASON_FSW_GSO_NOMEM_PKT, 0);
				continue;
			}
		}
		/* Use pre-allocated buflets */
		ASSERT(bh_cnt >= bh_cnt_tmp);
		dbuf_array.dba_num_dbufs = bh_cnt_tmp;
		while (bh_cnt_tmp-- > 0) {
			/*
			 * -fbounds-safety: buf_arr[iter] is a uint64_t, so
			 * forging it
			 */
			dbuf_array.dba_buflet[bh_cnt_tmp] =
			    __unsafe_forge_single(kern_buflet_t, buf_arr[iter]);
			buf_arr[iter] = 0;
			bh_cnt--;
			iter++;
		}
		/* copy and checksum TCP data */
		if (agg_ok) {
			int added = 0;
			ASSERT(dbuf_array.dba_num_dbufs != 0);
			csum_ok = copy_pkt_csum_packed(pkt, plen, &dbuf_array,
			    is_ipv4, NULL, sbuf, &data_csum, &added);

			if (__improbable(!csum_ok)) {
				STATS_INC(fsws, FSW_STATS_RX_AGG_BAD_CSUM);
				SK_ERR("Checksum for aggregation on new "
				    "mbuf is wrong");
				DTRACE_SKYWALK(aggr__chan_packed_tcp_csum_fail2);
				agg_ok = false;
				/* reset the used buflets */
				uint8_t j;
				for (j = 0; j < dbuf_array.dba_num_dbufs; j++) {
					VERIFY(kern_buflet_set_data_length(
						    dbuf_array.dba_buflet[j], 0) == 0);
				}
				goto non_agg;
			}

			/*
			 * There was not enough space in curr_m, thus we must
			 * have added to m->m_data.
			 */
			VERIFY(added > 0);
		} else {
non_agg:
			ASSERT(dbuf_array.dba_num_dbufs != 0);
			csum_ok = copy_pkt_csum(pkt, plen, &dbuf_array,
			    &data_csum, is_ipv4);
			if (__improbable(!csum_ok)) {
				STATS_INC(fsws, FSW_STATS_RX_AGG_BAD_CSUM);
				SK_ERR("%d incorrect csum", __LINE__);
				DTRACE_SKYWALK(aggr__chan_tcp_csum_fail);
			}
		}
		if (agg_ok) {
			ASSERT(fa.fa_spkt == spkt);
			ASSERT(spkt == NULL || fa.fa_sobj_is_pkt);
			/* update current packet header */
			flow_agg_merge_hdr(&fa, pkt, data_csum, fsws);
			ASSERT(dbuf_array.dba_num_dbufs > 0);
			bufcnt += dbuf_array.dba_num_dbufs;
			_append_dbuf_array_to_kpkt(sph, sbuf, &dbuf_array,
			    &sbuf);
		} else {
			/* Finalize the current super packet */
			if (sph != 0) {
				finalize_super_packet(&spkt, &sph, &fa,
				    &largest_spkt, &spkts, bufcnt);
			}

			/* New super packet */
			err = kern_pbufpool_alloc_nosleep(dpp, 0, &sph);
			if (__improbable(err != 0)) {
				STATS_INC(fsws, FSW_STATS_DROP_NOMEM_PKT);
				SK_ERR("packet alloc failed (err %d)", err);
				_free_dbuf_array(dpp, &dbuf_array);
				__RX_AGG_CHAN_DROP_SOURCE_PACKET(pkt,
				    DROP_REASON_FSW_GSO_NOMEM_PKT, 0);
				continue;
			}
			spkt = SK_PTR_ADDR_KPKT(sph);
			pkt_copy_metadata(pkt, spkt);
			/* Packet length for super packet starts from L3 */
			spkt->pkt_length = plen;
			spkt->pkt_flow_ulen =  pkt->pkt_flow_ulen;
			spkt->pkt_headroom = 0;
			spkt->pkt_l2_len = 0;
			spkt->pkt_seg_cnt = 1;

			ASSERT(dbuf_array.dba_num_dbufs > 0);
			bufcnt = dbuf_array.dba_num_dbufs;
			sbuf = kern_packet_get_next_buflet(sph, NULL);
			_append_dbuf_array_to_kpkt(sph, sbuf, &dbuf_array,
			    &sbuf);

			KPKTQ_ENQUEUE(&super_pkts, spkt);
			_UUID_COPY(spkt->pkt_flow_id, fe->fe_uuid);
			_UUID_COPY(spkt->pkt_policy_euuid, fe->fe_eproc_uuid);
			spkt->pkt_policy_id = fe->fe_policy_id;
			spkt->pkt_skip_policy_id = fe->fe_skip_policy_id;
			spkt->pkt_transport_protocol =
			    fe->fe_transport_protocol;
			flow_agg_init_spkt(fsw, &fa, spkt, pkt);
		}
next:
		pkt_agg_log(pkt, kernproc, true);
		prev_csum_ok = csum_ok;
		KPKTQ_ENQUEUE(&disposed_pkts, pkt);
	}

	/* Free unused buflets */
	STATS_ADD(fsws, FSW_STATS_RX_WASTED_BFLT, bh_cnt);
	while (bh_cnt > 0) {
		/* -fbounds-saftey: buf_arr[iter] is a uint64_t, so forging it */
		pp_free_buflet(dpp, __unsafe_forge_single(kern_buflet_t,
		    buf_arr[iter]));
		buf_arr[iter] = 0;
		bh_cnt--;
		iter++;
	}
	/* Finalize the last super packet */
	if (sph != 0) {
		finalize_super_packet(&spkt, &sph, &fa, &largest_spkt,
		    &spkts, bufcnt);
	}
	converge_aggregation_size(fe, largest_spkt);
	DTRACE_SKYWALK1(aggr__spkt__count, uint16_t, spkts);
	if (__improbable(is_mbuf)) {
		STATS_ADD(fsws, FSW_STATS_RX_AGG_MBUF2PKT, spkts);
	} else {
		STATS_ADD(fsws, FSW_STATS_RX_AGG_PKT2PKT, spkts);
	}
	FLOW_STATS_IN_ADD(fe, spackets, spkts);

	KPKTQ_FINI(rx_pkts);

	if (KPKTQ_LEN(&super_pkts) > 0) {
		fsw_ring_enqueue_tail_drop(fsw, ring, &super_pkts);
	}
	KPKTQ_FINI(&super_pkts);

	pp_free_pktq(&disposed_pkts);
}

/* streamline a smbuf */
static bool
_finalize_smbuf(struct mbuf *smbuf)
{
	/* the 1st mbuf always contains something, so start with the 2nd one */
	struct mbuf *m_chained = smbuf->m_next;
	struct mbuf *prev_m = smbuf;
	bool freed = false;

	while (m_chained != NULL) {
		if (m_chained->m_len != 0) {
			prev_m = m_chained;
			m_chained = m_chained->m_next;
			continue;
		}
		prev_m->m_next = m_chained->m_next;
		m_free(m_chained);
		m_chained = prev_m->m_next;
		freed = true;
	}
	return freed;
}

SK_NO_INLINE_ATTRIBUTE
static void
flow_rx_agg_host(struct nx_flowswitch *fsw, struct flow_entry *fe,
    struct pktq *rx_pkts, uint32_t rx_bytes, bool is_mbuf)
{
#define __RX_AGG_HOST_DROP_SOURCE_PACKET(_pkt, _reason, _flags)    do {   \
	drop_packets++;                                                   \
	drop_bytes += (_pkt)->pkt_length;                                 \
	pp_drop_packet_single(_pkt, fsw->fsw_ifp, _flags, _reason, __func__, __LINE__); \
	(_pkt) = NULL;                                                    \
	FLOW_AGG_CLEAR(&fa);                                              \
	prev_csum_ok = false;                                             \
} while (0)
	struct flow_agg fa;             /* states */
	FLOW_AGG_CLEAR(&fa);

	struct pktq disposed_pkts;      /* done src packets */
	KPKTQ_INIT(&disposed_pkts);

	struct __kern_packet *pkt, *tpkt;
	/* points to the first mbuf of chain */
	struct mbuf *m_chain = NULL;
	/* super mbuf, at the end it points to last mbuf packet */
	struct  mbuf *smbuf = NULL, *curr_m = NULL;
	bool prev_csum_ok = false, csum_ok, agg_ok;
	uint16_t smbufs = 0, smbuf_finalized = 0;
	uint32_t bytes = 0, rcvd_ulen = 0;
	uint32_t rcvd_packets = 0, rcvd_bytes = 0; /* raw packets & bytes */
	uint32_t drop_packets = 0, drop_bytes = 0; /* dropped packets & bytes */
	uint32_t largest_smbuf = 0;
	int err = 0;

	struct fsw_stats *fsws = &fsw->fsw_stats;
	bool is_ipv4 = (fe->fe_key.fk_ipver == IPVERSION);

	SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX));

	/* state for mbuf batch alloc */
	uint32_t mhead_cnt = 0;
	uint32_t mhead_bufsize = 0;
	struct mbuf * mhead = NULL;

	uint16_t l2len = KPKTQ_FIRST(rx_pkts)->pkt_l2_len;

	SK_DF(logflags, "Rx input queue bytes %u", rx_bytes);

	if (__probable(!is_mbuf)) {
		/*
		 *  Batch mbuf alloc is based on
		 * convert_native_pkt_to_mbuf_chain
		 */
		if (__probable(fe->fe_rx_largest_size != 0 &&
		    NX_FSW_TCP_RX_AGG_ENABLED())) {
			unsigned int num_segs = 1;
			int pktq_len = KPKTQ_LEN(rx_pkts);

			if (fe->fe_rx_largest_size <= MCLBYTES &&
			    rx_bytes / pktq_len <= MCLBYTES) {
				mhead_bufsize = MCLBYTES;
			} else if (fe->fe_rx_largest_size <= MBIGCLBYTES &&
			    rx_bytes / pktq_len <= MBIGCLBYTES) {
				mhead_bufsize = MBIGCLBYTES;
			} else if (fe->fe_rx_largest_size <= M16KCLBYTES &&
			    rx_bytes / pktq_len <= M16KCLBYTES) {
				mhead_bufsize = M16KCLBYTES;
			} else {
				mhead_bufsize = M16KCLBYTES * 2;
				num_segs = 2;
			}

try_again:
			if (rx_bytes != 0) {
				mhead_cnt = estimate_buf_cnt(fe, rx_bytes, KPKTQ_LEN(rx_pkts),
				    MCLBYTES, mhead_bufsize);
			} else {
				/* No payload, thus it's all small-sized ACKs/... */
				mhead_bufsize = MHLEN;
				mhead_cnt = pktq_len;
			}

			mhead = m_allocpacket_internal(&mhead_cnt,
			    mhead_bufsize, &num_segs, M_NOWAIT, 1, 0);

			if (mhead == NULL) {
				if (mhead_bufsize > M16KCLBYTES) {
					mhead_bufsize = M16KCLBYTES;
					num_segs = 1;
					goto try_again;
				}

				if (mhead_bufsize == M16KCLBYTES) {
					mhead_bufsize = MBIGCLBYTES;
					goto try_again;
				}

				if (mhead_bufsize == MBIGCLBYTES) {
					mhead_bufsize = MCLBYTES;
					goto try_again;
				}
			}
		} else {
			mhead = NULL;
			mhead_bufsize = mhead_cnt = 0;
		}
		SK_DF(logflags, "batch alloc'ed %u mbufs of size %u", mhead_cnt,
		    mhead_bufsize);
	}

	KPKTQ_FOREACH_SAFE(pkt, rx_pkts, tpkt) {
		if (tpkt != NULL) {
			void *baddr;
			MD_BUFLET_ADDR_ABS_PKT(tpkt, baddr);
			SK_PREFETCH(baddr, 0);
		}

		/* Validate l2 len, ip vers, is_mbuf */
		ASSERT(pkt->pkt_l2_len == l2len);
		ASSERT(is_mbuf == !!(PKT_IS_MBUF(pkt)));
		ASSERT(fe->fe_key.fk_ipver == pkt->pkt_flow_ip_ver);
		ASSERT(pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED);
		ASSERT((pkt->pkt_link_flags & PKT_LINKF_ETHFCS) == 0);
		ASSERT(!pkt->pkt_flow_ip_is_frag);
		ASSERT(pkt->pkt_flow_ip_proto == IPPROTO_TCP);

		csum_ok = false;
		agg_ok = false;
		/*
		 * As we only agg packets with same hdr length,
		 * leverage the pkt metadata
		 */
		uint32_t thlen = (pkt->pkt_flow_ip_hlen +
		    pkt->pkt_flow_tcp_hlen);
		uint32_t plen = (thlen + pkt->pkt_flow_ulen);

		/*
		 * Rather than calling flow_pkt_track() for each
		 * packet here, we accumulate received packet stats
		 * for the call to flow_track_stats() below.  This
		 * is because flow tracking is a no-op for traffic
		 * that belongs to the host stack.
		 */
		rcvd_ulen += pkt->pkt_flow_ulen;
		rcvd_bytes += pkt->pkt_length;
		rcvd_packets++;

		KPKTQ_REMOVE(rx_pkts, pkt);
		rx_bytes -= pkt->pkt_flow_ulen;

		/* packet is for BSD flow, create a mbuf chain */
		uint32_t len = (l2len + plen);
		uint16_t data_csum = 0;
		struct mbuf *__single m;
		bool is_wake_pkt = false;
		if (__improbable(is_mbuf)) {
			m = pkt->pkt_mbuf;

			if (m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT) {
				is_wake_pkt = true;
			}

			/* Detach mbuf from source pkt */
			KPKT_CLEAR_MBUF_DATA(pkt);

			uint32_t trailer = (m_pktlen(m) - len);
			ASSERT((uint32_t)m_pktlen(m) >= plen);
			/* Remove the trailer */
			if (trailer > 0) {
				m_adj(m, -trailer);
			}
			if ((uint32_t) m->m_len < (l2len + thlen)) {
				m = m_pullup(m, (l2len + thlen));
				if (m == NULL) {
					STATS_INC(fsws,
					    FSW_STATS_RX_DROP_NOMEM_BUF);
					SK_ERR("mbuf pullup failed (err %d)",
					    err);
					__RX_AGG_HOST_DROP_SOURCE_PACKET(pkt,
					    DROP_REASON_FSW_GSO_NOMEM_MBUF, DROPTAP_FLAG_DIR_IN);
					continue;
				}
				m->m_pkthdr.pkt_hdr = mtod(m, uint8_t *);
			}
			/* attached mbuf is already allocated */
			csum_ok = mbuf_csum(pkt, m, is_ipv4, &data_csum);
		} else {                /* native */
			uint16_t pad = P2ROUNDUP(l2len, sizeof(uint32_t)) -
			    l2len;
			uint32_t tot_len = (len + pad);
			/* remember largest aggregated packet size */
			if (smbuf) {
				/* plus 4 bytes to account for padding */
				if (largest_smbuf <
				    (uint32_t)m_pktlen(smbuf) + pad) {
					largest_smbuf = (uint32_t)m_pktlen(smbuf) + pad;
				}
			}

			if ((pkt->pkt_pflags & PKT_F_WAKE_PKT)) {
				is_wake_pkt = true;
			}

			if (prev_csum_ok && curr_m) {
				ASSERT(fa.fa_smbuf == smbuf);
				ASSERT(!fa.fa_sobj_is_pkt);
				agg_ok = flow_agg_is_ok(&fa, pkt, fsws);

				if (agg_ok &&
				    M_TRAILINGSPACE(curr_m) >= plen - thlen) {
					/*
					 * No need for a new mbuf,
					 * just append to curr_m.
					 */
					csum_ok = copy_pkt_csum_packed(pkt,
					    plen, NULL, is_ipv4, curr_m, NULL,
					    &data_csum, NULL);

					if (!csum_ok) {
						STATS_INC(fsws,
						    FSW_STATS_RX_AGG_BAD_CSUM);
						SK_ERR("Checksum for "
						    "aggregation is wrong");
						DTRACE_SKYWALK(aggr__host_packed_tcp_csum_fail1);
						/*
						 * Turns out, checksum is wrong!
						 * Fallback to no-agg mode.
						 */
						agg_ok = 0;
					} else {
						/*
						 * We only added payload,
						 * thus -thlen.
						 */
						bytes += (plen - thlen);
						flow_agg_merge_hdr(&fa, pkt,
						    data_csum, fsws);
						goto next;
					}
				}
			}

			/*
			 * If the batch allocation returned partial success,
			 * we try blocking allocation here again
			 */
			m = mhead;
			if (__improbable(m == NULL ||
			    tot_len > mhead_bufsize)) {
				unsigned int num_segs = 1;
				if (tot_len > M16KCLBYTES) {
					num_segs = 0;
				}

				ASSERT(mhead_cnt == 0 || mhead != NULL);
				err = mbuf_allocpacket(MBUF_DONTWAIT, tot_len,
				    &num_segs, &m);
				if (err != 0) {
					STATS_INC(fsws,
					    FSW_STATS_RX_DROP_NOMEM_BUF);
					SK_ERR("mbuf alloc failed (err %d), "
					    "maxchunks %d, len %d", err, num_segs,
					    tot_len);
					__RX_AGG_HOST_DROP_SOURCE_PACKET(pkt,
					    DROP_REASON_FSW_GSO_NOMEM_MBUF, DROPTAP_FLAG_DIR_IN);
					continue;
				}
			} else {
				ASSERT(mhead_cnt > 0);
				mhead = m->m_nextpkt;
				m->m_nextpkt = NULL;
				mhead_cnt--;
			}
			m->m_data += pad;
			m->m_pkthdr.pkt_hdr = mtod(m, uint8_t *);

			/*
			 * copy and checksum l3, l4 and payload
			 * l2 header is copied later only if we
			 * can't agg as an optimization
			 */
			m->m_pkthdr.csum_flags &= ~CSUM_RX_FLAGS;
			_dbuf_array_t dbuf_array = {.dba_is_buflet = false};
			if (agg_ok) {
				int added = 0, dbuf_idx = 0;
				struct mbuf *m_tmp = m;
				dbuf_array.dba_num_dbufs = 0;
				uint32_t m_chain_max_len = 0;
				while (m_tmp != NULL && dbuf_idx < MAX_BUFLET_COUNT) {
					dbuf_array.dba_mbuf[dbuf_idx] = m_tmp;
					dbuf_array.dba_num_dbufs += 1;
					m_chain_max_len += (uint32_t)M_TRAILINGSPACE(m_tmp);
					m_tmp = m_tmp->m_next;
					dbuf_idx++;
				}
				ASSERT(m_tmp == NULL);

				csum_ok = copy_pkt_csum_packed(pkt, plen,
				    &dbuf_array, is_ipv4, curr_m, NULL,
				    &data_csum, &added);

				if (!csum_ok) {
					STATS_INC(fsws,
					    FSW_STATS_RX_AGG_BAD_CSUM);
					SK_ERR("Checksum for aggregation "
					    "on new mbuf is wrong");
					DTRACE_SKYWALK(aggr__host_packed_tcp_csum_fail2);
					agg_ok = false;
					goto non_agg;
				}

				/*
				 * There was not enough space in curr_m,
				 * thus we must have added to m->m_data.
				 */
				VERIFY(added > 0);
				VERIFY(m->m_len <= m->m_pkthdr.len &&
				    (uint32_t)m->m_pkthdr.len <= m_chain_max_len);

				/*
				 * We account for whatever we added
				 * to m later on, thus - added.
				 */
				bytes += plen - thlen - added;
			} else {
non_agg:
				dbuf_array.dba_num_dbufs = 0;
				uint32_t m_chain_max_len = 0;
				struct mbuf *m_tmp = m;
				int dbuf_idx = 0;
				while (m_tmp != NULL && dbuf_idx < MAX_BUFLET_COUNT) {
					dbuf_array.dba_mbuf[dbuf_idx] = m_tmp;
					dbuf_array.dba_num_dbufs += 1;
					m_chain_max_len += (uint32_t)M_TRAILINGSPACE(m_tmp);
					m_tmp = m_tmp->m_next;
					dbuf_idx++;
				}
				ASSERT(m_tmp == NULL);

				m->m_len += l2len;
				m->m_pkthdr.len += l2len;
				csum_ok = copy_pkt_csum(pkt, plen, &dbuf_array,
				    &data_csum, is_ipv4);
				if (__improbable(!csum_ok)) {
					STATS_INC(fsws, FSW_STATS_RX_AGG_BAD_CSUM);
					SK_ERR("%d incorrect csum", __LINE__);
					DTRACE_SKYWALK(aggr__host_tcp_csum_fail);
				}
				VERIFY(m->m_len <= m->m_pkthdr.len &&
				    (uint32_t)m->m_pkthdr.len <= m_chain_max_len);
			}

			STATS_INC(fsws, FSW_STATS_RX_COPY_PKT2MBUF);
			STATS_INC(fsws, FSW_STATS_RX_COPY_SUM);

			m->m_pkthdr.csum_rx_start = pkt->pkt_csum_rx_start_off;
			m->m_pkthdr.csum_rx_val = pkt->pkt_csum_rx_value;
			/*
			 *  Note that these flags have same value,
			 * except PACKET_CSUM_PARTIAL
			 */
			m->m_pkthdr.csum_flags |= (pkt->pkt_csum_flags &
			    PACKET_CSUM_RX_FLAGS);

			/* Set the rcvif */
			m->m_pkthdr.rcvif = fsw->fsw_ifp;

			/* Make sure to propagate the wake pkt flag */
			if (is_wake_pkt) {
				m->m_pkthdr.pkt_flags |= PKTF_WAKE_PKT;
			}
		}
		ASSERT(m != NULL);
		ASSERT((m->m_flags & M_PKTHDR) && m->m_pkthdr.pkt_hdr != NULL);
		ASSERT((m->m_flags & M_HASFCS) == 0);
		ASSERT(m->m_nextpkt == NULL);

		if (__improbable(is_mbuf)) {
			if (prev_csum_ok && csum_ok) {
				ASSERT(fa.fa_smbuf == smbuf);
				agg_ok = flow_agg_is_ok(&fa, pkt, fsws);
			}
		}

		if (agg_ok) {
			ASSERT(is_wake_pkt == false);
			ASSERT(fa.fa_smbuf == smbuf);
			ASSERT(!fa.fa_sobj_is_pkt);
			if (__improbable(is_mbuf)) {
				bytes += (m_pktlen(m) - l2len);
				/* adjust mbuf by l2, l3 and l4  hdr */
				m_adj(m, l2len + thlen);
			} else {
				bytes += m_pktlen(m);
			}

			m->m_flags &= ~M_PKTHDR;
			flow_agg_merge_hdr(&fa, pkt, data_csum, fsws);
			while (curr_m->m_next != NULL) {
				curr_m = curr_m->m_next;
			}
			curr_m->m_next = m;
			curr_m = m;
			m = NULL;
		} else {
			if ((uint32_t) m->m_len < l2len) {
				m = m_pullup(m, l2len);
				if (m == NULL) {
					STATS_INC(fsws,
					    FSW_STATS_RX_DROP_NOMEM_BUF);
					SK_ERR("mbuf pullup failed (err %d)",
					    err);
					__RX_AGG_HOST_DROP_SOURCE_PACKET(pkt,
					    DROP_REASON_FSW_GSO_NOMEM_MBUF, DROPTAP_FLAG_DIR_IN);
					continue;
				}
				m->m_pkthdr.pkt_hdr = mtod(m, uint8_t *);
			}

			/* copy l2 header for native */
			if (__probable(!is_mbuf)) {
				uint16_t llhoff = pkt->pkt_headroom;
				uint8_t *baddr;
				MD_BUFLET_ADDR_ABS(pkt, baddr);
				ASSERT(baddr != NULL);
				baddr += llhoff;
				pkt_copy(baddr, m_mtod_current(m), l2len);
			}
			/* adjust mbuf by l2 hdr */
			m_adj(m, l2len);
			bytes += m_pktlen(m);

			/*
			 * aggregated packets can be skipped by pktap because
			 * the original pre-aggregated chain already passed through
			 * pktap (see fsw_snoop()) before entering this function.
			 */
			m->m_pkthdr.pkt_flags |= PKTF_SKIP_PKTAP;

			if (m_chain == NULL) {
				/* this is the start of the chain */
				m_chain = m;
				smbuf = m;
				curr_m = m;
			} else if (smbuf != NULL) {
				/*
				 * set m to be next packet
				 */
				mbuf_agg_log(smbuf, kernproc, is_mbuf);
				smbuf->m_nextpkt = m;
				/*
				 * Clean up (finalize) a smbuf only if it pre-allocated >1 segments,
				 * which only happens when mhead_bufsize > M16KCLBYTES
				 */
				if (_finalize_smbuf(smbuf)) {
					FSW_STATS_INC(FSW_STATS_RX_WASTED_16KMBUF);
				}
				smbuf_finalized++;
				smbuf = m;
				curr_m = m;
			} else {
				VERIFY(0);
			}

			smbufs++;
			m = NULL;

			flow_agg_init_smbuf(fsw, &fa, smbuf, pkt);
			/*
			 * If the super packet is an mbuf which can't accomodate
			 * sizeof(struct ip_tcp_mask) or sizeof(struct ip6_tcp_mask)
			 * in a single buffer, then do the aggregation check in slow path.
			 * Note that on Intel platforms, an mbuf without cluster
			 * has only 80 bytes available for data. That means if a
			 * packet contains an Ethernet header, the mbuf won't be
			 * able to fully contain "struct ip6_tcp_mask" or
			 * "struct ip6_tcp_mask" data in a single buffer, because
			 * sizeof(struct ip_tcp_mask) and sizeof(struct ip6_tcp_mask)
			 * are all 80 bytes as well.
			 */
			if (__improbable(smbuf->m_len <
			    ((m_mtod_current(smbuf) - (caddr_t)(smbuf->m_pkthdr.pkt_hdr)) + MASK_SIZE))) {
				fa.fa_sobj_is_short = true;
			}
		}
next:
		pkt_agg_log(pkt, kernproc, true);
		prev_csum_ok = csum_ok;
		KPKTQ_ENQUEUE(&disposed_pkts, pkt);
	}

	KPKTQ_FINI(rx_pkts);

	/* Free any leftover mbufs, true only for native  */
	if (__improbable(mhead != NULL)) {
		ASSERT(mhead_cnt != 0);
		STATS_ADD(fsws, FSW_STATS_RX_WASTED_MBUF, mhead_cnt);
		(void) m_freem_list(mhead);
		mhead = NULL;
		mhead_cnt = 0;
	}

	converge_aggregation_size(fe, largest_smbuf);

	if (smbufs > 0) {
		/* Last smbuf */
		mbuf_agg_log(smbuf, kernproc, is_mbuf);
		SK_DF(logflags, "smbuf count %u", smbufs);

		ASSERT(m_chain != NULL);
		ASSERT(smbuf != NULL);

		/*
		 * If the last mbuf needs to be finalized (mhead_bufsize > M16KCLBYTES)
		 * but is not (smbuf_finalized < smbuf), do it now.
		 */
		if (smbuf_finalized < smbufs &&
		    _finalize_smbuf(smbuf)) {
			FSW_STATS_INC(FSW_STATS_RX_WASTED_16KMBUF);
		}

		/*
		 * Call fsw_host_sendup() with mbuf chain
		 * directly.
		 */
		mchain_agg_log(m_chain, kernproc, is_mbuf);
		fsw_host_sendup(fsw->fsw_ifp, m_chain, smbuf, smbufs, bytes);

		if (__improbable(is_mbuf)) {
			STATS_ADD(fsws, FSW_STATS_RX_AGG_MBUF2MBUF, smbufs);
		} else {
			STATS_ADD(fsws, FSW_STATS_RX_AGG_PKT2MBUF, smbufs);
		}
		FLOW_STATS_IN_ADD(fe, spackets, smbufs);

		ASSERT((fe->fe_flags & FLOWENTF_TRACK) == 0);
	}

	/* record (raw) number of packets and bytes */
	ASSERT((int)(rcvd_bytes - drop_bytes) >= 0);
	ASSERT((int)(rcvd_packets - drop_packets) >= 0);
	flow_track_stats(fe, (rcvd_bytes - drop_bytes),
	    (rcvd_packets - drop_packets), (rcvd_ulen != 0), true);

	pp_free_pktq(&disposed_pkts);
}

void
flow_rx_agg_tcp(struct nx_flowswitch *fsw, struct flow_entry *fe,
    struct pktq *rx_pkts, uint32_t rx_bytes, uint32_t flags)
{
#pragma unused(flags)
	struct pktq dropped_pkts;
	bool is_mbuf;

	if (__improbable((flags & FLOW_PROC_FLAG_FRAGMENTS) != 0)) {
		dp_flow_rx_process(fsw, fe, rx_pkts, rx_bytes, FLOW_PROC_FLAG_FRAGMENTS);
		return;
	}

	KPKTQ_INIT(&dropped_pkts);

	if (!dp_flow_rx_route_process(fsw, fe)) {
		SK_ERR("Rx route bad");
		fsw_snoop_and_dequeue(fe, &dropped_pkts, rx_pkts, true);
		STATS_ADD(&fsw->fsw_stats, FSW_STATS_RX_FLOW_NONVIABLE,
		    KPKTQ_LEN(&dropped_pkts));
		pp_drop_pktq(&dropped_pkts, fsw->fsw_ifp, DROPTAP_FLAG_DIR_IN,
		    DROP_REASON_FSW_FLOW_NONVIABLE, __func__, __LINE__);
		return;
	}

	is_mbuf = !!(PKT_IS_MBUF(KPKTQ_FIRST(rx_pkts)));

	if (fe->fe_nx_port == FSW_VP_HOST) {
		boolean_t do_rx_agg;

		/* BSD flow */
		if (sk_fsw_rx_agg_tcp_host != SK_FSW_RX_AGG_TCP_HOST_AUTO) {
			do_rx_agg = (sk_fsw_rx_agg_tcp_host ==
			    SK_FSW_RX_AGG_TCP_HOST_ON);
		} else {
			do_rx_agg = !dlil_has_ip_filter() &&
			    !dlil_has_if_filter(fsw->fsw_ifp);
		}
		if (__improbable(!do_rx_agg)) {
			fsw_host_rx(fsw, rx_pkts);
			return;
		}
		if (__improbable(pktap_total_tap_count != 0)) {
			fsw_snoop(fsw, fe, rx_pkts, true);
		}
		flow_rx_agg_host(fsw, fe, rx_pkts, rx_bytes, is_mbuf);
	} else {
		/* channel flow */
		if (__improbable(pktap_total_tap_count != 0)) {
			fsw_snoop(fsw, fe, rx_pkts, true);
		}
		flow_rx_agg_channel(fsw, fe, rx_pkts, rx_bytes, is_mbuf);
	}
}