This is xnu-11215.1.10. See this file in:
/*
* Copyright (c) 2017-2023 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
* This file contains Original Code and/or Modifications of Original Code
* as defined in and that are subject to the Apple Public Source License
* Version 2.0 (the 'License'). You may not use this file except in
* compliance with the License. The rights granted to you under the License
* may not be used to create, or enable the creation or redistribution of,
* unlawful or unlicensed copies of an Apple operating system, or to
* circumvent, violate, or enable the circumvention or violation of, any
* terms of an Apple operating system software license agreement.
*
* Please obtain a copy of the License at
* http://www.opensource.apple.com/apsl/ and read it before using this file.
*
* The Original Code and all software distributed under the License are
* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
* Please see the License for the specific language governing rights and
* limitations under the License.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
*/
/* $FreeBSD: src/sys/netinet6/frag6.c,v 1.2.2.5 2001/07/03 11:01:50 ume Exp $ */
/* $KAME: frag6.c,v 1.31 2001/05/17 13:45:34 jinmei Exp $ */
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* @file
* flowswitch IP Reassembly for both v4 and v6
*
* Implementation of IP packet fragmentation and reassembly.
*
*/
#include <sys/domain.h>
#include <netinet/in.h>
#include <netinet/ip6.h>
#include <netinet/icmp6.h>
#include <skywalk/os_skywalk_private.h>
#include <skywalk/nexus/flowswitch/nx_flowswitch.h>
#include <skywalk/nexus/flowswitch/fsw_var.h>
#define IPFM_MAX_FRAGS_PER_QUEUE 128 /* RFC 791 64K/(512 min MTU) */
#define IPFM_MAX_QUEUES 1024 /* same as ip/ip6 */
#define IPFM_FRAG_TTL 60 /* RFC 2460 */
#define IPFM_TIMEOUT_TCALL_INTERVAL 1
static uint32_t ipfm_max_frags_per_queue = IPFM_MAX_FRAGS_PER_QUEUE;
static uint32_t ipfm_frag_ttl = IPFM_FRAG_TTL;
static uint32_t ipfm_timeout_tcall_ival = IPFM_TIMEOUT_TCALL_INTERVAL;
#if (DEVELOPMENT || DEBUG)
SYSCTL_INT(_kern_skywalk_flowswitch, OID_AUTO,
ipfm_max_frags_per_queue, CTLFLAG_RW | CTLFLAG_LOCKED,
&ipfm_max_frags_per_queue, 0, "");
#endif /* !DEVELOPMENT && !DEBUG */
SYSCTL_INT(_kern_skywalk_flowswitch, OID_AUTO, ipfm_frag_ttl,
CTLFLAG_RW | CTLFLAG_LOCKED, &ipfm_frag_ttl, 0, "");
SYSCTL_INT(_kern_skywalk_flowswitch, OID_AUTO,
ipfm_timeout_tcall_ival, CTLFLAG_RW | CTLFLAG_LOCKED,
&ipfm_timeout_tcall_ival, 0, "");
static LCK_GRP_DECLARE(fsw_ipfm_lock_group, "sk_fsw_ipfm_lock");
static LCK_ATTR_DECLARE(fsw_ipfm_lock_attr, 0, 0);
/* @internal ip fragment wrapper (chained in an ipfq) for __kern_packet */
struct ipf {
struct ipf *ipf_down;
struct ipf *ipf_up;
struct __kern_packet *ipf_pkt;
int ipf_len; /* fragmentable part length */
int ipf_off; /* fragment offset */
uint16_t ipf_mff; /* more fragment bit in frag off */
};
/* @internal ip fragment lookup key */
struct ipf_key {
uint64_t ipfk_addr[4]; /* src + dst ip addr (v4/v6) */
uint32_t ipfk_ident; /* IP identification */
uint16_t ipfk_len; /* len of ipfk_addr field */
};
enum {
IPFK_LEN_V4 = 2 * sizeof(struct in_addr),
IPFK_LEN_V6 = 2 * sizeof(struct in6_addr),
};
/*
* @internal
* IP reassembly queue structure. Each fragment (struct ipf)
* being reassembled is attached to one of these structures.
*/
struct ipfq {
struct ipf *ipfq_down; /* fragment chain */
struct ipf *ipfq_up;
struct ipfq *ipfq_next; /* queue chain */
struct ipfq *ipfq_prev;
uint64_t ipfq_timestamp; /* time of creation */
struct ipf_key ipfq_key; /* ipfq search key */
uint16_t ipfq_nfrag; /* # of fragments in queue */
int ipfq_unfraglen; /* len of unfragmentable part */
bool ipfq_is_dirty; /* q is dirty, don't use */
};
/*
* @internal (externally opaque)
* flowswitch IP Fragment Manager
*/
struct fsw_ip_frag_mgr {
struct skoid ipfm_skoid;
struct ipfq ipfm_q; /* ip reassembly queues */
uint32_t ipfm_q_limit; /* limit # of reass queues */
uint32_t ipfm_q_count; /* # of allocated reass queues */
uint32_t ipfm_f_limit; /* limit # of ipfs */
uint32_t ipfm_f_count; /* current # of allocated ipfs */
decl_lck_mtx_data(, ipfm_lock); /* guard reass and timeout cleanup */
thread_call_t ipfm_timeout_tcall; /* frag timeout thread */
struct ifnet *ipfm_ifp;
struct fsw_stats *ipfm_stats; /* indirect stats in fsw */
};
static int ipf_process(struct fsw_ip_frag_mgr *, struct __kern_packet **,
struct ipf_key *, uint16_t, uint16_t, uint16_t, uint16_t, uint16_t *,
uint16_t *);
static int ipf_key_cmp(struct ipf_key *, struct ipf_key *);
static void ipf_enq(struct ipf *, struct ipf *);
static void ipf_deq(struct ipf *);
static void ipfq_insque(struct ipfq *, struct ipfq *);
static void ipfq_remque(struct ipfq *);
static uint32_t ipfq_freef(struct fsw_ip_frag_mgr *mgr, struct ipfq *,
void (*)(struct fsw_ip_frag_mgr *, struct ipf *));
static void ipfq_timeout(thread_call_param_t, thread_call_param_t);
static void ipfq_sched_timeout(struct fsw_ip_frag_mgr *, boolean_t);
static struct ipfq *ipfq_alloc(struct fsw_ip_frag_mgr *mgr);
static void ipfq_free(struct fsw_ip_frag_mgr *mgr, struct ipfq *q);
static uint32_t ipfq_freefq(struct fsw_ip_frag_mgr *mgr, struct ipfq *q,
void (*ipf_cb)(struct fsw_ip_frag_mgr *, struct ipf *));
static struct ipf *ipf_alloc(struct fsw_ip_frag_mgr *mgr);
static void ipf_free(struct fsw_ip_frag_mgr *mgr, struct ipf *f);
static void ipf_free_pkt(struct ipf *f);
static void ipfq_drain(struct fsw_ip_frag_mgr *mgr);
static void ipfq_reap(struct fsw_ip_frag_mgr *mgr);
static int ipfq_drain_sysctl SYSCTL_HANDLER_ARGS;
void ipf_icmp_param_err(struct fsw_ip_frag_mgr *, struct __kern_packet *pkt,
int param);
void ipf_icmp_timeout_err(struct fsw_ip_frag_mgr *, struct ipf *f);
/* Create a flowswitch IP fragment manager. */
struct fsw_ip_frag_mgr *
fsw_ip_frag_mgr_create(struct nx_flowswitch *fsw, struct ifnet *ifp,
size_t f_limit)
{
struct fsw_ip_frag_mgr *mgr;
ASSERT(ifp != NULL);
mgr = sk_alloc_type(struct fsw_ip_frag_mgr, Z_WAITOK | Z_NOFAIL,
skmem_tag_fsw_frag_mgr);
mgr->ipfm_q.ipfq_next = mgr->ipfm_q.ipfq_prev = &mgr->ipfm_q;
lck_mtx_init(&mgr->ipfm_lock, &fsw_ipfm_lock_group, &fsw_ipfm_lock_attr);
mgr->ipfm_timeout_tcall =
thread_call_allocate_with_options(ipfq_timeout, mgr,
THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE);
VERIFY(mgr->ipfm_timeout_tcall != NULL);
mgr->ipfm_ifp = ifp;
mgr->ipfm_stats = &fsw->fsw_stats;
/* Use caller provided limit (caller knows pool size) */
ASSERT(f_limit >= 2 && f_limit < UINT32_MAX);
mgr->ipfm_f_limit = (uint32_t)f_limit;
mgr->ipfm_f_count = 0;
mgr->ipfm_q_limit = MIN(IPFM_MAX_QUEUES, mgr->ipfm_f_limit / 2);
mgr->ipfm_q_count = 0;
skoid_create(&mgr->ipfm_skoid, SKOID_DNODE(fsw->fsw_skoid), "ipfm", 0);
skoid_add_uint(&mgr->ipfm_skoid, "frag_limit", CTLFLAG_RW,
&mgr->ipfm_f_limit);
skoid_add_uint(&mgr->ipfm_skoid, "frag_count", CTLFLAG_RD,
&mgr->ipfm_f_count);
skoid_add_uint(&mgr->ipfm_skoid, "queue_limit", CTLFLAG_RW,
&mgr->ipfm_q_limit);
skoid_add_uint(&mgr->ipfm_skoid, "queue_count", CTLFLAG_RD,
&mgr->ipfm_q_count);
skoid_add_handler(&mgr->ipfm_skoid, "drain", CTLFLAG_RW,
ipfq_drain_sysctl, mgr, 0);
return mgr;
}
/* Free a flowswitch IP fragment manager. */
void
fsw_ip_frag_mgr_destroy(struct fsw_ip_frag_mgr *mgr)
{
thread_call_t __single tcall;
lck_mtx_lock(&mgr->ipfm_lock);
if ((tcall = mgr->ipfm_timeout_tcall) != NULL) {
lck_mtx_unlock(&mgr->ipfm_lock);
(void) thread_call_cancel_wait(tcall);
(void) thread_call_free(tcall);
mgr->ipfm_timeout_tcall = NULL;
lck_mtx_lock(&mgr->ipfm_lock);
}
ipfq_drain(mgr);
lck_mtx_unlock(&mgr->ipfm_lock);
lck_mtx_destroy(&mgr->ipfm_lock, &fsw_ipfm_lock_group);
skoid_destroy(&mgr->ipfm_skoid);
sk_free_type(struct fsw_ip_frag_mgr, mgr);
}
/*
* Reassemble a received IPv4 fragment.
*
* @param mgr
* fragment manager
* @param pkt
* received packet (must have ipv4 header validated)
* @param ip4
* pointer to the packet's IPv4 header
* @param nfrags
* number of fragments reassembled
* @return
* Successfully processed (not fully reassembled)
* ret = 0, *pkt = NULL(ipfm owns it), *nfrags=0
* Successfully reassembled
* ret = 0, *pkt = 1st fragment(fragments chained in order by pkt_nextpkt)
* *nfrags = number of all fragments (>0)
* Error
* ret != 0 && *pkt unmodified (caller to decide what to do with *pkt)
* *nfrags = 0
*/
int
fsw_ip_frag_reass_v4(struct fsw_ip_frag_mgr *mgr, struct __kern_packet **pkt,
struct ip *ip4, uint16_t *nfrags, uint16_t *tlen)
{
struct ipf_key key;
uint16_t unfragpartlen, offflag, fragoff, fragpartlen, fragflag;
int err;
uint8_t *src;
STATS_INC(mgr->ipfm_stats, FSW_STATS_RX_FRAG_V4);
src = (uint8_t *)(struct ip *__bidi_indexable)ip4 +
offsetof(struct ip, ip_src);
bcopy(src, (void *)key.ipfk_addr, IPFK_LEN_V4);
key.ipfk_len = IPFK_LEN_V4;
key.ipfk_ident = (uint32_t)ip4->ip_id;
unfragpartlen = (uint16_t)(ip4->ip_hl << 2);
offflag = ntohs(ip4->ip_off);
fragoff = (uint16_t)(offflag << 3);
fragpartlen = ntohs(ip4->ip_len) - (uint16_t)(ip4->ip_hl << 2);
fragflag = offflag & IP_MF;
err = ipf_process(mgr, pkt, &key, unfragpartlen, fragoff, fragpartlen,
fragflag, nfrags, tlen);
/*
* If packet has been reassembled compute the user data length.
*/
if (*pkt != NULL) {
struct __kern_packet *p = *pkt;
struct ip *__single iph = __unsafe_forge_single(struct ip *,
(struct ip *)p->pkt_flow_ip_hdr);
p->pkt_flow_ulen = ntohs(iph->ip_len) -
p->pkt_flow_ip_hlen - p->pkt_flow->flow_l4._l4_hlen;
}
return err;
}
/*
* Reassemble a received IPv6 fragment.
*
* @param mgr
* fragment manager
* @param pkt
* received packet (must have ipv6 header validated)
* @param ip6
* pointer to the packet's IPv6 header
* @param ip6f
* pointer to the packet's IPv6 Fragment Header
* @param nfrags
* number of fragments reassembled
* @return
* Successfully processed (not fully reassembled)
* ret = 0, *pkt = NULL(ipfm owns it), *nfrags=0
* Successfully reassembled
* ret = 0, *pkt = 1st fragment(fragments chained in ordrer by pkt_nextpkt)
* *nfrags = number of all fragments (>0)
* Error
* ret != 0 && *pkt unmodified (caller to decide what to do with *pkt)
* *nfrags = 0
*/
int
fsw_ip_frag_reass_v6(struct fsw_ip_frag_mgr *mgr, struct __kern_packet **pkt,
struct ip6_hdr *ip6, struct ip6_frag *ip6f, uint16_t *nfrags,
uint16_t *tlen)
{
struct ipf_key key;
ptrdiff_t ip6f_ptroff = (uintptr_t)ip6f - (uintptr_t)ip6;
uint16_t ip6f_off, fragoff, fragpartlen, unfragpartlen, fragflag;
int err;
uint8_t *src;
STATS_INC(mgr->ipfm_stats, FSW_STATS_RX_FRAG_V6);
/* jumbo payload can't contain a fragment header */
if (ip6->ip6_plen == 0) {
*nfrags = 0;
return ERANGE;
}
ASSERT(ip6f_ptroff < UINT16_MAX);
ip6f_off = (uint16_t)ip6f_ptroff;
fragoff = ntohs(ip6f->ip6f_offlg & IP6F_OFF_MASK);
fragpartlen = ntohs(ip6->ip6_plen) -
(ip6f_off + sizeof(struct ip6_frag) - sizeof(struct ip6_hdr));
unfragpartlen = ip6f_off;
fragflag = ip6f->ip6f_offlg & IP6F_MORE_FRAG;
/*
* RFC 6946: Handle "atomic" fragments (offset and m bit set to 0)
* upfront, unrelated to any reassembly.
*
* Flow classifier should process those as non-frag, ipfm shouldn't see
* them.
*/
ASSERT((ip6f->ip6f_offlg & ~IP6F_RESERVED_MASK) != 0);
src = (uint8_t *)(struct ip6_hdr *__bidi_indexable)ip6 +
offsetof(struct ip6_hdr, ip6_src);
bcopy(src, (void *)key.ipfk_addr, IPFK_LEN_V6);
key.ipfk_len = IPFK_LEN_V6;
key.ipfk_ident = ip6f->ip6f_ident;
err = ipf_process(mgr, pkt, &key, unfragpartlen, fragoff, fragpartlen,
fragflag, nfrags, tlen);
/*
* If packet has been reassembled compute the user data length.
*/
if (*pkt != NULL) {
struct __kern_packet *p = *pkt;
struct ip6_hdr *__single ip6h = __unsafe_forge_single(struct ip6_hdr *,
(struct ip6_hdr *)p->pkt_flow_ip_hdr);
p->pkt_flow_ulen = ntohs(ip6h->ip6_plen) -
p->pkt_flow->flow_l4._l4_hlen;
}
return err;
}
static struct mbuf *
ipf_pkt2mbuf(struct fsw_ip_frag_mgr *mgr, struct __kern_packet *pkt)
{
unsigned int one = 1;
struct mbuf *__single m = NULL;
struct mbuf *pkt_mbuf = pkt->pkt_mbuf;
uint8_t *buf;
struct ip6_hdr *ip6;
uint32_t l3t_len;
int err;
l3t_len = pkt->pkt_length - pkt->pkt_l2_len;
if (pkt->pkt_link_flags & PKT_LINKF_ETHFCS) {
l3t_len -= ETHER_CRC_LEN;
}
err = mbuf_allocpacket(MBUF_WAITOK, l3t_len, &one, &m);
VERIFY(err == 0);
ASSERT(l3t_len <= mbuf_maxlen(m));
if (pkt->pkt_pflags & PKT_F_MBUF_DATA) {
if ((pkt_mbuf->m_len < l3t_len) &&
(pkt_mbuf = m_pullup(pkt->pkt_mbuf, l3t_len)) == NULL) {
return NULL;
} else {
pkt->pkt_mbuf = pkt_mbuf;
bcopy(m_mtod_current(pkt->pkt_mbuf) + pkt->pkt_l2_len,
m_mtod_current(m), l3t_len);
}
} else {
MD_BUFLET_ADDR_ABS(pkt, buf);
buf += (pkt->pkt_headroom + pkt->pkt_l2_len);
bcopy(buf, m_mtod_current(m), l3t_len);
}
m->m_pkthdr.len = m->m_len = l3t_len;
ip6 = mtod(m, struct ip6_hdr *);
/* note for casting: IN6_IS_SCOPE_ doesn't need alignment */
if (IN6_IS_SCOPE_LINKLOCAL((struct in6_addr *)(uintptr_t)&ip6->ip6_src)) {
if (in6_embedded_scope) {
ip6->ip6_src.s6_addr16[1] = htons(mgr->ipfm_ifp->if_index);
}
ip6_output_setsrcifscope(m, mgr->ipfm_ifp->if_index, NULL);
}
if (IN6_IS_SCOPE_EMBED((struct in6_addr *)(uintptr_t)&ip6->ip6_dst)) {
if (in6_embedded_scope) {
ip6->ip6_dst.s6_addr16[1] = htons(mgr->ipfm_ifp->if_index);
}
ip6_output_setdstifscope(m, mgr->ipfm_ifp->if_index, NULL);
}
return m;
}
/*
* Since this function can be called while holding fsw_ip_frag_mgr.ipfm_lock,
* we need to ensure we don't enter the driver directly because a deadlock
* can happen if this same thread tries to get the workloop lock.
*/
static void
ipf_icmp6_error_flag(struct mbuf *m, int type, int code, int param, int flags)
{
sk_protect_t protect = sk_async_transmit_protect();
icmp6_error_flag(m, type, code, param, flags);
sk_async_transmit_unprotect(protect);
}
/*
* @internal IP fragment ICMP parameter problem error handling
*
* @param param
* offending parameter offset, only applicable to ICMPv6
*/
void
ipf_icmp_param_err(struct fsw_ip_frag_mgr *mgr, struct __kern_packet *pkt,
int param_offset)
{
if (pkt->pkt_flow_ip_ver != IPV6_VERSION) {
return;
}
struct mbuf *m = NULL;
m = ipf_pkt2mbuf(mgr, pkt);
if (__probable(m != NULL)) {
ipf_icmp6_error_flag(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER,
param_offset, 0);
}
/* m would be free by icmp6_error_flag function */
}
/* @internal IP fragment ICMP timeout error handling */
void
ipf_icmp_timeout_err(struct fsw_ip_frag_mgr *mgr, struct ipf *f)
{
struct __kern_packet *pkt = f->ipf_pkt;
ASSERT(pkt != NULL);
/* no icmp error packet for ipv4 */
if (pkt->pkt_flow_ip_ver != IPV6_VERSION) {
return;
}
/* only for the first fragment */
if (f->ipf_off != 0) {
return;
}
struct mbuf *m = NULL;
m = ipf_pkt2mbuf(mgr, pkt);
if (__probable(m != NULL)) {
ipf_icmp6_error_flag(m, ICMP6_TIME_EXCEEDED,
ICMP6_TIME_EXCEED_REASSEMBLY, 0, 0);
}
/* m would be free by icmp6_error_flag function */
}
/* @internal IP fragment processing, v4/v6 agonistic */
int
ipf_process(struct fsw_ip_frag_mgr *mgr, struct __kern_packet **pkt_ptr,
struct ipf_key *key, uint16_t unfraglen, uint16_t fragoff,
uint16_t fragpartlen, uint16_t fragflag, uint16_t *nfrags, uint16_t *tlen)
{
struct __kern_packet *pkt = *pkt_ptr;
struct __kern_packet *pkt_reassed = NULL;
struct ipfq *q, *mq = &mgr->ipfm_q;
struct ipf *f, *f_new, *f_down;
uint32_t nfrags_freed;
int next;
int first_frag = 0;
int err = 0;
int local_ipfq_unfraglen;
*nfrags = 0;
SK_DF(SK_VERB_IP_FRAG, "id %5d fragoff %5d fragpartlen %5d "
"fragflag 0x%x", key->ipfk_ident, fragoff, fragpartlen, fragflag);
/*
* Make sure that all fragments except last one have a data length
* that's a non-zero multiple of 8 bytes.
*/
if (fragflag && (fragpartlen == 0 || (fragpartlen & 0x7) != 0)) {
SK_DF(SK_VERB_IP_FRAG, "frag not multiple of 8 bytes");
STATS_INC(mgr->ipfm_stats, FSW_STATS_RX_FRAG_DROP_BAD_LEN);
ipf_icmp_param_err(mgr, pkt,
offsetof(struct ip6_hdr, ip6_plen));
return ERANGE;
}
lck_mtx_lock(&mgr->ipfm_lock);
/* find ipfq */
for (q = mq->ipfq_next; q != mq; q = q->ipfq_next) {
if (ipf_key_cmp(key, &q->ipfq_key) == 0) {
if (q->ipfq_is_dirty) {
SK_DF(SK_VERB_IP_FRAG, "found dirty q, skip");
err = EINVAL;
goto done;
}
break;
}
}
/* not found, create new ipfq */
if (q == mq) {
first_frag = 1;
q = ipfq_alloc(mgr);
if (q == NULL) {
STATS_INC(mgr->ipfm_stats,
FSW_STATS_RX_FRAG_DROP_NOMEM);
err = ENOMEM;
goto done;
}
ipfq_insque(q, mq);
net_update_uptime();
bcopy(key, &q->ipfq_key, sizeof(struct ipf_key));
q->ipfq_down = q->ipfq_up = (struct ipf *)q;
q->ipfq_unfraglen = -1; /* The 1st fragment has not arrived. */
q->ipfq_nfrag = 0;
q->ipfq_timestamp = _net_uptime;
}
ASSERT(!q->ipfq_is_dirty);
/* this queue has reached per queue frag limit */
if (q->ipfq_nfrag > ipfm_max_frags_per_queue) {
nfrags_freed = ipfq_freefq(mgr, q, NULL);
STATS_ADD(mgr->ipfm_stats,
FSW_STATS_RX_FRAG_DROP_PER_QUEUE_LIMIT, nfrags_freed);
err = ENOMEM;
goto done;
}
local_ipfq_unfraglen = q->ipfq_unfraglen;
/*
* If it's the 1st fragment, record the length of the
* unfragmentable part and the next header of the fragment header.
* Assume the first fragement to arrive will be correct.
* We do not have any duplicate checks here yet so another packet
* with fragoff == 0 could come and overwrite the ipfq_unfraglen
* and worse, the next header, at any time.
*/
if (fragoff == 0 && local_ipfq_unfraglen == -1) {
local_ipfq_unfraglen = unfraglen;
}
/* Check that the reassembled packet would not exceed 65535 bytes. */
if (local_ipfq_unfraglen + fragoff + fragpartlen > IP_MAXPACKET) {
SK_DF(SK_VERB_IP_FRAG, "frag too big");
STATS_INC(mgr->ipfm_stats, FSW_STATS_RX_FRAG_BAD);
ipf_icmp_param_err(mgr, pkt, sizeof(struct ip6_hdr) +
offsetof(struct ip6_frag, ip6f_offlg));
err = ERANGE;
goto done;
}
/*
* If it's the 1st fragment, do the above check for each
* fragment already stored in the reassembly queue.
* If an error is found, still return 0, since we don't return
* ownership of a chain of offending packets back to caller.
*/
if (fragoff == 0) {
for (f = q->ipfq_down; f != (struct ipf *)q; f = f_down) {
f_down = f->ipf_down;
if (local_ipfq_unfraglen + f->ipf_off + f->ipf_len >
IP_MAXPACKET) {
SK_DF(SK_VERB_IP_FRAG, "frag too big");
STATS_INC(mgr->ipfm_stats,
FSW_STATS_RX_FRAG_BAD);
ipf_deq(f);
ipf_free_pkt(f);
ipf_free(mgr, f);
}
}
}
f_new = ipf_alloc(mgr);
if (f_new == NULL) {
STATS_INC(mgr->ipfm_stats, FSW_STATS_RX_FRAG_DROP_NOMEM);
err = ENOMEM;
goto done;
}
f_new->ipf_mff = fragflag;
f_new->ipf_off = fragoff;
f_new->ipf_len = fragpartlen;
f_new->ipf_pkt = pkt;
if (first_frag) {
f = (struct ipf *)q;
goto insert;
}
/* Find a segment which begins after this one does. */
for (f = q->ipfq_down; f != (struct ipf *)q; f = f->ipf_down) {
if (f->ipf_off > f_new->ipf_off) {
break;
}
}
/*
* If any of the fragments being reassembled overlap with any
* other fragments being reassembled for the same packet,
* reassembly of that packet must be abandoned and all the
* fragments that have been received for that packet must be
* discarded, and no ICMP error messages should be sent.
*
* It should be noted that fragments may be duplicated in the
* network. Instead of treating these exact duplicate fragments
* as overlapping fragments, an implementation may choose to
* detect this case and drop exact duplicate fragments while
* keeping the other fragments belonging to the same packet.
*
* https://tools.ietf.org/html/rfc8200#appendix-B
*
* We apply this rule for both for IPv4 and IPv6 here.
*/
if (((f->ipf_up != (struct ipf *)q) && /* prev frag spans into f_new */
(f->ipf_up->ipf_off + f->ipf_up->ipf_len - f_new->ipf_off > 0)) ||
((f != (struct ipf *)q) && /* f_new spans into next */
(f_new->ipf_off + f_new->ipf_len - f->ipf_off > 0))) {
STATS_INC(mgr->ipfm_stats, FSW_STATS_RX_FRAG_BAD);
/* Check for exact duplicate offset/length */
if (((f->ipf_up != (struct ipf *)q) &&
((f->ipf_up->ipf_off != f_new->ipf_off) ||
(f->ipf_up->ipf_len != f_new->ipf_len))) ||
((f != (struct ipf *)q) &&
((f->ipf_off != f_new->ipf_off) ||
(f->ipf_len != f_new->ipf_len)))) {
SK_DF(SK_VERB_IP_FRAG, "frag overlap");
ipf_free(mgr, f_new);
/* give up over-lapping fragments queue */
SK_DF(SK_VERB_IP_FRAG, "free overlapping queue");
ipfq_freef(mgr, q, NULL);
q->ipfq_is_dirty = true;
} else {
ipf_free(mgr, f_new);
SK_DF(SK_VERB_IP_FRAG, "frag dup");
}
err = ERANGE;
goto done;
}
insert:
q->ipfq_unfraglen = local_ipfq_unfraglen;
/*
* Stick new segment in its place;
* check for complete reassembly.
* Move to front of packet queue, as we are
* the most recently active fragmented packet.
*/
ipf_enq(f_new, f->ipf_up);
q->ipfq_nfrag++;
next = 0;
for (f = q->ipfq_down; f != (struct ipf *)q; f = f->ipf_down) {
/* there is a hole */
if (f->ipf_off != next) {
goto done;
}
next += f->ipf_len;
}
/* we haven't got last frag yet */
if (f->ipf_up->ipf_mff) {
goto done;
}
/*
* Reassembly is complete; concatenate fragments.
*/
f = q->ipfq_down;
f_down = f->ipf_down;
pkt_reassed = f->ipf_pkt;
*nfrags = 1;
while (f_down != (struct ipf *)q) {
/* chain __kern_packet with pkt_nextpkt ptr */
f->ipf_pkt->pkt_nextpkt = f_down->ipf_pkt;
(*nfrags)++;
(*tlen) += f_down->ipf_len;
f_down = f->ipf_down;
ipf_deq(f);
ipf_free(mgr, f);
f = f_down;
f_down = f->ipf_down;
}
ipf_free(mgr, f);
err = 0;
STATS_INC(mgr->ipfm_stats, FSW_STATS_RX_FRAG_REASSED);
ipfq_remque(q);
ipfq_free(mgr, q);
done:
/* ipfm take ownership of, or return assembled packet, if no error */
if (err == 0) {
/* reass'ed packet if done; NULL otherwise */
*pkt_ptr = pkt_reassed;
}
ipfq_sched_timeout(mgr, FALSE);
lck_mtx_unlock(&mgr->ipfm_lock);
return err;
}
static int
ipf_key_cmp(struct ipf_key *a, struct ipf_key *b)
{
int d;
if ((d = (a->ipfk_len - b->ipfk_len)) != 0) {
return d;
}
if ((d = (a->ipfk_ident - b->ipfk_ident)) != 0) {
return d;
}
return memcmp(a->ipfk_addr, b->ipfk_addr, a->ipfk_len);
}
/*
* Put an ip fragment on a reassembly chain.
* Like insque, but pointers in middle of structure.
*/
static void
ipf_enq(struct ipf *f, struct ipf *up6)
{
f->ipf_up = up6;
f->ipf_down = up6->ipf_down;
up6->ipf_down->ipf_up = f;
up6->ipf_down = f;
}
/*
* To ipf_enq as remque is to insque.
*/
static void
ipf_deq(struct ipf *f)
{
f->ipf_up->ipf_down = f->ipf_down;
f->ipf_down->ipf_up = f->ipf_up;
}
static void
ipfq_insque(struct ipfq *new, struct ipfq *old)
{
new->ipfq_prev = old;
new->ipfq_next = old->ipfq_next;
old->ipfq_next->ipfq_prev = new;
old->ipfq_next = new;
}
static void
ipfq_remque(struct ipfq *p6)
{
p6->ipfq_prev->ipfq_next = p6->ipfq_next;
p6->ipfq_next->ipfq_prev = p6->ipfq_prev;
}
/*
* @internal drain reassembly queue till reaching target q count.
*/
static void
_ipfq_reap(struct fsw_ip_frag_mgr *mgr, uint32_t target_q_count,
void (*ipf_cb)(struct fsw_ip_frag_mgr *, struct ipf *))
{
uint32_t n_freed = 0;
LCK_MTX_ASSERT(&mgr->ipfm_lock, LCK_MTX_ASSERT_OWNED);
SK_DF(SK_VERB_IP_FRAG, "draining (frag %d/%d queue %d/%d)",
mgr->ipfm_f_count, mgr->ipfm_f_limit, mgr->ipfm_q_count,
mgr->ipfm_q_limit);
while (mgr->ipfm_q.ipfq_next != &mgr->ipfm_q &&
mgr->ipfm_q_count > target_q_count) {
n_freed += ipfq_freefq(mgr, mgr->ipfm_q.ipfq_prev,
mgr->ipfm_q.ipfq_prev->ipfq_is_dirty ? NULL : ipf_cb);
}
STATS_ADD(mgr->ipfm_stats, FSW_STATS_RX_FRAG_DROP_REAPED, n_freed);
}
/*
* @internal reap half reassembly queues to allow newer fragment assembly.
*/
static void
ipfq_reap(struct fsw_ip_frag_mgr *mgr)
{
_ipfq_reap(mgr, mgr->ipfm_q_count / 2, ipf_icmp_timeout_err);
}
/*
* @internal reap all reassembly queues, for shutdown etc.
*/
static void
ipfq_drain(struct fsw_ip_frag_mgr *mgr)
{
_ipfq_reap(mgr, 0, NULL);
}
static void
ipfq_timeout(thread_call_param_t arg0, thread_call_param_t arg1)
{
#pragma unused(arg1)
struct fsw_ip_frag_mgr *__single mgr = arg0;
struct ipfq *q;
uint64_t now, elapsed;
uint32_t n_freed = 0;
net_update_uptime();
now = _net_uptime;
SK_DF(SK_VERB_IP_FRAG, "run");
lck_mtx_lock(&mgr->ipfm_lock);
q = mgr->ipfm_q.ipfq_next;
if (q) {
while (q != &mgr->ipfm_q) {
q = q->ipfq_next;
elapsed = now - q->ipfq_prev->ipfq_timestamp;
if (elapsed > ipfm_frag_ttl) {
SK_DF(SK_VERB_IP_FRAG, "timing out q id %5d",
q->ipfq_prev->ipfq_key.ipfk_ident);
n_freed = ipfq_freefq(mgr, q->ipfq_prev,
q->ipfq_is_dirty ? NULL :
ipf_icmp_timeout_err);
}
}
}
STATS_ADD(mgr->ipfm_stats, FSW_STATS_RX_FRAG_DROP_TIMEOUT, n_freed);
/* If running out of resources, drain ipfm queues (oldest one first) */
if (mgr->ipfm_f_count >= mgr->ipfm_f_limit ||
mgr->ipfm_q_count >= mgr->ipfm_q_limit) {
ipfq_reap(mgr);
}
/* re-arm the purge timer if there's work to do */
if (mgr->ipfm_q_count > 0) {
ipfq_sched_timeout(mgr, TRUE);
}
lck_mtx_unlock(&mgr->ipfm_lock);
}
static void
ipfq_sched_timeout(struct fsw_ip_frag_mgr *mgr, boolean_t in_tcall)
{
uint32_t delay = MAX(1, ipfm_timeout_tcall_ival); /* seconds */
thread_call_t __single tcall = mgr->ipfm_timeout_tcall;
uint64_t now = mach_absolute_time();
uint64_t ival, deadline = now;
LCK_MTX_ASSERT(&mgr->ipfm_lock, LCK_MTX_ASSERT_OWNED);
ASSERT(tcall != NULL);
if (mgr->ipfm_q_count > 0 &&
(!thread_call_isactive(tcall) || in_tcall)) {
nanoseconds_to_absolutetime(delay * NSEC_PER_SEC, &ival);
clock_deadline_for_periodic_event(ival, now, &deadline);
(void) thread_call_enter_delayed(tcall, deadline);
}
}
static int
ipfq_drain_sysctl SYSCTL_HANDLER_ARGS
{
#pragma unused(oidp, arg2)
struct fsw_ip_frag_mgr *__single mgr = arg1;
SKOID_PROC_CALL_GUARD;
lck_mtx_lock(&mgr->ipfm_lock);
ipfq_drain(mgr);
lck_mtx_unlock(&mgr->ipfm_lock);
return 0;
}
static struct ipfq *
ipfq_alloc(struct fsw_ip_frag_mgr *mgr)
{
struct ipfq *q;
if (mgr->ipfm_q_count > mgr->ipfm_q_limit) {
ipfq_reap(mgr);
}
ASSERT(mgr->ipfm_q_count <= mgr->ipfm_q_limit);
q = kalloc_type(struct ipfq, Z_WAITOK | Z_ZERO);
if (q != NULL) {
mgr->ipfm_q_count++;
q->ipfq_is_dirty = false;
}
return q;
}
/* free q */
static void
ipfq_free(struct fsw_ip_frag_mgr *mgr, struct ipfq *q)
{
kfree_type(struct ipfq, q);
mgr->ipfm_q_count--;
}
/*
* Free all fragments, keep q.
* @return: number of frags freed
*/
static uint32_t
ipfq_freef(struct fsw_ip_frag_mgr *mgr, struct ipfq *q,
void (*ipf_cb)(struct fsw_ip_frag_mgr *, struct ipf *))
{
struct ipf *f, *down6;
uint32_t nfrags = 0;
for (f = q->ipfq_down; f != (struct ipf *)q; f = down6) {
nfrags++;
down6 = f->ipf_down;
ipf_deq(f);
if (ipf_cb != NULL) {
(*ipf_cb)(mgr, f);
}
ipf_free_pkt(f);
ipf_free(mgr, f);
}
return nfrags;
}
/* Free both all fragments and q
* @return: number of frags freed
*/
static uint32_t
ipfq_freefq(struct fsw_ip_frag_mgr *mgr, struct ipfq *q,
void (*ipf_cb)(struct fsw_ip_frag_mgr *, struct ipf *))
{
uint32_t freed_count;
freed_count = ipfq_freef(mgr, q, ipf_cb);
ipfq_remque(q);
ipfq_free(mgr, q);
return freed_count;
}
static struct ipf *
ipf_alloc(struct fsw_ip_frag_mgr *mgr)
{
struct ipf *f;
if (mgr->ipfm_f_count > mgr->ipfm_f_limit) {
STATS_INC(mgr->ipfm_stats, FSW_STATS_RX_FRAG_DROP_FRAG_LIMIT);
return NULL;
}
f = kalloc_type(struct ipf, Z_WAITOK | Z_ZERO);
if (f != NULL) {
mgr->ipfm_f_count++;
}
return f;
}
static void
ipf_free_pkt(struct ipf *f)
{
struct __kern_packet *pkt = f->ipf_pkt;
ASSERT(pkt != NULL);
pp_free_packet(__DECONST(struct kern_pbufpool *, pkt->pkt_qum.qum_pp),
SK_PTR_ADDR(pkt));
}
static void
ipf_free(struct fsw_ip_frag_mgr *mgr, struct ipf *f)
{
kfree_type(struct ipf, f);
mgr->ipfm_f_count--;
}