This is xnu-11215.1.10. See this file in:
/*
* Copyright (c) 2015-2022 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
* This file contains Original Code and/or Modifications of Original Code
* as defined in and that are subject to the Apple Public Source License
* Version 2.0 (the 'License'). You may not use this file except in
* compliance with the License. The rights granted to you under the License
* may not be used to create, or enable the creation or redistribution of,
* unlawful or unlicensed copies of an Apple operating system, or to
* circumvent, violate, or enable the circumvention or violation of, any
* terms of an Apple operating system software license agreement.
*
* Please obtain a copy of the License at
* http://www.opensource.apple.com/apsl/ and read it before using this file.
*
* The Original Code and all software distributed under the License are
* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
* Please see the License for the specific language governing rights and
* limitations under the License.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
*/
#include <skywalk/os_skywalk_private.h>
#include <skywalk/nexus/flowswitch/flow/flow_var.h>
#include <netinet/tcp.h>
#include <netinet/udp.h>
#include <netinet/ip.h>
#include <netinet/ip6.h>
#define CL_SKIP_ON(t) \
if (__improbable(t)) { \
SK_ERR("%d: skip " #t, __LINE__); \
SK_ERR("%s %s", if_name(ifp), sk_dump("buf", \
pkt_buf + pkt->pkt_headroom, __packet_get_real_data_length(pkt), \
MIN(128, bdlen), NULL, 0)); \
error = ENOTSUP; \
goto done; \
}
#define CL_SKIP_L4() \
do { \
pkt->pkt_flow_ip_hlen = l3hlen; \
pkt->pkt_flow_tcp_src = 0; \
pkt->pkt_flow_tcp_dst = 0; \
error = 0; \
goto done; \
} while (0);
/*
* Packet flow parser
*
* Parse a continuous chunk of packet header fields.
*
* The idea here is that while we have the headers in the CPU cache,
* do as much parsing as necessary and store the results in __flow.
*
* We assume that outbound packets from the host (BSD) stack never
* get here, i.e. we only handle channel-based outbound traffic.
*
* @param pkt
* packet to be classified
* @param ifp
* associated network interface
* @param af
* address family
* @param input
* is it input
*
* @return
* We return ENOTSUP to indicate that we can't classify the packet,
* and that the packet should still be forwarded to the lookup path.
* Any other non-zero value will cause the packet to be dropped.
*
*/
int
flow_pkt_classify(struct __kern_packet *pkt, struct ifnet *ifp, sa_family_t af,
bool input)
{
#pragma unused(ifp)
/* these begin at the same offset in the packet, hence the unions */
union {
volatile struct ip *__indexable _iph;
volatile struct ip6_hdr *__indexable _ip6;
} _l3;
#define iph _l3._iph
#define ip6 _l3._ip6
union {
volatile struct tcphdr *_tcph;
volatile struct udphdr *_udph;
} _l4;
#define tcph _l4._tcph
#define udph _l4._udph
uint32_t mtu = ifp->if_mtu;
size_t pkt_len; /* remaining packet length left for parsing */
uint32_t cls_len;
/*
* These are length parsed from packet header, needs to be
* incrementally validated from l3 to l4
*/
uint8_t l3hlen = 0; /* IP header length */
uint16_t l3tlen = 0; /* total length of IP packet */
uint8_t l4hlen = 0; /* TCP/UDP header length */
uint16_t ulen = 0; /* user data length */
int error = 0;
/* must be 16-bytes aligned due to use of sk_copy* below */
_CASSERT((offsetof(struct __flow, flow_l3) % 16) == 0);
_CASSERT((offsetof(struct __flow, flow_ipv4_src) % 16) == 0);
_CASSERT((offsetof(struct __flow, flow_ipv6_src) % 16) == 0);
_CASSERT((offsetof(struct __flow, flow_l4) % 16) == 0);
_CASSERT((offsetof(struct __flow, flow_tcp_src) % 16) == 0);
_CASSERT((offsetof(struct __flow, flow_udp_src) % 16) == 0);
_CASSERT((offsetof(struct __flow, flow_esp_spi) % 16) == 0);
_CASSERT(sizeof(struct __flow_l3_ipv4_addrs) == 8);
_CASSERT((offsetof(struct __flow_l3_ipv4_addrs, _dst) -
offsetof(struct __flow_l3_ipv4_addrs, _src)) ==
(offsetof(struct ip, ip_dst) - offsetof(struct ip, ip_src)));
_CASSERT(sizeof(struct __flow_l3_ipv6_addrs) == 32);
_CASSERT((offsetof(struct __flow_l3_ipv6_addrs, _dst) -
offsetof(struct __flow_l3_ipv6_addrs, _src)) ==
(offsetof(struct ip6_hdr, ip6_dst) -
offsetof(struct ip6_hdr, ip6_src)));
/* __flow_l4_tcp must mirror tcphdr for the first 16-bytes */
_CASSERT(sizeof(struct __flow_l4_tcp) == 16);
_CASSERT((offsetof(struct __flow_l4_tcp, _dst) -
offsetof(struct __flow_l4_tcp, _src)) ==
(offsetof(struct tcphdr, th_dport) -
offsetof(struct tcphdr, th_sport)));
_CASSERT((offsetof(struct __flow_l4_tcp, _seq) -
offsetof(struct __flow_l4_tcp, _src)) ==
(offsetof(struct tcphdr, th_seq) -
offsetof(struct tcphdr, th_sport)));
_CASSERT((offsetof(struct __flow_l4_tcp, _ack) -
offsetof(struct __flow_l4_tcp, _src)) ==
(offsetof(struct tcphdr, th_ack) -
offsetof(struct tcphdr, th_sport)));
_CASSERT((offsetof(struct __flow_l4_tcp, _flags) -
offsetof(struct __flow_l4_tcp, _src)) ==
(offsetof(struct tcphdr, th_flags) -
offsetof(struct tcphdr, th_sport)));
_CASSERT((offsetof(struct __flow_l4_tcp, _win) -
offsetof(struct __flow_l4_tcp, _src)) ==
(offsetof(struct tcphdr, th_win) -
offsetof(struct tcphdr, th_sport)));
/* ensure same offsets use for TCP and UDP */
_CASSERT(sizeof(struct __flow_l4_udp) == 8);
_CASSERT(offsetof(struct __flow, flow_tcp_src) ==
offsetof(struct __flow, flow_udp_src));
_CASSERT(offsetof(struct __flow, flow_tcp_dst) ==
offsetof(struct __flow, flow_udp_dst));
/* parsing starts from l3, count SDU length after l2 header */
ASSERT(pkt->pkt_l2_len <= pkt->pkt_length);
pkt_len = pkt->pkt_length - pkt->pkt_l2_len;
/*
* we restrict the data length available for classification to the
* portion of L3 datagram available in the first buflet.
*/
/*
* compat netif sets the packet length and buflet data length
* metadata to the original length of the packet although the
* actual buffer is limited to NETIF_COMPAT_BUF_SIZE (128 bytes).
*/
uint8_t *pkt_buf, *l3_hdr;
uint32_t bdlen, bdlim, bdoff;
uint32_t pkt_buf_size;
uint8_t *__sized_by(pkt_buf_size) pkt_buf_cpy;
MD_BUFLET_ADDR_ABS_DLEN(pkt, pkt_buf, bdlen, bdlim, bdoff);
pkt_buf_cpy = pkt_buf;
pkt_buf_size = bdlim;
cls_len = bdlim - bdoff;
cls_len -= pkt->pkt_l2_len;
cls_len = (uint16_t)MIN(cls_len, pkt_len);
VERIFY(pkt_len >= cls_len);
/* takes care of ip6 assignment too */
l3_hdr = pkt_buf + pkt->pkt_headroom + pkt->pkt_l2_len;
pkt_buf_cpy = l3_hdr;
pkt_buf_size = cls_len;
iph = (volatile struct ip *__indexable)(void *)pkt_buf_cpy;
VERIFY(af != AF_UNSPEC);
pkt->pkt_flow_ip_ver = 0;
/*
* This code is in the hot data path, so we try to be as efficient
* as possible, and hence the use of unrolled loads/stores.
*/
/***************** L3 header (IP/IPv6) *****************/
switch (af) {
case AF_INET:
CL_SKIP_ON(cls_len < sizeof(struct ip));
l3hlen = (uint8_t)(iph->ip_hl << 2);
CL_SKIP_ON(l3hlen < sizeof(struct ip));
CL_SKIP_ON(cls_len < l3hlen);
/* don't allow outgoing channel-based packet with option(s) */
CL_SKIP_ON(!input && l3hlen != sizeof(struct ip));
l3tlen = ntohs(iph->ip_len);
CL_SKIP_ON(l3tlen < l3hlen);
CL_SKIP_ON(pkt_len < l3tlen);
CL_SKIP_ON(iph->ip_v != IPVERSION);
if (__probable(IS_P2ALIGNED(&iph->ip_src, 8))) {
sk_copy64_8(__DECONST(uint64_t *, &iph->ip_src),
(uint64_t *)(void *)&pkt->pkt_flow_ipv4_src);
} else if (IS_P2ALIGNED(&iph->ip_src, 4)) {
uint32_t *src;
uint32_t *dst;
src = (uint32_t *)(void *)(__DEVOLATILE(char *, iph) +
offsetof(struct ip, ip_src));
dst = (uint32_t *__indexable)(&pkt->pkt_flow_ipv4_addrs);
sk_copy32_8(src, dst);
} else {
bcopy(__DECONST(struct __flow_l3_ipv4_addrs *__single, &iph->ip_src),
(struct __flow_l3_ipv4_addrs *__single) &pkt->pkt_flow_ipv4_addrs,
sizeof(struct __flow_l3_ipv4_addrs));
}
pkt->pkt_flow_ip_ver = IPVERSION;
pkt->pkt_flow_ip_proto = iph->ip_p;
pkt->pkt_flow_ip_hdr = (mach_vm_address_t)iph;
if (__improbable(ntohs(iph->ip_off) & ~(IP_DF | IP_RF))) {
pkt->pkt_flow_ip_is_frag = TRUE;
pkt->pkt_flow_ip_frag_id = iph->ip_id;
/* we only parse l4 in the 1st frag */
if ((ntohs(iph->ip_off) & IP_OFFMASK) != 0) {
pkt->pkt_flow_ip_is_first_frag = FALSE;
CL_SKIP_L4();
} else {
pkt->pkt_flow_ip_is_first_frag = TRUE;
}
}
break;
case AF_INET6:
l3hlen = sizeof(struct ip6_hdr);
CL_SKIP_ON(cls_len < l3hlen);
l3tlen = l3hlen + ntohs(ip6->ip6_plen);
CL_SKIP_ON(pkt_len < l3tlen);
CL_SKIP_ON((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION);
struct ipv6addrs {
struct in6_addr src;
struct in6_addr dst;
};
if (__probable(IS_P2ALIGNED(&ip6->ip6_src, 8))) {
uint64_t *src;
uint64_t *dst;
src = (uint64_t *)(void *)(__DEVOLATILE(char *, ip6) +
offsetof(struct ip6_hdr, ip6_src));
dst = (uint64_t *__indexable)(void *)(&pkt->pkt_flow_ipv6_addrs);
sk_copy64_32(src, dst);
} else if (IS_P2ALIGNED(&ip6->ip6_src, 4)) {
uint32_t *src;
uint32_t *dst;
src = (uint32_t *)(void *)(__DEVOLATILE(char *, ip6) +
offsetof(struct ip6_hdr, ip6_src));
dst = (uint32_t *__indexable)(&pkt->pkt_flow_ipv6_addrs);
sk_copy32_32(src, dst);
} else {
bcopy(__DECONST(struct __flow_l3_ipv6_addrs *__single, &ip6->ip6_src),
(struct __flow_l3_ipv6_addrs *__single) &pkt->pkt_flow_ipv6_addrs,
sizeof(struct __flow_l3_ipv6_addrs));
}
pkt->pkt_flow_ip_ver = IPV6_VERSION;
pkt->pkt_flow_ip_proto = ip6->ip6_nxt;
pkt->pkt_flow_ip_hdr = (mach_vm_address_t)ip6;
/* only parse the next immediate extension header for frags */
if (__improbable(ip6->ip6_nxt == IPPROTO_FRAGMENT)) {
volatile struct ip6_frag *ip6f;
ip6f = (volatile struct ip6_frag *)(ip6 + 1);
CL_SKIP_ON(cls_len < l3hlen + sizeof(struct ip6_frag));
pkt->pkt_flow_ip_is_frag = 1;
pkt->pkt_flow_ip_frag_id = ip6f->ip6f_ident;
pkt->pkt_flow_ip_proto = ip6f->ip6f_nxt;
l3hlen += sizeof(struct ip6_frag);
CL_SKIP_ON(l3tlen < l3hlen);
/* we only parse l4 in the 1st frag */
if ((ip6f->ip6f_offlg & IP6F_OFF_MASK) != 0) {
pkt->pkt_flow_ip_is_first_frag = FALSE;
CL_SKIP_L4();
} else {
pkt->pkt_flow_ip_is_first_frag = TRUE;
}
/* process atomic frag as non-frag */
if ((ip6f->ip6f_offlg & ~IP6F_RESERVED_MASK) == 0) {
pkt->pkt_flow_ip_is_frag = 0;
}
}
break;
default:
error = ENOTSUP;
goto done;
}
pkt->pkt_flow_ip_hlen = l3hlen;
if (__improbable(pkt->pkt_flow_ip_proto != IPPROTO_TCP &&
pkt->pkt_flow_ip_proto != IPPROTO_UDP)) {
error = 0;
goto done;
}
/**************** L4 header (TCP/UDP) *****************/
ulen = (l3tlen - l3hlen);
if (__probable(pkt->pkt_flow_ip_proto == IPPROTO_TCP)) {
CL_SKIP_ON((cls_len < l3hlen + sizeof(struct tcphdr)) ||
(ulen < sizeof(struct tcphdr)));
tcph = __DECONST(volatile struct tcphdr *,
(volatile uint8_t *)iph + l3hlen);
l4hlen = (uint8_t)(tcph->th_off << 2);
CL_SKIP_ON(l4hlen < sizeof(*tcph));
CL_SKIP_ON(l4hlen > ulen);
pkt->pkt_flow_tcp_hlen = l4hlen;
pkt->pkt_flow_tcp_hdr = (mach_vm_address_t)tcph;
} else {
CL_SKIP_ON((cls_len < l3hlen + sizeof(struct udphdr)) ||
(ulen < sizeof(struct udphdr)));
udph = __DECONST(volatile struct udphdr *,
(volatile uint8_t *)iph + l3hlen);
l4hlen = sizeof(*udph);
CL_SKIP_ON(l4hlen > ulen);
pkt->pkt_flow_udp_hlen = l4hlen;
pkt->pkt_flow_udp_hdr = (mach_vm_address_t)udph;
}
if (__probable(!pkt->pkt_flow_ip_is_frag)) {
ulen -= l4hlen;
pkt->pkt_flow_ulen = ulen;
} else {
/*
* We can't determine user data length for fragment until
* it is reassembled.
*/
pkt->pkt_flow_ulen = 0;
}
if (__probable(IS_P2ALIGNED(&tcph->th_sport, 4))) {
uint32_t *src;
uint32_t *dst;
if (__probable(pkt->pkt_flow_ip_proto == IPPROTO_TCP)) {
src = __unsafe_forge_bidi_indexable(uint32_t *,
__DECONST(uint32_t *, &tcph->th_sport),
sizeof(uint32_t) * 4);
dst = __unsafe_forge_bidi_indexable(uint32_t *,
(uint32_t *)(void *)&pkt->pkt_flow_tcp_src,
sizeof(uint32_t) * 4);
sk_copy32_16(src, dst);
} else {
src = __unsafe_forge_bidi_indexable(uint32_t *,
__DECONST(uint32_t *, &udph->uh_sport),
sizeof(uint32_t) * 2);
dst = __unsafe_forge_bidi_indexable(uint32_t *,
(uint32_t *)(void *) &pkt->pkt_flow_udp_src,
sizeof(uint32_t) * 2);
sk_copy32_8(src, dst);
}
} else {
if (__probable(pkt->pkt_flow_ip_proto == IPPROTO_TCP)) {
bcopy(__DECONST(struct __flow_l4_tcp *__single, tcph),
(struct __flow_l4_tcp *__single) &pkt->pkt_flow_tcp,
sizeof(struct __flow_l4_tcp));
} else {
bcopy(__DECONST(struct __flow_l4_udp *__single, udph),
(struct __flow_l4_udp *__single) &pkt->pkt_flow_udp,
sizeof(struct __flow_l4_udp));
}
}
if (!input && pkt->pkt_flow_ip_proto == IPPROTO_TCP &&
pkt->pkt_flow_ulen != 0) {
/*
* Following the logic in tcp_output(), we mark
* this if the payload is non-zero; note that
* the pkt_flow_tcp_seq is in network byte order.
*/
pkt->pkt_pflags |= PKT_F_START_SEQ;
}
done:
if (__probable(error == 0)) {
SK_DF(SK_VERB_FLOW_CLASSIFY, "pkt_length %u l3_ip_len %u "
"l3_ip_ver 0x%x l3_proto %u l4_sport %u l4_dport %u",
pkt->pkt_length, l3tlen, pkt->pkt_flow_ip_ver,
pkt->pkt_flow_ip_proto, ntohs(pkt->pkt_flow_tcp_src),
ntohs(pkt->pkt_flow_tcp_dst));
/* on output, trim metadata length if not same as IP length */
if (!input) {
if (__improbable(pkt->pkt_length != (l3tlen + pkt->pkt_l2_len))) {
SK_ERR("packet is too long (%u), trimming to "
"IP + L2 length (%d)", pkt->pkt_length,
l3tlen + pkt->pkt_l2_len);
METADATA_SET_LEN(pkt, l3tlen + pkt->pkt_l2_len, bdoff);
}
if (__improbable(((pkt->pkt_length > mtu) &&
(pkt->pkt_proto_seg_sz == 0)) ||
(pkt->pkt_proto_seg_sz > mtu))) {
SK_ERR("dropped; length (%u) exceeds MTU (%d) "
" proto_seg_sz %d",
pkt->pkt_length, mtu,
pkt->pkt_proto_seg_sz);
SK_ERR("%s", sk_dump("buf", l3_hdr, cls_len,
128, NULL, 0));
error = EMSGSIZE;
goto fail;
}
}
/*
* Mark QUM_F_FLOW_CLASSIFIED on the packet to indicate
* that the __flow structure has valid info now.
*/
pkt->pkt_qum_qflags |= QUM_F_FLOW_CLASSIFIED;
return 0;
}
fail:
ASSERT(error != 0 && !(pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED));
KPKT_CLEAR_FLOW_ALL(pkt->pkt_flow);
return error;
}