This is xnu-11215.1.10. See this file in:
/*
 * Copyright (c) 2021 Apple Inc. All rights reserved.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
 *
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. The rights granted to you under the License
 * may not be used to create, or enable the creation or redistribution of,
 * unlawful or unlicensed copies of an Apple operating system, or to
 * circumvent, violate, or enable the circumvention or violation of, any
 * terms of an Apple operating system software license agreement.
 *
 * Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this file.
 *
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
 */

/*
 * The flowidns (Flow ID namespace) module provides functionality to allocate
 * globally unique identifier for a flow.
 * Currently we have four modules (flowswitch, inpcb, PF & IPSec driver) in our
 * stack which need to generate flow identifiers. These modules stamp every
 * outgoing packet with a flowID. This flowID can be used by other upstream
 * components in the device for flow classification purpose. For example, the
 * FQ-Codel algorithm relies on this per packet flowID to avoid parsing every
 * packet header for flow classification. A globally unique flowID can also be
 * used by the networking feature offload engines operating at link layer to
 * avoid flow classification operations.
 * For performance reasons we use the concept of a flow domain and the
 * data structures used by the flowidns module have per domain instance.
 * These domains represent the above mentioned four modules generating the
 * flowID. This allows us to avoid global lock being used while allocating &
 * releasing flowID. FlowID is a 32-bit unsigned integer and the 2 most
 * significant bits of flowID are used to encode the domain ID. This
 * encoding also means that the flowID generator only needs to ensure
 * uniqueness of identifier within a domain.
 */

#include <skywalk/os_skywalk.h>
#include <skywalk/os_skywalk_private.h>
#include <skywalk/namespace/flowidns.h>
#include <dev/random/randomdev.h>
#include <sys/sdt.h>

/* maximum number of flowID generation retries in case of collision */
#define FLOWIDNS_MAX_FLOWID_GEN_RETRY  5

/* 2 most significant bits of the flowID are used to encode the flow domain */
#define FLOWIDNS_FLOWID_DOMAIN_SHIFT   30
#define FLOWIDNS_FLOWID_DOMAIN_MASK    (0x03 << FLOWIDNS_FLOWID_DOMAIN_SHIFT)

#define FLOWIDNS_FLOWID_SET_DOMAIN(_dom, _fid)    do {         \
	(_fid) &= ~FLOWIDNS_FLOWID_DOMAIN_MASK;                \
	(_fid) |= ((_dom) << FLOWIDNS_FLOWID_DOMAIN_SHIFT);    \
} while (0)

#define FLOWIDNS_FLOWID_GET_DOMAIN(_dom, _fid)    do {    \
	(_dom) = (_fid) >> FLOWIDNS_FLOWID_DOMAIN_SHIFT;  \
} while (0)

#define FLOWIDNS_DOM_LOCK(_dom)    \
	lck_mtx_lock(&(flowidns_domain_array[(_dom)].fd_mtx))
#define FLOWIDNS_DOM_UNLOCK(_dom)    \
	lck_mtx_unlock(&(flowidns_domain_array[(_dom)].fd_mtx))

struct flowidns_flowid_tree_node {
	RB_ENTRY(flowidns_flowid_tree_node) fftn_link;
	struct flowidns_flow_key            fftn_flowkey;
	flowidns_flowid_t                   fftn_flowid;
};

static LCK_GRP_DECLARE(flowidns_lock_group, "flowidns_lock");
static int __flowidns_inited = 0;

static SKMEM_TYPE_DEFINE(flowidns_fftn_zone, struct flowidns_flowid_tree_node);

__attribute__((always_inline))
static inline int
fftn_cmp(const struct flowidns_flowid_tree_node *fftn1,
    const struct flowidns_flowid_tree_node *fftn2)
{
	return (signed)(fftn1->fftn_flowid - fftn2->fftn_flowid);
}

RB_HEAD(flowidns_flowid_tree, flowidns_flowid_tree_node);
RB_PROTOTYPE(flowidns_flowid_tree, flowidns_flowid_tree_node, fftn_link,
    fftn_cmp);
RB_GENERATE(flowidns_flowid_tree, flowidns_flowid_tree_node, fftn_link,
    fftn_cmp);

struct flowidns_domain {
	decl_lck_mtx_data(, fd_mtx);
	struct flowidns_flowid_tree    fd_flowid_tree;
	uint32_t                       fd_id;
	uint64_t                       fd_nallocs;
	uint64_t                       fd_nreleases;
	uint64_t                       fd_ncollisions;
};

static struct flowidns_domain flowidns_domain_array[FLOWIDNS_DOMAIN_MAX + 1];

static struct flowidns_flowid_tree_node *
flowidns_fftn_alloc(bool can_block)
{
	struct flowidns_flowid_tree_node *fftn = NULL;
	zalloc_flags_t zflags;

	zflags = can_block ? Z_WAITOK_ZERO : Z_NOWAIT_ZERO;
	fftn = zalloc_flags(flowidns_fftn_zone, zflags);
	return fftn;
}

static void
flowidns_fftn_free(struct flowidns_flowid_tree_node *fftn)
{
	zfree(flowidns_fftn_zone, fftn);
}

static struct flowidns_flowid_tree_node *
flowidns_find_fftn(flowidns_flowid_t flowid, flowidns_domain_id_t domain)
{
	struct flowidns_flowid_tree_node find = { .fftn_flowid = flowid };

	return RB_FIND(flowidns_flowid_tree,
	           &(flowidns_domain_array[domain].fd_flowid_tree), &find);
}

void
flowidns_allocate_flowid(flowidns_domain_id_t domain,
    struct flowidns_flow_key *pflow_key, flowidns_flowid_t *pflowid)
{
	struct flowidns_flowid_tree_node *fftn = NULL, *dup = NULL;
	uint32_t flowid = 0;
	int retry_cnt = 0;

	VERIFY(__flowidns_inited == 1);
	VERIFY(pflowid != NULL);
	VERIFY(pflow_key != NULL);
	VERIFY(domain >= FLOWIDNS_DOMAIN_MIN &&
	    domain <= FLOWIDNS_DOMAIN_MAX);

	FLOWIDNS_DOM_LOCK(domain);

	fftn = flowidns_fftn_alloc(true);
	if (__improbable(fftn == NULL)) {
		panic_plain("failed to allocate flowid node\n");
	}
retry:
	/* try to get a non-zero flow identifier */
	do {
		read_frandom(&flowid, sizeof(flowid));
	} while (__improbable(flowid == 0));

	FLOWIDNS_FLOWID_SET_DOMAIN(domain, flowid);

	fftn->fftn_flowid = flowid;
	fftn->fftn_flowkey = *pflow_key;
	dup = RB_INSERT(flowidns_flowid_tree,
	    &(flowidns_domain_array[domain].fd_flowid_tree), fftn);

	/* try to get a unique flow identifier */
	if (dup != NULL) {
		retry_cnt++;
		flowidns_domain_array[domain].fd_ncollisions++;
		SK_ERR("duplicate flowid 0x%x generated, retrying %d",
		    flowid, retry_cnt);
		/*
		 * safeguard to check if we need a better hash strategy.
		 */
		VERIFY(retry_cnt <= FLOWIDNS_MAX_FLOWID_GEN_RETRY);
		goto retry;
	}
	*pflowid = flowid;
	flowidns_domain_array[domain].fd_nallocs++;
	VERIFY(flowidns_domain_array[domain].fd_nallocs != 0);

	FLOWIDNS_DOM_UNLOCK(domain);

	DTRACE_SKYWALK2(fidalloc, uint32_t, domain, uint32_t, flowid);
}

void
flowidns_release_flowid(flowidns_flowid_t flowid)
{
	struct flowidns_flowid_tree_node *fftn;
	flowidns_domain_id_t domain;

	VERIFY(__flowidns_inited == 1);
	VERIFY(flowid != 0);

	FLOWIDNS_FLOWID_GET_DOMAIN(domain, flowid);
	VERIFY(domain >= FLOWIDNS_DOMAIN_MIN &&
	    domain <= FLOWIDNS_DOMAIN_MAX);

	DTRACE_SKYWALK2(fidrel, uint32_t, domain, uint32_t, flowid);

	FLOWIDNS_DOM_LOCK(domain);

	fftn = flowidns_find_fftn(flowid, domain);
	if (fftn == NULL) {
		panic_plain("flowid 0x%x not found in domain %d\n", flowid,
		    domain);
	}
	RB_REMOVE(flowidns_flowid_tree,
	    &(flowidns_domain_array[domain].fd_flowid_tree), fftn);
	ASSERT(fftn->fftn_flowid == flowid);
	flowidns_fftn_free(fftn);
	flowidns_domain_array[domain].fd_nreleases++;
	VERIFY(flowidns_domain_array[domain].fd_nreleases != 0);

	FLOWIDNS_DOM_UNLOCK(domain);
}

int
flowidns_init()
{
	flowidns_domain_id_t domain;

	VERIFY(__flowidns_inited == 0);
	_CASSERT(SFH_DOMAIN_IPSEC == FLOWIDNS_DOMAIN_IPSEC);
	_CASSERT(SFH_DOMAIN_FLOWSWITCH == FLOWIDNS_DOMAIN_FLOWSWITCH);
	_CASSERT(SFH_DOMAIN_INPCB == FLOWIDNS_DOMAIN_INPCB);
	_CASSERT(SFH_DOMAIN_PF == FLOWIDNS_DOMAIN_PF);
	_CASSERT(FLOWIDNS_DOMAIN_MIN == 0);
	/*
	 * FLOWIDNS_FLOWID_DOMAIN_{MASK, SHIFT} macros are based on below
	 * assumption.
	 */
	_CASSERT(FLOWIDNS_DOMAIN_MAX == 3);

	for (domain = FLOWIDNS_DOMAIN_MIN; domain <= FLOWIDNS_DOMAIN_MAX;
	    domain++) {
		bzero(&flowidns_domain_array[domain],
		    sizeof(struct flowidns_domain));
		flowidns_domain_array[domain].fd_id = domain;
		lck_mtx_init(&(flowidns_domain_array[domain].fd_mtx),
		    &flowidns_lock_group, NULL);
		RB_INIT(&(flowidns_domain_array[domain].fd_flowid_tree));
	}

	__flowidns_inited = 1;
	SK_D("initialized flow ID namespace");
	return 0;
}

void
flowidns_fini(void)
{
	flowidns_domain_id_t domain;
	struct flowidns_flowid_tree_node *fftn, *fftn_tmp;

	VERIFY(__flowidns_inited == 1);

	for (domain = FLOWIDNS_DOMAIN_MIN; domain <= FLOWIDNS_DOMAIN_MAX;
	    domain++) {
		FLOWIDNS_DOM_LOCK(domain);

		RB_FOREACH_SAFE(fftn, flowidns_flowid_tree,
		    &(flowidns_domain_array[domain].fd_flowid_tree),
		    fftn_tmp) {
			RB_REMOVE(flowidns_flowid_tree,
			    &(flowidns_domain_array[domain].fd_flowid_tree),
			    fftn);
			flowidns_fftn_free(fftn);
		}

		FLOWIDNS_DOM_UNLOCK(domain);

		lck_mtx_destroy(&(flowidns_domain_array[domain].fd_mtx),
		    &flowidns_lock_group);
	}

	__flowidns_inited = 0;
}

static int flowidns_stats_sysctl SYSCTL_HANDLER_ARGS;
SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, flowidns,
    CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
    0, 0, flowidns_stats_sysctl, "-",
    "flowid allocations (struct sk_stats_flowidns_header, "
    "skywalk/os_stats_private.h)");

static int
flowidns_dump_domain(struct sysctl_req *req, struct flowidns_domain *domain)
{
	struct flowidns_flowid_tree_node *fftn;
	struct sk_stats_flowidns_header header;
	struct sk_stats_flowidns_record record;
	uint64_t n_records;
	int err;

	/* Fill out header */
	memset(&header, 0, sizeof(header));
	header.sfh_domain = domain->fd_id;
	header.sfh_nallocs = domain->fd_nallocs;
	header.sfh_nreleases = domain->fd_nreleases;
	header.sfh_ncollisions = domain->fd_ncollisions;
	n_records = domain->fd_nallocs - domain->fd_nreleases;
	VERIFY(n_records <= UINT32_MAX);
	header.sfh_nrecords = (uint32_t)n_records;

	err = SYSCTL_OUT(req, &header, sizeof(header));
	if (err) {
		return err;
	}

	/* Fill out records */
	RB_FOREACH(fftn, flowidns_flowid_tree, &domain->fd_flowid_tree) {
		VERIFY(n_records > 0);
		n_records--;
		bzero(&record, sizeof(record));
		record.sfr_flowid = fftn->fftn_flowid;
		record.sfr_af = fftn->fftn_flowkey.ffk_af;
		record.sfr_ipproto = fftn->fftn_flowkey.ffk_proto;
		record.sfr_protoid = fftn->fftn_flowkey.ffk_protoid;
		_CASSERT(sizeof(fftn->fftn_flowkey.ffk_laddr) ==
		    sizeof(record.sfr_laddr));
		_CASSERT(sizeof(fftn->fftn_flowkey.ffk_raddr) ==
		    sizeof(record.sfr_raddr));
		bcopy(&(fftn->fftn_flowkey.ffk_laddr), &record.sfr_laddr,
		    sizeof(record.sfr_laddr));
		bcopy(&(fftn->fftn_flowkey.ffk_raddr), &record.sfr_raddr,
		    sizeof(record.sfr_raddr));

		err = SYSCTL_OUT(req, &record, sizeof(record));
		if (err) {
			return err;
		}
	}
	VERIFY(n_records == 0);
	return 0;
}

static int
flowidns_stats_sysctl SYSCTL_HANDLER_ARGS
{
#pragma unused(oidp, arg1, arg2)
	flowidns_domain_id_t domain;
	int err = 0;

	if (!kauth_cred_issuser(kauth_cred_get())) {
		return EPERM;
	}

	if (__flowidns_inited == 0) {
		return ENOTSUP;
	}

	net_update_uptime();

	for (domain = FLOWIDNS_DOMAIN_MIN; domain <= FLOWIDNS_DOMAIN_MAX;
	    domain++) {
		FLOWIDNS_DOM_LOCK(domain);
		err = flowidns_dump_domain(req, &flowidns_domain_array[domain]);
		FLOWIDNS_DOM_UNLOCK(domain);
		if (err != 0) {
			return err;
		}
	}
	/*
	 * If this is just a request for length, add slop because
	 * this is dynamically changing data
	 */
	if (req->oldptr == USER_ADDR_NULL) {
		req->oldidx += 20 * sizeof(struct sk_stats_flowidns_record);
	}
	return err;
}