This is xnu-11215.1.10. See this file in:
/*
* Copyright (c) 2016-2021 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
* This file contains Original Code and/or Modifications of Original Code
* as defined in and that are subject to the Apple Public Source License
* Version 2.0 (the 'License'). You may not use this file except in
* compliance with the License. The rights granted to you under the License
* may not be used to create, or enable the creation or redistribution of,
* unlawful or unlicensed copies of an Apple operating system, or to
* circumvent, violate, or enable the circumvention or violation of, any
* terms of an Apple operating system software license agreement.
*
* Please obtain a copy of the License at
* http://www.opensource.apple.com/apsl/ and read it before using this file.
*
* The Original Code and all software distributed under the License are
* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
* Please see the License for the specific language governing rights and
* limitations under the License.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
*/
#include <skywalk/os_skywalk_private.h>
#include <skywalk/nexus/flowswitch/fsw_var.h>
#include <skywalk/nexus/flowswitch/flow/flow_var.h>
static uint32_t flow_owner_bucket_purge_common(struct flow_owner_bucket *,
nexus_port_t, boolean_t);
static int fo_cmp(const struct flow_owner *, const struct flow_owner *);
static struct flow_owner *fo_alloc(boolean_t);
static void fo_free(struct flow_owner *);
static LCK_GRP_DECLARE(flow_owner_lock_group, "sk_flow_owner_lock");
static LCK_ATTR_DECLARE(flow_owner_lock_attr, 0, 0);
RB_GENERATE_PREV(flow_owner_tree, flow_owner, fo_link, fo_cmp);
KALLOC_TYPE_VAR_DEFINE(KT_SK_FOB, struct flow_owner_bucket, KT_DEFAULT);
struct flow_owner_bucket *
__sized_by(*tot_sz)
flow_owner_buckets_alloc(size_t fob_cnt, size_t * fob_sz, size_t * tot_sz){
size_t cache_sz = skmem_cpu_cache_line_size();
struct flow_owner_bucket *fob;
size_t fob_tot_sz;
/* each bucket is CPU cache-aligned */
*fob_sz = P2ROUNDUP(sizeof(*fob), cache_sz);
*tot_sz = fob_tot_sz = fob_cnt * (*fob_sz);
fob = sk_alloc_type_hash(KT_SK_FOB, fob_tot_sz, Z_WAITOK, skmem_tag_fsw_fob_hash);
if (__improbable(fob == NULL)) {
return NULL;
}
#if !KASAN_CLASSIC
/*
* except in KASAN_CLASSIC mode, kalloc will always maintain cacheline
* size alignment if the requested size is a multiple of a cacheline
* size (this is true for any size that is a power of two from 16 to
* PAGE_SIZE).
*
* Because this is an optimization only, it is OK to leave KASAN_CLASSIC
* not respect this.
*/
ASSERT(IS_P2ALIGNED(fob, cache_sz));
#endif
SK_DF(SK_VERB_MEM, "fob 0x%llx fob_cnt %zu fob_sz %zu "
"(total %zu bytes) ALLOC", SK_KVA(fob), fob_cnt,
*fob_sz, fob_tot_sz);
return fob;
}
void
flow_owner_buckets_free(struct flow_owner_bucket *fob, size_t tot_sz)
{
SK_DF(SK_VERB_MEM, "fob 0x%llx FREE", SK_KVA(fob));
sk_free_type_hash(KT_SK_FOB, tot_sz, fob);
}
void
flow_owner_bucket_init(struct flow_owner_bucket *fob)
{
#if !KASAN_CLASSIC
ASSERT(IS_P2ALIGNED(fob, skmem_cpu_cache_line_size()));
#endif /* !KASAN_CLASSIC */
lck_mtx_init(&fob->fob_lock, &flow_owner_lock_group,
&flow_owner_lock_attr);
RB_INIT(&fob->fob_owner_head);
}
void
flow_owner_bucket_destroy(struct flow_owner_bucket *fob)
{
/*
* In the event we are called as part of the nexus destructor,
* we need to wait until all threads have exited the flow close
* critical section, and that the flow_owner_bucket is empty.
* By the time we get here, the module initiating the request
* (e.g. NECP) has been quiesced, so any flow open requests would
* have been rejected.
*/
FOB_LOCK(fob);
while (!RB_EMPTY(&fob->fob_owner_head)) {
SK_ERR("waiting for fob 0x%llx to go idle", SK_KVA(fob));
if (++(fob->fob_dtor_waiters) == 0) { /* wraparound */
fob->fob_dtor_waiters++;
}
(void) msleep(&fob->fob_dtor_waiters, &fob->fob_lock,
(PZERO - 1), __FUNCTION__, NULL);
}
while (fob->fob_busy_flags & FOBF_CLOSE_BUSY) {
if (++(fob->fob_close_waiters) == 0) { /* wraparound */
fob->fob_close_waiters++;
}
(void) msleep(&fob->fob_close_waiters, &fob->fob_lock,
(PZERO - 1), __FUNCTION__, NULL);
}
ASSERT(RB_EMPTY(&fob->fob_owner_head));
ASSERT(!(fob->fob_busy_flags & FOBF_OPEN_BUSY));
ASSERT(!(fob->fob_busy_flags & FOBF_CLOSE_BUSY));
FOB_UNLOCK(fob);
lck_mtx_destroy(&fob->fob_lock, &flow_owner_lock_group);
}
static uint32_t
flow_owner_bucket_purge_common(struct flow_owner_bucket *fob,
nexus_port_t nx_port, boolean_t if_idle)
{
/* called by flow_owner_bucket_purge_all()? */
boolean_t locked = (nx_port == NEXUS_PORT_ANY);
struct flow_owner *fo, *tfo;
struct flow_entry *fe, *tfe;
uint32_t cnt = 0;
if (!locked) {
FOB_LOCK(fob);
}
FOB_LOCK_ASSERT_HELD(fob);
RB_FOREACH_SAFE(fo, flow_owner_tree, &fob->fob_owner_head, tfo) {
if (fo->fo_nx_port != nx_port && nx_port != NEXUS_PORT_ANY) {
continue;
}
if (!if_idle || nx_port == NEXUS_PORT_ANY) {
RB_FOREACH_SAFE(fe, flow_entry_id_tree,
&fo->fo_flow_entry_id_head, tfe) {
ASSERT(fe->fe_nx_port == fo->fo_nx_port);
flow_entry_retain(fe);
flow_entry_destroy(fo, fe, FALSE, NULL);
}
}
ASSERT(nx_port != NEXUS_PORT_ANY ||
RB_EMPTY(&fo->fo_flow_entry_id_head));
if (RB_EMPTY(&fo->fo_flow_entry_id_head)) {
flow_owner_free(fob, fo);
++cnt;
} else if (nx_port != NEXUS_PORT_ANY) {
/* let ms_flow_unbind() know this port is gone */
fo->fo_nx_port_destroyed = TRUE;
VERIFY(fo->fo_nx_port_na == NULL);
}
}
if (!locked) {
FOB_UNLOCK(fob);
}
return cnt;
}
void
flow_owner_bucket_purge_all(struct flow_owner_bucket *fob)
{
(void) flow_owner_bucket_purge_common(fob, NEXUS_PORT_ANY, TRUE);
}
static uint32_t
flow_owner_bucket_activate_nx_port_common(struct flow_owner_bucket *fob,
nexus_port_t nx_port, struct nexus_adapter *nx_port_na,
na_activate_mode_t mode)
{
struct flow_owner *fo;
struct flow_entry *fe;
uint32_t cnt = 0;
VERIFY(nx_port != NEXUS_PORT_ANY);
FOB_LOCK(fob);
RB_FOREACH(fo, flow_owner_tree, &fob->fob_owner_head) {
if (fo->fo_nx_port_destroyed || (fo->fo_nx_port != nx_port)) {
continue;
}
if (mode == NA_ACTIVATE_MODE_ON) {
VERIFY(fo->fo_nx_port_na == NULL);
*(struct nexus_adapter **)(uintptr_t)&fo->fo_nx_port_na = nx_port_na;
}
RB_FOREACH(fe, flow_entry_id_tree,
&fo->fo_flow_entry_id_head) {
if (fe->fe_flags & FLOWENTF_TORN_DOWN) {
continue;
}
VERIFY(fe->fe_nx_port == fo->fo_nx_port);
if (fe->fe_adv_idx != FLOWADV_IDX_NONE) {
if (mode == NA_ACTIVATE_MODE_ON) {
na_flowadv_entry_alloc(
fo->fo_nx_port_na, fe->fe_uuid,
fe->fe_adv_idx, fe->fe_flowid);
} else if (fo->fo_nx_port_na != NULL) {
na_flowadv_entry_free(fo->fo_nx_port_na,
fe->fe_uuid, fe->fe_adv_idx,
fe->fe_flowid);
}
}
}
if (mode != NA_ACTIVATE_MODE_ON && fo->fo_nx_port_na != NULL) {
*(struct nexus_adapter **)(uintptr_t)&fo->fo_nx_port_na = NULL;
}
++cnt;
}
FOB_UNLOCK(fob);
return cnt;
}
uint32_t
flow_owner_activate_nexus_port(struct flow_mgr *fm,
boolean_t pid_bound, pid_t pid, nexus_port_t nx_port,
struct nexus_adapter *nx_port_na, na_activate_mode_t mode)
{
struct flow_owner_bucket *fob;
uint32_t fo_cnt = 0;
VERIFY(nx_port != NEXUS_PORT_ANY);
VERIFY(nx_port_na != NULL);
if (pid_bound) {
fob = flow_mgr_get_fob_by_pid(fm, pid);
fo_cnt = flow_owner_bucket_activate_nx_port_common(fob, nx_port,
nx_port_na, mode);
} else {
uint32_t i;
/*
* Otherwise, this can get expensive since we need to search
* thru all proc-mapping buckets to find the flows that are
* related to this nexus port.
*/
for (i = 0; i < fm->fm_owner_buckets_cnt; i++) {
fob = flow_mgr_get_fob_at_idx(fm, i);
fo_cnt += flow_owner_bucket_activate_nx_port_common(fob,
nx_port, nx_port_na, mode);
}
}
/* There shouldn't be more than one flow owners on a nexus port */
VERIFY(fo_cnt <= 1);
return fo_cnt;
}
static void
flow_owner_bucket_attach_common(struct flow_owner_bucket *fob,
nexus_port_t nx_port)
{
struct flow_owner *fo;
VERIFY(nx_port != NEXUS_PORT_ANY);
FOB_LOCK(fob);
RB_FOREACH(fo, flow_owner_tree, &fob->fob_owner_head) {
if (fo->fo_nx_port_destroyed && (fo->fo_nx_port == nx_port)) {
fo->fo_nx_port_destroyed = FALSE;
}
}
FOB_UNLOCK(fob);
}
void
flow_owner_attach_nexus_port(struct flow_mgr *fm, boolean_t pid_bound,
pid_t pid, nexus_port_t nx_port)
{
struct flow_owner_bucket *fob;
ASSERT(nx_port != NEXUS_PORT_ANY);
if (pid_bound) {
fob = flow_mgr_get_fob_by_pid(fm, pid);
flow_owner_bucket_attach_common(fob, nx_port);
} else {
uint32_t i;
/*
* Otherwise, this can get expensive since we need to search
* thru all proc-mapping buckets to find the flows that are
* related to this nexus port.
*/
for (i = 0; i < fm->fm_owner_buckets_cnt; i++) {
fob = flow_mgr_get_fob_at_idx(fm, i);
flow_owner_bucket_attach_common(fob, nx_port);
}
}
}
uint32_t
flow_owner_detach_nexus_port(struct flow_mgr *fm, boolean_t pid_bound,
pid_t pid, nexus_port_t nx_port, boolean_t if_idle)
{
struct flow_owner_bucket *fob;
uint32_t purged = 0;
ASSERT(nx_port != NEXUS_PORT_ANY);
if (pid_bound) {
fob = flow_mgr_get_fob_by_pid(fm, pid);
purged = flow_owner_bucket_purge_common(fob, nx_port, if_idle);
} else {
uint32_t i;
/*
* Otherwise, this can get expensive since we need to search
* thru all proc-mapping buckets to find the flows that are
* related to this nexus port.
*/
for (i = 0; i < fm->fm_owner_buckets_cnt; i++) {
fob = flow_mgr_get_fob_at_idx(fm, i);
purged += flow_owner_bucket_purge_common(fob,
nx_port, if_idle);
}
}
return purged;
}
/* 64-bit mask with range */
#define FO_BMASK64(_beg, _end) \
((((uint64_t)0xffffffffffffffff) >> \
(63 - (_end))) & ~((1ULL << (_beg)) - 1))
struct flow_owner *
flow_owner_alloc(struct flow_owner_bucket *fob, struct proc *p,
nexus_port_t nx_port, bool nx_port_pid_bound, bool flowadv,
struct nx_flowswitch *fsw, struct nexus_adapter *nx_port_na,
void *context, bool low_latency)
{
struct flow_owner *fo;
const pid_t pid = proc_pid(p);
_CASSERT(true == 1);
_CASSERT(false == 0);
ASSERT(low_latency == true || low_latency == false);
ASSERT(nx_port != NEXUS_PORT_ANY);
FOB_LOCK_ASSERT_HELD(fob);
#if DEBUG
ASSERT(flow_owner_find_by_pid(fob, pid, context, low_latency) == NULL);
RB_FOREACH(fo, flow_owner_tree, &fob->fob_owner_head) {
if (!fo->fo_nx_port_destroyed && (fo->fo_nx_port == nx_port)) {
VERIFY(0);
/* NOTREACHED */
__builtin_unreachable();
}
}
#endif /* DEBUG */
fo = fo_alloc(TRUE);
if (fo != NULL) {
if (flowadv) {
uint32_t i;
bitmap_t *bmap;
bmap = skmem_cache_alloc(sk_fab_cache, SKMEM_SLEEP);
if (bmap == NULL) {
SK_ERR("failed to alloc flow advisory bitmap");
fo_free(fo);
return NULL;
}
bzero(bmap, sk_fab_size);
fo->fo_flowadv_bmap = bmap;
fo->fo_num_flowadv_bmaps = sk_fadv_nchunks;
fo->fo_flowadv_max = sk_max_flows;
/* set the bits for free indices */
for (i = 0; i < sk_fadv_nchunks; i++) {
uint32_t end = 63;
if (i == (sk_fadv_nchunks - 1)) {
end = ((sk_max_flows - 1) %
FO_FLOWADV_CHUNK);
}
fo->fo_flowadv_bmap[i] = FO_BMASK64(0, end);
}
}
RB_INIT(&fo->fo_flow_entry_id_head);
/* const override */
*(struct flow_owner_bucket **)(uintptr_t)&fo->fo_bucket = fob;
fo->fo_context = context;
fo->fo_pid = pid;
(void) snprintf(fo->fo_name, sizeof(fo->fo_name), "%s",
proc_name_address(p));
fo->fo_nx_port_pid_bound = nx_port_pid_bound;
fo->fo_low_latency = low_latency;
fo->fo_nx_port = nx_port;
*(struct nexus_adapter **)(uintptr_t)&fo->fo_nx_port_na = nx_port_na;
*(struct nx_flowswitch **)(uintptr_t)&fo->fo_fsw = fsw;
RB_INSERT(flow_owner_tree, &fob->fob_owner_head, fo);
SK_DF(SK_VERB_FLOW, "%s(%d) fob 0x%llx added fo 0x%llx "
"nx_port %d nx_port_pid_bound %d ll %d nx_port_na 0x%llx",
fo->fo_name, fo->fo_pid, SK_KVA(fob), SK_KVA(fo),
(int)nx_port, nx_port_pid_bound, fo->fo_low_latency,
SK_KVA(nx_port_na));
}
return fo;
}
void
flow_owner_free(struct flow_owner_bucket *fob, struct flow_owner *fo)
{
FOB_LOCK_ASSERT_HELD(fob);
ASSERT(fo->fo_bucket == fob);
*(struct flow_owner_bucket **)(uintptr_t)&fo->fo_bucket = NULL;
RB_REMOVE(flow_owner_tree, &fob->fob_owner_head, fo);
ASSERT(fo->fo_num_flowadv == 0);
skmem_cache_free(sk_fab_cache, fo->fo_flowadv_bmap);
fo->fo_flowadv_bmap = NULL;
fo->fo_num_flowadv_bmaps = 0;
/* wake up any thread blocked in flow_owner_bucket_destroy() */
if (RB_EMPTY(&fob->fob_owner_head) && fob->fob_dtor_waiters > 0) {
fob->fob_dtor_waiters = 0;
wakeup(&fob->fob_dtor_waiters);
}
SK_DF(SK_VERB_FLOW, "%s(%d) fob 0x%llx removed fo 0x%llx nx_port %d",
fo->fo_name, fo->fo_pid, SK_KVA(fob), SK_KVA(fo),
(int)fo->fo_nx_port);
fo_free(fo);
}
int
flow_owner_flowadv_index_alloc(struct flow_owner *fo, flowadv_idx_t *fadv_idx)
{
bitmap_t *bmap = fo->fo_flowadv_bmap;
size_t nchunks, i, j, idx = FLOWADV_IDX_NONE;
FOB_LOCK_ASSERT_HELD(FO_BUCKET(fo));
ASSERT(fo->fo_flowadv_max != 0);
nchunks = P2ROUNDUP(fo->fo_flowadv_max, FO_FLOWADV_CHUNK) /
FO_FLOWADV_CHUNK;
for (i = 0; i < nchunks; i++) {
j = ffsll(bmap[i]);
if (j == 0) {
/* All indices in this chunk are in use */
continue;
}
--j;
/* mark the index as in use */
bit_clear(bmap[i], j);
idx = (i * FO_FLOWADV_CHUNK) + j;
break;
}
if (idx == FLOWADV_IDX_NONE) {
SK_ERR("%s(%d) flow advisory table full: num %u max %u",
fo->fo_name, fo->fo_pid, fo->fo_num_flowadv,
fo->fo_flowadv_max);
VERIFY(fo->fo_num_flowadv == fo->fo_flowadv_max);
*fadv_idx = FLOWADV_IDX_NONE;
return ENOMEM;
}
fo->fo_num_flowadv++;
ASSERT(idx < ((flowadv_idx_t) -1));
*fadv_idx = (flowadv_idx_t)idx;
ASSERT(*fadv_idx < fo->fo_flowadv_max);
return 0;
}
void
flow_owner_flowadv_index_free(struct flow_owner *fo, flowadv_idx_t fadv_idx)
{
uint32_t chunk_idx, bit_pos;
bitmap_t *bmap = fo->fo_flowadv_bmap;
FOB_LOCK_ASSERT_HELD(FO_BUCKET(fo));
ASSERT(fo->fo_num_flowadv != 0);
ASSERT((fo->fo_flowadv_max != 0) && (fadv_idx < fo->fo_flowadv_max));
chunk_idx = fadv_idx / FO_FLOWADV_CHUNK;
bit_pos = fadv_idx % FO_FLOWADV_CHUNK;
ASSERT(!bit_test(bmap[chunk_idx], bit_pos));
/* mark the index as free */
bit_set(bmap[chunk_idx], bit_pos);
fo->fo_num_flowadv--;
}
int
flow_owner_destroy_entry(struct flow_owner *fo, uuid_t uuid,
bool nolinger, void *close_params)
{
struct flow_entry *fe = NULL;
int err = 0;
FOB_LOCK_ASSERT_HELD(FO_BUCKET(fo));
/* lookup such flow for this process */
fe = flow_entry_find_by_uuid(fo, uuid);
if (fe == NULL) {
err = ENOENT;
} else {
/* free flow entry (OK to linger if caller asked) */
flow_entry_destroy(fo, fe, nolinger, close_params);
}
return err;
}
static inline int
fo_cmp(const struct flow_owner *a, const struct flow_owner *b)
{
if (a->fo_pid > b->fo_pid) {
return 1;
}
if (a->fo_pid < b->fo_pid) {
return -1;
}
if ((intptr_t)a->fo_context > (intptr_t)b->fo_context) {
return 1;
} else if ((intptr_t)a->fo_context < (intptr_t)b->fo_context) {
return -1;
}
if (a->fo_low_latency != b->fo_low_latency) {
if (a->fo_low_latency) {
return 1;
} else {
return -1;
}
}
return 0;
}
static struct flow_owner *
fo_alloc(boolean_t can_block)
{
struct flow_owner *fo;
fo = skmem_cache_alloc(sk_fo_cache,
can_block ? SKMEM_SLEEP : SKMEM_NOSLEEP);
if (fo == NULL) {
return NULL;
}
bzero(fo, sk_fo_size);
SK_DF(SK_VERB_MEM, "fo 0x%llx ALLOC", SK_KVA(fo));
return fo;
}
static void
fo_free(struct flow_owner *fo)
{
ASSERT(fo->fo_bucket == NULL);
ASSERT(RB_EMPTY(&fo->fo_flow_entry_id_head));
ASSERT(fo->fo_flowadv_bmap == NULL);
SK_DF(SK_VERB_MEM, "fo 0x%llx FREE", SK_KVA(fo));
skmem_cache_free(sk_fo_cache, fo);
}