This is xnu-11215.1.10. See this file in:
/*
* Copyright (c) 2017-2023 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
* This file contains Original Code and/or Modifications of Original Code
* as defined in and that are subject to the Apple Public Source License
* Version 2.0 (the 'License'). You may not use this file except in
* compliance with the License. The rights granted to you under the License
* may not be used to create, or enable the creation or redistribution of,
* unlawful or unlicensed copies of an Apple operating system, or to
* circumvent, violate, or enable the circumvention or violation of, any
* terms of an Apple operating system software license agreement.
*
* Please obtain a copy of the License at
* http://www.opensource.apple.com/apsl/ and read it before using this file.
*
* The Original Code and all software distributed under the License are
* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
* Please see the License for the specific language governing rights and
* limitations under the License.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
*/
/*
* Flow Routes.
*
* Each (non-listener) flow entry is always associated with a flow route
* object. Multiple flow entries sharing the same remote address will use
* the same flow route for that address. The flow route object contains
* the route information for the remote node. It gets allocated when a
* flow entry requests to connect, and is garbage-collected when it's no
* longer referred to after its expiration time has passed.
*
* A flow route also contains the default local address that's used to
* reach the remote node. This may not necessarily be the same local
* address used by the flow entry, if it has explicitly bound the entry
* to another local address. But for the majority of cases, having the
* local address be present in the flow route allows us to avoid doing
* source address selection each time a connect request happens.
*
* When the remote node is reachable via a gateway, the gateway address
* portion of the flow route contains its IP address and the flow route
* is marked with FLOWRTF_GATEWAY. We use this to optimize the gateway
* route lookup, since otherwise we'd have to perform an extra lookup
* each time we need to resolve the route.
*
* When the remote node is directly on the link, the FLOWRTF_ONLINK flag
* is set, and the gateway address isn't used. The target address used
* for resolution will the the remote address itself.
*
* On links with link-layer information, we store the resolved address
* of the target node (which may be the gateway's) in the flow route,
* and mark the flow route with FLOWRTF_HAS_LLINFO.
*
* Each flow route also registers itself to receive route events when
* the underlying rtentry is updated or deleted.
*/
#include <skywalk/os_skywalk_private.h>
#include <skywalk/nexus/flowswitch/nx_flowswitch.h>
#include <skywalk/nexus/flowswitch/fsw_var.h>
#include <skywalk/nexus/flowswitch/flow/flow_var.h>
#include <netinet/in.h>
#include <netinet/in_var.h>
#include <netinet/in_arp.h>
#include <netinet6/nd6.h>
#include <net/route.h>
extern struct rtstat_64 rtstat;
static LCK_GRP_DECLARE(flow_route_lock_group, "sk_flow_route_lock");
static LCK_ATTR_DECLARE(flow_route_lock_attr, 0, 0);
static int fr_cmp(const struct flow_route *, const struct flow_route *);
static int fr_id_cmp(const struct flow_route *, const struct flow_route *);
static struct flow_route *fr_alloc(boolean_t);
static void fr_free(struct flow_route *);
static uint32_t flow_route_bucket_purge_common(struct flow_route_bucket *,
uint32_t *, boolean_t, boolean_t);
static void flow_route_ev_callback(struct eventhandler_entry_arg,
struct sockaddr *, int, struct sockaddr *, int);
RB_GENERATE_PREV(flow_route_tree, flow_route, fr_link, fr_cmp);
RB_GENERATE_PREV(flow_route_id_tree, flow_route, fr_id_link, fr_id_cmp);
KALLOC_TYPE_VAR_DEFINE(KT_SK_FRB, struct flow_route_bucket, KT_DEFAULT);
KALLOC_TYPE_VAR_DEFINE(KT_SK_FRIB, struct flow_route_id_bucket, KT_DEFAULT);
#define FR_ZONE_NAME "flow.route"
static unsigned int flow_route_size; /* size of flow_route */
struct skmem_cache *flow_route_cache; /* cache for flow_route */
static int __flow_route_inited = 0;
#define FLOW_ROUTE_EXPIRE 600 /* seconds */
static unsigned int flow_route_expire = FLOW_ROUTE_EXPIRE;
SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_route_expire,
CTLFLAG_RW | CTLFLAG_LOCKED, &flow_route_expire, 0, "");
void
flow_route_init(void)
{
ASSERT(!__flow_route_inited);
flow_route_size = sizeof(struct flow_route);
flow_route_cache = skmem_cache_create(FR_ZONE_NAME, flow_route_size,
sizeof(uint64_t), NULL, NULL, NULL, NULL, NULL, 0);
__flow_route_inited = 1;
}
void
flow_route_fini(void)
{
if (__flow_route_inited) {
skmem_cache_destroy(flow_route_cache);
flow_route_cache = NULL;
__flow_route_inited = 0;
}
}
struct flow_route_bucket *
__sized_by(*tot_sz)
flow_route_buckets_alloc(size_t frb_cnt, size_t * frb_sz, size_t * tot_sz){
uint32_t cache_sz = skmem_cpu_cache_line_size();
struct flow_route_bucket *frb;
size_t frb_tot_sz;
/* each bucket is CPU cache-aligned */
*frb_sz = P2ROUNDUP(sizeof(*frb), cache_sz);
*tot_sz = frb_tot_sz = frb_cnt * (*frb_sz);
frb = sk_alloc_type_hash(KT_SK_FRB, frb_tot_sz, Z_WAITOK,
skmem_tag_fsw_frb_hash);
if (__improbable(frb == NULL)) {
return NULL;
}
#if !KASAN_CLASSIC
/*
* except in KASAN_CLASSIC mode, kalloc will always maintain cacheline
* size alignment if the requested size is a multiple of a cacheline
* size (this is true for any size that is a power of two from 16 to
* PAGE_SIZE).
*
* Because this is an optimization only, it is OK to leave KASAN_CLASSIC
* not respect this.
*/
ASSERT(IS_P2ALIGNED(frb, cache_sz));
#endif
SK_DF(SK_VERB_MEM, "frb 0x%llx frb_cnt %zu frb_sz %zu "
"(total %zu bytes) ALLOC", SK_KVA(frb), frb_cnt,
*frb_sz, frb_tot_sz);
return frb;
}
void
flow_route_buckets_free(struct flow_route_bucket *frb, size_t tot_sz)
{
SK_DF(SK_VERB_MEM, "frb 0x%llx FREE", SK_KVA(frb));
sk_free_type_hash(KT_SK_FRB, tot_sz, frb);
}
void
flow_route_bucket_init(struct flow_route_bucket *frb)
{
#if !KASAN_CLASSIC
ASSERT(IS_P2ALIGNED(frb, skmem_cpu_cache_line_size()));
#endif /* !KASAN_CLASSIC */
lck_rw_init(&frb->frb_lock, &flow_route_lock_group,
&flow_route_lock_attr);
RB_INIT(&frb->frb_head);
}
void
flow_route_bucket_destroy(struct flow_route_bucket *frb)
{
ASSERT(RB_EMPTY(&frb->frb_head));
lck_rw_destroy(&frb->frb_lock, &flow_route_lock_group);
}
static struct flow_route *
flow_route_find_by_addr(struct flow_route_bucket *frb,
union sockaddr_in_4_6 *dst)
{
struct flow_route *fr;
struct flow_route find;
FRB_LOCK_ASSERT_HELD(frb);
switch (SA(dst)->sa_family) {
case AF_INET:
find.fr_af = AF_INET;
find.fr_addr_len = sizeof(struct in_addr);
find.fr_addr_key = (void *)&SIN(dst)->sin_addr;
break;
case AF_INET6:
find.fr_af = AF_INET6;
find.fr_addr_len = sizeof(struct in6_addr);
find.fr_addr_key = (void *)&SIN6(dst)->sin6_addr;
break;
default:
VERIFY(0);
/* NOTREACHED */
__builtin_unreachable();
}
fr = RB_FIND(flow_route_tree, &frb->frb_head, &find);
if (fr != NULL) {
flow_route_retain(fr); /* for the caller */
}
return fr;
}
struct flow_route_id_bucket *
__sized_by(*tot_sz)
flow_route_id_buckets_alloc(size_t frib_cnt, size_t * frib_sz, size_t * tot_sz){
uint32_t cache_sz = skmem_cpu_cache_line_size();
struct flow_route_id_bucket *frib;
size_t frib_tot_sz;
/* each bucket is CPU cache-aligned */
*frib_sz = P2ROUNDUP(sizeof(*frib), cache_sz);
*tot_sz = frib_tot_sz = frib_cnt * (*frib_sz);
frib = sk_alloc_type_hash(KT_SK_FRIB, frib_tot_sz, Z_WAITOK,
skmem_tag_fsw_frib_hash);
/* END IGNORE CODESTYLE */
if (__improbable(frib == NULL)) {
return NULL;
}
#if !KASAN_CLASSIC
/*
* except in KASAN_CLASSIC mode, kalloc will always maintain cacheline
* size alignment if the requested size is a multiple of a cacheline
* size (this is true for any size that is a power of two from 16 to
* PAGE_SIZE).
*
* Because this is an optimization only, it is OK to leave KASAN_CLASSIC
* not respect this.
*/
ASSERT(IS_P2ALIGNED(frib, cache_sz));
#endif /* !KASAN_CLASSIC */
SK_DF(SK_VERB_MEM, "frib 0x%llx frib_cnt %zu frib_sz %zu "
"(total %zu bytes) ALLOC", SK_KVA(frib), frib_cnt,
*frib_sz, frib_tot_sz);
return frib;
}
void
flow_route_id_buckets_free(struct flow_route_id_bucket *frib, size_t tot_sz)
{
SK_DF(SK_VERB_MEM, "frib 0x%llx FREE", SK_KVA(frib));
sk_free_type_hash(KT_SK_FRIB, tot_sz, frib);
}
void
flow_route_id_bucket_init(struct flow_route_id_bucket *frib)
{
#if !KASAN_CLASSIC
ASSERT(IS_P2ALIGNED(frib, skmem_cpu_cache_line_size()));
#endif
lck_rw_init(&frib->frib_lock, &flow_route_lock_group,
&flow_route_lock_attr);
RB_INIT(&frib->frib_head);
}
void
flow_route_id_bucket_destroy(struct flow_route_id_bucket *frib)
{
ASSERT(RB_EMPTY(&frib->frib_head));
lck_rw_destroy(&frib->frib_lock, &flow_route_lock_group);
}
static struct flow_route *
flow_route_find_by_uuid(struct flow_route_id_bucket *frib, uuid_t id)
{
struct flow_route *fr;
struct flow_route find;
FRIB_LOCK_ASSERT_HELD(frib);
uuid_copy(find.fr_uuid, id);
fr = RB_FIND(flow_route_id_tree, &frib->frib_head, &find);
if (fr != NULL) {
flow_route_retain(fr); /* for the caller */
}
return fr;
}
static struct flow_route *
fr_alloc(boolean_t cansleep)
{
struct flow_route *fr;
fr = skmem_cache_alloc(flow_route_cache,
(cansleep ? SKMEM_SLEEP : SKMEM_NOSLEEP));
if (fr == NULL) {
return NULL;
}
bzero(fr, flow_route_size);
lck_spin_init(&fr->fr_reflock, &flow_route_lock_group, &flow_route_lock_attr);
lck_mtx_init(&fr->fr_lock, &flow_route_lock_group, &flow_route_lock_attr);
uuid_generate_random(fr->fr_uuid);
SK_DF(SK_VERB_MEM, "allocated fr 0x%llx", SK_KVA(fr));
return fr;
}
static void
fr_free(struct flow_route *fr)
{
SK_DF(SK_VERB_MEM, "freeing fr 0x%llx", SK_KVA(fr));
VERIFY(!(fr->fr_flags & FLOWRTF_ATTACHED));
VERIFY(fr->fr_usecnt == 0);
FR_LOCK(fr);
/* callee frees route entry */
flow_route_cleanup(fr);
VERIFY(fr->fr_rt_dst == NULL);
VERIFY(fr->fr_rt_gw == NULL);
VERIFY(fr->fr_rt_evhdlr_tag == NULL);
FR_UNLOCK(fr);
lck_mtx_destroy(&fr->fr_lock, &flow_route_lock_group);
lck_spin_destroy(&fr->fr_reflock, &flow_route_lock_group);
skmem_cache_free(flow_route_cache, fr);
}
static inline int
fr_cmp(const struct flow_route *a, const struct flow_route *b)
{
int d;
if ((d = (a->fr_af - b->fr_af)) != 0) {
return d;
}
if ((d = flow_ip_cmp(a->fr_addr_key, b->fr_addr_key,
b->fr_addr_len)) != 0) {
return d;
}
return 0;
}
static inline int
fr_id_cmp(const struct flow_route *a, const struct flow_route *b)
{
return uuid_compare(a->fr_uuid, b->fr_uuid);
}
static inline int
fr_use_stable_address(struct nx_flow_req *req)
{
int use_stable_address = ip6_prefer_tempaddr ? 0 : 1;
if (req != NULL &&
(req->nfr_flags & NXFLOWREQF_OVERRIDE_ADDRESS_SELECTION)) {
use_stable_address = (req->nfr_flags & NXFLOWREQF_USE_STABLE_ADDRESS) ? 1 : 0;
}
return use_stable_address;
}
int
flow_route_configure(struct flow_route *fr, struct ifnet *ifp, struct nx_flow_req *req)
{
#if SK_LOG
char old_s[MAX_IPv6_STR_LEN]; /* src */
char src_s[MAX_IPv6_STR_LEN]; /* src */
char dst_s[MAX_IPv6_STR_LEN]; /* dst */
#endif /* SK_LOG */
struct rtentry *rt = NULL, *__single gwrt = NULL;
int err = 0;
FR_LOCK_ASSERT_HELD(fr);
/*
* If there is a route entry for the final destination, see if
* it's no longer valid and perform another routing table lookup.
* A non-NULL fr_rt_dst is always associated with a route event
* registration, and the route reference is held there.
*/
rt = fr->fr_rt_dst;
if (rt == NULL || !(rt->rt_flags & RTF_UP) || fr->fr_want_configure) {
struct eventhandler_entry_arg ee_arg;
/* callee frees route entry */
flow_route_cleanup(fr);
/* lookup destination route */
ASSERT(err == 0);
rt = rtalloc1_scoped(SA(&fr->fr_faddr), 1, 0, ifp->if_index);
if (rt == NULL) {
err = EHOSTUNREACH;
SK_ERR("no route to %s on %s (err %d)",
sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
sizeof(dst_s)), ifp->if_xname, err);
} else {
/*
* If route points to another interface and the
* route's gateway isn't link-layer, reject it.
* We make an exception otherwise, since local
* interface addresses resolve this way.
*/
if (rt->rt_ifp != ifp && rt->rt_ifp != lo_ifp &&
(rt->rt_gateway == NULL ||
SA(rt->rt_gateway)->sa_family != AF_LINK)) {
err = EHOSTUNREACH;
SK_ERR("route to %s on %s != %s (err %d)",
sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
sizeof(dst_s)), rt->rt_ifp->if_xname,
ifp->if_xname, err);
}
}
if (err != 0) {
goto done;
}
ASSERT(fr->fr_mgr != NULL);
ASSERT(!uuid_is_null(fr->fr_mgr->fm_uuid));
ASSERT(!uuid_is_null(fr->fr_uuid));
ASSERT(!uuid_is_null(fr->fr_nx_uuid));
bzero(&ee_arg, sizeof(ee_arg));
uuid_copy(ee_arg.ee_fm_uuid, fr->fr_mgr->fm_uuid);
uuid_copy(ee_arg.ee_fr_uuid, fr->fr_uuid);
/*
* Register for changes on destination route; this covers both
* cases where the destination is on-link, or if it is off-link
* and is using a gateway route. This also transfers the refcnt
* of the route entry to the event handler, released later when
* it is deregistered.
*/
ASSERT(fr->fr_rt_dst == NULL);
ASSERT(fr->fr_rt_evhdlr_tag == NULL);
fr->fr_rt_dst = rt; /* move reference to fr */
fr->fr_rt_evhdlr_tag =
EVENTHANDLER_REGISTER(&rt->rt_evhdlr_ctxt, route_event,
&flow_route_ev_callback, ee_arg, EVENTHANDLER_PRI_ANY);
ASSERT(fr->fr_rt_evhdlr_tag != NULL);
os_atomic_andnot(&fr->fr_flags, FLOWRTF_DELETED, relaxed);
/*
* Lookup gateway route (if any); returns locked gwrt
* with a reference bumped up.
*/
err = route_to_gwroute(SA(&fr->fr_faddr), rt, &gwrt);
if (err != 0) {
/*
* Reference held by fr_rt_dst will be taken
* care of by flow_route_cleanup() below, so
* make sure we don't do an extra rtfree().
*/
rt = NULL;
ASSERT(gwrt == NULL);
SK_ERR("no gw route to %s on %s (err %d)",
sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
sizeof(dst_s)), ifp->if_xname, err);
goto done;
}
/* if RTF_GATEWAY isn't set, gwrt == rt */
ASSERT(gwrt != NULL);
RT_LOCK_ASSERT_HELD(gwrt);
/*
* Must have been cleared via cleanup, and that we're
* single-threaded here for fr by virtue of fr_lock.
*/
ASSERT(!(fr->fr_flags & (FLOWRTF_GATEWAY | FLOWRTF_ONLINK)));
if (gwrt != rt && (rt->rt_flags & RTF_GATEWAY) &&
(rt->rt_gateway->sa_family == AF_INET ||
rt->rt_gateway->sa_family == AF_INET6)) {
struct sockaddr_storage ss;
ASSERT(fr->fr_rt_gw == NULL);
/* locked via route_to_gwroute() above */
fr->fr_rt_gw = gwrt; /* move reference to fr */
RT_ADDREF_LOCKED(gwrt); /* for this routine */
/*
* Destination is off-link and is reachable
* thru an IP gateway route. Save the IP
* address of the gateway in fr_gaddr.
*/
(void) sa_copy(rt->rt_gateway, &ss, NULL);
_CASSERT(sizeof(fr->fr_gaddr) <= sizeof(ss));
bcopy(&ss, &fr->fr_gaddr, sizeof(fr->fr_gaddr));
os_atomic_or(&fr->fr_flags, FLOWRTF_GATEWAY, relaxed);
} else if (IS_DIRECT_HOSTROUTE(rt)) {
/*
* Destination is on-link.
*/
os_atomic_or(&fr->fr_flags, FLOWRTF_ONLINK, relaxed);
}
RT_UNLOCK(gwrt);
}
RT_ADDREF(rt); /* for this routine */
/* see if we need to re-select default source address */
int use_stable_address = fr_use_stable_address(req);
if (fr->fr_want_configure ||
fr->fr_laddr_gencnt != ifp->if_nx_flowswitch.if_fsw_ipaddr_gencnt ||
!(fr->fr_flags & FLOWRTF_STABLE_ADDR) != !use_stable_address) {
union sockaddr_in_4_6 old = fr->fr_laddr;
if (use_stable_address) {
os_atomic_or(&fr->fr_flags, FLOWRTF_STABLE_ADDR, relaxed);
} else {
os_atomic_andnot(&fr->fr_flags, FLOWRTF_STABLE_ADDR, relaxed);
}
if ((err = flow_route_select_laddr(&fr->fr_laddr, &fr->fr_faddr,
ifp, rt, &fr->fr_laddr_gencnt, use_stable_address)) != 0) {
SK_ERR("no usable src address to reach %s on %s "
"(err %d)", sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
sizeof(dst_s)), ifp->if_xname, err);
goto done;
}
if (bcmp(&old, &fr->fr_laddr, SA(&old)->sa_len) != 0) {
SK_ERR("src address is now %s (was %s) to reach %s "
"on %s", sk_sa_ntop(SA(&fr->fr_laddr), src_s,
sizeof(src_s)), sk_sa_ntop(SA(&old), old_s,
sizeof(old_s)), sk_sa_ntop(SA(&fr->fr_faddr),
dst_s, sizeof(dst_s)), ifp->if_xname);
}
}
ASSERT(err == 0);
done:
if (__probable(err == 0)) {
os_atomic_store(&fr->fr_want_configure, 0, release);
} else {
/* callee frees route entry */
flow_route_cleanup(fr);
}
if (gwrt != NULL) {
ASSERT(rt != NULL);
if (gwrt == rt) {
RT_REMREF(gwrt);
} else {
rtfree(gwrt);
}
gwrt = NULL;
}
if (rt != NULL) {
rtfree(rt);
rt = NULL;
}
return err;
}
int
flow_route_find(struct kern_nexus *nx, struct flow_mgr *fm,
struct ifnet *ifp, struct nx_flow_req *req,
flow_route_ctor_fn_t fr_ctor, flow_route_resolve_fn_t fr_resolve,
void *arg, struct flow_route **frp)
{
#if SK_LOG
char src_s[MAX_IPv6_STR_LEN]; /* dst */
char dst_s[MAX_IPv6_STR_LEN]; /* dst */
char gw_s[MAX_IPv6_STR_LEN]; /* gw */
#endif /* SK_LOG */
union sockaddr_in_4_6 *daddr = &req->nfr_daddr;
struct flow_route_bucket *frb;
struct flow_route_id_bucket *frib;
struct flow_route *fr = NULL;
int err = 0;
ASSERT(fr_ctor != NULL && fr_resolve != NULL);
ASSERT(frp != NULL);
*frp = NULL;
frb = flow_mgr_get_frb_by_addr(fm, daddr);
int use_stable_address = fr_use_stable_address(req);
/* see if there is a cached flow route (as reader) */
FRB_RLOCK(frb);
fr = flow_route_find_by_addr(frb, daddr);
if (fr != NULL) {
if (__improbable(fr->fr_want_configure || fr->fr_laddr_gencnt !=
ifp->if_nx_flowswitch.if_fsw_ipaddr_gencnt) ||
__improbable(!(fr->fr_flags & FLOWRTF_STABLE_ADDR) != !use_stable_address)) {
os_atomic_inc(&fr->fr_want_configure, relaxed);
FR_LOCK(fr);
err = flow_route_configure(fr, ifp, req);
if (err != 0) {
SK_ERR("fr 0x%llx error re-configuring dst %s "
"on %s (err %d) [R]", SK_KVA(fr),
sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
sizeof(dst_s)), ifp->if_xname, err);
}
FR_UNLOCK(fr);
}
if (err == 0) {
SK_DF(SK_VERB_FLOW_ROUTE,
"fr 0x%llx found for dst %s " "on %s [R,%u]",
SK_KVA(fr), sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
sizeof(dst_s)), ifp->if_xname, fr->fr_usecnt);
}
FRB_RUNLOCK(frb); /* reader */
goto done;
}
/*
* Flow route doesn't exist; become a writer and prepare to
* allocate one. We could be racing with other threads here,
* so check first if there is now a cached flow route that
* got created by the winning thread.
*/
if (!FRB_RLOCKTOWLOCK(frb)) {
FRB_WLOCK(frb);
}
fr = flow_route_find_by_addr(frb, daddr);
if (fr != NULL) {
if (__improbable(fr->fr_want_configure) ||
__improbable(!(fr->fr_flags & FLOWRTF_STABLE_ADDR) != !use_stable_address)) {
FR_LOCK(fr);
err = flow_route_configure(fr, ifp, req);
if (err != 0) {
SK_ERR("fr 0x%llx error re-configuring dst %s "
"on %s (err %d) [W]", SK_KVA(fr),
sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
sizeof(dst_s)), ifp->if_xname, err);
}
FR_UNLOCK(fr);
}
if (err == 0) {
SK_DF(SK_VERB_FLOW_ROUTE,
"fr 0x%llx found for dst %s on %s [W,%u]",
SK_KVA(fr), sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
sizeof(dst_s)), ifp->if_xname, fr->fr_usecnt);
}
FRB_WUNLOCK(frb); /* writer */
goto done;
}
/* allocate one */
fr = fr_alloc(TRUE);
fr->fr_faddr = *daddr; /* remote address */
switch (SA(&fr->fr_faddr)->sa_family) {
case AF_INET:
SIN(&fr->fr_faddr)->sin_port = 0;
fr->fr_addr_len = sizeof(struct in_addr);
fr->fr_addr_key = &SIN(&fr->fr_faddr)->sin_addr;
break;
case AF_INET6:
SIN6(&fr->fr_faddr)->sin6_port = 0;
fr->fr_addr_len = sizeof(struct in6_addr);
fr->fr_addr_key = &SIN6(&fr->fr_faddr)->sin6_addr;
break;
default:
VERIFY(0);
/* NOTREACHED */
__builtin_unreachable();
}
ASSERT(!uuid_is_null(fr->fr_uuid));
uuid_copy(fr->fr_nx_uuid, nx->nx_uuid);
*(struct flow_mgr **)(uintptr_t)&fr->fr_mgr = fm;
/* force configure newly-created flow route */
os_atomic_inc(&fr->fr_want_configure, relaxed);
FR_LOCK(fr);
if ((err = flow_route_configure(fr, ifp, req)) != 0) {
SK_ERR("fr 0x%llx error configuring dst %s on %s (err %d)",
SK_KVA(fr), sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
sizeof(dst_s)), ifp->if_xname, err);
FR_UNLOCK(fr);
FRB_WUNLOCK(frb); /* writer */
/* not yet in tree, so free immediately */
fr_free(fr);
fr = NULL;
goto done;
}
/* execute nexus-specific constructor */
fr_ctor(arg, fr);
FR_UNLOCK(fr);
frib = flow_mgr_get_frib_by_uuid(fm, fr->fr_uuid);
FRIB_WLOCK(frib);
*(struct flow_route_bucket **)(uintptr_t)&fr->fr_frb = frb;
*(struct flow_route_id_bucket **)(uintptr_t)&fr->fr_frib = frib;
FRB_WLOCK_ASSERT_HELD(frb);
FRIB_WLOCK_ASSERT_HELD(frib);
RB_INSERT(flow_route_tree, &frb->frb_head, fr);
RB_INSERT(flow_route_id_tree, &frib->frib_head, fr);
os_atomic_or(&fr->fr_flags, FLOWRTF_ATTACHED, relaxed);
#if DEBUG
/* sanity checks for comparator routines */
VERIFY(flow_route_find_by_addr(frb, &fr->fr_faddr) == fr);
flow_route_release(fr);
VERIFY(flow_route_find_by_uuid(frib, fr->fr_uuid) == fr);
flow_route_release(fr);
#endif /* DEBUG */
/* for the trees */
_CASSERT(FLOW_ROUTE_MINREF == 2);
flow_route_retain(fr);
flow_route_retain(fr);
ASSERT(fr->fr_usecnt == FLOW_ROUTE_MINREF);
/* for the caller */
flow_route_retain(fr);
FRIB_WUNLOCK(frib); /* writer */
FRB_WUNLOCK(frb); /* writer */
/* execute nexus-specific resolver */
if (!(fr->fr_flags & FLOWRTF_RESOLVED) &&
(err = fr_resolve(arg, fr, NULL)) != 0) {
if (fr->fr_flags & FLOWRTF_GATEWAY) {
SK_ERR("fr 0x%llx resolve %s gw %s on %s (err %d)",
SK_KVA(fr), (err == EJUSTRETURN ? "pending" :
"fail"), sk_sa_ntop(SA(&fr->fr_gaddr), dst_s,
sizeof(dst_s)), ifp->if_xname, err);
} else {
SK_ERR("fr 0x%llx resolve %s dst %s on %s (err %d)",
SK_KVA(fr), (err == EJUSTRETURN ? "pending" :
"fail"), sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
sizeof(dst_s)), ifp->if_xname, err);
}
if (err == EJUSTRETURN) {
err = 0;
} else {
goto done;
}
}
ASSERT(err == 0);
#if SK_LOG
if (fr->fr_flags & FLOWRTF_GATEWAY) {
SK_DF(SK_VERB_FLOW_ROUTE,
"add fr 0x%llx %s -> %s via gw %s on %s", SK_KVA(fr),
sk_sa_ntop(SA(&fr->fr_laddr), src_s, sizeof(src_s)),
sk_sa_ntop(SA(&fr->fr_faddr), dst_s, sizeof(dst_s)),
sk_sa_ntop(SA(&fr->fr_gaddr), gw_s, sizeof(gw_s)),
ifp->if_xname);
} else {
SK_DF(SK_VERB_FLOW_ROUTE,
"add fr 0x%llx %s -> %s on %s", SK_KVA(fr),
sk_sa_ntop(SA(&fr->fr_laddr), src_s, sizeof(src_s)),
sk_sa_ntop(SA(&fr->fr_faddr), dst_s, sizeof(dst_s)),
ifp->if_xname);
}
#endif /* SK_LOG */
done:
if (err == 0) {
ASSERT(fr != NULL);
*frp = fr;
} else if (fr != NULL) {
/* can't directly call fr_free() if it's in the tree */
flow_route_release(fr);
fr = NULL;
}
return err;
}
void
flow_route_retain(struct flow_route *fr)
{
lck_spin_lock(&fr->fr_reflock);
if (fr->fr_usecnt++ == FLOW_ROUTE_MINREF) {
fr->fr_expire = 0;
}
lck_spin_unlock(&fr->fr_reflock);
}
void
flow_route_release(struct flow_route *fr)
{
bool should_free = false;
lck_spin_lock(&fr->fr_reflock);
VERIFY(fr->fr_usecnt > 0);
if (fr->fr_flags & FLOWRTF_ATTACHED) {
if (fr->fr_usecnt-- == (FLOW_ROUTE_MINREF + 1)) {
fr->fr_expire = _net_uptime + flow_route_expire;
}
} else {
/*
* fr is no longer in lookup tree, so there shouldn't be
* further usecnt, if we reach 0 usecnt, then this is the very
* last reference and is safe to unlock and call fr_free.
*/
if (--(fr->fr_usecnt) == 0) {
should_free = true;
}
}
lck_spin_unlock(&fr->fr_reflock);
if (should_free) {
fr_free(fr);
}
}
static uint32_t
flow_route_bucket_purge_common(struct flow_route_bucket *frb, uint32_t *resid,
boolean_t all, boolean_t early_expire)
{
#if SK_LOG
char ss[MAX_IPv6_STR_LEN]; /* dst */
char ds[MAX_IPv6_STR_LEN]; /* dst */
char gs[MAX_IPv6_STR_LEN]; /* gw */
#endif /* SK_LOG */
struct flow_route *fr, *tfr;
uint64_t now = net_uptime();
uint32_t i = 0, tot = 0;
FRB_WLOCK_ASSERT_HELD(frb);
RB_FOREACH_SAFE(fr, flow_route_tree, &frb->frb_head, tfr) {
struct flow_route_id_bucket *frib =
__DECONST(struct flow_route_id_bucket *, fr->fr_frib);
++tot;
/*
* We're not holding fr_lock here, since this is a
* best-effort check. If there's a race and we miss
* it now, we'll come back again shortly.
*/
lck_spin_lock(&fr->fr_reflock);
if (!all && (fr->fr_usecnt > FLOW_ROUTE_MINREF ||
(fr->fr_expire > now && !early_expire &&
!(fr->fr_flags & FLOWRTF_DELETED)))) {
lck_spin_unlock(&fr->fr_reflock);
SK_DF(SK_VERB_FLOW_ROUTE, "skipping fr 0x%llx "
"refcnt %u expire %llu", SK_KVA(fr),
fr->fr_usecnt, fr->fr_expire);
continue;
}
lck_spin_unlock(&fr->fr_reflock);
/*
* If "all" is set, flow entries must be gone by now, as
* we must be called by flow_route_bucket_purge_all().
* It also means that the caller has acquired writer lock
* on all flow {route,route_id} buckets, and fr_usecnt
* must be at its minimum value now.
*/
if (!all) {
FRIB_WLOCK(frib);
}
FRIB_WLOCK_ASSERT_HELD(frib);
_CASSERT(FLOW_ROUTE_MINREF == 2);
ASSERT(fr->fr_usecnt >= FLOW_ROUTE_MINREF);
RB_REMOVE(flow_route_tree, &frb->frb_head, fr);
RB_REMOVE(flow_route_id_tree, &frib->frib_head, fr);
os_atomic_andnot(&fr->fr_flags, FLOWRTF_ATTACHED, relaxed);
#if SK_LOG
if (fr->fr_flags & FLOWRTF_GATEWAY) {
SK_DF(SK_VERB_FLOW_ROUTE,
"remove fr 0x%llx %s -> %s via gw %s [exp %lld]",
SK_KVA(fr),
sk_sa_ntop(SA(&fr->fr_laddr), ss, sizeof(ss)),
sk_sa_ntop(SA(&fr->fr_faddr), ds, sizeof(ds)),
sk_sa_ntop(SA(&fr->fr_gaddr), gs, sizeof(gs)),
(int64_t)(fr->fr_expire - now));
} else {
SK_DF(SK_VERB_FLOW_ROUTE,
"remove fr 0x%llx %s -> %s [exp %lld]", SK_KVA(fr),
sk_sa_ntop(SA(&fr->fr_laddr), ss, sizeof(ss)),
sk_sa_ntop(SA(&fr->fr_faddr), ds, sizeof(ds)),
(int64_t)(fr->fr_expire - now));
}
#endif /* SK_LOG */
/* for the trees */
flow_route_release(fr);
flow_route_release(fr);
++i;
if (!all) {
FRIB_WUNLOCK(frib);
}
}
if (resid != NULL) {
*resid = (tot - i);
}
return i;
}
void
flow_route_bucket_purge_all(struct flow_route_bucket *frb)
{
(void) flow_route_bucket_purge_common(frb, NULL, TRUE, FALSE);
}
static uint32_t
flow_route_bucket_prune(struct flow_route_bucket *frb, struct ifnet *ifp,
uint32_t *resid)
{
uint64_t now = net_uptime();
struct flow_route *fr;
uint32_t i = 0, tot = 0;
boolean_t ifdown = !(ifp->if_flags & IFF_UP);
FRB_RLOCK(frb);
RB_FOREACH(fr, flow_route_tree, &frb->frb_head) {
++tot;
/* loose check; do this without holding fr_reflock */
if (fr->fr_usecnt > FLOW_ROUTE_MINREF ||
(fr->fr_expire > now && !ifdown &&
!(fr->fr_flags & FLOWRTF_DELETED))) {
continue;
}
++i;
}
/*
* If there's nothing to prune or there's a writer, we're done.
* Note that if we failed to upgrade to writer, the lock would
* have been released automatically.
*/
if (i == 0 || !FRB_RLOCKTOWLOCK(frb)) {
if (i == 0) {
FRB_RUNLOCK(frb);
}
if (resid != NULL) {
*resid = (tot - i);
}
return 0;
}
SK_DF(SK_VERB_FLOW_ROUTE, "purging at least %u idle routes on %s",
i, ifp->if_xname);
/* purge idle ones */
i = flow_route_bucket_purge_common(frb, resid, FALSE, ifdown);
FRB_WUNLOCK(frb);
return i;
}
uint32_t
flow_route_prune(struct flow_mgr *fm, struct ifnet *ifp,
uint32_t *tot_resid)
{
uint32_t pruned = 0;
uint32_t resid;
uint32_t i;
for (i = 0; i < fm->fm_route_buckets_cnt; i++) {
struct flow_route_bucket *frb = flow_mgr_get_frb_at_idx(fm, i);
pruned += flow_route_bucket_prune(frb, ifp, &resid);
if (tot_resid != NULL) {
*tot_resid += resid;
}
}
return pruned;
}
/*
* This runs in the context of eventhandler invocation routine which loops
* through all the registered callbacks. Care must be taken to not call
* any primitives here that would lead to routing changes in the same context
* as it would lead to deadlock in eventhandler code.
*/
static void
flow_route_ev_callback(struct eventhandler_entry_arg ee_arg,
struct sockaddr *dst, int route_ev, struct sockaddr *gw_addr_orig, int flags)
{
#pragma unused(dst, flags)
#if SK_LOG
char dst_s[MAX_IPv6_STR_LEN];
#endif /* SK_LOG */
struct flow_route_id_bucket *frib = NULL;
struct flow_route *fr = NULL;
struct flow_mgr *fm;
VERIFY(!uuid_is_null(ee_arg.ee_fm_uuid));
VERIFY(!uuid_is_null(ee_arg.ee_fr_uuid));
evhlog(debug, "%s: eventhandler saw event type=route_event event_code=%s",
__func__, route_event2str(route_ev));
/*
* Upon success, callee will hold flow manager lock as reader,
* and we'll need to unlock it below. Otherwise there's no
* need to unlock here and just return.
*/
fm = flow_mgr_find_lock(ee_arg.ee_fm_uuid);
if (fm == NULL) {
SK_ERR("Event %s for dst %s ignored; flow manager not found",
route_event2str(route_ev), sk_sa_ntop(dst, dst_s,
sizeof(dst_s)));
return;
}
SK_DF(SK_VERB_FLOW_ROUTE, "%s: dst %s event %s", fm->fm_name,
sk_sa_ntop(dst, dst_s, sizeof(dst_s)), route_event2str(route_ev));
do {
frib = flow_mgr_get_frib_by_uuid(fm, ee_arg.ee_fr_uuid);
FRIB_RLOCK(frib);
/* callee returns a reference that we need to release below */
fr = flow_route_find_by_uuid(frib, ee_arg.ee_fr_uuid);
if (fr == NULL) {
SK_ERR("%s: dst %s flow route not found", fm->fm_name,
sk_sa_ntop(dst, dst_s, sizeof(dst_s)));
break;
}
/*
* Grab fr_lock to prevent flow route configuration or
* resolver from using stale info while we are updating.
*/
FR_LOCK(fr);
switch (route_ev) {
case ROUTE_ENTRY_REFRESH:
/*
* This is the case where the route entry has been
* updated (for example through RTM_CHANGE). Some
* of it may not warrant a lookup again and some of
* it may. For now, mark flow to perform a look-up
* again as the gateway may have changed.
*/
os_atomic_inc(&fr->fr_want_configure, relaxed);
os_atomic_andnot(&fr->fr_flags, FLOWRTF_RESOLVED, relaxed);
SK_DF(SK_VERB_FLOW_ROUTE, "%s: dst %s route changed",
fm->fm_name, sk_sa_ntop(dst, dst_s,
sizeof(dst_s)));
break;
case ROUTE_ENTRY_DELETED:
/*
* NOTE: flow_route_cleanup() should not be called
* to de-register eventhandler in the context of
* eventhandler callback to avoid deadlock in
* eventhandler code. Instead, just mark the flow
* route un-resolved. When it is being used again
* or being deleted the old eventhandler must be
* de-registered.
*/
os_atomic_inc(&fr->fr_want_configure, relaxed);
os_atomic_andnot(&fr->fr_flags, FLOWRTF_RESOLVED, relaxed);
os_atomic_or(&fr->fr_flags, FLOWRTF_DELETED, relaxed);
SK_DF(SK_VERB_FLOW_ROUTE, "%s: dst %s route deleted",
fm->fm_name, sk_sa_ntop(dst, dst_s,
sizeof(dst_s)));
break;
case ROUTE_LLENTRY_STALE:
/*
* When the route entry is deemed unreliable or old
* enough to trigger a route lookup again. Don't
* reconfigure the flow route, but simply attempt
* to resolve it next time to trigger a probe.
*/
os_atomic_inc(&fr->fr_want_probe, relaxed);
os_atomic_andnot(&fr->fr_flags, FLOWRTF_RESOLVED, relaxed);
SK_DF(SK_VERB_FLOW_ROUTE, "%s: dst %s llentry stale",
fm->fm_name, sk_sa_ntop(dst, dst_s,
sizeof(dst_s)));
break;
case ROUTE_LLENTRY_CHANGED:
/*
* When the link-layer info has changed; replace
* cached llinfo in the flow route (treat this
* as ROUTE_LLENTRY_RESOLVED).
*/
OS_FALLTHROUGH;
case ROUTE_LLENTRY_RESOLVED:
{
/*
* SDL address length may be 0 for cellular.
* If Ethernet, copy into flow route and mark
* it as cached. In all cases, mark the flow
* route as resolved.
*/
/*
* XXX Remove explicit __bidi_indexable once
* rdar://119193012 lands
*/
struct sockaddr_dl *__bidi_indexable gw_addr =
(struct sockaddr_dl *__bidi_indexable) SDL(gw_addr_orig);
ASSERT(gw_addr->sdl_family == AF_LINK);
if (gw_addr->sdl_alen == ETHER_ADDR_LEN) {
FLOWRT_UPD_ETH_DST(fr, LLADDR(gw_addr));
SK_DF(SK_VERB_FLOW_ROUTE,
"%s: dst %s llentry %s", fm->fm_name,
sk_sa_ntop(dst, dst_s, sizeof(dst_s)),
(!(fr->fr_flags & FLOWRTF_HAS_LLINFO) ?
"resolved" : "changed"));
os_atomic_or(&fr->fr_flags, FLOWRTF_HAS_LLINFO, relaxed);
} else {
os_atomic_andnot(&fr->fr_flags, FLOWRTF_HAS_LLINFO, relaxed);
}
os_atomic_or(&fr->fr_flags, FLOWRTF_RESOLVED, relaxed);
#if SK_LOG
if (__improbable((sk_verbose & SK_VERB_FLOW_ROUTE) !=
0) && (fr->fr_flags & FLOWRTF_HAS_LLINFO)) {
SK_DF(SK_VERB_FLOW_ROUTE,
"%s: fr 0x%llx eth_type 0x%x "
"eth_src %x:%x:%x:%x:%x:%x "
"eth_dst %x:%x:%x:%x:%x:%x [%s])",
fm->fm_name, SK_KVA(fr),
ntohs(fr->fr_eth.ether_type),
fr->fr_eth.ether_shost[0],
fr->fr_eth.ether_shost[1],
fr->fr_eth.ether_shost[2],
fr->fr_eth.ether_shost[3],
fr->fr_eth.ether_shost[4],
fr->fr_eth.ether_shost[5],
fr->fr_eth.ether_dhost[0],
fr->fr_eth.ether_dhost[1],
fr->fr_eth.ether_dhost[2],
fr->fr_eth.ether_dhost[3],
fr->fr_eth.ether_dhost[4],
fr->fr_eth.ether_dhost[5],
sk_sa_ntop(dst, dst_s, sizeof(dst_s)));
}
#endif /* SK_LOG */
break;
}
case ROUTE_LLENTRY_DELETED:
/*
* If the route entry points to a router and an
* RTM_DELETE has been issued on it; force the
* flow route to be reconfigured.
*/
os_atomic_inc(&fr->fr_want_configure, relaxed);
os_atomic_andnot(&fr->fr_flags, (FLOWRTF_HAS_LLINFO | FLOWRTF_RESOLVED), relaxed);
SK_DF(SK_VERB_FLOW_ROUTE, "%s: dst %s llentry deleted",
fm->fm_name, sk_sa_ntop(dst, dst_s,
sizeof(dst_s)));
break;
case ROUTE_LLENTRY_PROBED:
/*
* When the resolver has begun probing the target;
* nothing to do here.
*/
SK_DF(SK_VERB_FLOW_ROUTE, "%s: dst %s llentry probed",
fm->fm_name, sk_sa_ntop(dst, dst_s,
sizeof(dst_s)));
break;
case ROUTE_LLENTRY_UNREACH:
/*
* When the route entry is marked with RTF_REJECT
* or the probes have timed out, reconfigure.
*/
os_atomic_inc(&fr->fr_want_configure, relaxed);
os_atomic_andnot(&fr->fr_flags, FLOWRTF_RESOLVED, relaxed);
SK_ERR("%s: dst %s llentry unreachable", fm->fm_name,
sk_sa_ntop(dst, dst_s, sizeof(dst_s)));
break;
default:
break;
}
} while (0);
if (fr != NULL) {
flow_route_release(fr);
FR_UNLOCK(fr);
}
if (frib != NULL) {
FRIB_UNLOCK(frib);
}
if (fm != NULL) {
flow_mgr_unlock();
}
}
int
flow_route_select_laddr(union sockaddr_in_4_6 *src, union sockaddr_in_4_6 *dst,
struct ifnet *ifp, struct rtentry *rt, uint32_t *ipaddr_gencnt,
int use_stable_address)
{
#if SK_LOG
char src_s[MAX_IPv6_STR_LEN]; /* src */
char dst_s[MAX_IPv6_STR_LEN]; /* dst */
#endif /* SK_LOG */
sa_family_t af = SA(dst)->sa_family;
struct ifnet *__single src_ifp = NULL;
struct ifaddr *__single ifa = NULL;
int err = 0;
/* see comments in flow_route_configure() regarding loopback */
ASSERT(rt->rt_ifp == ifp || rt->rt_ifp == lo_ifp);
switch (af) {
case AF_INET: {
ifnet_lock_shared(ifp);
if (__improbable(rt->rt_ifa->ifa_debug & IFD_DETACHING) != 0) {
err = EHOSTUNREACH;
SK_ERR("route to %s has src address marked detaching "
"(err %d)", inet_ntop(AF_INET,
&SIN(dst)->sin_addr, dst_s, sizeof(dst_s)), err);
ifnet_lock_done(ifp);
break;
}
SIN(src)->sin_len = sizeof(struct sockaddr_in);
SIN(src)->sin_family = AF_INET;
SIN(src)->sin_addr = IA_SIN(rt->rt_ifa)->sin_addr;
ASSERT(SIN(src)->sin_addr.s_addr != INADDR_ANY);
*ipaddr_gencnt = ifp->if_nx_flowswitch.if_fsw_ipaddr_gencnt;
ifnet_lock_done(ifp);
break;
}
case AF_INET6: {
struct in6_addr src_storage, *in6;
struct route_in6 ro = {};
uint32_t hints = (use_stable_address ? 0 : IPV6_SRCSEL_HINT_PREFER_TMPADDR);
ro.ro_rt = rt;
if ((in6 = in6_selectsrc_core(SIN6(dst), hints,
ifp, 0, &src_storage, &src_ifp, &err, &ifa, &ro, FALSE)) == NULL) {
if (err == 0) {
err = EADDRNOTAVAIL;
}
VERIFY(src_ifp == NULL);
SK_ERR("src address to dst %s on %s not available "
"(err %d)", inet_ntop(AF_INET6,
&SIN6(dst)->sin6_addr, dst_s, sizeof(dst_s)),
ifp->if_xname, err);
break;
}
VERIFY(src_ifp != NULL);
VERIFY(ifa != NULL);
if (__improbable(src_ifp != ifp)) {
if (err == 0) {
err = ENETUNREACH;
}
SK_ERR("dst %s, src %s ifp %s != %s (err %d)",
inet_ntop(AF_INET6, &SIN6(dst)->sin6_addr,
dst_s, sizeof(dst_s)),
inet_ntop(AF_INET6, &SIN6(src)->sin6_addr,
src_s, sizeof(src_s)),
src_ifp->if_xname, ifp->if_xname, err);
break;
}
ifnet_lock_shared(ifp);
if (__improbable(ifa->ifa_debug & IFD_DETACHING) != 0) {
err = EHOSTUNREACH;
SK_ERR("IPv6 address selected is marked to be "
"detached (err %d)", err);
ifnet_lock_done(ifp);
break;
}
/* clear embedded scope if link-local src */
if (IN6_IS_SCOPE_EMBED(in6)) {
if (in6_embedded_scope) {
SIN6(src)->sin6_scope_id = ntohs(in6->s6_addr16[1]);
in6->s6_addr16[1] = 0;
} else {
SIN6(src)->sin6_scope_id = src_ifp->if_index;
}
}
SIN6(src)->sin6_len = sizeof(struct sockaddr_in6);
SIN6(src)->sin6_family = AF_INET6;
SIN6(src)->sin6_addr = *in6;
ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&SIN6(src)->sin6_addr));
*ipaddr_gencnt = ifp->if_nx_flowswitch.if_fsw_ipaddr_gencnt;
ifnet_lock_done(ifp);
break;
}
default:
VERIFY(0);
/* NOTREACHED */
__builtin_unreachable();
}
if (ifa != NULL) {
ifa_remref(ifa);
}
if (src_ifp != NULL) {
ifnet_release(src_ifp);
}
#if SK_LOG
if (err == 0 && __improbable((sk_verbose & SK_VERB_FLOW_ROUTE) != 0)) {
SK_DF(SK_VERB_FLOW_ROUTE, "src %s to dst %s on %s",
sk_sa_ntop(SA(src), src_s, sizeof(src_s)),
sk_sa_ntop(SA(dst), dst_s, sizeof(dst_s)),
ifp->if_xname);
}
#endif /* SK_LOG */
return err;
}
void
flow_route_cleanup(struct flow_route *fr)
{
#if SK_LOG
char ss[MAX_IPv6_STR_LEN]; /* dst */
char ds[MAX_IPv6_STR_LEN]; /* dst */
char gs[MAX_IPv6_STR_LEN]; /* gw */
#endif /* SK_LOG */
FR_LOCK_ASSERT_HELD(fr);
if (fr->fr_rt_evhdlr_tag != NULL) {
ASSERT(fr->fr_rt_dst != NULL);
route_event_enqueue_nwk_wq_entry(fr->fr_rt_dst, NULL,
ROUTE_EVHDLR_DEREGISTER, fr->fr_rt_evhdlr_tag, FALSE);
fr->fr_rt_evhdlr_tag = NULL;
fr->fr_rt_dst = NULL;
}
ASSERT(fr->fr_rt_dst == NULL);
if (fr->fr_rt_gw != NULL) {
rtfree(fr->fr_rt_gw);
fr->fr_rt_gw = NULL;
}
#if SK_LOG
if (fr->fr_flags & FLOWRTF_GATEWAY) {
SK_DF(SK_VERB_FLOW_ROUTE,
"clean fr 0x%llx %s -> %s via gw %s", SK_KVA(fr),
sk_sa_ntop(SA(&fr->fr_laddr), ss, sizeof(ss)),
sk_sa_ntop(SA(&fr->fr_faddr), ds, sizeof(ds)),
sk_sa_ntop(SA(&fr->fr_gaddr), gs, sizeof(gs)));
} else if (fr->fr_flags & FLOWRTF_ONLINK) {
SK_DF(SK_VERB_FLOW_ROUTE,
"clean fr 0x%llx %s -> %s", SK_KVA(fr),
sk_sa_ntop(SA(&fr->fr_laddr), ss, sizeof(ss)),
sk_sa_ntop(SA(&fr->fr_faddr), ds, sizeof(ds)));
}
#endif /* SK_LOG */
os_atomic_andnot(&fr->fr_flags, (FLOWRTF_GATEWAY | FLOWRTF_ONLINK), relaxed);
}
static boolean_t
_flow_route_laddr_validate(struct flow_ip_addr *src_ip0, uint8_t ip_v,
struct ifnet *ifp, uint32_t *gencnt)
{
boolean_t address_found = TRUE;
struct ifaddr *ifa = NULL;
struct flow_ip_addr src_ip = {};
uint32_t scope = ifp->if_index;
VERIFY(gencnt != NULL);
VERIFY(ip_v == IPVERSION || ip_v == IPV6_VERSION);
if (ip_v == IPVERSION) {
memcpy(&src_ip._v4, &src_ip0->_v4, sizeof(src_ip._v4));
ifa = (struct ifaddr *)ifa_foraddr_scoped(
src_ip._v4.s_addr, scope);
} else {
memcpy(&src_ip, src_ip0, sizeof(*src_ip0));
if (in6_embedded_scope && IN6_IS_SCOPE_EMBED(&src_ip._v6)) {
src_ip._v6.s6_addr16[1] = htons((uint16_t)scope);
}
ifa = (struct ifaddr *)ifa_foraddr6_scoped(&src_ip._v6,
scope);
}
if (__improbable(ifa == NULL)) {
address_found = FALSE;
goto done;
}
ifnet_lock_shared(ifp);
if (__improbable(ifa->ifa_debug & IFD_DETACHING) != 0) {
address_found = FALSE;
ifnet_lock_done(ifp);
goto done;
}
if (ip_v == IPV6_VERSION) {
/*
* -fbounds-safety: ia6 (in6_ifaddr) overlays ifa (ifaddr)
*/
struct in6_ifaddr *ia6 = __container_of(ifa, struct in6_ifaddr,
ia_ifa);
/*
* Fail if IPv6 address is not ready or if the address
* is reserved * for CLAT46.
*/
if (__improbable(ia6->ia6_flags &
(IN6_IFF_NOTREADY | IN6_IFF_CLAT46)) != 0) {
address_found = FALSE;
ifnet_lock_done(ifp);
goto done;
}
} else {
/*
* If interface has CLAT46 enabled, fail IPv4 bind.
* Since this implies network is NAT64/DNS64, Internet
* effectively becomes reachable over IPv6. If on
* system IPv4 to IPv6 translation is required, that
* should be handled solely through bump in the API.
* The in kernel translation is only done for apps
* directly using low level networking APIs.
*/
if (__improbable(IS_INTF_CLAT46(ifp))) {
address_found = FALSE;
ifnet_lock_done(ifp);
goto done;
}
}
*gencnt = ifp->if_nx_flowswitch.if_fsw_ipaddr_gencnt;
ifnet_lock_done(ifp);
done:
if (ifa != NULL) {
ifa_remref(ifa);
}
return address_found;
}
boolean_t
flow_route_laddr_validate(union sockaddr_in_4_6 *saddr, struct ifnet *ifp,
uint32_t *gencnt)
{
VERIFY(saddr->sa.sa_family == AF_INET ||
saddr->sa.sa_family == AF_INET6);
struct flow_ip_addr *ipa;
uint8_t ipv;
if (saddr->sa.sa_family == AF_INET) {
ipv = IPVERSION;
ipa = (struct flow_ip_addr *)(void *)&saddr->sin.sin_addr;
} else {
ipv = IPV6_VERSION;
ipa = (struct flow_ip_addr *)(void *)&saddr->sin6.sin6_addr;
}
return _flow_route_laddr_validate(ipa, ipv, ifp, gencnt);
}
boolean_t
flow_route_key_validate(struct flow_key *fk, struct ifnet *ifp,
uint32_t *gencnt)
{
return _flow_route_laddr_validate(&fk->fk_src, fk->fk_ipver, ifp,
gencnt);
}