This is xnu-11215.1.10. See this file in:
/*
 * Copyright (c) 2016-2021 Apple Inc. All rights reserved.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
 *
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. The rights granted to you under the License
 * may not be used to create, or enable the creation or redistribution of,
 * unlawful or unlicensed copies of an Apple operating system, or to
 * circumvent, violate, or enable the circumvention or violation of, any
 * terms of an Apple operating system software license agreement.
 *
 * Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this file.
 *
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
 */

#ifndef _SKYWALK_MEM_SKMEMCACHEVAR_H
#define _SKYWALK_MEM_SKMEMCACHEVAR_H

#ifdef BSD_KERNEL_PRIVATE
#include <skywalk/core/skywalk_var.h>
#include <skywalk/os_channel_private.h>
#include <kern/cpu_number.h>
#include <machine/machine_routines.h>

/*
 * Buffer control.
 */
struct skmem_bufctl {
	SLIST_ENTRY(skmem_bufctl) bc_link;      /* bufctl linkage */
	void                    *__sized_by(bc_lim) bc_addr;       /* buffer obj address */
	void                    *bc_addrm;      /* mirrored buffer obj addr */
	struct skmem_slab       *bc_slab;       /* controlling slab */
	uint32_t                bc_lim;         /* buffer obj limit */
	uint32_t                bc_flags;       /* SKMEM_BUFCTL_* flags */
	uint32_t                bc_idx;         /* buffer index within slab */
	volatile uint32_t       bc_usecnt;      /* outstanding use */
};

#define SKMEM_BUFCTL_SHAREOK    0x1             /* supports sharing */

#define SKMEM_STACK_DEPTH       16              /* maximum audit stack depth */

#define SKMEM_CACHE_ALIGN       8               /* min guaranteed alignment */

/*
 * Alternative buffer control if SKM_MODE_AUDIT is set.
 */
struct skmem_bufctl_audit {
	SLIST_ENTRY(skmem_bufctl) bc_link;      /* bufctl linkage */
	void                    *__sized_by(bc_lim) bc_addr;       /* buffer address */
	void                    *bc_addrm;      /* mirrored buffer address */
	struct skmem_slab       *bc_slab;       /* controlling slab */
	uint32_t                bc_lim;         /* buffer obj limit */
	uint32_t                bc_flags;       /* SKMEM_BUFCTL_* flags */
	uint32_t                bc_idx;         /* buffer index within slab */
	volatile uint32_t       bc_usecnt;      /* outstanding use */
	struct thread           *bc_thread;     /* thread doing transaction */
	uint32_t                bc_timestamp;   /* transaction time */
	uint32_t                bc_depth;       /* stack depth */
	void                    *bc_stack[SKMEM_STACK_DEPTH]; /* stack */
};

/*
 * Buffer control hash bucket.
 */
struct skmem_bufctl_bkt {
	SLIST_HEAD(, skmem_bufctl) bcb_head;    /* bufctl allocated list */
};

/*
 * Slab.
 */
struct skmem_slab {
	TAILQ_ENTRY(skmem_slab) sl_link;        /* slab freelist linkage */
	struct skmem_cache      *sl_cache;      /* controlling cache */
	void                    *sl_base;       /* base of allocated memory */
	void                    *sl_basem;      /* base of mirrored memory */
	struct sksegment        *sl_seg;        /* backing segment */
	struct sksegment        *sl_segm;       /* backing mirrored segment */
	SLIST_HEAD(, skmem_bufctl) sl_head;     /* bufctl free list */
	uint32_t                sl_refcnt;      /* outstanding allocations */
	uint32_t                sl_chunks;      /* # of buffers in slab */
};

#define SKMEM_SLAB_IS_PARTIAL(sl)       \
	((sl)->sl_refcnt > 0 && (sl)->sl_refcnt < (sl)->sl_chunks)

#define SKMEM_SLAB_MEMBER(sl, buf)      \
	(((size_t)(buf) - (size_t)(sl)->sl_base) < (sl)->sl_cache->skm_slabsize)

/*
 * Magazine type.
 */
struct skmem_magtype {
	int                     mt_magsize;     /* magazine size (# of objs) */
	int                     mt_align;       /* magazine alignment */
	size_t                  mt_minbuf;      /* all smaller bufs qualify */
	size_t                  mt_maxbuf;      /* no larger bufs qualify */
	struct skmem_cache      *mt_cache;      /* magazine cache */
	char                    mt_cname[64];   /* magazine cache name */
};

/*
 * Magazine.
 */
struct skmem_mag {
	SLIST_ENTRY(skmem_mag)  mg_link;        /* magazine linkage */
	struct skmem_magtype    *mg_magtype;    /* magazine type */
	size_t                  mg_count;       /* # of mg_round array elements */
	void                    *mg_round[__counted_by(mg_count)];   /* one or more objs */
};

#define SKMEM_MAG_SIZE(n)       \
	offsetof(struct skmem_mag, mg_round[n])

/*
 * Magazine depot.
 */
struct skmem_maglist {
	SLIST_HEAD(, skmem_mag) ml_list;        /* magazine list */
	uint32_t                ml_total;       /* number of magazines */
	uint32_t                ml_min;         /* min since last update */
	uint32_t                ml_reaplimit;   /* max reapable magazines */
	uint64_t                ml_alloc;       /* allocations from this list */
};

/*
 * Per-CPU cache structure.
 */
struct skmem_cpu_cache {
	decl_lck_mtx_data(, cp_lock);
	struct skmem_mag        *cp_loaded;     /* currently filled magazine */
	struct skmem_mag        *cp_ploaded;    /* previously filled magazine */
	uint64_t                cp_alloc;       /* allocations from this cpu */
	uint64_t                cp_free;        /* frees to this cpu */
	int                     cp_rounds;      /* # of objs in filled mag */
	int                     cp_prounds;     /* # of objs in previous mag */
	int                     cp_magsize;     /* # of objs in a full mag */
} __attribute__((aligned(CHANNEL_CACHE_ALIGN_MAX)));

/*
 * Object's region information.
 *
 * This info is provided to skmem_ctor_fn_t() to assist master and
 * slave objects construction.  It is also provided separately via
 * skmem_cache_get_obj_info() when called on an object that's been
 * allocated from skmem_cache.  Information about slave object is
 * available only at constructor time.
 */
struct skmem_obj_info {
	void                    *__sized_by(oi_size) oi_addr;       /* object address */
	struct skmem_bufctl     *oi_bc;         /* buffer control (master) */
	uint32_t                oi_size;        /* actual object size */
	obj_idx_t               oi_idx_reg;     /* object idx within region */
	obj_idx_t               oi_idx_seg;     /* object idx within segment */
} __attribute__((__packed__));

/*
 * Generic one-way linked list element structure.  This is used to
 * handle skmem_cache_batch_alloc() requests in order to chain the
 * allocated objects together before returning them to the caller.
 * It is also used when freeing a batch of packets by the caller of
 * skmem_cache_batch_free().  Note that this requires the region's
 * object to be at least the size of struct skmem_obj, as we store
 * this information at the beginning of each object in the chain.
 */
struct skmem_obj {
	/*
	 * Given that we overlay this structure on top of whatever
	 * structure that the object represents, the constructor must
	 * ensure that it reserves at least the size of a pointer
	 * at the top for the linkage.
	 */
	struct skmem_obj        *mo_next;       /* next object in the list */
	/*
	 * The following are used only for raw (unconstructed) objects
	 * coming out of the slab layer during allocations.  They are
	 * not touched otherwise by skmem_cache when the object resides
	 * in the magazine.  By utilizing this space, we avoid having
	 * to allocate temporary storage elsewhere.
	 */
	struct skmem_obj_info   mo_info;        /* object's info */
	struct skmem_obj_info   mo_minfo;       /* mirrored object's info */
};

#define SKMEM_OBJ_ADDR(_oi)     (_oi)->oi_addr
#define SKMEM_OBJ_BUFCTL(_oi)   (_oi)->oi_bc
#define SKMEM_OBJ_SIZE(_oi)     (_oi)->oi_size
#define SKMEM_OBJ_IDX_REG(_oi)  (_oi)->oi_idx_reg
#define SKMEM_OBJ_IDX_SEG(_oi)  (_oi)->oi_idx_seg
/* segment the object belongs to (only for master) */
#define SKMEM_OBJ_SEG(_oi)      (_oi)->oi_bc->bc_slab->sl_seg
/* offset of object relative to the object's own region */
#define SKMEM_OBJ_ROFF(_oi)     \
	((mach_vm_offset_t)(SKMEM_OBJ_SIZE(_oi) * SKMEM_OBJ_IDX_REG(_oi)))

typedef int (*skmem_ctor_fn_t)(struct skmem_obj_info *,
    struct skmem_obj_info *, void *, uint32_t);
typedef void (*skmem_dtor_fn_t)(void *, void *);
typedef void (*skmem_reclaim_fn_t)(void *);
typedef int (*skmem_slab_alloc_fn_t)(struct skmem_cache *,
    struct skmem_obj_info *, struct skmem_obj_info *, uint32_t);
typedef void (*skmem_slab_free_fn_t)(struct skmem_cache *, void *);

/*
 * Cache.
 */
struct skmem_cache {
#if KASAN
	void            *skm_start;
	uint32_t        skm_align[0];
#endif
	/*
	 * Commonly-accessed elements during alloc and free.
	 */
	uint32_t        skm_mode;               /* cache mode flags */
	skmem_ctor_fn_t skm_ctor;               /* object constructor */
	skmem_dtor_fn_t skm_dtor;               /* object destructor */
	skmem_reclaim_fn_t skm_reclaim;         /* cache reclaim */
	void            *skm_private;           /* opaque arg to callbacks */

	/*
	 * Depot.
	 */
	decl_lck_mtx_data(, skm_dp_lock);       /* protects depot layer */
	struct skmem_magtype *skm_magtype;      /* magazine type */
	struct skmem_maglist skm_full;          /* full magazines */
	struct skmem_maglist skm_empty;         /* empty magazines */

	/*
	 * Slab.
	 */
	decl_lck_mtx_data(, skm_sl_lock);       /* protects slab layer */
	skmem_slab_alloc_fn_t skm_slab_alloc;   /* slab allocate */
	skmem_slab_free_fn_t skm_slab_free;     /* slab free */
	size_t          skm_chunksize;          /* bufsize + alignment */
	size_t          skm_objsize;            /* actual obj size in slab */
	size_t          skm_slabsize;           /* size of a slab */
	size_t          skm_hash_initial;       /* initial hash table size */
	size_t          skm_hash_limit;         /* hash table size limit */
	size_t          skm_hash_shift;         /* get to interesting bits */
	size_t          skm_hash_mask;          /* hash table mask */
	size_t          skm_hash_size;
	struct skmem_bufctl_bkt *__counted_by(skm_hash_size) skm_hash_table; /* alloc'd buffer htable */
	TAILQ_HEAD(, skmem_slab) skm_sl_partial_list; /* partially-allocated */
	TAILQ_HEAD(, skmem_slab) skm_sl_empty_list;   /* fully-allocated */
	struct skmem_region *skm_region;        /* region source for slabs */

	/*
	 * Statistics.
	 */
	uint32_t        skm_cpu_mag_size;       /* current magazine size */
	uint32_t        skm_cpu_mag_resize;     /* # of magazine resizes */
	uint32_t        skm_cpu_mag_purge;      /* # of magazine purges */
	uint32_t        skm_cpu_mag_reap;       /* # of magazine reaps */
	uint64_t        skm_depot_contention;   /* mutex contention count */
	uint64_t        skm_depot_contention_prev; /* previous snapshot */
	uint32_t        skm_depot_full;         /* # of full magazines */
	uint32_t        skm_depot_empty;        /* # of empty magazines */
	uint32_t        skm_depot_ws_zero;      /* # of working set flushes */
	uint32_t        skm_sl_rescale;         /* # of hash table rescales */
	uint32_t        skm_sl_create;          /* slab creates */
	uint32_t        skm_sl_destroy;         /* slab destroys */
	uint32_t        skm_sl_alloc;           /* slab layer allocations */
	uint32_t        skm_sl_free;            /* slab layer frees */
	uint32_t        skm_sl_partial;         /* # of partial slabs */
	uint32_t        skm_sl_empty;           /* # of empty slabs */
	uint64_t        skm_sl_alloc_fail;      /* total failed allocations */
	uint64_t        skm_sl_bufinuse;        /* total unfreed buffers */
	uint64_t        skm_sl_bufmax;          /* max buffers ever */

	/*
	 * Cache properties.
	 */
	TAILQ_ENTRY(skmem_cache) skm_link;      /* cache linkage */
	char            skm_name[64];           /* cache name */
	uuid_t          skm_uuid;               /* cache uuid */
	size_t          skm_bufsize;            /* buffer size */
	size_t          skm_bufalign;           /* buffer alignment */
	size_t          skm_objalign;           /* object alignment */

	/*
	 * CPU layer, aligned at (maximum) cache line boundary.
	 */
	decl_lck_mtx_data(, skm_rs_lock);       /* protects resizing */
	struct thread    *skm_rs_owner;         /* resize owner */
	uint32_t        skm_rs_busy;            /* prevent resizing */
	uint32_t        skm_rs_want;            /* # of threads blocked */
	size_t          skm_cpu_cache_count;
	struct skmem_cpu_cache  skm_cpu_cache[__counted_by(skm_cpu_cache_count)]
	__attribute__((aligned(CHANNEL_CACHE_ALIGN_MAX)));
};

#define SKMEM_CACHE_SIZE(n)     \
	offsetof(struct skmem_cache, skm_cpu_cache[n])

#define SKMEM_CPU_CACHE(c)                                      \
	((struct skmem_cpu_cache *)((void *)((char *)(c) +      \
	SKMEM_CACHE_SIZE(cpu_number()))))

/* valid values for skm_mode, set only by skmem_cache_create() */
#define SKM_MODE_NOMAGAZINES    0x00000001      /* disable magazines layer */
#define SKM_MODE_AUDIT          0x00000002      /* audit transactions */
#define SKM_MODE_NOREDIRECT     0x00000004      /* unaffected by defunct */
#define SKM_MODE_BATCH          0x00000008      /* supports batch alloc/free */
#define SKM_MODE_DYNAMIC        0x00000010      /* enable magazine resizing */
#define SKM_MODE_CLEARONFREE    0x00000020      /* zero-out upon slab free */
#define SKM_MODE_PSEUDO         0x00000040      /* external backing store */
#define SKM_MODE_RECLAIM        0x00000080      /* aggressive memory reclaim */

#define SKM_MODE_BITS \
	"\020\01NOMAGAZINES\02AUDIT\03NOREDIRECT\04BATCH\05DYNAMIC"     \
	"\06CLEARONFREE\07PSEUDO\10RECLAIM"

/*
 * Valid flags for sk{mem,region}_alloc().  SKMEM_FAILOK is valid only if
 * SKMEM_SLEEP is set, i.e. SKMEM_{NOSLEEP,FAILOK} are mutually exclusive.
 * If set, SKMEM_FAILOK indicates that the segment allocation may fail,
 * and that the cache layer would handle the retries rather than blocking
 * inside the region allocator.
 */
#define SKMEM_SLEEP             0x0     /* can block for memory; won't fail */
#define SKMEM_NOSLEEP           0x1     /* cannot block for memory; may fail */
#define SKMEM_PANIC             0x2     /* panic upon allocation failure */
#define SKMEM_FAILOK            0x4     /* can fail for blocking alloc */

/* valid flag values for skmem_cache_create() */
#define SKMEM_CR_NOMAGAZINES    0x1     /* disable magazines layer */
#define SKMEM_CR_BATCH          0x2     /* support batch alloc/free */
#define SKMEM_CR_DYNAMIC        0x4     /* enable magazine resizing */
#define SKMEM_CR_CLEARONFREE    0x8     /* zero-out upon slab free */
#define SKMEM_CR_RECLAIM        0x10    /* aggressive memory reclaim */

__BEGIN_DECLS
/*
 * Given a buffer control, add a use count to it.
 */
__attribute__((always_inline))
static inline void
skmem_bufctl_use(struct skmem_bufctl *bc)
{
	uint32_t old, new;

	os_atomic_rmw_loop(&bc->bc_usecnt, old, new, relaxed, {
		new = old + 1;
		VERIFY(new != 0);
		ASSERT(new == 1 || (bc->bc_flags & SKMEM_BUFCTL_SHAREOK));
	});
}

/*
 * Given a buffer control, remove a use count from it (returns new value).
 */
__attribute__((always_inline))
static inline uint32_t
skmem_bufctl_unuse(struct skmem_bufctl *bc)
{
	uint32_t old, new;

	os_atomic_rmw_loop(&bc->bc_usecnt, old, new, relaxed, {
		new = old - 1;
		VERIFY(old != 0);
		ASSERT(old == 1 || (bc->bc_flags & SKMEM_BUFCTL_SHAREOK));
	});

	return new;
}

extern struct skmem_cache *skmem_slab_cache;    /* cache for skmem_slab */
extern struct skmem_cache *skmem_bufctl_cache;  /* cache for skmem_bufctl */
extern unsigned int bc_size;                    /* size of bufctl */
extern int skmem_slab_alloc_locked(struct skmem_cache *,
    struct skmem_obj_info *, struct skmem_obj_info *, uint32_t);
extern void skmem_slab_free_locked(struct skmem_cache *, void *);
extern int skmem_slab_alloc_pseudo_locked(struct skmem_cache *,
    struct skmem_obj_info *, struct skmem_obj_info *, uint32_t);
extern void skmem_slab_free_pseudo_locked(struct skmem_cache *, void *);
extern void skmem_slab_free(struct skmem_cache *, void *);
extern void skmem_slab_batch_free(struct skmem_cache *, struct skmem_obj *);
extern uint32_t skmem_slab_batch_alloc(struct skmem_cache *, struct skmem_obj **,
    uint32_t, uint32_t);
extern int skmem_slab_alloc(struct skmem_cache *, struct skmem_obj_info *,
    struct skmem_obj_info *, uint32_t);
extern void skmem_audit_bufctl(struct skmem_bufctl *);
#define SKM_SLAB_LOCK(_skm)                     \
	lck_mtx_lock(&(_skm)->skm_sl_lock)
#define SKM_SLAB_LOCK_ASSERT_HELD(_skm)         \
	LCK_MTX_ASSERT(&(_skm)->skm_sl_lock, LCK_MTX_ASSERT_OWNED)
#define SKM_SLAB_LOCK_ASSERT_NOTHELD(_skm)      \
	LCK_MTX_ASSERT(&(_skm)->skm_sl_lock, LCK_MTX_ASSERT_NOTOWNED)
#define SKM_SLAB_UNLOCK(_skm)                   \
	lck_mtx_unlock(&(_skm)->skm_sl_lock)
#define SKMEM_CACHE_HASH_INDEX(_a, _s, _m)      (((_a) >> (_s)) & (_m))
#define SKMEM_CACHE_HASH(_skm, _buf)                                     \
	(&(_skm)->skm_hash_table[SKMEM_CACHE_HASH_INDEX((uintptr_t)_buf, \
	(_skm)->skm_hash_shift, (_skm)->skm_hash_mask)])

extern void skmem_cache_pre_init(void);
extern void skmem_cache_init(void);
extern void skmem_cache_fini(void);
extern struct skmem_cache *skmem_cache_create(const char *, size_t, size_t,
    skmem_ctor_fn_t, skmem_dtor_fn_t, skmem_reclaim_fn_t, void *,
    struct skmem_region *, uint32_t);
extern void skmem_cache_destroy(struct skmem_cache *);

extern uint32_t skmem_cache_batch_alloc(struct skmem_cache *,
    struct skmem_obj **list, size_t objsize, uint32_t, uint32_t);

/*
 * XXX -fbounds-safety: Sometimes we use skmem_cache_alloc to allocate a struct
 * with a flexible array (e.g. struct skmem_mag). For those, we can't have the
 * alloc function return void *__single, because we lose bounds information.
 */
static inline void *__header_indexable
skmem_cache_alloc(struct skmem_cache *skm, uint32_t skmflag)
{
	struct skmem_obj *__single buf;

	(void) skmem_cache_batch_alloc(skm, &buf, skm->skm_objsize, 1, skmflag);

	/* This is one of the few places where using __unsafe_forge is okay */
	return __unsafe_forge_bidi_indexable(void *, buf, buf ? skm->skm_objsize : 0);
}

extern void skmem_cache_free(struct skmem_cache *, void *);
extern void skmem_cache_free_nocache(struct skmem_cache *, void *);
extern void skmem_cache_batch_free(struct skmem_cache *, struct skmem_obj *);
extern void skmem_cache_batch_free_nocache(struct skmem_cache *, struct skmem_obj *);
extern void skmem_cache_reap_now(struct skmem_cache *, boolean_t);
extern void skmem_cache_reap(void);
extern void skmem_reap_caches(boolean_t);
extern void skmem_cache_get_obj_info(struct skmem_cache *, void *,
    struct skmem_obj_info *, struct skmem_obj_info *);
extern uint32_t skmem_cache_magazine_max(uint32_t);
extern boolean_t skmem_allow_magazines(void);
#if (DEVELOPMENT || DEBUG)
extern void skmem_cache_test_start(uint32_t);
extern void skmem_cache_test_stop(void);
#endif /* (DEVELOPMENT || DEBUG) */
__END_DECLS
#endif /* BSD_KERNEL_PRIVATE */
#endif /* _SKYWALK_MEM_SKMEMCACHEVAR_H */