/*
* Copyright (c) 2022 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
* This file contains Original Code and/or Modifications of Original Code
* as defined in and that are subject to the Apple Public Source License
* Version 2.0 (the 'License'). You may not use this file except in
* compliance with the License. The rights granted to you under the License
* may not be used to create, or enable the creation or redistribution of,
* unlawful or unlicensed copies of an Apple operating system, or to
* circumvent, violate, or enable the circumvention or violation of, any
* terms of an Apple operating system software license agreement.
*
* Please obtain a copy of the License at
* http://www.opensource.apple.com/apsl/ and read it before using this file.
*
* The Original Code and all software distributed under the License are
* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
* Please see the License for the specific language governing rights and
* limitations under the License.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
*/
#define LOCK_PRIVATE 1
#include <mach_ldebug.h>
#include <kern/locks_internal.h>
#include <kern/lock_stat.h>
#include <kern/locks.h>
#include <kern/kalloc.h>
#include <kern/thread.h>
#include <mach/machine/sdt.h>
#include <machine/cpu_data.h>
#include <machine/machine_cpu.h>
#if !LCK_MTX_USE_ARCH
/*
* lck_mtx_t
* ~~~~~~~~~
*
* Kernel mutexes in this implementation are made of four 32 bits words:
*
* - word 0: turnstile compact ID (24 bits) and the 0x22 lock tag
* - word 1: padding (to be used for group compact IDs)
* - word 2: mutex state (lock owner + interlock, spin and waiters bits),
* refered to as "data" in the code.
* - word 3: adaptive spin and interlock MCS queue tails.
*
* The 64 bits word made of the last two words is refered to
* as the "mutex state" in code.
*
*
* Core serialization rules
* ~~~~~~~~~~~~~~~~~~~~~~~~
*
* The mutex has a bit (lck_mtx_t::lck_mtx.ilocked or bit LCK_MTX_ILOCK
* of the data word) that serves as a spinlock for the mutex state.
*
*
* Updating the lock fields must follow the following rules:
*
* - It is ok to "steal" the mutex (updating its data field) if no one
* holds the interlock.
*
* - Holding the interlock allows its holder to update the first 3 words
* of the kernel mutex without using RMW atomics (plain stores are OK).
*
* - Holding the interlock is required for a thread to remove itself
* from the adaptive spin queue.
*
* - Threads can enqueue themselves onto the adaptive spin wait queue
* or the interlock wait queue at any time.
*
*
* Waiters bit and turnstiles
* ~~~~~~~~~~~~~~~~~~~~~~~~~~
*
* The turnstile on a kernel mutex is set by waiters, and cleared
* once they have all been resumed and successfully acquired the lock.
*
* LCK_MTX_NEEDS_WAKEUP being set (always with an owner set too)
* forces threads to the lck_mtx_unlock slowpath,
* in order to evaluate whether lck_mtx_unlock_wakeup() must be called.
*
* As a result it means it really only needs to be set at select times:
*
* - when a thread blocks and "snitches" on the current thread owner,
* so that when that thread unlocks it calls wake up,
*
* - when a thread that was woken up resumes its work and became
* the inheritor.
*/
#define ADAPTIVE_SPIN_ENABLE 0x1
#define NOINLINE __attribute__((noinline))
#define LCK_MTX_EVENT(lck) CAST_EVENT64_T(&(lck)->lck_mtx.data)
#define LCK_EVENT_TO_MUTEX(e) __container_of((uint32_t *)(e), lck_mtx_t, lck_mtx.data)
#define LCK_MTX_HAS_WAITERS(l) ((l)->lck_mtx.data & LCK_MTX_NEEDS_WAKEUP)
#if DEVELOPMENT || DEBUG
TUNABLE(bool, LckDisablePreemptCheck, "-disable_mtx_chk", false);
#endif /* DEVELOPMENT || DEBUG */
extern unsigned int not_in_kdp;
KALLOC_TYPE_DEFINE(KT_LCK_MTX, lck_mtx_t, KT_PRIV_ACCT);
#define LCK_MTX_NULL_CTID 0x00000000u
__enum_decl(lck_mtx_mode_t, uint32_t, {
LCK_MTX_MODE_SLEEPABLE,
LCK_MTX_MODE_SPIN,
LCK_MTX_MODE_SPIN_ALWAYS,
});
__enum_decl(lck_ilk_mode_t, uint32_t, {
LCK_ILK_MODE_UNLOCK,
LCK_ILK_MODE_DIRECT,
LCK_ILK_MODE_FROM_AS,
});
static inline void
lck_mtx_mcs_clear(lck_mtx_mcs_t mcs)
{
*mcs = (struct lck_mtx_mcs){ };
}
static inline lck_mcs_id_t
lck_mtx_get_mcs_id(void)
{
return lck_mcs_id_current(LCK_MCS_SLOT_0);
}
__pure2
static inline lck_mtx_mcs_t
lck_mtx_get_mcs(lck_mcs_id_t idx)
{
return &lck_mcs_get_other(idx)->mcs_mtx;
}
#pragma mark lck_mtx_t: validation
__abortlike
static void
__lck_mtx_invalid_panic(lck_mtx_t *lck)
{
panic("Invalid/destroyed mutex %p: "
"<0x%06x 0x%02x 0x%08x 0x%08x/%p 0x%04x 0x%04x>",
lck, lck->lck_mtx_tsid, lck->lck_mtx_type, lck->lck_mtx_grp,
lck->lck_mtx.data, ctid_get_thread_unsafe(lck->lck_mtx.owner),
lck->lck_mtx.as_tail, lck->lck_mtx.ilk_tail);
}
__abortlike
static void
__lck_mtx_not_owned_panic(lck_mtx_t *lock, thread_t thread)
{
panic("Mutex %p is unexpectedly not owned by thread %p", lock, thread);
}
__abortlike
static void
__lck_mtx_owned_panic(lck_mtx_t *lock, thread_t thread)
{
panic("Mutex %p is unexpectedly owned by thread %p", lock, thread);
}
__abortlike
static void
__lck_mtx_lock_is_sleepable_panic(lck_mtx_t *lck)
{
// "Always" variants can never block. If the lock is held as a normal mutex
// then someone is mixing always and non-always calls on the same lock, which is
// forbidden.
panic("Mutex %p is held as a full-mutex (spin-always lock attempted)", lck);
}
#if DEVELOPMENT || DEBUG
__abortlike
static void
__lck_mtx_preemption_disabled_panic(lck_mtx_t *lck, int expected)
{
panic("Attempt to take mutex %p with preemption disabled (%d)",
lck, get_preemption_level() - expected);
}
__abortlike
static void
__lck_mtx_at_irq_panic(lck_mtx_t *lck)
{
panic("Attempt to take mutex %p in IRQ context", lck);
}
/*
* Routine: lck_mtx_check_preemption
*
* Verify preemption is enabled when attempting to acquire a mutex.
*/
static inline void
lck_mtx_check_preemption(lck_mtx_t *lock, thread_t thread, int expected)
{
#pragma unused(thread)
if (lock_preemption_level_for_thread(thread) == expected) {
return;
}
if (LckDisablePreemptCheck) {
return;
}
if (current_cpu_datap()->cpu_hibernate) {
return;
}
if (startup_phase < STARTUP_SUB_EARLY_BOOT) {
return;
}
__lck_mtx_preemption_disabled_panic(lock, expected);
}
static inline void
lck_mtx_check_irq(lck_mtx_t *lock)
{
if (ml_at_interrupt_context()) {
__lck_mtx_at_irq_panic(lock);
}
}
#define LCK_MTX_SNIFF_PREEMPTION(thread) lock_preemption_level_for_thread(thread)
#define LCK_MTX_CHECK_INVARIANTS 1
#else
#define lck_mtx_check_irq(lck) ((void)0)
#define LCK_MTX_SNIFF_PREEMPTION(thread) 0
#define LCK_MTX_CHECK_INVARIANTS 0
#endif /* !DEVELOPMENT && !DEBUG */
#if CONFIG_DTRACE
#define LCK_MTX_SNIFF_DTRACE() lck_debug_state.lds_value
#else
#define LCK_MTX_SNIFF_DTRACE() 0
#endif
#pragma mark lck_mtx_t: alloc/init/destroy/free
lck_mtx_t *
lck_mtx_alloc_init(lck_grp_t *grp, lck_attr_t *attr)
{
lck_mtx_t *lck;
lck = zalloc(KT_LCK_MTX);
lck_mtx_init(lck, grp, attr);
return lck;
}
void
lck_mtx_free(lck_mtx_t *lck, lck_grp_t *grp)
{
lck_mtx_destroy(lck, grp);
zfree(KT_LCK_MTX, lck);
}
void
lck_mtx_init(lck_mtx_t *lck, lck_grp_t *grp, lck_attr_t *attr)
{
if (attr == LCK_ATTR_NULL) {
attr = &lck_attr_default;
}
*lck = (lck_mtx_t){
.lck_mtx_type = LCK_TYPE_MUTEX,
.lck_mtx_grp = grp->lck_grp_attr_id,
};
if (attr->lck_attr_val & LCK_ATTR_DEBUG) {
lck->lck_mtx.data |= LCK_MTX_PROFILE;
}
lck_grp_reference(grp, &grp->lck_grp_mtxcnt);
}
void
lck_mtx_destroy(lck_mtx_t *lck, lck_grp_t *grp)
{
if (lck->lck_mtx_tsid && lck->lck_mtx_type == LCK_TYPE_MUTEX) {
panic("Mutex to destroy still has waiters: %p: "
"<0x%06x 0x%02x 0x%08x 0x%08x/%p 0x%04x 0x%04x>",
lck, lck->lck_mtx_tsid, lck->lck_mtx_type, lck->lck_mtx_grp,
lck->lck_mtx.data, ctid_get_thread_unsafe(lck->lck_mtx.owner),
lck->lck_mtx.as_tail, lck->lck_mtx.ilk_tail);
}
if (lck->lck_mtx_type != LCK_TYPE_MUTEX ||
(lck->lck_mtx.data & ~LCK_MTX_PROFILE) ||
lck->lck_mtx.as_tail || lck->lck_mtx.ilk_tail) {
__lck_mtx_invalid_panic(lck);
}
LCK_GRP_ASSERT_ID(grp, lck->lck_mtx_grp);
lck->lck_mtx_type = LCK_TYPE_NONE;
lck->lck_mtx.data = LCK_MTX_TAG_DESTROYED;
lck->lck_mtx_grp = 0;
lck_grp_deallocate(grp, &grp->lck_grp_mtxcnt);
}
#pragma mark lck_mtx_t: lck_mtx_ilk*
static hw_spin_timeout_status_t
lck_mtx_ilk_timeout_panic(void *_lock, hw_spin_timeout_t to, hw_spin_state_t st)
{
lck_mtx_t *lck = _lock;
panic("Mutex interlock[%p] " HW_SPIN_TIMEOUT_FMT "; "
"current owner: %p, "
"<0x%06x 0x%02x 0x%08x 0x%08x 0x%04x 0x%04x>, "
HW_SPIN_TIMEOUT_DETAILS_FMT,
lck, HW_SPIN_TIMEOUT_ARG(to, st),
ctid_get_thread_unsafe(lck->lck_mtx.owner),
lck->lck_mtx_tsid, lck->lck_mtx_type,
lck->lck_mtx_grp, lck->lck_mtx.data,
lck->lck_mtx.as_tail, lck->lck_mtx.ilk_tail,
HW_SPIN_TIMEOUT_DETAILS_ARG(to, st));
}
static const struct hw_spin_policy lck_mtx_ilk_timeout_policy = {
.hwsp_name = "lck_mtx_t (ilk)",
.hwsp_timeout_atomic = &lock_panic_timeout,
.hwsp_op_timeout = lck_mtx_ilk_timeout_panic,
};
static void
lck_mtx_ilk_lock_cleanup_as_mcs(
lck_mtx_t *lock,
lck_mcs_id_t idx,
lck_mtx_mcs_t mcs,
hw_spin_timeout_t to,
hw_spin_state_t *ss)
{
lck_mtx_mcs_t nnode = NULL;
lck_mcs_id_t pidx = (lck_mcs_id_t)mcs->lmm_as_prev;
bool was_last;
/*
* This is called when the thread made use
* of the adaptive spin queue and needs
* to remove itself from it.
*/
/*
* If the thread is last, set the tail to the node before us.
*/
was_last = lock_cmpxchg(&lock->lck_mtx.as_tail, idx, pidx, release);
if (was_last) {
/*
* If @c mcs was last, we need to erase the previous
* node link to it.
*
* However, new nodes could have now taken our place
* and set the previous node's @c lmm_as_next field
* already, so we must CAS rather than blindly set.
*
* We know the previous node is stable because
* we hold the interlock (preventing concurrent
* removals).
*/
if (pidx) {
os_atomic_cmpxchg(&lck_mtx_get_mcs(pidx)->lmm_as_next,
mcs, nnode, relaxed);
}
} else {
/*
* If @c mcs wasn't last, then wait to make sure
* we observe @c lmm_as_next. Once we do, we know
* the field is stable since we hold the interlock
* (preventing concurrent dequeues).
*
* We can then update it to @c mcs next node index
* (which is also stable for similar reasons).
*
* Lastly update the previous node @c lmm_as_next
* field as well to terminate the dequeue.
*/
while (!hw_spin_wait_until(&mcs->lmm_as_next, nnode, nnode)) {
hw_spin_policy_t pol = &lck_mtx_ilk_timeout_policy;
hw_spin_should_keep_spinning(lock, pol, to, ss);
}
os_atomic_store(&nnode->lmm_as_prev, pidx, relaxed);
if (pidx) {
os_atomic_store(&lck_mtx_get_mcs(pidx)->lmm_as_next,
nnode, relaxed);
}
}
/*
* @c mcs's fields are left dangling,
* it is the responsibilty of the caller
* to terminate the cleanup.
*/
}
static NOINLINE void
lck_mtx_ilk_lock_contended(
lck_mtx_t *lock,
lck_mtx_state_t state,
lck_ilk_mode_t mode)
{
hw_spin_policy_t pol = &lck_mtx_ilk_timeout_policy;
hw_spin_timeout_t to = hw_spin_compute_timeout(pol);
hw_spin_state_t ss = { };
lck_mtx_mcs_t mcs, nnode, pnode;
lck_mcs_id_t idx, pidx;
lck_mtx_state_t nstate;
unsigned long ready;
uint64_t spin_start;
/*
* Take a spot in the interlock MCS queue,
* and then spin until we're at the head of it.
*/
idx = lck_mtx_get_mcs_id();
mcs = &lck_mcs_get_current()->mcs_mtx;
if (mode != LCK_MTX_MODE_SPIN) {
spin_start = LCK_MTX_ADAPTIVE_SPIN_BEGIN();
}
mcs->lmm_ilk_current = lock;
pidx = os_atomic_xchg(&lock->lck_mtx.ilk_tail, idx, release);
if (pidx) {
pnode = lck_mtx_get_mcs(pidx);
os_atomic_store(&pnode->lmm_ilk_next, mcs, relaxed);
while (!hw_spin_wait_until(&mcs->lmm_ilk_ready, ready, ready)) {
hw_spin_should_keep_spinning(lock, pol, to, &ss);
}
}
/*
* We're now the first in line, wait for the interlock
* to look ready and take it.
*
* We can't just assume the lock is ours for the taking,
* because the fastpath of lck_mtx_lock_spin{,_always}
* only look at the mutex "data" and might steal it.
*
* Also clear the interlock MCS tail if @c mcs is last.
*/
do {
while (!hw_spin_wait_until(&lock->lck_mtx.val,
state.val, state.ilocked == 0)) {
hw_spin_should_keep_spinning(lock, pol, to, &ss);
}
nstate = state;
nstate.ilocked = 1;
if (nstate.ilk_tail == idx) {
nstate.ilk_tail = 0;
}
} while (!os_atomic_cmpxchg(&lock->lck_mtx, state, nstate, acquire));
/*
* We now have the interlock, let's cleanup the MCS state.
*
* First, if there is a node after us, notify that it
* is at the head of the interlock queue.
*
* Second, perform the adaptive spin MCS cleanup if needed.
*
* Lastly, clear the MCS node.
*/
if (state.ilk_tail != idx) {
while (!hw_spin_wait_until(&mcs->lmm_ilk_next, nnode, nnode)) {
hw_spin_should_keep_spinning(lock, pol, to, &ss);
}
os_atomic_store(&nnode->lmm_ilk_ready, 1, relaxed);
}
if (mode == LCK_ILK_MODE_FROM_AS) {
lck_mtx_ilk_lock_cleanup_as_mcs(lock, idx, mcs, to, &ss);
}
lck_mtx_mcs_clear(mcs);
if (mode != LCK_MTX_MODE_SPIN) {
LCK_MTX_ADAPTIVE_SPIN_END(lock, lock->lck_mtx_grp, spin_start);
}
}
static void
lck_mtx_ilk_lock_nopreempt(lck_mtx_t *lock, lck_ilk_mode_t mode)
{
lck_mtx_state_t state, nstate;
os_atomic_rmw_loop(&lock->lck_mtx.val, state.val, nstate.val, acquire, {
if (__improbable(state.ilocked || state.ilk_tail)) {
os_atomic_rmw_loop_give_up({
return lck_mtx_ilk_lock_contended(lock, state, mode);
});
}
nstate = state;
nstate.ilocked = true;
});
}
static void
lck_mtx_ilk_unlock_v(lck_mtx_t *lock, uint32_t data)
{
os_atomic_store(&lock->lck_mtx.data, data, release);
lock_enable_preemption();
}
static void
lck_mtx_ilk_unlock(lck_mtx_t *lock)
{
lck_mtx_ilk_unlock_v(lock, lock->lck_mtx.data & ~LCK_MTX_ILOCK);
}
#pragma mark lck_mtx_t: turnstile integration
/*
* Routine: lck_mtx_lock_wait
*
* Invoked in order to wait on contention.
*
* Called with the interlock locked and
* returns it unlocked.
*
* Always aggressively sets the owning thread to promoted,
* even if it's the same or higher priority
* This prevents it from lowering its own priority while holding a lock
*
* TODO: Come up with a more efficient way to handle same-priority promotions
* <rdar://problem/30737670> ARM mutex contention logic could avoid taking the thread lock
*/
static struct turnstile *
lck_mtx_lock_wait(
lck_mtx_t *lck,
thread_t self,
thread_t holder,
struct turnstile *ts)
{
uint64_t sleep_start = LCK_MTX_BLOCK_BEGIN();
KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_START,
unslide_for_kdebug(lck), (uintptr_t)thread_tid(self), 0, 0, 0);
if (ts == TURNSTILE_NULL) {
ts = turnstile_prepare_compact_id((uintptr_t)lck,
lck->lck_mtx_tsid, TURNSTILE_KERNEL_MUTEX);
if (lck->lck_mtx_tsid == 0) {
lck->lck_mtx_tsid = ts->ts_compact_id;
}
}
assert3u(ts->ts_compact_id, ==, lck->lck_mtx_tsid);
thread_set_pending_block_hint(self, kThreadWaitKernelMutex);
turnstile_update_inheritor(ts, holder, (TURNSTILE_DELAYED_UPDATE | TURNSTILE_INHERITOR_THREAD));
waitq_assert_wait64(&ts->ts_waitq, LCK_MTX_EVENT(lck),
THREAD_UNINT | THREAD_WAIT_NOREPORT_USER, TIMEOUT_WAIT_FOREVER);
lck_mtx_ilk_unlock(lck);
turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_NOT_HELD);
thread_block(THREAD_CONTINUE_NULL);
KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_END, 0, 0, 0, 0, 0);
LCK_MTX_BLOCK_END(lck, lck->lck_mtx_grp, sleep_start);
return ts;
}
static void
lck_mtx_lock_wait_done(lck_mtx_t *lck, struct turnstile *ts)
{
if (turnstile_complete_compact_id((uintptr_t)lck, ts,
TURNSTILE_KERNEL_MUTEX)) {
lck->lck_mtx_tsid = 0;
}
}
/*
* Routine: lck_mtx_lock_will_need_wakeup
*
* Returns whether the thread is the current turnstile inheritor,
* which means it will have to call lck_mtx_unlock_wakeup()
* on unlock.
*/
__attribute__((always_inline))
static bool
lck_mtx_lock_will_need_wakeup(lck_mtx_t *lck, thread_t self)
{
uint32_t tsid = lck->lck_mtx_tsid;
return tsid && turnstile_get_by_id(tsid)->ts_inheritor == self;
}
/*
* Routine: lck_mtx_unlock_wakeup
*
* Invoked on unlock when there is contention.
*
* Called with the interlock locked.
*
* NOTE: callers should call turnstile_clenup after
* dropping the interlock.
*/
static void
lck_mtx_unlock_wakeup(
lck_mtx_t *lck,
__kdebug_only thread_t thread)
{
struct turnstile *ts;
kern_return_t did_wake;
KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_UNLCK_WAKEUP_CODE) | DBG_FUNC_START,
unslide_for_kdebug(lck), (uintptr_t)thread_tid(thread), 0, 0, 0);
ts = turnstile_get_by_id(lck->lck_mtx_tsid);
/*
* We can skip turnstile_{prepare,cleanup} because
* we hold the interlock of the primitive,
* and enqueues/wakeups all happen under the interlock,
* which means the turnstile is stable.
*/
did_wake = waitq_wakeup64_one(&ts->ts_waitq, LCK_MTX_EVENT(lck),
THREAD_AWAKENED, WAITQ_UPDATE_INHERITOR);
assert(did_wake == KERN_SUCCESS);
turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_UNLCK_WAKEUP_CODE) | DBG_FUNC_END, 0, 0, 0, 0, 0);
}
#pragma mark lck_mtx_t: lck_mtx_lock
static inline bool
lck_mtx_ctid_on_core(uint32_t ctid)
{
thread_t th = ctid_get_thread_unsafe(ctid);
return th && machine_thread_on_core_allow_invalid(th);
}
#define LCK_MTX_OWNER_FOR_TRACE(lock) \
VM_KERNEL_UNSLIDE_OR_PERM(ctid_get_thread_unsafe((lock)->lck_mtx.data))
static void
lck_mtx_lock_adaptive_spin(lck_mtx_t *lock, lck_mtx_state_t state)
{
__kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lock);
hw_spin_policy_t pol = &lck_mtx_ilk_timeout_policy;
hw_spin_timeout_t to = hw_spin_compute_timeout(pol);
hw_spin_state_t ss = { };
uint64_t deadline;
lck_mtx_mcs_t mcs, node;
lck_mcs_id_t idx, pidx, clear_idx;
unsigned long prev;
lck_mtx_state_t nstate;
ast_t *const astp = ast_pending();
idx = lck_mtx_get_mcs_id();
mcs = &lck_mcs_get_current()->mcs_mtx;
KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_START,
trace_lck, LCK_MTX_OWNER_FOR_TRACE(lock), lock->lck_mtx_tsid, 0, 0);
/*
* Take a spot in the adaptive spin queue,
* and then spin until we're at the head of it.
*
* Until we're at the head, we do not need to monitor
* for whether the current owner is on core or not:
*
* 1. the head of the queue is doing it already,
*
* 2. when the entire adaptive spin queue will "give up"
* as a result of the owner going off core, we want
* to avoid a thundering herd and let the AS queue
* pour into the interlock one slowly.
*
* Do give up if the scheduler made noises something
* more important has shown up.
*
* Note: this function is optimized so that we do not touch
* our local mcs node when we're the head of the queue.
*
* This allows us in the case when the contention is
* between 2 cores only to not have to touch this
* cacheline at all.
*/
pidx = os_atomic_xchg(&lock->lck_mtx.as_tail, idx, release);
if (pidx) {
node = lck_mtx_get_mcs(pidx);
mcs->lmm_as_prev = pidx;
os_atomic_store(&node->lmm_as_next, mcs, release);
while (!hw_spin_wait_until(&mcs->lmm_as_prev, prev,
prev == 0 || (os_atomic_load(astp, relaxed) & AST_URGENT))) {
hw_spin_should_keep_spinning(lock, pol, to, &ss);
}
if (__improbable(prev)) {
goto adaptive_spin_fail;
}
clear_idx = 0;
} else {
clear_idx = idx;
}
/*
* We're now first in line.
*
* It's our responsbility to monitor the lock's state
* for whether (1) the lock has become available,
* (2) its owner has gone off core, (3) the scheduler
* wants its CPU back, or (4) we've spun for too long.
*/
deadline = ml_get_timebase() + os_atomic_load(&MutexSpin, relaxed);
for (;;) {
state.val = lock_load_exclusive(&lock->lck_mtx.val, acquire);
if (__probable(!state.ilocked && !state.ilk_tail && !state.owner)) {
/*
* 2-core contention: if we can, try to dequeue
* ourselves from the adaptive spin queue
* as part of this CAS in order to avoid
* the cost of lck_mtx_ilk_lock_cleanup_as_mcs()
* and zeroing the mcs node at all.
*
* Because the queue is designed to limit contention,
* using store-exclusive over an armv8.1 LSE atomic
* is actually marginally better (presumably due to
* the better codegen).
*/
nstate = state;
nstate.ilocked = true;
if (state.as_tail == clear_idx) {
nstate.as_tail = 0;
}
if (__probable(lock_store_exclusive(&lock->lck_mtx.val,
state.val, nstate.val, acquire))) {
break;
}
} else {
lock_wait_for_event();
}
if (__improbable(ml_get_timebase() > deadline ||
(os_atomic_load(astp, relaxed) & AST_URGENT) ||
(!state.ilocked && !state.ilk_tail && state.owner &&
!lck_mtx_ctid_on_core(state.owner)))) {
goto adaptive_spin_fail;
}
}
/*
* If we're here, we got the lock, we just have to cleanup
* the MCS nodes and return.
*/
if (state.as_tail != clear_idx) {
lck_mtx_ilk_lock_cleanup_as_mcs(lock, idx, mcs, to, &ss);
lck_mtx_mcs_clear(mcs);
}
KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_END,
trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(thread),
lock->lck_mtx_tsid, 0, 0);
return;
adaptive_spin_fail:
KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_END,
trace_lck, LCK_MTX_OWNER_FOR_TRACE(lock), lock->lck_mtx_tsid, 0, 0);
return lck_mtx_ilk_lock_contended(lock, state, LCK_ILK_MODE_FROM_AS);
}
static NOINLINE void
lck_mtx_lock_contended(lck_mtx_t *lock, thread_t thread, lck_mtx_mode_t mode)
{
struct turnstile *ts = TURNSTILE_NULL;
lck_mtx_state_t state;
uint32_t ctid = thread->ctid;
uint32_t data;
#if CONFIG_DTRACE
int first_miss = 0;
#endif /* CONFIG_DTRACE */
bool direct_wait = false;
uint64_t spin_start;
uint32_t profile;
lck_mtx_check_irq(lock);
if (mode == LCK_MTX_MODE_SLEEPABLE) {
lock_disable_preemption_for_thread(thread);
}
for (;;) {
/*
* Load the current state and perform sanity checks
*
* Note that the various "corrupt" values are designed
* so that the slowpath is taken when a mutex was used
* after destruction, so that we do not have to do
* sanity checks in the fast path.
*/
state = os_atomic_load(&lock->lck_mtx, relaxed);
if (state.owner == ctid) {
__lck_mtx_owned_panic(lock, thread);
}
if (lock->lck_mtx_type != LCK_TYPE_MUTEX ||
state.data == LCK_MTX_TAG_DESTROYED) {
__lck_mtx_invalid_panic(lock);
}
profile = (state.data & LCK_MTX_PROFILE);
/*
* Attempt steal
*
* When the lock state is 0, then no thread can be queued
* for adaptive spinning or for the interlock yet.
*
* As such we can attempt to try to take the interlock.
* (we can't take the mutex directly because we need
* the interlock to do turnstile operations on the way out).
*/
if ((state.val & ~(uint64_t)LCK_MTX_PROFILE) == 0) {
if (!os_atomic_cmpxchgv(&lock->lck_mtx.val,
state.val, state.val | LCK_MTX_ILOCK,
&state.val, acquire)) {
continue;
}
break;
}
#if CONFIG_DTRACE
if (profile) {
LCK_MTX_PROF_MISS(lock, lock->lck_mtx_grp, &first_miss);
}
#endif /* CONFIG_DTRACE */
if (mode == LCK_MTX_MODE_SLEEPABLE) {
spin_start = LCK_MTX_ADAPTIVE_SPIN_BEGIN();
} else {
spin_start = LCK_MTX_SPIN_SPIN_BEGIN();
}
/*
* Adaptive spin or interlock
*
* Evaluate if adaptive spinning should be attempted,
* and if yes go to adaptive spin.
*
* Otherwise (and this includes always-spin mutexes),
* go for the interlock.
*/
if (mode != LCK_MTX_MODE_SPIN_ALWAYS &&
(state.ilocked || state.as_tail || !state.owner ||
lck_mtx_ctid_on_core(state.owner))) {
lck_mtx_lock_adaptive_spin(lock, state);
} else {
direct_wait = true;
lck_mtx_ilk_lock_nopreempt(lock, LCK_ILK_MODE_DIRECT);
}
if (mode == LCK_MTX_MODE_SLEEPABLE) {
LCK_MTX_ADAPTIVE_SPIN_END(lock, lock->lck_mtx_grp, spin_start);
} else {
LCK_MTX_SPIN_SPIN_END(lock, lock->lck_mtx_grp, spin_start);
}
/*
* Take or sleep
*
* We now have the interlock. Either the owner
* isn't set, and the mutex is ours to claim,
* or we must go to sleep.
*
* If we go to sleep, we need to set LCK_MTX_NEEDS_WAKEUP
* to force the current lock owner to call
* lck_mtx_unlock_wakeup().
*/
state = os_atomic_load(&lock->lck_mtx, relaxed);
if (state.owner == LCK_MTX_NULL_CTID) {
break;
}
if (mode == LCK_MTX_MODE_SPIN_ALWAYS) {
__lck_mtx_lock_is_sleepable_panic(lock);
}
#if CONFIG_DTRACE
if (profile) {
LCK_MTX_PROF_WAIT(lock, lock->lck_mtx_grp,
direct_wait, &first_miss);
}
#endif /* CONFIG_DTRACE */
os_atomic_store(&lock->lck_mtx.data,
state.data | LCK_MTX_ILOCK | LCK_MTX_NEEDS_WAKEUP,
compiler_acq_rel);
ts = lck_mtx_lock_wait(lock, thread,
ctid_get_thread(state.owner), ts);
/* returns interlock unlocked and preemption re-enabled */
lock_disable_preemption_for_thread(thread);
}
/*
* We can take the lock!
*
* We only have the interlock and the owner field is 0.
*
* Perform various turnstile cleanups if needed,
* claim the lock, and reenable preemption (if needed).
*/
if (ts) {
lck_mtx_lock_wait_done(lock, ts);
}
data = ctid | profile;
if (lck_mtx_lock_will_need_wakeup(lock, thread)) {
data |= LCK_MTX_NEEDS_WAKEUP;
}
if (mode != LCK_MTX_MODE_SLEEPABLE) {
data |= LCK_MTX_ILOCK | LCK_MTX_SPIN_MODE;
}
os_atomic_store(&lock->lck_mtx.data, data, release);
if (mode == LCK_MTX_MODE_SLEEPABLE) {
lock_enable_preemption();
}
assert(thread->turnstile != NULL);
if (ts) {
turnstile_cleanup();
}
LCK_MTX_ACQUIRED(lock, lock->lck_mtx_grp,
mode != LCK_MTX_MODE_SLEEPABLE, profile);
}
#if LCK_MTX_CHECK_INVARIANTS || CONFIG_DTRACE
__attribute__((noinline))
#else
__attribute__((always_inline))
#endif
static void
lck_mtx_lock_slow(
lck_mtx_t *lock,
thread_t thread,
lck_mtx_state_t state,
lck_mtx_mode_t mode)
{
#pragma unused(state)
#if CONFIG_DTRACE
lck_mtx_state_t ostate = {
.data = LCK_MTX_PROFILE,
};
#endif /* CONFIG_DTRACE */
#if LCK_MTX_CHECK_INVARIANTS
if (mode != LCK_MTX_MODE_SPIN_ALWAYS) {
lck_mtx_check_preemption(lock, thread,
(mode == LCK_MTX_MODE_SPIN));
}
#endif /* LCK_MTX_CHECK_INVARIANTS */
#if CONFIG_DTRACE
if (state.val == ostate.val) {
state.data = thread->ctid | LCK_MTX_PROFILE;
if (mode != LCK_MTX_MODE_SLEEPABLE) {
state.ilocked = true;
state.spin_mode = true;
}
os_atomic_cmpxchgv(&lock->lck_mtx.val,
ostate.val, state.val, &state.val, acquire);
}
if ((state.val & ~ostate.val) == 0) {
LCK_MTX_ACQUIRED(lock, lock->lck_mtx_grp,
mode != LCK_MTX_MODE_SLEEPABLE,
state.data & LCK_MTX_PROFILE);
return;
}
#endif /* CONFIG_DTRACE */
lck_mtx_lock_contended(lock, thread, mode);
}
static __attribute__((always_inline)) void
lck_mtx_lock_fastpath(lck_mtx_t *lock, lck_mtx_mode_t mode)
{
thread_t thread = current_thread();
lck_mtx_state_t state = {
.data = thread->ctid,
};
uint64_t take_slowpath = 0;
if (mode != LCK_MTX_MODE_SPIN_ALWAYS) {
take_slowpath |= LCK_MTX_SNIFF_PREEMPTION(thread);
}
take_slowpath |= LCK_MTX_SNIFF_DTRACE();
if (mode != LCK_MTX_MODE_SLEEPABLE) {
lock_disable_preemption_for_thread(thread);
state.ilocked = true;
state.spin_mode = true;
}
/*
* Do the CAS on the entire mutex state,
* which hence requires for the ILK/AS queues
* to be empty (which is fairer).
*/
lock_cmpxchgv(&lock->lck_mtx.val,
0, state.val, &state.val, acquire);
take_slowpath |= state.val;
if (__improbable(take_slowpath)) {
return lck_mtx_lock_slow(lock, thread, state, mode);
}
}
void
lck_mtx_lock(lck_mtx_t *lock)
{
lck_mtx_lock_fastpath(lock, LCK_MTX_MODE_SLEEPABLE);
}
void
lck_mtx_lock_spin(lck_mtx_t *lock)
{
lck_mtx_lock_fastpath(lock, LCK_MTX_MODE_SPIN);
}
void
lck_mtx_lock_spin_always(lck_mtx_t *lock)
{
lck_mtx_lock_fastpath(lock, LCK_MTX_MODE_SPIN_ALWAYS);
}
#pragma mark lck_mtx_t: lck_mtx_try_lock
static __attribute__((always_inline)) bool
lck_mtx_try_lock_slow_inline(
lck_mtx_t *lock,
thread_t thread,
uint32_t odata,
uint32_t ndata,
bool spin)
{
#pragma unused(lock, thread, odata, ndata)
#if CONFIG_DTRACE
if (odata == LCK_MTX_PROFILE) {
os_atomic_cmpxchgv(&lock->lck_mtx.data,
odata, ndata | LCK_MTX_PROFILE, &odata, acquire);
}
if ((odata & ~LCK_MTX_PROFILE) == 0) {
LCK_MTX_TRY_ACQUIRED(lock, lock->lck_mtx_grp,
spin, odata & LCK_MTX_PROFILE);
return true;
}
if (odata & LCK_MTX_PROFILE) {
LCK_MTX_PROF_MISS(lock, lock->lck_mtx_grp, &(int){ 0 });
}
#endif /* CONFIG_DTRACE */
if (spin) {
lock_enable_preemption();
}
return false;
}
#if CONFIG_DTRACE || LCK_MTX_CHECK_INVARIANTS
__attribute__((noinline))
#else
__attribute__((always_inline))
#endif
static bool
lck_mtx_try_lock_slow(
lck_mtx_t *lock,
thread_t thread,
uint32_t odata,
uint32_t ndata)
{
return lck_mtx_try_lock_slow_inline(lock, thread, odata, ndata, false);
}
#if CONFIG_DTRACE || LCK_MTX_CHECK_INVARIANTS
__attribute__((noinline))
#else
__attribute__((always_inline))
#endif
static bool
lck_mtx_try_lock_slow_spin(
lck_mtx_t *lock,
thread_t thread,
uint32_t odata,
uint32_t ndata)
{
return lck_mtx_try_lock_slow_inline(lock, thread, odata, ndata, true);
}
static __attribute__((always_inline)) bool
lck_mtx_try_lock_fastpath(lck_mtx_t *lock, lck_mtx_mode_t mode)
{
thread_t thread = current_thread();
uint32_t odata, ndata = thread->ctid;
uint32_t take_slowpath = 0;
#if CONFIG_DTRACE
take_slowpath |= lck_debug_state.lds_value;
#endif
if (mode != LCK_MTX_MODE_SLEEPABLE) {
lock_disable_preemption_for_thread(thread);
ndata |= LCK_MTX_SPIN_MODE | LCK_MTX_ILOCK;
}
/*
* try_lock because it's likely to be used for cases
* like lock inversion resolutions tries a bit harder
* than lck_mtx_lock() to take the lock and ignores
* adaptive spin / interlock queues by doing the CAS
* on the 32bit mutex data only.
*/
lock_cmpxchgv(&lock->lck_mtx.data, 0, ndata, &odata, acquire);
take_slowpath |= odata;
if (__probable(!take_slowpath)) {
return true;
}
if (mode == LCK_MTX_MODE_SPIN_ALWAYS &&
(odata & LCK_MTX_CTID_MASK) &&
!(odata & LCK_MTX_SPIN_MODE)) {
__lck_mtx_lock_is_sleepable_panic(lock);
}
if (mode == LCK_MTX_MODE_SLEEPABLE) {
return lck_mtx_try_lock_slow(lock, thread, odata, ndata);
} else {
return lck_mtx_try_lock_slow_spin(lock, thread, odata, ndata);
}
}
boolean_t
lck_mtx_try_lock(lck_mtx_t *lock)
{
return lck_mtx_try_lock_fastpath(lock, LCK_MTX_MODE_SLEEPABLE);
}
boolean_t
lck_mtx_try_lock_spin(lck_mtx_t *lock)
{
return lck_mtx_try_lock_fastpath(lock, LCK_MTX_MODE_SPIN);
}
boolean_t
lck_mtx_try_lock_spin_always(lck_mtx_t *lock)
{
return lck_mtx_try_lock_fastpath(lock, LCK_MTX_MODE_SPIN_ALWAYS);
}
#pragma mark lck_mtx_t: lck_mtx_unlock
static NOINLINE void
lck_mtx_unlock_contended(lck_mtx_t *lock, thread_t thread, uint32_t data)
{
bool cleanup = false;
#if !CONFIG_DTRACE
/*
* This check is done by lck_mtx_unlock_slow() when it is enabled.
*/
if (thread->ctid != (data & LCK_MTX_CTID_MASK)) {
__lck_mtx_not_owned_panic(lock, thread);
}
#endif /* !CONFIG_DTRACE */
if ((data & LCK_MTX_SPIN_MODE) == 0) {
lock_disable_preemption_for_thread(thread);
lck_mtx_ilk_lock_nopreempt(lock, LCK_ILK_MODE_UNLOCK);
}
/*
* We must re-load the data: we might have taken
* the slowpath because another thread had taken
* the interlock and set the NEEDS_WAKEUP bit
* while we were spinning to get it.
*/
data = os_atomic_load(&lock->lck_mtx.data, compiler_acq_rel);
if (data & LCK_MTX_NEEDS_WAKEUP) {
lck_mtx_unlock_wakeup(lock, thread);
cleanup = true;
}
lck_mtx_ilk_unlock_v(lock, data & LCK_MTX_PROFILE);
LCK_MTX_RELEASED(lock, lock->lck_mtx_grp, data & LCK_MTX_PROFILE);
/*
* Do not do any turnstile operations outside of this block.
*
* lock/unlock is called at early stage of boot while single
* threaded, without turnstiles being available yet.
* Even without contention we can come throught the slow path
* if the mutex is acquired as a spin lock.
*/
if (cleanup) {
turnstile_cleanup();
}
}
#if CONFIG_DTRACE
__attribute__((noinline))
#else
__attribute__((always_inline))
#endif
static void
lck_mtx_unlock_slow(lck_mtx_t *lock, thread_t thread, uint32_t data)
{
#if CONFIG_DTRACE
/*
* If Dtrace is enabled, locks can be profiled,
* which causes the fastpath of unlock to fail.
*/
if ((data & LCK_MTX_BITS_MASK) == LCK_MTX_PROFILE) {
os_atomic_cmpxchgv(&lock->lck_mtx.data, data, LCK_MTX_PROFILE,
&data, release);
}
if (thread->ctid != (data & LCK_MTX_CTID_MASK)) {
__lck_mtx_not_owned_panic(lock, thread);
}
if ((data & (LCK_MTX_BITS_MASK & ~LCK_MTX_PROFILE)) == 0) {
LCK_MTX_RELEASED(lock, lock->lck_mtx_grp, false);
return;
}
#endif /* CONFIG_DTRACE */
lck_mtx_unlock_contended(lock, thread, data);
}
void
lck_mtx_unlock(lck_mtx_t *lock)
{
thread_t thread = current_thread();
uint32_t take_slowpath = 0;
uint32_t data;
take_slowpath |= LCK_MTX_SNIFF_DTRACE();
/*
* The fast path ignores the ILK/AS queues on purpose,
* those really are a "lock" concept, not unlock.
*/
if (__probable(lock_cmpxchgv(&lock->lck_mtx.data,
thread->ctid, 0, &data, release))) {
if (__probable(!take_slowpath)) {
return;
}
}
lck_mtx_unlock_slow(lock, thread, data);
}
#pragma mark lck_mtx_t: misc
void
lck_mtx_assert(lck_mtx_t *lock, unsigned int type)
{
lck_mtx_state_t state = os_atomic_load(&lock->lck_mtx, relaxed);
thread_t thread = current_thread();
if (type == LCK_MTX_ASSERT_OWNED) {
if (state.owner != thread->ctid) {
__lck_mtx_not_owned_panic(lock, thread);
}
} else if (type == LCK_MTX_ASSERT_NOTOWNED) {
if (state.owner == thread->ctid) {
__lck_mtx_owned_panic(lock, thread);
}
} else {
panic("lck_mtx_assert(): invalid arg (%u)", type);
}
}
/*
* Routine: lck_mtx_convert_spin
*
* Convert a mutex held for spin into a held full mutex
*/
void
lck_mtx_convert_spin(lck_mtx_t *lock)
{
lck_mtx_state_t state = os_atomic_load(&lock->lck_mtx, relaxed);
thread_t thread = current_thread();
uint32_t data = thread->ctid;
if (state.owner != data) {
__lck_mtx_not_owned_panic(lock, thread);
}
if (state.spin_mode) {
/*
* Note: we can acquire the lock in spin mode
* _and_ be the inheritor if we waited.
*
* We must only clear ilocked and spin_mode,
* but preserve owner and needs_wakeup.
*/
state.ilocked = false;
state.spin_mode = false;
lck_mtx_ilk_unlock_v(lock, state.data);
turnstile_cleanup();
}
}
/*
* Routine: kdp_lck_mtx_lock_spin_is_acquired
* NOT SAFE: To be used only by kernel debugger to avoid deadlock.
*/
boolean_t
kdp_lck_mtx_lock_spin_is_acquired(lck_mtx_t *lck)
{
lck_mtx_state_t state = os_atomic_load(&lck->lck_mtx, relaxed);
if (not_in_kdp) {
panic("panic: spinlock acquired check done outside of kernel debugger");
}
if (state.data == LCK_MTX_TAG_DESTROYED) {
return false;
}
return state.owner || state.ilocked;
}
void
kdp_lck_mtx_find_owner(
struct waitq *waitq __unused,
event64_t event,
thread_waitinfo_t *waitinfo)
{
lck_mtx_t *mutex = LCK_EVENT_TO_MUTEX(event);
lck_mtx_state_t state = os_atomic_load(&mutex->lck_mtx, relaxed);
assert3u(state.data, !=, LCK_MTX_TAG_DESTROYED);
waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
waitinfo->owner = thread_tid(ctid_get_thread(state.owner));
}
#endif /* !LCK_MTX_USE_ARCH */
/*
* Routine: mutex_pause
*
* Called by former callers of simple_lock_pause().
*/
#define MAX_COLLISION_COUNTS 32
#define MAX_COLLISION 8
unsigned int max_collision_count[MAX_COLLISION_COUNTS];
uint32_t collision_backoffs[MAX_COLLISION] = {
10, 50, 100, 200, 400, 600, 800, 1000
};
void
mutex_pause(uint32_t collisions)
{
wait_result_t wait_result;
uint32_t back_off;
if (collisions >= MAX_COLLISION_COUNTS) {
collisions = MAX_COLLISION_COUNTS - 1;
}
max_collision_count[collisions]++;
if (collisions >= MAX_COLLISION) {
collisions = MAX_COLLISION - 1;
}
back_off = collision_backoffs[collisions];
wait_result = assert_wait_timeout((event_t)mutex_pause, THREAD_UNINT, back_off, NSEC_PER_USEC);
assert(wait_result == THREAD_WAITING);
wait_result = thread_block(THREAD_CONTINUE_NULL);
assert(wait_result == THREAD_TIMED_OUT);
}
unsigned int mutex_yield_wait = 0;
unsigned int mutex_yield_no_wait = 0;
boolean_t
lck_mtx_yield(
lck_mtx_t *lck)
{
bool has_waiters = LCK_MTX_HAS_WAITERS(lck);
#if DEBUG
lck_mtx_assert(lck, LCK_MTX_ASSERT_OWNED);
#endif /* DEBUG */
if (!has_waiters) {
mutex_yield_no_wait++;
} else {
mutex_yield_wait++;
lck_mtx_unlock(lck);
mutex_pause(0);
lck_mtx_lock(lck);
}
return has_waiters;
}