/*
* Copyright (c) 2000-2020 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
* This file contains Original Code and/or Modifications of Original Code
* as defined in and that are subject to the Apple Public Source License
* Version 2.0 (the 'License'). You may not use this file except in
* compliance with the License. The rights granted to you under the License
* may not be used to create, or enable the creation or redistribution of,
* unlawful or unlicensed copies of an Apple operating system, or to
* circumvent, violate, or enable the circumvention or violation of, any
* terms of an Apple operating system software license agreement.
*
* Please obtain a copy of the License at
* http://www.opensource.apple.com/apsl/ and read it before using this file.
*
* The Original Code and all software distributed under the License are
* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
* Please see the License for the specific language governing rights and
* limitations under the License.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
*/
/*
* @OSF_COPYRIGHT@
*/
/*
* Mach Operating System
* Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
* All Rights Reserved.
*
* Permission to use, copy, modify and distribute this software and its
* documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
* ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie Mellon
* the rights to redistribute these changes.
*/
/*
* File: kern/lock.c
* Author: Avadis Tevanian, Jr., Michael Wayne Young
* Date: 1985
*
* Locking primitives implementation
*/
#define LOCK_PRIVATE 1
#include <mach_ldebug.h>
#include <kern/lock_stat.h>
#include <kern/locks.h>
#include <kern/zalloc.h>
#include <kern/misc_protos.h>
#include <kern/thread.h>
#include <kern/processor.h>
#include <kern/cpu_data.h>
#include <kern/cpu_number.h>
#include <kern/sched_prim.h>
#include <kern/debug.h>
#include <string.h>
#include <i386/machine_routines.h> /* machine_timeout_suspended() */
#include <i386/tsc.h>
#include <machine/atomic.h>
#include <machine/machine_cpu.h>
#include <i386/mp.h>
#include <machine/atomic.h>
#include <sys/kdebug.h>
#if LCK_MTX_USE_ARCH
#include <i386/locks_i386_inlines.h>
#endif /* LCK_MTX_USE_ARCH */
#include <kern/cpu_number.h>
#include <os/hash.h>
#include <i386/cpuid.h>
#define ANY_LOCK_DEBUG (USLOCK_DEBUG || LOCK_DEBUG || MUTEX_DEBUG)
uint64_t _Atomic lock_panic_timeout = 0xf000000; /* 251e6 TSC ticks */
/* Forwards */
#if USLOCK_DEBUG
/*
* Perform simple lock checks.
*/
int uslock_check = 1;
int max_lock_loops = 100000000;
decl_simple_lock_data(extern, printf_lock);
decl_simple_lock_data(extern, panic_lock);
#endif /* USLOCK_DEBUG */
extern unsigned int not_in_kdp;
#if !LCK_GRP_USE_ARG
#define usimple_lock_nopreempt(lck, grp) \
usimple_lock_nopreempt(lck)
#define usimple_lock_try_nopreempt(lck, grp) \
usimple_lock_try_nopreempt(lck)
#endif
static void usimple_lock_nopreempt(usimple_lock_t, lck_grp_t *);
static unsigned int usimple_lock_try_nopreempt(usimple_lock_t, lck_grp_t *);
/*
* We often want to know the addresses of the callers
* of the various lock routines. However, this information
* is only used for debugging and statistics.
*/
typedef void *pc_t;
#define INVALID_PC ((void *) VM_MAX_KERNEL_ADDRESS)
#define INVALID_THREAD ((void *) VM_MAX_KERNEL_ADDRESS)
#if ANY_LOCK_DEBUG
#define OBTAIN_PC(pc) ((pc) = GET_RETURN_PC())
#define DECL_PC(pc) pc_t pc;
#else /* ANY_LOCK_DEBUG */
#define DECL_PC(pc)
#ifdef lint
/*
* Eliminate lint complaints about unused local pc variables.
*/
#define OBTAIN_PC(pc) ++pc
#else /* lint */
#define OBTAIN_PC(pc)
#endif /* lint */
#endif /* USLOCK_DEBUG */
KALLOC_TYPE_DEFINE(KT_LCK_SPIN, lck_spin_t, KT_PRIV_ACCT);
#if LCK_MTX_USE_ARCH
KALLOC_TYPE_DEFINE(KT_LCK_MTX, lck_mtx_t, KT_PRIV_ACCT);
#endif /* LCK_MTX_USE_ARCH */
/*
* atomic exchange API is a low level abstraction of the operations
* to atomically read, modify, and write a pointer. This abstraction works
* for both Intel and ARMv8.1 compare and exchange atomic instructions as
* well as the ARM exclusive instructions.
*
* atomic_exchange_begin() - begin exchange and retrieve current value
* atomic_exchange_complete() - conclude an exchange
* atomic_exchange_abort() - cancel an exchange started with atomic_exchange_begin()
*/
uint32_t
atomic_exchange_begin32(uint32_t *target, uint32_t *previous, enum memory_order ord)
{
uint32_t val;
(void)ord; // Memory order not used
val = os_atomic_load(target, relaxed);
*previous = val;
return val;
}
boolean_t
atomic_exchange_complete32(uint32_t *target, uint32_t previous, uint32_t newval, enum memory_order ord)
{
return __c11_atomic_compare_exchange_strong((_Atomic uint32_t *)target, &previous, newval, ord, memory_order_relaxed);
}
void
atomic_exchange_abort(void)
{
}
boolean_t
atomic_test_and_set32(uint32_t *target, uint32_t test_mask, uint32_t set_mask, enum memory_order ord, boolean_t wait)
{
uint32_t value, prev;
for (;;) {
value = atomic_exchange_begin32(target, &prev, ord);
if (value & test_mask) {
if (wait) {
cpu_pause();
} else {
atomic_exchange_abort();
}
return FALSE;
}
value |= set_mask;
if (atomic_exchange_complete32(target, prev, value, ord)) {
return TRUE;
}
}
}
/*
* Portable lock package implementation of usimple_locks.
*/
#if USLOCK_DEBUG
#define USLDBG(stmt) stmt
void usld_lock_init(usimple_lock_t, unsigned short);
void usld_lock_pre(usimple_lock_t, pc_t);
void usld_lock_post(usimple_lock_t, pc_t);
void usld_unlock(usimple_lock_t, pc_t);
void usld_lock_try_pre(usimple_lock_t, pc_t);
void usld_lock_try_post(usimple_lock_t, pc_t);
int usld_lock_common_checks(usimple_lock_t, char *);
#else /* USLOCK_DEBUG */
#define USLDBG(stmt)
#endif /* USLOCK_DEBUG */
#if LCK_MTX_USE_ARCH
/*
* Forward definitions
*/
static void lck_mtx_unlock_wakeup_tail(lck_mtx_t *mutex, uint32_t state);
static void lck_mtx_interlock_lock(lck_mtx_t *mutex, uint32_t *new_state);
static void lck_mtx_interlock_lock_clear_flags(lck_mtx_t *mutex, uint32_t and_flags, uint32_t *new_state);
static int lck_mtx_interlock_try_lock_set_flags(lck_mtx_t *mutex, uint32_t or_flags, uint32_t *new_state);
static boolean_t lck_mtx_lock_wait_interlock_to_clear(lck_mtx_t *lock, uint32_t *new_state);
static boolean_t lck_mtx_try_lock_wait_interlock_to_clear(lck_mtx_t *lock, uint32_t *new_state);
#endif /* LCK_MTX_USE_ARCH */
/*
* Routine: lck_spin_alloc_init
*/
lck_spin_t *
lck_spin_alloc_init(
lck_grp_t *grp,
lck_attr_t *attr)
{
lck_spin_t *lck;
lck = zalloc(KT_LCK_SPIN);
lck_spin_init(lck, grp, attr);
return lck;
}
/*
* Routine: lck_spin_free
*/
void
lck_spin_free(
lck_spin_t *lck,
lck_grp_t *grp)
{
lck_spin_destroy(lck, grp);
zfree(KT_LCK_SPIN, lck);
}
/*
* Routine: lck_spin_init
*/
void
lck_spin_init(
lck_spin_t *lck,
lck_grp_t *grp,
__unused lck_attr_t *attr)
{
usimple_lock_init((usimple_lock_t) lck, 0);
if (grp) {
lck_grp_reference(grp, &grp->lck_grp_spincnt);
}
}
/*
* Routine: lck_spin_destroy
*/
void
lck_spin_destroy(
lck_spin_t *lck,
lck_grp_t *grp)
{
if (lck->interlock == LCK_SPIN_TAG_DESTROYED) {
return;
}
lck->interlock = LCK_SPIN_TAG_DESTROYED;
if (grp) {
lck_grp_deallocate(grp, &grp->lck_grp_spincnt);
}
return;
}
/*
* Routine: lck_spin_lock
*/
void
lck_spin_lock_grp(
lck_spin_t *lck,
lck_grp_t *grp)
{
#pragma unused(grp)
usimple_lock((usimple_lock_t) lck, grp);
}
void
lck_spin_lock(
lck_spin_t *lck)
{
usimple_lock((usimple_lock_t) lck, NULL);
}
void
lck_spin_lock_nopreempt(
lck_spin_t *lck)
{
usimple_lock_nopreempt((usimple_lock_t) lck, NULL);
}
void
lck_spin_lock_nopreempt_grp(
lck_spin_t *lck,
lck_grp_t *grp)
{
#pragma unused(grp)
usimple_lock_nopreempt((usimple_lock_t) lck, grp);
}
/*
* Routine: lck_spin_unlock
*/
void
lck_spin_unlock(
lck_spin_t *lck)
{
usimple_unlock((usimple_lock_t) lck);
}
void
lck_spin_unlock_nopreempt(
lck_spin_t *lck)
{
usimple_unlock_nopreempt((usimple_lock_t) lck);
}
boolean_t
lck_spin_try_lock_grp(
lck_spin_t *lck,
lck_grp_t *grp)
{
#pragma unused(grp)
boolean_t lrval = (boolean_t)usimple_lock_try((usimple_lock_t) lck, grp);
#if DEVELOPMENT || DEBUG
if (lrval) {
pltrace(FALSE);
}
#endif
return lrval;
}
/*
* Routine: lck_spin_try_lock
*/
boolean_t
lck_spin_try_lock(
lck_spin_t *lck)
{
boolean_t lrval = (boolean_t)usimple_lock_try((usimple_lock_t) lck, LCK_GRP_NULL);
#if DEVELOPMENT || DEBUG
if (lrval) {
pltrace(FALSE);
}
#endif
return lrval;
}
int
lck_spin_try_lock_nopreempt(
lck_spin_t *lck)
{
boolean_t lrval = (boolean_t)usimple_lock_try_nopreempt((usimple_lock_t) lck, LCK_GRP_NULL);
#if DEVELOPMENT || DEBUG
if (lrval) {
pltrace(FALSE);
}
#endif
return lrval;
}
int
lck_spin_try_lock_nopreempt_grp(
lck_spin_t *lck,
lck_grp_t *grp)
{
#pragma unused(grp)
boolean_t lrval = (boolean_t)usimple_lock_try_nopreempt((usimple_lock_t) lck, grp);
#if DEVELOPMENT || DEBUG
if (lrval) {
pltrace(FALSE);
}
#endif
return lrval;
}
/*
* Routine: lck_spin_assert
*/
void
lck_spin_assert(const lck_spin_t *lock, unsigned int type)
{
thread_t thread, holder;
uintptr_t state;
if (__improbable(type != LCK_ASSERT_OWNED && type != LCK_ASSERT_NOTOWNED)) {
panic("lck_spin_assert(): invalid arg (%u)", type);
}
state = lock->interlock;
holder = (thread_t)state;
thread = current_thread();
if (type == LCK_ASSERT_OWNED) {
if (__improbable(holder == THREAD_NULL)) {
panic("Lock not owned %p = %lx", lock, state);
}
if (__improbable(holder != thread)) {
panic("Lock not owned by current thread %p = %lx", lock, state);
}
} else if (type == LCK_ASSERT_NOTOWNED) {
if (__improbable(holder != THREAD_NULL)) {
if (holder == thread) {
panic("Lock owned by current thread %p = %lx", lock, state);
}
}
}
}
/*
* Routine: kdp_lck_spin_is_acquired
* NOT SAFE: To be used only by kernel debugger to avoid deadlock.
* Returns: TRUE if lock is acquired.
*/
boolean_t
kdp_lck_spin_is_acquired(lck_spin_t *lck)
{
if (not_in_kdp) {
panic("panic: spinlock acquired check done outside of kernel debugger");
}
return (lck->interlock != 0)? TRUE : FALSE;
}
/*
* Initialize a usimple_lock.
*
* No change in preemption state.
*/
void
usimple_lock_init(
usimple_lock_t l,
__unused unsigned short tag)
{
USLDBG(usld_lock_init(l, tag));
hw_lock_init(&l->interlock);
}
static hw_spin_timeout_status_t
usimple_lock_acquire_timeout_panic(void *_lock, hw_spin_timeout_t to, hw_spin_state_t st)
{
usimple_lock_t l = _lock;
uintptr_t lowner;
lck_spinlock_to_info_t lsti;
if (machine_timeout_suspended()) {
return HW_LOCK_TIMEOUT_CONTINUE;
}
lowner = (uintptr_t)l->interlock.lock_data;
lsti = lck_spinlock_timeout_hit(l, lowner);
panic("Spinlock[%p] " HW_SPIN_TIMEOUT_FMT " ;"
"lock owner thread=0x%lx, current_thread: %p, "
"lock owner active on CPU %d, "
HW_SPIN_TIMEOUT_DETAILS_FMT,
l, HW_SPIN_TIMEOUT_ARG(to, st),
lowner, current_thread(), lsti->owner_cpu,
HW_SPIN_TIMEOUT_DETAILS_ARG(to, st));
}
static const struct hw_spin_policy usimple_lock_spin_policy = {
.hwsp_name = "usimple_lock_t",
.hwsp_timeout = &LockTimeOutTSC,
.hwsp_op_timeout = usimple_lock_acquire_timeout_panic,
};
/*
* Acquire a usimple_lock.
*
* Returns with preemption disabled. Note
* that the hw_lock routines are responsible for
* maintaining preemption state.
*/
void
(usimple_lock)(
usimple_lock_t l
LCK_GRP_ARG(lck_grp_t *grp))
{
DECL_PC(pc);
OBTAIN_PC(pc);
USLDBG(usld_lock_pre(l, pc));
(void)hw_lock_to(&l->interlock, &usimple_lock_spin_policy, grp);
#if DEVELOPMENT || DEBUG
pltrace(FALSE);
#endif
USLDBG(usld_lock_post(l, pc));
#if CONFIG_DTRACE
LOCKSTAT_RECORD(LS_LCK_SPIN_LOCK_ACQUIRE, l, 0, (uintptr_t)LCK_GRP_PROBEARG(grp));
#endif
}
/*
* Acquire a usimple_lock_nopreempt
*
* Called and returns with preemption disabled. Note
* that the hw_lock routines are responsible for
* maintaining preemption state.
*/
static void
usimple_lock_nopreempt(
usimple_lock_t l,
lck_grp_t *grp)
{
DECL_PC(pc);
OBTAIN_PC(pc);
USLDBG(usld_lock_pre(l, pc));
(void)hw_lock_to_nopreempt(&l->interlock, &usimple_lock_spin_policy, grp);
#if DEVELOPMENT || DEBUG
pltrace(FALSE);
#endif
USLDBG(usld_lock_post(l, pc));
#if CONFIG_DTRACE
LOCKSTAT_RECORD(LS_LCK_SPIN_LOCK_ACQUIRE, l, 0, (uintptr_t)LCK_GRP_PROBEARG(grp));
#endif
}
/*
* Release a usimple_lock.
*
* Returns with preemption enabled. Note
* that the hw_lock routines are responsible for
* maintaining preemption state.
*/
void
usimple_unlock(
usimple_lock_t l)
{
DECL_PC(pc);
OBTAIN_PC(pc);
USLDBG(usld_unlock(l, pc));
#if DEVELOPMENT || DEBUG
pltrace(TRUE);
#endif
hw_lock_unlock(&l->interlock);
}
/*
* Release a usimple_unlock_nopreempt.
*
* Called and returns with preemption enabled. Note
* that the hw_lock routines are responsible for
* maintaining preemption state.
*/
void
usimple_unlock_nopreempt(
usimple_lock_t l)
{
DECL_PC(pc);
OBTAIN_PC(pc);
USLDBG(usld_unlock(l, pc));
#if DEVELOPMENT || DEBUG
pltrace(TRUE);
#endif
hw_lock_unlock_nopreempt(&l->interlock);
}
/*
* Conditionally acquire a usimple_lock.
*
* On success, returns with preemption disabled.
* On failure, returns with preemption in the same state
* as when first invoked. Note that the hw_lock routines
* are responsible for maintaining preemption state.
*
* XXX No stats are gathered on a miss; I preserved this
* behavior from the original assembly-language code, but
* doesn't it make sense to log misses? XXX
*/
unsigned int
usimple_lock_try(
usimple_lock_t l,
lck_grp_t *grp)
{
unsigned int success;
DECL_PC(pc);
OBTAIN_PC(pc);
USLDBG(usld_lock_try_pre(l, pc));
if ((success = hw_lock_try(&l->interlock, grp))) {
#if DEVELOPMENT || DEBUG
pltrace(FALSE);
#endif
USLDBG(usld_lock_try_post(l, pc));
}
return success;
}
void
usimple_lock_assert(usimple_lock_t l, unsigned int type)
{
hw_lock_assert(&l->interlock, type);
}
/*
* Conditionally acquire a usimple_lock.
*
* Called and returns with preemption disabled. Note
* that the hw_lock routines are responsible for
* maintaining preemption state.
*
* XXX No stats are gathered on a miss; I preserved this
* behavior from the original assembly-language code, but
* doesn't it make sense to log misses? XXX
*/
static unsigned int
usimple_lock_try_nopreempt(
usimple_lock_t l,
lck_grp_t *grp)
{
unsigned int success;
DECL_PC(pc);
OBTAIN_PC(pc);
USLDBG(usld_lock_try_pre(l, pc));
if ((success = hw_lock_try_nopreempt(&l->interlock, grp))) {
#if DEVELOPMENT || DEBUG
pltrace(FALSE);
#endif
USLDBG(usld_lock_try_post(l, pc));
}
return success;
}
/*
* Acquire a usimple_lock while polling for pending cpu signals
* and spinning on a lock.
*
*/
unsigned
int
(usimple_lock_try_lock_mp_signal_safe_loop_deadline)(usimple_lock_t l,
uint64_t deadline
LCK_GRP_ARG(lck_grp_t *grp))
{
boolean_t istate = ml_get_interrupts_enabled();
if (deadline < mach_absolute_time()) {
return 0;
}
while (!simple_lock_try(l, grp)) {
if (!istate) {
cpu_signal_handler(NULL);
}
if (deadline < mach_absolute_time()) {
return 0;
}
cpu_pause();
}
return 1;
}
void
(usimple_lock_try_lock_loop)(usimple_lock_t l
LCK_GRP_ARG(lck_grp_t *grp))
{
/* When the lock is not contended, grab the lock and go. */
if (!simple_lock_try(l, grp)) {
usimple_lock_try_lock_mp_signal_safe_loop_deadline(l, ULLONG_MAX, grp);
}
}
unsigned
int
(usimple_lock_try_lock_mp_signal_safe_loop_duration)(usimple_lock_t l,
uint64_t duration
LCK_GRP_ARG(lck_grp_t *grp))
{
uint64_t deadline;
uint64_t base_at;
uint64_t duration_at;
/* Fast track for uncontended locks */
if (simple_lock_try(l, grp)) {
return 1;
}
base_at = mach_absolute_time();
nanoseconds_to_absolutetime(duration, &duration_at);
deadline = base_at + duration_at;
if (deadline < base_at) {
/* deadline has overflowed, make it saturate */
deadline = ULLONG_MAX;
}
return usimple_lock_try_lock_mp_signal_safe_loop_deadline(l, deadline, grp);
}
#if USLOCK_DEBUG
/*
* States of a usimple_lock. The default when initializing
* a usimple_lock is setting it up for debug checking.
*/
#define USLOCK_CHECKED 0x0001 /* lock is being checked */
#define USLOCK_TAKEN 0x0002 /* lock has been taken */
#define USLOCK_INIT 0xBAA0 /* lock has been initialized */
#define USLOCK_INITIALIZED (USLOCK_INIT|USLOCK_CHECKED)
#define USLOCK_CHECKING(l) (uslock_check && \
((l)->debug.state & USLOCK_CHECKED))
/*
* Initialize the debugging information contained
* in a usimple_lock.
*/
void
usld_lock_init(
usimple_lock_t l,
__unused unsigned short tag)
{
if (l == USIMPLE_LOCK_NULL) {
panic("lock initialization: null lock pointer");
}
l->lock_type = USLOCK_TAG;
l->debug.state = uslock_check ? USLOCK_INITIALIZED : 0;
l->debug.lock_cpu = l->debug.unlock_cpu = 0;
l->debug.lock_pc = l->debug.unlock_pc = INVALID_PC;
l->debug.lock_thread = l->debug.unlock_thread = INVALID_THREAD;
l->debug.duration[0] = l->debug.duration[1] = 0;
l->debug.unlock_cpu = l->debug.unlock_cpu = 0;
l->debug.unlock_pc = l->debug.unlock_pc = INVALID_PC;
l->debug.unlock_thread = l->debug.unlock_thread = INVALID_THREAD;
}
/*
* These checks apply to all usimple_locks, not just
* those with USLOCK_CHECKED turned on.
*/
int
usld_lock_common_checks(
usimple_lock_t l,
char *caller)
{
if (l == USIMPLE_LOCK_NULL) {
panic("%s: null lock pointer", caller);
}
if (l->lock_type != USLOCK_TAG) {
panic("%s: %p is not a usimple lock, 0x%x", caller, l, l->lock_type);
}
if (!(l->debug.state & USLOCK_INIT)) {
panic("%s: %p is not an initialized lock, 0x%x", caller, l, l->debug.state);
}
return USLOCK_CHECKING(l);
}
/*
* Debug checks on a usimple_lock just before attempting
* to acquire it.
*/
/* ARGSUSED */
void
usld_lock_pre(
usimple_lock_t l,
pc_t pc)
{
char caller[] = "usimple_lock";
if (!usld_lock_common_checks(l, caller)) {
return;
}
/*
* Note that we have a weird case where we are getting a lock when we are]
* in the process of putting the system to sleep. We are running with no
* current threads, therefore we can't tell if we are trying to retake a lock
* we have or someone on the other processor has it. Therefore we just
* ignore this test if the locking thread is 0.
*/
if ((l->debug.state & USLOCK_TAKEN) && l->debug.lock_thread &&
l->debug.lock_thread == (void *) current_thread()) {
printf("%s: lock %p already locked (at %p) by",
caller, l, l->debug.lock_pc);
printf(" current thread %p (new attempt at pc %p)\n",
l->debug.lock_thread, pc);
panic("%s", caller);
}
mp_disable_preemption();
mp_enable_preemption();
}
/*
* Debug checks on a usimple_lock just after acquiring it.
*
* Pre-emption has been disabled at this point,
* so we are safe in using cpu_number.
*/
void
usld_lock_post(
usimple_lock_t l,
pc_t pc)
{
unsigned int mycpu;
char caller[] = "successful usimple_lock";
if (!usld_lock_common_checks(l, caller)) {
return;
}
if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED)) {
panic("%s: lock %p became uninitialized",
caller, l);
}
if ((l->debug.state & USLOCK_TAKEN)) {
panic("%s: lock 0x%p became TAKEN by someone else",
caller, l);
}
mycpu = (unsigned int)cpu_number();
assert(mycpu <= UCHAR_MAX);
l->debug.lock_thread = (void *)current_thread();
l->debug.state |= USLOCK_TAKEN;
l->debug.lock_pc = pc;
l->debug.lock_cpu = (unsigned char)mycpu;
}
/*
* Debug checks on a usimple_lock just before
* releasing it. Note that the caller has not
* yet released the hardware lock.
*
* Preemption is still disabled, so there's
* no problem using cpu_number.
*/
void
usld_unlock(
usimple_lock_t l,
pc_t pc)
{
unsigned int mycpu;
char caller[] = "usimple_unlock";
if (!usld_lock_common_checks(l, caller)) {
return;
}
mycpu = cpu_number();
assert(mycpu <= UCHAR_MAX);
if (!(l->debug.state & USLOCK_TAKEN)) {
panic("%s: lock 0x%p hasn't been taken",
caller, l);
}
if (l->debug.lock_thread != (void *) current_thread()) {
panic("%s: unlocking lock 0x%p, owned by thread %p",
caller, l, l->debug.lock_thread);
}
if (l->debug.lock_cpu != mycpu) {
printf("%s: unlocking lock 0x%p on cpu 0x%x",
caller, l, mycpu);
printf(" (acquired on cpu 0x%x)\n", l->debug.lock_cpu);
panic("%s", caller);
}
l->debug.unlock_thread = l->debug.lock_thread;
l->debug.lock_thread = INVALID_PC;
l->debug.state &= ~USLOCK_TAKEN;
l->debug.unlock_pc = pc;
l->debug.unlock_cpu = (unsigned char)mycpu;
}
/*
* Debug checks on a usimple_lock just before
* attempting to acquire it.
*
* Preemption isn't guaranteed to be disabled.
*/
void
usld_lock_try_pre(
usimple_lock_t l,
__unused pc_t pc)
{
char caller[] = "usimple_lock_try";
if (!usld_lock_common_checks(l, caller)) {
return;
}
}
/*
* Debug checks on a usimple_lock just after
* successfully attempting to acquire it.
*
* Preemption has been disabled by the
* lock acquisition attempt, so it's safe
* to use cpu_number.
*/
void
usld_lock_try_post(
usimple_lock_t l,
pc_t pc)
{
unsigned int mycpu;
char caller[] = "successful usimple_lock_try";
if (!usld_lock_common_checks(l, caller)) {
return;
}
if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED)) {
panic("%s: lock 0x%p became uninitialized",
caller, l);
}
if ((l->debug.state & USLOCK_TAKEN)) {
panic("%s: lock 0x%p became TAKEN by someone else",
caller, l);
}
mycpu = cpu_number();
assert(mycpu <= UCHAR_MAX);
l->debug.lock_thread = (void *) current_thread();
l->debug.state |= USLOCK_TAKEN;
l->debug.lock_pc = pc;
l->debug.lock_cpu = (unsigned char)mycpu;
}
#endif /* USLOCK_DEBUG */
#if LCK_MTX_USE_ARCH
/*
* Slow path routines for lck_mtx locking and unlocking functions.
*
* These functions were previously implemented in x86 assembly,
* and some optimizations are in place in this c code to obtain a compiled code
* as performant and compact as the assembly version.
*
* To avoid to inline these functions on the fast path, all functions directly called by
* the fast paths have the __attribute__((noinline)) specified. Also they are all implemented
* in such a way the fast path can tail call into them. In this way the return address
* does not need to be pushed on the caller stack and stack optimization can happen on the caller.
*
* Slow path code is structured in such a way there are no calls to functions that will return
* on the context of the caller function, i.e. all functions called are or tail call functions
* or inline functions. The number of arguments of the tail call functions are less then six,
* so that they can be passed over registers and do not need to be pushed on stack.
* This allows the compiler to not create a stack frame for the functions.
*
* __improbable and __probable are used to compile the slow path code in such a way
* the fast path case will be on a sequence of instructions with as less jumps as possible,
* to make this case the most optimized even if falling through the slow path.
*/
/*
* Intel lock invariants:
*
* lck_mtx_waiters: contains the count of threads currently in the mutex waitqueue
*
* The lock owner is promoted to the max priority of all its waiters only if it
* was a lower priority when it acquired or was an owner when a waiter waited.
* Max priority is capped at MAXPRI_PROMOTE.
*
* The last waiter will not be promoted as it is woken up, but the last
* lock owner may not have been the last thread to have been woken up depending on the
* luck of the draw. Therefore a last-owner may still have the promoted-on-wakeup
* flag set.
*
* TODO: Figure out an algorithm for stopping a lock holder which is already at the right
* priority from dropping priority in the future without having to take thread lock
* on acquire.
*/
/*
* Routine: lck_mtx_alloc_init
*/
lck_mtx_t *
lck_mtx_alloc_init(
lck_grp_t *grp,
lck_attr_t *attr)
{
lck_mtx_t *lck;
lck = zalloc(KT_LCK_MTX);
lck_mtx_init(lck, grp, attr);
return lck;
}
/*
* Routine: lck_mtx_free
*/
void
lck_mtx_free(
lck_mtx_t *lck,
lck_grp_t *grp)
{
lck_mtx_destroy(lck, grp);
zfree(KT_LCK_MTX, lck);
}
/*
* Routine: lck_mtx_init
*/
void
lck_mtx_init(
lck_mtx_t *lck,
lck_grp_t *grp,
lck_attr_t *attr)
{
if (attr == LCK_ATTR_NULL) {
attr = &lck_attr_default;
}
*lck = (lck_mtx_t){
.lck_mtx_grp = grp->lck_grp_attr_id,
};
if (__improbable((attr->lck_attr_val & LCK_ATTR_DEBUG) ||
(grp->lck_grp_attr_id & LCK_GRP_ATTR_DEBUG))) {
lck->lck_mtx_profile = 1;
}
lck_grp_reference(grp, &grp->lck_grp_mtxcnt);
}
/*
* Routine: lck_mtx_destroy
*/
void
lck_mtx_destroy(
lck_mtx_t *lck,
lck_grp_t *grp)
{
if (lck->lck_mtx_state == LCK_MTX_TAG_DESTROYED) {
return;
}
#if MACH_LDEBUG
lck_mtx_assert(lck, LCK_MTX_ASSERT_NOTOWNED);
#endif
ordered_store_mtx_state_release(lck, LCK_MTX_TAG_DESTROYED);
lck_grp_deallocate(grp, &grp->lck_grp_mtxcnt);
}
#if DEVELOPMENT | DEBUG
__attribute__((noinline))
void
lck_mtx_owner_check_panic(
lck_mtx_t *lock)
{
thread_t owner = ctid_get_thread_unsafe(lock->lck_mtx_owner);
panic("Mutex unlock attempted from non-owner thread. Owner=%p lock=%p", owner, lock);
}
#endif
/*
* Routine: lck_mtx_unlock_slow
*
* Unlocks a mutex held by current thread.
*
* It will wake up waiters if necessary.
*
* Interlock can be held.
*/
__attribute__((noinline))
void
lck_mtx_unlock_slow(
lck_mtx_t *lock)
{
thread_t thread;
uint32_t state;
state = ordered_load_mtx_state(lock);
thread = current_thread();
#if DEVELOPMENT | DEBUG
if (__improbable(lock->lck_mtx_owner != thread->ctid)) {
lck_mtx_owner_check_panic(lock);
}
#endif
/* check if it is held as a spinlock */
if (__improbable((state & LCK_MTX_MLOCKED_MSK) == 0)) {
goto unlock;
}
lck_mtx_interlock_lock_clear_flags(lock, LCK_MTX_MLOCKED_MSK, &state);
unlock:
/* preemption disabled, interlock held and mutex not held */
/* clear owner */
ordered_store_mtx_owner(lock, 0);
if (__improbable(state & LCK_MTX_WAITERS_MSK)) {
return lck_mtx_unlock_wakeup_tail(lock, state);
}
/* release interlock, promotion and clear spin flag */
state &= (~(LCK_MTX_ILOCKED_MSK | LCK_MTX_SPIN_MSK));
ordered_store_mtx_state_release(lock, state); /* since I own the interlock, I don't need an atomic update */
/* re-enable preemption */
lck_mtx_unlock_finish_inline(lock, state);
}
#define LCK_MTX_LCK_WAIT_CODE 0x20
#define LCK_MTX_LCK_WAKEUP_CODE 0x21
#define LCK_MTX_LCK_SPIN_CODE 0x22
#define LCK_MTX_LCK_ACQUIRE_CODE 0x23
#define LCK_MTX_LCK_DEMOTE_CODE 0x24
/*
* Routine: lck_mtx_unlock_wakeup_tail
*
* Invoked on unlock when there is
* contention, i.e. the assembly routine sees
* that mutex->lck_mtx_waiters != 0
*
* neither the mutex or interlock is held
*
* Note that this routine might not be called if there are pending
* waiters which have previously been woken up, and they didn't
* end up boosting the old owner.
*
* assembly routine previously did the following to mutex:
* (after saving the state in prior_lock_state)
* decremented lck_mtx_waiters if nonzero
*
* This function needs to be called as a tail call
* to optimize the compiled code.
*/
__attribute__((noinline))
static void
lck_mtx_unlock_wakeup_tail(
lck_mtx_t *mutex,
uint32_t state)
{
struct turnstile *ts;
__kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
kern_return_t did_wake;
KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_START,
trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
ts = turnstile_prepare_hash((uintptr_t)mutex, TURNSTILE_KERNEL_MUTEX);
did_wake = waitq_wakeup64_one(&ts->ts_waitq, LCK_MTX_EVENT(mutex),
THREAD_AWAKENED, WAITQ_UPDATE_INHERITOR);
assert(did_wake == KERN_SUCCESS);
turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
turnstile_complete_hash((uintptr_t)mutex, TURNSTILE_KERNEL_MUTEX);
state -= LCK_MTX_WAITER;
state &= (~(LCK_MTX_SPIN_MSK | LCK_MTX_ILOCKED_MSK));
ordered_store_mtx_state_release(mutex, state);
assert(current_thread()->turnstile != NULL);
turnstile_cleanup();
KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_END,
trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
lck_mtx_unlock_finish_inline(mutex, state);
}
/*
* Routine: lck_mtx_lock_acquire_x86
*
* Invoked on acquiring the mutex when there is
* contention (i.e. the assembly routine sees that
* that mutex->lck_mtx_waiters != 0
*
* mutex is owned... interlock is held... preemption is disabled
*/
__attribute__((always_inline))
static void
lck_mtx_lock_acquire_inline(
lck_mtx_t *mutex,
struct turnstile *ts)
{
__kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_START,
trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
if (mutex->lck_mtx_waiters > 0) {
if (ts == NULL) {
ts = turnstile_prepare_hash((uintptr_t)mutex, TURNSTILE_KERNEL_MUTEX);
}
turnstile_update_inheritor(ts, current_thread(),
TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_THREAD);
turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
}
if (ts != NULL) {
turnstile_complete_hash((uintptr_t)mutex, TURNSTILE_KERNEL_MUTEX);
}
assert(current_thread()->turnstile != NULL);
KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_END,
trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
}
void
lck_mtx_lock_acquire_x86(
lck_mtx_t *mutex)
{
return lck_mtx_lock_acquire_inline(mutex, NULL);
}
/*
* Tail call helpers for lock functions that perform
* lck_mtx_lock_acquire followed by the caller's finish routine, to optimize
* the caller's compiled code.
*/
__attribute__((noinline))
static void
lck_mtx_lock_acquire_tail(
lck_mtx_t *mutex,
struct turnstile *ts)
{
lck_mtx_lock_acquire_inline(mutex, ts);
lck_mtx_lock_finish_inline_with_cleanup(mutex, ordered_load_mtx_state(mutex));
}
__attribute__((noinline))
static boolean_t
lck_mtx_try_lock_acquire_tail(
lck_mtx_t *mutex)
{
lck_mtx_lock_acquire_inline(mutex, NULL);
lck_mtx_try_lock_finish_inline(mutex, ordered_load_mtx_state(mutex));
return TRUE;
}
__attribute__((noinline))
static void
lck_mtx_convert_spin_acquire_tail(
lck_mtx_t *mutex)
{
lck_mtx_lock_acquire_inline(mutex, NULL);
lck_mtx_convert_spin_finish_inline(mutex, ordered_load_mtx_state(mutex));
}
static void
lck_mtx_ilk_unlock(
lck_mtx_t *mutex)
{
lck_mtx_ilk_unlock_inline(mutex, ordered_load_mtx_state(mutex));
}
static inline void
lck_mtx_interlock_lock_set_and_clear_flags(
lck_mtx_t *mutex,
uint32_t xor_flags,
uint32_t and_flags,
uint32_t *new_state)
{
uint32_t state, prev;
state = *new_state;
for (;;) {
/* have to wait for interlock to clear */
while (__improbable(state & (LCK_MTX_ILOCKED_MSK | xor_flags))) {
cpu_pause();
state = ordered_load_mtx_state(mutex);
}
prev = state; /* prev contains snapshot for exchange */
state |= LCK_MTX_ILOCKED_MSK | xor_flags; /* pick up interlock */
state &= ~and_flags; /* clear flags */
disable_preemption();
if (os_atomic_cmpxchg(&mutex->lck_mtx_state, prev, state, acquire)) {
break;
}
enable_preemption();
cpu_pause();
state = ordered_load_mtx_state(mutex);
}
*new_state = state;
return;
}
static inline void
lck_mtx_interlock_lock_clear_flags(
lck_mtx_t *mutex,
uint32_t and_flags,
uint32_t *new_state)
{
return lck_mtx_interlock_lock_set_and_clear_flags(mutex, 0, and_flags, new_state);
}
static inline void
lck_mtx_interlock_lock(
lck_mtx_t *mutex,
uint32_t *new_state)
{
return lck_mtx_interlock_lock_set_and_clear_flags(mutex, 0, 0, new_state);
}
static inline int
lck_mtx_interlock_try_lock_set_flags(
lck_mtx_t *mutex,
uint32_t or_flags,
uint32_t *new_state)
{
uint32_t state, prev;
state = *new_state;
/* have to wait for interlock to clear */
if (state & (LCK_MTX_ILOCKED_MSK | or_flags)) {
return 0;
}
prev = state; /* prev contains snapshot for exchange */
state |= LCK_MTX_ILOCKED_MSK | or_flags; /* pick up interlock */
disable_preemption();
if (os_atomic_cmpxchg(&mutex->lck_mtx_state, prev, state, acquire)) {
*new_state = state;
return 1;
}
enable_preemption();
return 0;
}
__attribute__((noinline))
static void
lck_mtx_lock_contended(
lck_mtx_t *lock,
bool profile,
boolean_t *first_miss)
{
lck_mtx_spinwait_ret_type_t ret;
uint32_t state;
thread_t thread;
struct turnstile *ts = NULL;
try_again:
if (profile) {
LCK_MTX_PROF_MISS(lock, lock->lck_mtx_grp, first_miss);
}
ret = lck_mtx_lock_spinwait_x86(lock);
state = ordered_load_mtx_state(lock);
switch (ret) {
case LCK_MTX_SPINWAIT_NO_SPIN:
/*
* owner not on core, lck_mtx_lock_spinwait_x86 didn't even
* try to spin.
*/
/* just fall through case LCK_MTX_SPINWAIT_SPUN */
OS_FALLTHROUGH;
case LCK_MTX_SPINWAIT_SPUN_HIGH_THR:
case LCK_MTX_SPINWAIT_SPUN_OWNER_NOT_CORE:
case LCK_MTX_SPINWAIT_SPUN_NO_WINDOW_CONTENTION:
case LCK_MTX_SPINWAIT_SPUN_SLIDING_THR:
/*
* mutex not acquired but lck_mtx_lock_spinwait_x86 tried to spin
* interlock not held
*/
lck_mtx_interlock_lock(lock, &state);
assert(state & LCK_MTX_ILOCKED_MSK);
if (state & LCK_MTX_MLOCKED_MSK) {
if (profile) {
LCK_MTX_PROF_WAIT(lock, lock->lck_mtx_grp,
ret == LCK_MTX_SPINWAIT_NO_SPIN, first_miss);
}
lck_mtx_lock_wait_x86(lock, &ts);
/*
* interlock is not held here.
*/
goto try_again;
} else {
/* grab the mutex */
state |= LCK_MTX_MLOCKED_MSK;
ordered_store_mtx_state_release(lock, state);
thread = current_thread();
ordered_store_mtx_owner(lock, thread->ctid);
}
break;
case LCK_MTX_SPINWAIT_ACQUIRED:
/*
* mutex has been acquired by lck_mtx_lock_spinwait_x86
* interlock is held and preemption disabled
* owner is set and mutex marked as locked
* statistics updated too
*/
break;
default:
panic("lck_mtx_lock_spinwait_x86 returned %d for mutex %p", ret, lock);
}
/*
* interlock is already acquired here
*/
/* mutex has been acquired */
if (state & LCK_MTX_WAITERS_MSK) {
/*
* lck_mtx_lock_acquire_tail will call
* turnstile_complete.
*/
return lck_mtx_lock_acquire_tail(lock, ts);
}
if (ts != NULL) {
turnstile_complete_hash((uintptr_t)lock, TURNSTILE_KERNEL_MUTEX);
}
assert(current_thread()->turnstile != NULL);
/* release the interlock */
lck_mtx_lock_finish_inline_with_cleanup(lock, ordered_load_mtx_state(lock));
}
/*
* Helper noinline functions for calling
* panic to optimize compiled code.
*/
__attribute__((noinline)) __abortlike
static void
lck_mtx_destroyed(
lck_mtx_t *lock)
{
panic("trying to interlock destroyed mutex (%p)", lock);
}
__attribute__((noinline))
static boolean_t
lck_mtx_try_destroyed(
lck_mtx_t *lock)
{
panic("trying to interlock destroyed mutex (%p)", lock);
return FALSE;
}
__attribute__((always_inline))
static boolean_t
lck_mtx_lock_wait_interlock_to_clear(
lck_mtx_t *lock,
uint32_t* new_state)
{
uint32_t state;
for (;;) {
cpu_pause();
state = ordered_load_mtx_state(lock);
if (!(state & (LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK))) {
*new_state = state;
return TRUE;
}
if (state & LCK_MTX_MLOCKED_MSK) {
/* if it is held as mutex, just fail */
return FALSE;
}
}
}
__attribute__((always_inline))
static boolean_t
lck_mtx_try_lock_wait_interlock_to_clear(
lck_mtx_t *lock,
uint32_t* new_state)
{
uint32_t state;
for (;;) {
cpu_pause();
state = ordered_load_mtx_state(lock);
if (state & (LCK_MTX_MLOCKED_MSK | LCK_MTX_SPIN_MSK)) {
/* if it is held as mutex or spin, just fail */
return FALSE;
}
if (!(state & LCK_MTX_ILOCKED_MSK)) {
*new_state = state;
return TRUE;
}
}
}
/*
* Routine: lck_mtx_lock_slow
*
* Locks a mutex for current thread.
* If the lock is contended this function might
* sleep.
*
* Called with interlock not held.
*/
__attribute__((noinline))
void
lck_mtx_lock_slow(
lck_mtx_t *lock)
{
bool profile;
uint32_t state;
int first_miss = 0;
state = ordered_load_mtx_state(lock);
profile = (state & LCK_MTX_PROFILE_MSK);
if (__improbable(profile)) {
if (state & LCK_MTX_SPIN_MSK) {
/* M_SPIN_MSK was set, so M_ILOCKED_MSK must also be present */
assert(state & LCK_MTX_ILOCKED_MSK);
LCK_MTX_PROF_MISS(lock, lock->lck_mtx_grp, &first_miss);
}
}
/* is the interlock or mutex held */
if (__improbable(state & (LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK))) {
/*
* Note: LCK_MTX_TAG_DESTROYED has
* LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK set.
*/
/* is the mutex already held */
if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))) {
/* no, must have been the mutex */
return lck_mtx_lock_contended(lock, profile, &first_miss);
}
/* check to see if it is marked destroyed */
if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
lck_mtx_destroyed(lock);
}
}
/* no - can't be DESTROYED or locked */
while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_MLOCKED_MSK, &state))) {
if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) {
return lck_mtx_lock_contended(lock, profile, &first_miss);
}
}
/* lock and interlock acquired */
/* record owner of mutex */
ordered_store_mtx_owner(lock, current_thread()->ctid);
/*
* Check if there are waiters to
* inherit their priority.
*/
if (__improbable(state & LCK_MTX_WAITERS_MSK)) {
return lck_mtx_lock_acquire_tail(lock, NULL);
}
/* release the interlock */
lck_mtx_lock_finish_inline(lock, ordered_load_mtx_state(lock));
}
__attribute__((noinline))
boolean_t
lck_mtx_try_lock_slow(
lck_mtx_t *lock)
{
bool profile;
uint32_t state;
int first_miss = 0;
state = ordered_load_mtx_state(lock);
profile = (state & LCK_MTX_PROFILE_MSK);
/* is the interlock or mutex held */
if (__improbable(state & (LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK))) {
/*
* Note: LCK_MTX_TAG_DESTROYED has
* LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK set.
*/
/* is the mutex already held */
if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))) {
return FALSE;
}
/* check to see if it is marked destroyed */
if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
lck_mtx_try_destroyed(lock);
}
if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) {
if (__improbable(profile)) {
LCK_MTX_PROF_MISS(lock, lock->lck_mtx_grp, &first_miss);
}
return FALSE;
}
}
if (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_MLOCKED_MSK, &state))) {
if (__improbable(profile)) {
LCK_MTX_PROF_MISS(lock, lock->lck_mtx_grp, &first_miss);
}
return FALSE;
}
/* lock and interlock acquired */
/* record owner of mutex */
ordered_store_mtx_owner(lock, current_thread()->ctid);
/*
* Check if there are waiters to
* inherit their priority.
*/
if (__improbable(state & LCK_MTX_WAITERS_MSK)) {
return lck_mtx_try_lock_acquire_tail(lock);
}
/* release the interlock */
lck_mtx_try_lock_finish_inline(lock, ordered_load_mtx_state(lock));
return TRUE;
}
__attribute__((noinline))
void
lck_mtx_lock_spin_slow(
lck_mtx_t *lock)
{
bool profile;
uint32_t state;
int first_miss = 0;
state = ordered_load_mtx_state(lock);
profile = (state & LCK_MTX_PROFILE_MSK);
if (__improbable(profile)) {
if (state & LCK_MTX_SPIN_MSK) {
/* M_SPIN_MSK was set, so M_ILOCKED_MSK must also be present */
assert(state & LCK_MTX_ILOCKED_MSK);
LCK_MTX_PROF_MISS(lock, lock->lck_mtx_grp, &first_miss);
}
}
/* is interlock or mutex held */
if (__improbable(state & (LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK))) {
/*
* Note: LCK_MTX_TAG_DESTROYED has
* LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK set.
*/
/* is the mutex already held */
if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))) {
/* no, must have been the mutex */
return lck_mtx_lock_contended(lock, profile, &first_miss);
}
/* check to see if it is marked destroyed */
if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
lck_mtx_destroyed(lock);
}
}
bool acquired = true;
#if CONFIG_DTRACE
uint64_t spin_start;
if (__probable(lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_SPIN_MSK, &state))) {
goto past_spin;
}
spin_start = LCK_MTX_SPIN_SPIN_BEGIN();
#endif /* CONFIG_DTRACE */
/* note - can't be DESTROYED or locked */
/* note - preemption is not disabled while spinning */
while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_SPIN_MSK, &state))) {
if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) {
acquired = false;
break;
}
}
LCK_MTX_SPIN_SPIN_END(lock, lock->lck_mtx_grp, spin_start);
#if CONFIG_DTRACE
past_spin:
#endif /* CONFIG_DTRACE */
if (__improbable(!acquired)) {
return lck_mtx_lock_contended(lock, profile, &first_miss);
}
/* lock as spinlock and interlock acquired */
/* record owner of mutex */
ordered_store_mtx_owner(lock, current_thread()->ctid);
LCK_MTX_ACQUIRED(lock, lock->lck_mtx_grp, true, profile);
/* return with the interlock held and preemption disabled */
return;
}
__attribute__((noinline))
boolean_t
lck_mtx_try_lock_spin_slow(
lck_mtx_t *lock)
{
bool profile;
uint32_t state;
int first_miss = 0;
state = ordered_load_mtx_state(lock);
profile = (state & LCK_MTX_PROFILE_MSK);
/* is the interlock or mutex held */
if (__improbable(state & (LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK))) {
/*
* Note: LCK_MTX_TAG_DESTROYED has
* LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK set.
*/
/* is the mutex already held */
if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))) {
return FALSE;
}
/* check to see if it is marked destroyed */
if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
lck_mtx_try_destroyed(lock);
}
}
/* note - can't be DESTROYED or locked */
if (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_SPIN_MSK, &state))) {
if (__improbable(profile)) {
LCK_MTX_PROF_MISS(lock, lock->lck_mtx_grp, &first_miss);
}
return FALSE;
}
/* lock and interlock acquired */
/* record owner of mutex */
ordered_store_mtx_owner(lock, current_thread()->ctid);
#if CONFIG_DTRACE
LOCKSTAT_RECORD(LS_LCK_MTX_TRY_LOCK_SPIN_ACQUIRE, lock, 0);
#endif
return TRUE;
}
__attribute__((noinline))
void
lck_mtx_convert_spin(
lck_mtx_t *lock)
{
uint32_t state;
state = ordered_load_mtx_state(lock);
assertf(lock->lck_mtx_owner == current_thread()->ctid,
"lock %p not owned by thread ctid %d (current owner %d)",
lock, current_thread()->ctid, lock->lck_mtx_owner);
if (__improbable(state & LCK_MTX_MLOCKED_MSK)) {
/* already owned as a mutex, just return */
return;
}
assert(get_preemption_level() > 0);
assert(state & LCK_MTX_ILOCKED_MSK);
assert(state & LCK_MTX_SPIN_MSK);
/*
* Check if there are waiters to
* inherit their priority.
*/
if (__improbable(state & LCK_MTX_WAITERS_MSK)) {
return lck_mtx_convert_spin_acquire_tail(lock);
}
lck_mtx_convert_spin_finish_inline(lock, ordered_load_mtx_state(lock));
return;
}
static inline boolean_t
lck_mtx_lock_grab_mutex(
lck_mtx_t *lock)
{
uint32_t state;
state = ordered_load_mtx_state(lock);
if (!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_MLOCKED_MSK, &state)) {
return FALSE;
}
/* lock and interlock acquired */
/* record owner of mutex */
ordered_store_mtx_owner(lock, current_thread()->ctid);
return TRUE;
}
__attribute__((noinline))
void
lck_mtx_assert(
lck_mtx_t *lock,
unsigned int type)
{
thread_t thread;
uint32_t state;
thread = current_thread();
state = ordered_load_mtx_state(lock);
if (type == LCK_MTX_ASSERT_OWNED) {
if ((lock->lck_mtx_owner != thread->ctid) ||
!(state & (LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK))) {
panic("mutex (%p) not owned", lock);
}
} else {
assert(type == LCK_MTX_ASSERT_NOTOWNED);
if (lock->lck_mtx_owner == thread->ctid) {
panic("mutex (%p) owned", lock);
}
}
}
/*
* Routine: lck_mtx_lock_spinwait_x86
*
* Invoked trying to acquire a mutex when there is contention but
* the holder is running on another processor. We spin for up to a maximum
* time waiting for the lock to be released.
*
* Called with the interlock unlocked.
* returns LCK_MTX_SPINWAIT_ACQUIRED if mutex acquired
* returns LCK_MTX_SPINWAIT_SPUN if we spun
* returns LCK_MTX_SPINWAIT_NO_SPIN if we didn't spin due to the holder not running
*/
__attribute__((noinline))
lck_mtx_spinwait_ret_type_t
lck_mtx_lock_spinwait_x86(
lck_mtx_t *mutex)
{
__kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
ctid_t owner, prev_owner;
uint64_t window_deadline, sliding_deadline, high_deadline;
uint64_t spin_start, start_time, cur_time, avg_hold_time, bias, delta;
lck_mtx_spinwait_ret_type_t retval = LCK_MTX_SPINWAIT_SPUN_HIGH_THR;
int loopcount = 0;
int total_hold_time_samples, window_hold_time_samples, unfairness;
uint prev_owner_cpu;
bool adjust;
KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_START,
trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(ctid_get_thread_unsafe(mutex->lck_mtx_owner)),
mutex->lck_mtx_waiters, 0, 0);
spin_start = LCK_MTX_ADAPTIVE_SPIN_BEGIN();
start_time = mach_absolute_time();
/*
* window_deadline represents the "learning" phase.
* The thread collects statistics about the lock during
* window_deadline and then it makes a decision on whether to spin more
* or block according to the concurrency behavior
* observed.
*
* Every thread can spin at least low_MutexSpin.
*/
window_deadline = start_time + low_MutexSpin;
/*
* Sliding_deadline is the adjusted spin deadline
* computed after the "learning" phase.
*/
sliding_deadline = window_deadline;
/*
* High_deadline is a hard deadline. No thread
* can spin more than this deadline.
*/
if (high_MutexSpin >= 0) {
high_deadline = start_time + high_MutexSpin;
} else {
high_deadline = start_time + low_MutexSpin * real_ncpus;
}
/*
* Do not know yet which is the owner cpu.
* Initialize prev_owner_cpu with next cpu.
*/
prev_owner_cpu = (cpu_number() + 1) % real_ncpus;
total_hold_time_samples = 0;
window_hold_time_samples = 0;
avg_hold_time = 0;
adjust = TRUE;
bias = (os_hash_kernel_pointer(mutex) + cpu_number()) % real_ncpus;
prev_owner = mutex->lck_mtx_owner;
/*
* Spin while:
* - mutex is locked, and
* - it's locked as a spin lock, and
* - owner is running on another processor, and
* - we haven't spun for long enough.
*/
do {
/*
* Try to acquire the lock.
*/
if (__probable(lck_mtx_lock_grab_mutex(mutex))) {
retval = LCK_MTX_SPINWAIT_ACQUIRED;
break;
}
cur_time = mach_absolute_time();
/*
* Never spin past high_deadline.
*/
if (cur_time >= high_deadline) {
retval = LCK_MTX_SPINWAIT_SPUN_HIGH_THR;
break;
}
/*
* Check if owner is on core. If not block.
*/
owner = mutex->lck_mtx_owner;
if (owner) {
uint32_t i = prev_owner_cpu;
bool owner_on_core = false;
disable_preemption();
owner = mutex->lck_mtx_owner;
/*
* For scalability we want to check if the owner is on core
* without locking the mutex interlock.
* If we do not lock the mutex interlock, the owner that we see might be
* invalid, so we cannot dereference it. Therefore we cannot check
* any field of the thread to tell us if it is on core.
* Check if the thread that is running on the other cpus matches the owner.
*/
if (owner) {
thread_t owner_thread = ctid_get_thread_unsafe(owner);
do {
if ((cpu_data_ptr[i] != NULL) && (cpu_data_ptr[i]->cpu_active_thread == owner_thread)) {
owner_on_core = true;
break;
}
if (++i >= real_ncpus) {
i = 0;
}
} while (i != prev_owner_cpu);
}
enable_preemption();
if (owner == 0) {
/* nothing to do, we didn't find an owner */
} else if (owner_on_core) {
prev_owner_cpu = i;
} else {
prev_owner = owner;
owner = mutex->lck_mtx_owner;
if (owner == prev_owner) {
/*
* Owner is not on core.
* Stop spinning.
*/
if (loopcount == 0) {
retval = LCK_MTX_SPINWAIT_NO_SPIN;
} else {
retval = LCK_MTX_SPINWAIT_SPUN_OWNER_NOT_CORE;
}
break;
}
/*
* Fall through if the owner changed while we were scanning.
* The new owner could potentially be on core, so loop
* again.
*/
}
}
/*
* Save how many times we see the owner changing.
* We can roughly estimate the mutex hold
* time and the fairness with that.
*/
if (owner != prev_owner) {
prev_owner = owner;
total_hold_time_samples++;
window_hold_time_samples++;
}
/*
* Learning window expired.
* Try to adjust the sliding_deadline.
*/
if (cur_time >= window_deadline) {
/*
* If there was not contention during the window
* stop spinning.
*/
if (window_hold_time_samples < 1) {
retval = LCK_MTX_SPINWAIT_SPUN_NO_WINDOW_CONTENTION;
break;
}
if (adjust) {
/*
* For a fair lock, we'd wait for at most (NCPU-1) periods,
* but the lock is unfair, so let's try to estimate by how much.
*/
unfairness = total_hold_time_samples / real_ncpus;
if (unfairness == 0) {
/*
* We observed the owner changing `total_hold_time_samples` times which
* let us estimate the average hold time of this mutex for the duration
* of the spin time.
* avg_hold_time = (cur_time - start_time) / total_hold_time_samples;
*
* In this case spin at max avg_hold_time * (real_ncpus - 1)
*/
delta = cur_time - start_time;
sliding_deadline = start_time + (delta * (real_ncpus - 1)) / total_hold_time_samples;
} else {
/*
* In this case at least one of the other cpus was able to get the lock twice
* while I was spinning.
* We could spin longer but it won't necessarily help if the system is unfair.
* Try to randomize the wait to reduce contention.
*
* We compute how much time we could potentially spin
* and distribute it over the cpus.
*
* bias is an integer between 0 and real_ncpus.
* distributed_increment = ((high_deadline - cur_time) / real_ncpus) * bias
*/
delta = high_deadline - cur_time;
sliding_deadline = cur_time + ((delta * bias) / real_ncpus);
adjust = FALSE;
}
}
window_deadline += low_MutexSpin;
window_hold_time_samples = 0;
}
/*
* Stop spinning if we past
* the adjusted deadline.
*/
if (cur_time >= sliding_deadline) {
retval = LCK_MTX_SPINWAIT_SPUN_SLIDING_THR;
break;
}
if (mutex->lck_mtx_owner != 0) {
cpu_pause();
}
loopcount++;
} while (TRUE);
LCK_MTX_ADAPTIVE_SPIN_END(mutex, mutex->lck_mtx_grp, spin_start);
KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_END,
trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(ctid_get_thread_unsafe(mutex->lck_mtx_owner)),
mutex->lck_mtx_waiters, retval, 0);
return retval;
}
/*
* Routine: lck_mtx_lock_wait_x86
*
* Invoked in order to wait on contention.
*
* Called with the interlock locked and
* preemption disabled...
* returns it unlocked and with preemption enabled
*
* lck_mtx_waiters is 1:1 with a wakeup needing to occur.
* A runnable waiter can exist between wait and acquire
* without a waiters count being set.
* This allows us to never make a spurious wakeup call.
*
* Priority:
* This avoids taking the thread lock if the owning thread is the same priority.
* This optimizes the case of same-priority threads contending on a lock.
* However, that allows the owning thread to drop in priority while holding the lock,
* because there is no state that the priority change can notice that
* says that the targeted thread holds a contended mutex.
*
* One possible solution: priority changes could look for some atomic tag
* on the thread saying 'holding contended lock', and then set up a promotion.
* Needs a story for dropping that promotion - the last contended unlock
* has to notice that this has happened.
*/
__attribute__((noinline))
void
lck_mtx_lock_wait_x86(
lck_mtx_t *mutex,
struct turnstile **ts)
{
thread_t self = current_thread();
uint64_t sleep_start;
__kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
thread_t holder = ctid_get_thread(mutex->lck_mtx_owner);
KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_START,
trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(holder),
mutex->lck_mtx_waiters, 0, 0);
sleep_start = LCK_MTX_BLOCK_BEGIN();
mutex->lck_mtx_waiters++;
/*
* lck_mtx_lock_wait_x86 might be called on a loop. Call prepare just once and reuse
* the same turnstile while looping, the matching turnstile compleate will be called
* by lck_mtx_lock_contended when finally acquiring the lock.
*/
if (*ts == NULL) {
*ts = turnstile_prepare_hash((uintptr_t)mutex, TURNSTILE_KERNEL_MUTEX);
}
struct turnstile *turnstile = *ts;
thread_set_pending_block_hint(self, kThreadWaitKernelMutex);
turnstile_update_inheritor(turnstile, holder, (TURNSTILE_DELAYED_UPDATE | TURNSTILE_INHERITOR_THREAD));
waitq_assert_wait64(&turnstile->ts_waitq, LCK_MTX_EVENT(mutex), THREAD_UNINT | THREAD_WAIT_NOREPORT_USER, TIMEOUT_WAIT_FOREVER);
lck_mtx_ilk_unlock(mutex);
turnstile_update_inheritor_complete(turnstile, TURNSTILE_INTERLOCK_NOT_HELD);
thread_block(THREAD_CONTINUE_NULL);
KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_END,
trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(ctid_get_thread_unsafe(mutex->lck_mtx_owner)),
mutex->lck_mtx_waiters, 0, 0);
LCK_MTX_BLOCK_END(mutex, mutex->lck_mtx_grp, sleep_start);
}
/*
* Routine: kdp_lck_mtx_lock_spin_is_acquired
* NOT SAFE: To be used only by kernel debugger to avoid deadlock.
* Returns: TRUE if lock is acquired.
*/
boolean_t
kdp_lck_mtx_lock_spin_is_acquired(lck_mtx_t *lck)
{
if (not_in_kdp) {
panic("panic: kdp_lck_mtx_lock_spin_is_acquired called outside of kernel debugger");
}
if (lck->lck_mtx_ilocked || lck->lck_mtx_mlocked) {
return TRUE;
}
return FALSE;
}
void
kdp_lck_mtx_find_owner(__unused struct waitq * waitq, event64_t event, thread_waitinfo_t * waitinfo)
{
lck_mtx_t * mutex = LCK_EVENT_TO_MUTEX(event);
waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
waitinfo->owner = thread_tid(ctid_get_thread(mutex->lck_mtx_owner));
}
#endif /* LCK_MTX_USE_ARCH */
#if CONFIG_PV_TICKET
__startup_func
void
lck_init_pv(void)
{
uint32_t pvtck = 1;
PE_parse_boot_argn("pvticket", &pvtck, sizeof(pvtck));
if (pvtck == 0) {
return;
}
has_lock_pv = cpuid_vmm_present() &&
(cpuid_vmm_get_kvm_features() & CPUID_KVM_FEATURE_PV_UNHALT) != 0;
}
STARTUP(LOCKS, STARTUP_RANK_FIRST, lck_init_pv);
#endif