/*
* Copyright (c) 2011-2021 Apple Computer, Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
* This file contains Original Code and/or Modifications of Original Code
* as defined in and that are subject to the Apple Public Source License
* Version 2.0 (the 'License'). You may not use this file except in
* compliance with the License. The rights granted to you under the License
* may not be used to create, or enable the creation or redistribution of,
* unlawful or unlicensed copies of an Apple operating system, or to
* circumvent, violate, or enable the circumvention or violation of, any
* terms of an Apple operating system software license agreement.
*
* Please obtain a copy of the License at
* http://www.opensource.apple.com/apsl/ and read it before using this file.
*
* The Original Code and all software distributed under the License are
* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
* Please see the License for the specific language governing rights and
* limitations under the License.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
*/
/*
* This file manages the timers used for on-CPU samples and PET.
*
* Each timer configured by a tool is represented by a kptimer structure.
* The timer calls present in each structure are used to schedule CPU-local
* timers. As each timer fires, that CPU samples itself and schedules another
* timer to fire at the next deadline. The first timer to fire across all CPUs
* determines that deadline. This causes the timers to fire at a consistent
* cadence.
*
* Traditional PET uses a timer call to wake up its sampling thread and take
* on-CPU samples.
*
* Synchronization for start and stop is provided by the ktrace subsystem lock.
* Global state is stored in a single struct, to ease debugging.
*/
#include <mach/mach_types.h>
#include <kern/cpu_data.h> /* current_thread() */
#include <kern/kalloc.h>
#include <kern/timer_queue.h>
#include <libkern/section_keywords.h>
#include <stdatomic.h>
#include <sys/errno.h>
#include <sys/vm.h>
#include <sys/ktrace.h>
#include <machine/machine_routines.h>
#if defined(__x86_64__)
#include <i386/mp.h>
#endif /* defined(__x86_64__) */
#include <kperf/kperf.h>
#include <kperf/buffer.h>
#include <kperf/context.h>
#include <kperf/action.h>
#include <kperf/kptimer.h>
#include <kperf/pet.h>
#include <kperf/sample.h>
#define KPTIMER_PET_INACTIVE (999)
#define KPTIMER_MAX (8)
struct kptimer {
uint32_t kt_actionid;
uint64_t kt_period_abs;
/*
* The `kt_cur_deadline` field represents when the timer should next fire.
* It's used to synchronize between timers firing on each CPU. In the timer
* handler, each CPU will take the `kt_lock` and see if the
* `kt_cur_deadline` still needs to be updated for the timer fire. If so,
* it updates it and logs the timer fire event under the lock.
*/
lck_spin_t kt_lock;
uint64_t kt_cur_deadline;
#if DEVELOPMENT || DEBUG
/*
* To be set by the timer leader as a debugging aid for timeouts, if kperf
* happens to be on-CPU when they occur.
*/
uint64_t kt_fire_time;
#endif /* DEVELOPMENT || DEBUG */
};
static struct {
struct kptimer *g_timers;
uint64_t *g_cpu_deadlines;
unsigned int g_ntimers;
unsigned int g_pet_timerid;
bool g_setup:1;
bool g_pet_active:1;
bool g_started:1;
struct timer_call g_pet_timer;
} kptimer = {
.g_pet_timerid = KPTIMER_PET_INACTIVE,
};
SECURITY_READ_ONLY_LATE(static uint64_t) kptimer_minperiods_mtu[KTPL_MAX];
/*
* Enforce a minimum timer period to prevent interrupt storms.
*/
const uint64_t kptimer_minperiods_ns[KTPL_MAX] = {
#if defined(__x86_64__)
[KTPL_FG] = 20 * NSEC_PER_USEC, /* The minimum timer period in xnu, period. */
[KTPL_BG] = 1 * NSEC_PER_MSEC,
[KTPL_FG_PET] = 1 * NSEC_PER_MSEC,
[KTPL_BG_PET] = 1 * NSEC_PER_MSEC,
#elif defined(__arm64__)
[KTPL_FG] = 50 * NSEC_PER_USEC,
[KTPL_BG] = 1 * NSEC_PER_MSEC,
[KTPL_FG_PET] = 1 * NSEC_PER_MSEC,
[KTPL_BG_PET] = 1 * NSEC_PER_MSEC,
#else
#error unexpected architecture
#endif
};
static void kptimer_pet_handler(void * __unused param1, void * __unused param2);
static void kptimer_stop_cpu(processor_t processor);
void
kptimer_init(void)
{
for (int i = 0; i < KTPL_MAX; i++) {
nanoseconds_to_absolutetime(kptimer_minperiods_ns[i],
&kptimer_minperiods_mtu[i]);
}
}
static void
kptimer_set_cpu_deadline(int cpuid, int timerid, uint64_t deadline)
{
kptimer.g_cpu_deadlines[(cpuid * KPTIMER_MAX) + timerid] =
deadline;
}
static void
kptimer_setup(void)
{
if (kptimer.g_setup) {
return;
}
static lck_grp_t kptimer_lock_grp;
lck_grp_init(&kptimer_lock_grp, "kptimer", LCK_GRP_ATTR_NULL);
const size_t timers_size = KPTIMER_MAX * sizeof(struct kptimer);
kptimer.g_timers = zalloc_permanent_tag(timers_size,
ZALIGN(struct kptimer), VM_KERN_MEMORY_DIAG);
for (int i = 0; i < KPTIMER_MAX; i++) {
lck_spin_init(&kptimer.g_timers[i].kt_lock, &kptimer_lock_grp,
LCK_ATTR_NULL);
}
const size_t deadlines_size = machine_info.logical_cpu_max * KPTIMER_MAX *
sizeof(kptimer.g_cpu_deadlines[0]);
kptimer.g_cpu_deadlines = zalloc_permanent_tag(deadlines_size,
ZALIGN_64, VM_KERN_MEMORY_DIAG);
for (int i = 0; i < KPTIMER_MAX; i++) {
for (int j = 0; j < machine_info.logical_cpu_max; j++) {
kptimer_set_cpu_deadline(j, i, EndOfAllTime);
}
}
timer_call_setup(&kptimer.g_pet_timer, kptimer_pet_handler, NULL);
kptimer.g_setup = true;
}
void
kptimer_reset(void)
{
kptimer_stop();
kptimer_set_pet_timerid(KPTIMER_PET_INACTIVE);
for (unsigned int i = 0; i < kptimer.g_ntimers; i++) {
kptimer.g_timers[i].kt_period_abs = 0;
kptimer.g_timers[i].kt_actionid = 0;
for (int j = 0; j < machine_info.logical_cpu_max; j++) {
kptimer_set_cpu_deadline(j, i, EndOfAllTime);
}
}
}
#pragma mark - deadline management
static uint64_t
kptimer_get_cpu_deadline(int cpuid, int timerid)
{
return kptimer.g_cpu_deadlines[(cpuid * KPTIMER_MAX) + timerid];
}
static void
kptimer_sample_curcpu(unsigned int actionid, unsigned int timerid,
uint32_t flags)
{
struct kperf_sample *intbuf = kperf_intr_sample_buffer();
#if DEVELOPMENT || DEBUG
intbuf->sample_time = mach_absolute_time();
#endif /* DEVELOPMENT || DEBUG */
BUF_DATA(PERF_TM_HNDLR | DBG_FUNC_START);
thread_t thread = current_thread();
task_t task = get_threadtask(thread);
struct kperf_context ctx = {
.cur_thread = thread,
.cur_task = task,
.cur_pid = task_pid(task),
.trigger_type = TRIGGER_TYPE_TIMER,
.trigger_id = timerid,
};
(void)kperf_sample(intbuf, &ctx, actionid,
SAMPLE_FLAG_PEND_USER | flags);
BUF_INFO(PERF_TM_HNDLR | DBG_FUNC_END);
}
static void
kptimer_lock(struct kptimer *timer)
{
lck_spin_lock(&timer->kt_lock);
}
static void
kptimer_unlock(struct kptimer *timer)
{
lck_spin_unlock(&timer->kt_lock);
}
/*
* If the deadline expired in the past, find the next deadline to program,
* locked into the cadence provided by the period.
*/
static inline uint64_t
dead_reckon_deadline(uint64_t now, uint64_t deadline, uint64_t period)
{
if (deadline < now) {
uint64_t time_since = now - deadline;
uint64_t extra_time = period - (time_since % period);
return now + extra_time;
}
return deadline;
}
static uint64_t
kptimer_fire(struct kptimer *timer, unsigned int timerid,
uint64_t deadline, int __unused cpuid, uint64_t now)
{
bool first = false;
uint64_t next_deadline = deadline + timer->kt_period_abs;
/*
* It's not straightforward to replace this lock with a compare-exchange,
* since the PERF_TM_FIRE event must be emitted *before* any subsequent
* PERF_TM_HNDLR events, so tools can understand the handlers are responding
* to this timer fire.
*/
kptimer_lock(timer);
if (timer->kt_cur_deadline < next_deadline) {
first = true;
next_deadline = dead_reckon_deadline(now, next_deadline,
timer->kt_period_abs);
timer->kt_cur_deadline = next_deadline;
BUF_DATA(PERF_TM_FIRE, timerid, timerid == kptimer.g_pet_timerid,
timer->kt_period_abs, timer->kt_actionid);
#if DEVELOPMENT || DEBUG
/*
* Debugging aid to see the last time this timer fired.
*/
timer->kt_fire_time = mach_absolute_time();
#endif /* DEVELOPMENT || DEBUG */
if (timerid == kptimer.g_pet_timerid && kppet_get_lightweight_pet()) {
os_atomic_inc(&kppet_gencount, relaxed);
}
} else {
/*
* In case this CPU has missed several timer fires, get it back on track
* by synchronizing with the latest timer fire.
*/
next_deadline = timer->kt_cur_deadline;
}
kptimer_unlock(timer);
if (!first && !kperf_action_has_non_system(timer->kt_actionid)) {
/*
* The first timer to fire will sample the system, so there's
* no need to run other timers if those are the only samplers
* for this action.
*/
return next_deadline;
}
kptimer_sample_curcpu(timer->kt_actionid, timerid,
first ? SAMPLE_FLAG_SYSTEM : 0);
return next_deadline;
}
/*
* Determine which of the timers fired.
*/
void
kptimer_expire(processor_t processor, int cpuid, uint64_t now)
{
uint64_t min_deadline = UINT64_MAX;
enum kperf_sampling status = os_atomic_load(&kperf_status, acquire);
switch (status) {
case KPERF_SAMPLING_ON:
break;
case KPERF_SAMPLING_SHUTDOWN:
kptimer_stop_cpu(processor);
return;
case KPERF_SAMPLING_OFF:
panic("kperf: timer fired at %llu, but sampling is disabled", now);
default:
panic("kperf: unknown sampling state 0x%x", status);
}
for (unsigned int i = 0; i < kptimer.g_ntimers; i++) {
struct kptimer *timer = &kptimer.g_timers[i];
if (timer->kt_period_abs == 0) {
continue;
}
uint64_t cpudeadline = kptimer_get_cpu_deadline(cpuid, i);
if (now > cpudeadline) {
uint64_t deadline = kptimer_fire(timer, i, cpudeadline, cpuid, now);
if (deadline == 0) {
kptimer_set_cpu_deadline(cpuid, i, EndOfAllTime);
} else {
kptimer_set_cpu_deadline(cpuid, i, deadline);
if (deadline < min_deadline) {
min_deadline = deadline;
}
}
}
}
if (min_deadline < UINT64_MAX) {
running_timer_enter(processor, RUNNING_TIMER_KPERF, NULL,
min_deadline, mach_absolute_time());
}
}
#pragma mark - start/stop
static void
kptimer_broadcast(void (*fn)(void *))
{
ktrace_assert_lock_held();
#if defined(__x86_64__)
(void)mp_cpus_call(CPUMASK_ALL, ASYNC, fn, NULL);
#else /* defined(__x86_64__) */
_Atomic uint32_t xcsync = 0;
cpu_broadcast_xcall((uint32_t *)&xcsync, TRUE /* include self */, fn,
&xcsync);
#endif /* !defined(__x86_64__) */
}
static void
kptimer_broadcast_ack(void *arg)
{
#if defined(__x86_64__)
#pragma unused(arg)
#else /* defined(__x86_64__) */
_Atomic uint32_t *xcsync = arg;
int pending = os_atomic_dec(xcsync, relaxed);
if (pending == 0) {
thread_wakeup(xcsync);
}
#endif /* !defined(__x86_64__) */
}
static void
kptimer_sample_pet_remote(void * __unused arg)
{
if (!kperf_is_sampling()) {
return;
}
struct kptimer *timer = &kptimer.g_timers[kptimer.g_pet_timerid];
kptimer_sample_curcpu(timer->kt_actionid, kptimer.g_pet_timerid, 0);
}
#if !defined(__x86_64__)
#include <arm/cpu_internal.h>
void kperf_signal_handler(void);
void
kperf_signal_handler(void)
{
kptimer_sample_pet_remote(NULL);
}
#endif /* !defined(__x86_64__) */
#include <stdatomic.h>
_Atomic uint64_t mycounter = 0;
static void
kptimer_broadcast_pet(void)
{
atomic_fetch_add(&mycounter, 1);
#if defined(__x86_64__)
(void)mp_cpus_call(CPUMASK_OTHERS, NOSYNC, kptimer_sample_pet_remote,
NULL);
#else /* defined(__x86_64__) */
int curcpu = cpu_number();
for (int i = 0; i < machine_info.logical_cpu_max; i++) {
if (i != curcpu) {
cpu_signal(cpu_datap(i), SIGPkppet, NULL, NULL);
}
}
#endif /* !defined(__x86_64__) */
}
static void
kptimer_pet_handler(void * __unused param1, void * __unused param2)
{
if (!kptimer.g_pet_active) {
return;
}
struct kptimer *timer = &kptimer.g_timers[kptimer.g_pet_timerid];
BUF_DATA(PERF_TM_FIRE, kptimer.g_pet_timerid, 1, timer->kt_period_abs,
timer->kt_actionid);
/*
* To get the on-CPU samples as close to this timer fire as possible, first
* broadcast to them to sample themselves.
*/
kptimer_broadcast_pet();
/*
* Wakeup the PET thread afterwards so it's not inadvertently sampled (it's a
* high-priority kernel thread). If the scheduler needs to IPI to run it,
* that IPI will be handled after the IPIs issued during the broadcast.
*/
kppet_wake_thread();
/*
* Finally, sample this CPU, who's stacks and state have been preserved while
* running this handler. Make sure to include system measurements.
*/
kptimer_sample_curcpu(timer->kt_actionid, kptimer.g_pet_timerid,
SAMPLE_FLAG_SYSTEM);
BUF_INFO(PERF_TM_FIRE | DBG_FUNC_END);
/*
* The PET thread will re-arm the timer when it's done.
*/
}
void
kptimer_pet_enter(uint64_t sampledur_abs)
{
if (!kperf_is_sampling()) {
return;
}
uint64_t period_abs = kptimer.g_timers[kptimer.g_pet_timerid].kt_period_abs;
uint64_t orig_period_abs = period_abs;
if (period_abs > sampledur_abs) {
period_abs -= sampledur_abs;
}
period_abs = MAX(kptimer_min_period_abs(true), period_abs);
uint64_t deadline_abs = mach_absolute_time() + period_abs;
BUF_INFO(PERF_PET_SCHED, orig_period_abs, period_abs, sampledur_abs,
deadline_abs);
timer_call_enter(&kptimer.g_pet_timer, deadline_abs, TIMER_CALL_SYS_CRITICAL);
}
static uint64_t
kptimer_earliest_deadline(processor_t processor, uint64_t now)
{
uint64_t min_deadline = UINT64_MAX;
for (unsigned int i = 0; i < kptimer.g_ntimers; i++) {
struct kptimer *timer = &kptimer.g_timers[i];
uint64_t cur_deadline = timer->kt_cur_deadline;
if (cur_deadline == 0) {
continue;
}
cur_deadline = dead_reckon_deadline(now, cur_deadline,
timer->kt_period_abs);
kptimer_set_cpu_deadline(processor->cpu_id, i, cur_deadline);
if (cur_deadline < min_deadline) {
min_deadline = cur_deadline;
}
}
return min_deadline;
}
void kptimer_running_setup(processor_t processor, uint64_t now);
void
kptimer_running_setup(processor_t processor, uint64_t now)
{
uint64_t deadline = kptimer_earliest_deadline(processor, now);
if (deadline < UINT64_MAX) {
running_timer_setup(processor, RUNNING_TIMER_KPERF, NULL, deadline,
now);
}
}
static void
kptimer_start_cpu(processor_t processor)
{
uint64_t now = mach_absolute_time();
uint64_t deadline = kptimer_earliest_deadline(processor, now);
if (deadline < UINT64_MAX) {
running_timer_enter(processor, RUNNING_TIMER_KPERF, NULL, deadline,
now);
}
}
static void
kptimer_start_remote(void *arg)
{
kptimer_start_cpu(current_processor());
kptimer_broadcast_ack(arg);
}
static void
kptimer_stop_cpu(processor_t processor)
{
for (unsigned int i = 0; i < kptimer.g_ntimers; i++) {
kptimer_set_cpu_deadline(processor->cpu_id, i, EndOfAllTime);
}
running_timer_cancel(processor, RUNNING_TIMER_KPERF);
}
void
kptimer_stop_curcpu(void)
{
kptimer_stop_cpu(current_processor());
}
static void
kptimer_stop_remote(void * __unused arg)
{
assert(ml_get_interrupts_enabled() == FALSE);
kptimer_stop_cpu(current_processor());
kptimer_broadcast_ack(arg);
}
/*
* Called when a CPU is brought online. Handles the cases where the kperf timer may have
* been either enabled or disabled while the CPU was offline (preventing the enabling/disabling
* IPIs from reaching this CPU).
*/
void
kptimer_curcpu_up(void)
{
enum kperf_sampling status = os_atomic_load(&kperf_status, acquire);
processor_t processor = current_processor();
assert(ml_get_interrupts_enabled() == FALSE);
/*
* If the CPU was taken offline, THEN kperf was enabled, this CPU would have missed
* the enabling IPI, so fix that here. Also, if the CPU was taken offline (after having
* enabled kperf), recompute the deadline (since we may have missed a timer update) and
* keep the timer enabled.
*/
if (status == KPERF_SAMPLING_ON) {
kptimer_start_cpu(processor);
} else {
/*
* Similarly, If the CPU is resuming after having previously armed the kperf timer
* before going down, and kperf is currently disabled, disable the kperf running
* timer on this CPU.
*/
kptimer_stop_cpu(processor);
}
}
void
kptimer_start(void)
{
ktrace_assert_lock_held();
if (kptimer.g_started) {
return;
}
uint64_t now = mach_absolute_time();
unsigned int ntimers_active = 0;
kptimer.g_started = true;
for (unsigned int i = 0; i < kptimer.g_ntimers; i++) {
struct kptimer *timer = &kptimer.g_timers[i];
if (timer->kt_period_abs == 0 || timer->kt_actionid == 0) {
/*
* No period or action means the timer is inactive.
*/
continue;
} else if (!kppet_get_lightweight_pet() &&
i == kptimer.g_pet_timerid) {
kptimer.g_pet_active = true;
timer_call_enter(&kptimer.g_pet_timer, now + timer->kt_period_abs,
TIMER_CALL_SYS_CRITICAL);
} else {
timer->kt_cur_deadline = now + timer->kt_period_abs;
ntimers_active++;
}
}
if (ntimers_active > 0) {
kptimer_broadcast(kptimer_start_remote);
}
}
void
kptimer_stop(void)
{
ktrace_assert_lock_held();
if (!kptimer.g_started) {
return;
}
int intrs_en = ml_set_interrupts_enabled(FALSE);
if (kptimer.g_pet_active) {
kptimer.g_pet_active = false;
timer_call_cancel(&kptimer.g_pet_timer);
}
kptimer.g_started = false;
kptimer_broadcast(kptimer_stop_remote);
for (unsigned int i = 0; i < kptimer.g_ntimers; i++) {
kptimer.g_timers[i].kt_cur_deadline = 0;
}
ml_set_interrupts_enabled(intrs_en);
}
#pragma mark - accessors
int
kptimer_get_period(unsigned int timerid, uint64_t *period_abs)
{
if (timerid >= kptimer.g_ntimers) {
return EINVAL;
}
*period_abs = kptimer.g_timers[timerid].kt_period_abs;
return 0;
}
int
kptimer_set_period(unsigned int timerid, uint64_t period_abs)
{
if (timerid >= kptimer.g_ntimers) {
return EINVAL;
}
if (kptimer.g_started) {
return EBUSY;
}
bool pet = kptimer.g_pet_timerid == timerid;
uint64_t min_period = kptimer_min_period_abs(pet);
if (period_abs != 0 && period_abs < min_period) {
period_abs = min_period;
}
if (pet && !kppet_get_lightweight_pet()) {
kppet_config(kptimer.g_timers[timerid].kt_actionid);
}
kptimer.g_timers[timerid].kt_period_abs = period_abs;
return 0;
}
int
kptimer_get_action(unsigned int timerid, unsigned int *actionid)
{
if (timerid >= kptimer.g_ntimers) {
return EINVAL;
}
*actionid = kptimer.g_timers[timerid].kt_actionid;
return 0;
}
int
kptimer_set_action(unsigned int timerid, unsigned int actionid)
{
if (timerid >= kptimer.g_ntimers) {
return EINVAL;
}
if (kptimer.g_started) {
return EBUSY;
}
kptimer.g_timers[timerid].kt_actionid = actionid;
if (kptimer.g_pet_timerid == timerid && !kppet_get_lightweight_pet()) {
kppet_config(actionid);
}
return 0;
}
unsigned int
kptimer_get_count(void)
{
return kptimer.g_ntimers;
}
int
kptimer_set_count(unsigned int count)
{
kptimer_setup();
if (kptimer.g_started) {
return EBUSY;
}
if (count > KPTIMER_MAX) {
return EINVAL;
}
kptimer.g_ntimers = count;
return 0;
}
uint64_t
kptimer_min_period_abs(bool pet)
{
enum kptimer_period_limit limit = 0;
if (ktrace_background_active()) {
limit = pet ? KTPL_BG_PET : KTPL_BG;
} else {
limit = pet ? KTPL_FG_PET : KTPL_FG;
}
return kptimer_minperiods_mtu[limit];
}
uint32_t
kptimer_get_pet_timerid(void)
{
return kptimer.g_pet_timerid;
}
int
kptimer_set_pet_timerid(uint32_t petid)
{
if (kptimer.g_started) {
return EBUSY;
}
if (petid >= kptimer.g_ntimers) {
kppet_config(0);
} else {
kppet_config(kptimer.g_timers[petid].kt_actionid);
uint64_t period_abs = MAX(kptimer_min_period_abs(true),
kptimer.g_timers[petid].kt_period_abs);
kptimer.g_timers[petid].kt_period_abs = period_abs;
}
kptimer.g_pet_timerid = petid;
return 0;
}