This is xnu-11215.1.10. See this file in:
/*
 * Copyright (c) 2022 Apple Inc. All rights reserved.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
 *
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. The rights granted to you under the License
 * may not be used to create, or enable the creation or redistribution of,
 * unlawful or unlicensed copies of an Apple operating system, or to
 * circumvent, violate, or enable the circumvention or violation of, any
 * terms of an Apple operating system software license agreement.
 *
 * Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this file.
 *
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
 */

/*
 * BTI Telemetry is a debug feature intended to support a safer, slow rollout
 * of ARMv8.5's Branch Target Indication in and across the kernel.
 * Telemetry mode converts normally fatal BTI exceptions into non-fatal,
 * analytics generating events.
 */

#include <libkern/coreanalytics/coreanalytics.h>
#include <kern/percpu.h>
#include <arm64/bti_telemetry.h>
#include <libkern/tree.h>
#include <kern/locks.h>
#include <kern/thread_call.h>
#include <kern/kalloc.h>
#include <kern/cpu_data.h>
#include <machine/machine_routines.h>
#include <vm/pmap.h>
#include <libkern/OSKextLibPrivate.h>
#include <libkern/kernel_mach_header.h>
#include <kern/assert.h>

#ifdef CONFIG_BTI_TELEMETRY
#define TAG "[bti_telemetry] "

/* ~* Module Configuration *~ */
/**
 * Enable reporting via CoreAnalytics in addition to local gathering.
 */
#define BTI_TELEMETRY_USE_CORE_ANALYTICS (1)

typedef struct bti_telemetry_record {
	SPLAY_ENTRY(bti_telemetry_record) link;

	/** Slid address at which the exception was thrown */
	uintptr_t faulting_address;

	/** The raw BTYPE for this exception */
	uint8_t branch_type;
} bti_telemetry_record_s;

/* ~* Core Analytics *~ */
CA_EVENT(arm_bti_exceptions,
    CA_INT, branch_type,
    CA_INT, faulting_offset,
    CA_STATIC_STRING(CA_UUID_LEN), faulting_uuid);

/* ~* Splay tree *~ */
static int
bti_telemetry_record_compare(bti_telemetry_record_s *a1_r,
    bti_telemetry_record_s *a2_r)
{
	/* Compare on fault address */
	if (a1_r->faulting_address > a2_r->faulting_address) {
		return 1;
	} else if (a1_r->faulting_address < a2_r->faulting_address) {
		return -1;
	}

	/* Same address but different BTI exception type? */
	if (a1_r->branch_type > a2_r->branch_type) {
		return 1;
	} else if (a1_r->branch_type < a2_r->branch_type) {
		return -1;
	} else {
		return 0;
	}
}

SPLAY_HEAD(bti_telemetry_tree, bti_telemetry_record);
// These functions generated by SPLAY_PROTOTYPE but are currently unused
__unused static struct bti_telemetry_record *bti_telemetry_tree_SPLAY_NEXT(
	struct bti_telemetry_tree *head, struct bti_telemetry_record *elm);
__unused static struct bti_telemetry_record *bti_telemetry_tree_SPLAY_SEARCH(
	struct bti_telemetry_tree *head, struct bti_telemetry_record *elm);
__unused static struct bti_telemetry_record *bti_telemetry_tree_SPLAY_MIN_MAX(
	struct bti_telemetry_tree *head, int val);
SPLAY_PROTOTYPE(bti_telemetry_tree,
    bti_telemetry_record,
    link,
    bti_telemetry_record_compare);
SPLAY_GENERATE(bti_telemetry_tree,
    bti_telemetry_record,
    link,
    bti_telemetry_record_compare);

/* ~* Globals *~ */
/* Lock which protects the event submission queue */
static LCK_GRP_DECLARE(bti_telemetry_lock_grp, "bti_telemetry_lock");
static LCK_SPIN_DECLARE(bti_telemetry_lock, &bti_telemetry_lock_grp);

/*
 * Since BTI exceptions are, naturally, caught in an exception context, it is
 * not safe to allocate or do other complex behaviors like calling into
 * CoreAnalytics. To solve this, we use a short submission ring buffer which
 * collects records for processing on the submission thread.
 *
 * This ring buffer is locked by BTI_TELEMETRY_LOCK.
 */
#define RECORD_SUBMISSION_BUFFER_LENGTH (16)
static bti_telemetry_record_s record_submission_buffer[RECORD_SUBMISSION_BUFFER_LENGTH];
static size_t rsb_rd_idx;
static size_t rsb_wr_idx;
static size_t rsb_count;
static bool rsb_is_draining;

/**
 * For local telemetry and deduplication, we store hit records in a splay tree.
 * We use a splay here for performance reasons since BTI exceptions exhibit a
 * degree of temporal locality.
 */
static struct bti_telemetry_tree telemetry_splay_tree;

/**
 * Flag indicating whether this CPU is currently trying to acquire the
 * telemetry lock or has already acquired the lock.
 * This is used as a deadlock avoidance mechanism.
 */
static uint8_t PERCPU_DATA(per_cpu_telemetry_lock_blocked);

/**
 * Thread which is responsible for clearing the submission buffer by submitting
 * to CoreAnalytics and the local tree.
 */
static struct thread_call *drain_record_submission_buffer_callout;

/* ~* Implementation *~ */
/**
 * Enqueue SRC into the record submission buffer. Returns TRUE if successful,
 * false otherwise. BTI_TELEMETRY_LOCK must be held during this operation.
 */
static bool
rsb_enqueue_locked(bti_telemetry_record_s *src)
{
	if (rsb_count == RECORD_SUBMISSION_BUFFER_LENGTH) {
		return false;
	}

	rsb_count += 1;
	bti_telemetry_record_s *dst = record_submission_buffer + rsb_wr_idx;
	memcpy(dst, src, sizeof(bti_telemetry_record_s));
	rsb_wr_idx = (rsb_wr_idx + 1) % RECORD_SUBMISSION_BUFFER_LENGTH;

	return true;
}

/**
 * Try and acquire a spin lock in an interrupt-deadlock safe way.
 *
 * This function differs from the standard lck_spin_try_lock function in that it
 * will block if the lock is expected to be acquired *eventually* but will not
 * block if it detects that the lock will never be acquired (such as when the
 * current CPU owns the lock, which can happen if a BTI exception is taken while
 * handling a telemetry operation under the lock).
 */
static inline bool OS_WARN_RESULT
safe_telemetry_lock_try_lock(void)
{
	uint8_t *telemetry_lock_blocked = NULL;

	/*
	 * Disable preemption to ensure that our block signal always corresponds
	 * to the CPU we're actually running on.
	 *
	 * If we did not disable preemption, there is a case where we may mark that
	 * we are trying to acquire the lock on core A, get approved, get preempted,
	 * get rescheduled on core B, and then take the lock there. If we then take
	 * a BTI exception on core B while handling the original exception (ex.  we
	 * take an IRQ and a BTI exception is generated there), we may re-enter on
	 * core B, (incorrectly) see that we are not blocked, try to acquire the
	 * lock, and ultimately deadlock.
	 */
	disable_preemption();

	telemetry_lock_blocked = PERCPU_GET(per_cpu_telemetry_lock_blocked);
	if (!os_atomic_cmpxchg(telemetry_lock_blocked, 0, 1, relaxed)) {
		/*
		 * This CPU has already acquired/is blocked on the telemetry lock.
		 * Attempting to acquire again on this CPU will deadlock. Refuse the
		 * operation.
		 */
		enable_preemption();
		return false;
	}

	/* We've been approved to acquire the lock on this core! */
	lck_spin_lock(&bti_telemetry_lock);
	return true;
}

/**
 * Attempts to acquire the telemetry lock and panic if it cannot be acquired.
 */
static void
safe_telemetry_lock_lock(void)
{
	if (!safe_telemetry_lock_try_lock()) {
		panic("Unexpectedly could not acquire telemetry lock (nested acquire will deadlock)");
	}
}

/**
 * Unlock telemetry lock after being locked with safe_telemetry_lock_try_lock
 */
static inline void
safe_telemetry_lock_unlock(void)
{
	uint8_t *telemetry_lock_blocked = NULL;

	lck_spin_unlock(&bti_telemetry_lock);
	/*
	 * Clear the block only AFTER having dropped the lock so that we can't
	 * hit a really narrow deadlock race where we get interrupted between
	 * clearing the block and dropping the lock.
	 */
	telemetry_lock_blocked = PERCPU_GET(per_cpu_telemetry_lock_blocked);
	os_atomic_store(telemetry_lock_blocked, (uint8_t)0, relaxed);

	/* Finally, reenable preemption as this thread is now safe to move */
	enable_preemption();
}

/**
 * Get the UUID and __TEXT_EXEC based offset of ADDR into its respective
 * binary image. Copy each into UUID and OFFSET. Returns negative on error.
 *
 * Acquires a sleeping lock, do not call while interrupts are disabled.
 */
static int
get_uuid_and_text_offset_for_addr(uintptr_t addr, uuid_t *uuid, size_t *offset)
{
	kernel_mach_header_t *mh = NULL;
	kernel_segment_command_t *seg_text_exec = NULL;
	void *mh_uuid = NULL;
	unsigned long mh_uuid_len = 0;

	if (!(mh = OSKextKextForAddress((void *)addr))) {
		return -1;
	}

	if (!(seg_text_exec = getsegbynamefromheader(mh, "__TEXT_EXEC"))) {
		return -2;
	}

	if (!(mh_uuid = getuuidfromheader(mh, &mh_uuid_len))) {
		return -3;
	}

	if (mh_uuid_len != sizeof(*uuid)) {
		return -4;
	}

	memcpy(uuid, mh_uuid, sizeof(*uuid));
	*offset = addr - seg_text_exec->vmaddr;

	return 0;
}

static void __unused
dump_telemetry_record(bti_telemetry_record_s *record,
    uuid_string_t uuid_str,
    size_t offset)
{
	printf(
		TAG "Unexpected BTI exception (pc=0x%08lx, BTYPE=%d)\n"
		TAG "\t<UUID: %s, offset: 0x%08lx>\n",
		record->faulting_address, record->branch_type,
		uuid_str, offset);
}

/**
 * Thread call which drains the record submission buffer.
 * There must be no more than one instance of this thread running at a time.
 */
static void
drain_record_submission_buffer_thread_call(__unused thread_call_param_t p0,
    __unused thread_call_param_t p1)
{
	size_t drain_count = 0;
	size_t drain_rd_idx = 0;
	bti_telemetry_record_s *record_allocations[RECORD_SUBMISSION_BUFFER_LENGTH];

	/*
	 * We never expect for the submission thread to be scheduled while another
	 * handler is suspended above it (acquiring disables preemption) or while
	 * another submission thread is suspended above it (only one submission
	 * thread should ever be running). Thus, failing to acquire the lock
	 * indicates that something is seriously wrong.
	 */
	safe_telemetry_lock_lock();

	if (rsb_is_draining) {
		panic("Unexpectedly found multiple concurrent drains!");
	}
	rsb_is_draining = true;

	/*
	 * Iteratively drain the submission queue until no entries remain.
	 * Drops and reacquires the telemetry lock.
	 */
	while ((drain_count = rsb_count)) {
		/* LOCKED IN */
		drain_rd_idx = rsb_rd_idx;
		safe_telemetry_lock_unlock();

		/*
		 * It is safe to read these entries based on snapshots of DRAIN_COUNT
		 * and DRAIN_RD_IDX without holding the lock because all of the records'
		 * writes will have already become visible due to the lock's store
		 * release on the enqueue side. We may miss some records in this pass if
		 * they enqueue after the snapshot but we'll just pick them up in the
		 * next loop iteration. Additionally, since only one instance of this
		 * function will be running at a time, we don't need to worry about
		 * duplicate allocations/work.
		 */

		for (size_t i = 0; i < drain_count; i++) {
			/* Create persistent copies of the entries in the RSB */
			size_t rsb_i = (drain_rd_idx + i) % RECORD_SUBMISSION_BUFFER_LENGTH;
			bti_telemetry_record_s *record_i = record_submission_buffer + rsb_i;

			bti_telemetry_record_s *new_record =
			    kalloc_type(bti_telemetry_record_s, Z_WAITOK | Z_NOFAIL);

			memcpy(new_record, record_i, sizeof(bti_telemetry_record_s));
			record_allocations[i] = new_record;
		}

		safe_telemetry_lock_lock();
		/* Insert all draining entries into the splay */
		for (size_t i = 0; i < drain_count; i++) {
			bti_telemetry_record_s *duplicate = SPLAY_INSERT(bti_telemetry_tree,
			    &telemetry_splay_tree,
			    record_allocations[i]);
			if (duplicate) {
				/*
				 * Since we scan both the RSB and the splay tree before
				 * submitting a record, we never expect to have multiple
				 * instances of any record. If this occurs, it's a bug!
				 */
				panic("Unexpected duplicate splay entry!");
			}
		}

		/* Dequeue the entries from the RSB */
		rsb_rd_idx =
		    (rsb_rd_idx + drain_count) % RECORD_SUBMISSION_BUFFER_LENGTH;
		rsb_count -= drain_count;
		safe_telemetry_lock_unlock();

		/* Report entries */
		for (size_t i = 0; i < drain_count; i++) {
			int result = 0;
			uuid_t uuid;
			uuid_string_t uuid_str;
			size_t offset = 0;
			bti_telemetry_record_s *record_i = record_allocations[i];

			if ((result = get_uuid_and_text_offset_for_addr(
				    record_i->faulting_address,
				    &uuid, &offset)) < 0) {
				/*
				 * We couldn't get the required data for symbolication for some
				 * odd reason. Report a NULL UUID and the address raw so we can
				 * track these invalid events.
				 */
				memset(&uuid, 0x00, sizeof(uuid));
				offset = VM_KERNEL_UNSLIDE(record_i->faulting_address);
			}
			uuid_unparse(uuid, uuid_str);

			/* Print events to the console for local debug */
			dump_telemetry_record(record_i, uuid_str, offset);

#if BTI_TELEMETRY_USE_CORE_ANALYTICS
			/* Report to CoreAnalytics */
			ca_event_t ca_event = CA_EVENT_ALLOCATE(arm_bti_exceptions);
			CA_EVENT_TYPE(arm_bti_exceptions) * event_data = ca_event->data;

			event_data->branch_type = record_i->branch_type;
			event_data->faulting_offset = offset;
			strlcpy(event_data->faulting_uuid, uuid_str, CA_UUID_LEN);

			CA_EVENT_SEND(ca_event);
#endif /* BTI_TELEMETRY_USE_CORE_ANALYTICS */
		}

		safe_telemetry_lock_lock();
		/* LOCKED OUT */
	}

	/* Done for now, if submitters have entries they'll need to call again. */
	rsb_is_draining = false;
	safe_telemetry_lock_unlock();
}

__startup_func
void
bti_telemetry_init(void)
{
	printf(TAG "bti_telemetry_init\n");
	SPLAY_INIT(&telemetry_splay_tree);

	drain_record_submission_buffer_callout = thread_call_allocate_with_options(
		drain_record_submission_buffer_thread_call, NULL,
		THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE);

	if (!drain_record_submission_buffer_callout) {
		panic("Failed to allocate drain callout!");
	}
}

/**
 * Submit RECORD to the submission queue. Returns TRUE if the record was
 * ingested (either enqueued or dupe'd), FALSE otherwise.
 */
static bool
submit_telemetry_record(bti_telemetry_record_s *record)
{
	bool did_ingest = true;
	bool should_flush_submission_buffer = false;
	bti_telemetry_record_s *splay_found_record = NULL;
	if (!safe_telemetry_lock_try_lock()) {
		/*
		 * Failed to acquire the lock!
		 * We're likely in a nested exception. Since we can't safely do anything
		 * else with the record, just drop it.
		 */
		return false;
	}

	/* First, scan the submission queue for matching, queued records */
	for (size_t i = 0; i < rsb_count; i++) {
		size_t rsb_i = (rsb_rd_idx + i) % RECORD_SUBMISSION_BUFFER_LENGTH;
		bti_telemetry_record_s *record_i = record_submission_buffer + rsb_i;
		if (bti_telemetry_record_compare(record, record_i) == 0) {
			/* Match, no need to report again. */
			goto DONE_LOCKED;
		}
	}

	/* Next, try for a record in the splay */
	splay_found_record = SPLAY_FIND(bti_telemetry_tree,
	    &telemetry_splay_tree,
	    record);
	if (splay_found_record) {
		/* Match, no need to report again. */
		goto DONE_LOCKED;
	}

	/*
	 * If we haven't hit anywhere, this means we have a new event that needs to
	 * be enqueued for reporting.
	 */
	did_ingest = rsb_enqueue_locked(record);
	should_flush_submission_buffer = did_ingest && !rsb_is_draining;

DONE_LOCKED:
	safe_telemetry_lock_unlock();

	if (should_flush_submission_buffer) {
		/*
		 * We submitted a new entry while the drain thread was either exiting or
		 * not running. Queue a new flush. Multiple calls here before the drain
		 * starts running will not result in multiple calls being queued due to
		 * THREAD_CALL_OPTIONS_ONCE.
		 */
		thread_call_enter(drain_record_submission_buffer_callout);
	}

	return did_ingest;
}

/** Convert a BTI exception frame into a telemetry record */
static void
generate_telemetry_record(arm_saved_state_t *state,
    bti_telemetry_record_s *record)
{
	uintptr_t pc = 0;
	uint64_t esr = 0;

	pc = get_saved_state_pc(state);
	esr = get_saved_state_esr(state);

	/* Generate the exception record */
	record->branch_type = (uint8_t)(esr & ISS_BTI_BTYPE_MASK);
	record->faulting_address = pc;
}

/*
 * Try and recover from a BTI exception. Returns true if we are able to recover,
 * false otherwise.
 */
static bool
recover_from_bti_exception(arm_saved_state_t *state)
{
	/*
	 * Since BTI raises on a mismatched PSTATE.BTYPE, we can simply clear BTYPE
	 * and directly return from the exception to continue executing as if
	 * the exception never happened.
	 */
	uint32_t psr = get_saved_state_cpsr(state);
	psr &= ~PSR_BTYPE_MASK;
	set_saved_state_cpsr(state, psr);

	return true;
}

bool
bti_telemetry_handle_exception(arm_saved_state_t *state)
{
	bti_telemetry_record_s record = { 0 };

	/* Generate the telemetry record and hand it to the submission thread */
	generate_telemetry_record(state, &record);
	(void)submit_telemetry_record(&record);

	/* Recover and prepare to keep executing */
	return recover_from_bti_exception(state);
}

#endif /* CONFIG_BTI_TELEMETRY */