This is xnu-8019. See this file in:
#include "hvtest_arm64.h"
#include "hvtest_guest.h"

#include <ptrauth.h>
#include <darwintest.h>
#include <darwintest_perf.h>
#include <mach/mach.h>
#include <stdatomic.h>
#include <stdlib.h>

T_GLOBAL_META(
	T_META_NAMESPACE("xnu.arm.hv"),
	T_META_REQUIRES_SYSCTL_EQ("kern.hv_support", 1),
	// Temporary workaround for not providing an x86_64 slice
	T_META_REQUIRES_SYSCTL_EQ("hw.optional.arm64", 1),
	T_META_RADAR_COMPONENT_NAME("xnu"),
	T_META_RADAR_COMPONENT_VERSION("arm - hypervisor"),
	T_META_OWNER("p_newman")
	);

#define SET_PC(vcpu, symbol) \
{ \
	vcpu_entry_function entry = ptrauth_strip(&symbol, 0); \
	uint64_t entry_addr = (uintptr_t)entry; \
	(void)hv_vcpu_set_reg(vcpu, HV_REG_PC, entry_addr); \
}

// Note that expect_*(), set_reg(), and get_reg() cannot be used in benchmarks,
// as the T_ASSERT() checks they perform are severely detrimental to results.
//
// The helpers below should be used in their place.

static void
quick_bump_pc(hv_vcpu_t vcpu, const bool forward)
{
	uint64_t pc;
	(void)hv_vcpu_get_reg(vcpu, HV_REG_PC, &pc);
	pc = forward ? pc + 4 : pc - 4;
	(void)hv_vcpu_set_reg(vcpu, HV_REG_PC, pc);
}

static void
vtimer_benchmark(hv_vcpu_t vcpu, hv_vcpu_exit_t *exit)
{
	dt_stat_thread_cycles_t stat = dt_stat_thread_cycles_create(
		"VTimer interruption");
	SET_PC(vcpu, spin_vcpu_entry);
	set_sys_reg(vcpu, HV_SYS_REG_CNTV_CVAL_EL0, 0);
	set_sys_reg(vcpu, HV_SYS_REG_CNTV_CTL_EL0, 1);
	// Dry-run twice to ensure that the timer is re-armed.
	run_to_next_vm_fault(vcpu, exit);
	T_ASSERT_EQ_UINT(exit->reason, HV_EXIT_REASON_VTIMER_ACTIVATED,
	    "check for timer");
	hv_vcpu_set_vtimer_mask(vcpu, false);
	run_to_next_vm_fault(vcpu, exit);
	T_ASSERT_EQ_UINT(exit->reason, HV_EXIT_REASON_VTIMER_ACTIVATED,
	    "check for timer");
	hv_vcpu_set_vtimer_mask(vcpu, false);
	T_STAT_MEASURE_LOOP(stat) {
		hv_vcpu_run(vcpu);
		hv_vcpu_set_vtimer_mask(vcpu, false);
	}
	dt_stat_finalize(stat);
	// Disable the timer before running other benchmarks, otherwise they will be
	// interrupted.
	set_sys_reg(vcpu, HV_SYS_REG_CNTV_CTL_EL0, 0);
}

static void
trap_benchmark(dt_stat_thread_cycles_t trap_stat, hv_vcpu_t vcpu,
    hv_vcpu_exit_t *exit, const uint64_t batch, const bool increment_pc)
{
	while (!dt_stat_stable(trap_stat)) {
		set_reg(vcpu, HV_REG_X0, batch);
		dt_stat_token start = dt_stat_thread_cycles_begin(trap_stat);
		for (uint32_t i = 0; i < batch; i++) {
			hv_vcpu_run(vcpu);
			if (increment_pc) {
				quick_bump_pc(vcpu, true);
			}
		}
		dt_stat_thread_cycles_end_batch(trap_stat, (int)batch, start);
		expect_hvc(vcpu, exit, 2);
	}
	dt_stat_finalize(trap_stat);
}

static void
mrs_bench_kernel(hv_vcpu_t vcpu, hv_vcpu_exit_t *exit, const char *name)
{
	const uint64_t batch = 1000;
	SET_PC(vcpu, mrs_actlr_bench_loop);
	set_control(vcpu, _HV_CONTROL_FIELD_HCR,
	    get_control(vcpu, _HV_CONTROL_FIELD_HCR) & ~HCR_TACR);
	dt_stat_thread_cycles_t stat = dt_stat_thread_cycles_create(name);
	while (!dt_stat_stable(stat)) {
		set_reg(vcpu, HV_REG_X0, batch);
		dt_stat_token start = dt_stat_thread_cycles_begin(stat);
		hv_vcpu_run(vcpu);
		dt_stat_thread_cycles_end_batch(stat, (int)batch, start);
		T_QUIET; T_ASSERT_EQ_UINT(exit->reason, HV_EXIT_REASON_EXCEPTION,
		    "check for exception");
		T_QUIET; T_ASSERT_EQ((uint32_t)exit->exception.syndrome >> 26, 0x16,
		    "check for HVC64");
	}
	dt_stat_finalize(stat);
}

static void *
trap_bench_monitor(void *arg __unused, hv_vcpu_t vcpu, hv_vcpu_exit_t *exit)
{
	// In all benchmark testcases using quick_run_vcpu(), dry run all guest code
	// to fault in pages so that run_to_next_vm_fault() isn't needed while
	// recording measurements.

	vtimer_benchmark(vcpu, exit);

	// dry-run hvc_bench_loop
	SET_PC(vcpu, hvc_bench_loop);
	set_reg(vcpu, HV_REG_X0, 1);
	expect_hvc(vcpu, exit, 1);
	expect_hvc(vcpu, exit, 2);

	SET_PC(vcpu, hvc_bench_loop);
	trap_benchmark(dt_stat_thread_cycles_create("HVC handled by VMM"),
	    vcpu, exit, 1000, false);

	// dry-run data_abort_bench_loop
	SET_PC(vcpu, data_abort_bench_loop);
	set_reg(vcpu, HV_REG_X0, 1);
	expect_trapped_store(vcpu, exit, get_reserved_start());
	expect_hvc(vcpu, exit, 2);

	SET_PC(vcpu, data_abort_bench_loop);
	trap_benchmark(dt_stat_thread_cycles_create("data abort handled by VMM"),
	    vcpu, exit, 1000, true);

	// dry-run mrs_actlr_bench_loop
	SET_PC(vcpu, mrs_actlr_bench_loop);
	set_reg(vcpu, HV_REG_X0, 1);
	set_control(vcpu, _HV_CONTROL_FIELD_HCR,
	    get_control(vcpu, _HV_CONTROL_FIELD_HCR) & ~HCR_TACR);
	// Confirm no visible trap from MRS
	expect_hvc(vcpu, exit, 2);

	mrs_bench_kernel(vcpu, exit, "MRS trap handled by kernel");

	SET_PC(vcpu, mrs_actlr_bench_loop);
	set_reg(vcpu, HV_REG_X0, 1);
	set_control(vcpu, _HV_CONTROL_FIELD_HCR,
	    get_control(vcpu, _HV_CONTROL_FIELD_HCR) | HCR_TACR);
	// Confirm MRS trap from test loop
	expect_exception(vcpu, exit, 0x18);
	quick_bump_pc(vcpu, true);
	expect_hvc(vcpu, exit, 2);
	SET_PC(vcpu, mrs_actlr_bench_loop);
	trap_benchmark(dt_stat_thread_cycles_create("MRS trap handled by VMM"),
	    vcpu, exit, 1000, true);

	SET_PC(vcpu, activate_debug);
	expect_hvc(vcpu, exit, 0);

	SET_PC(vcpu, hvc_bench_loop);
	trap_benchmark(dt_stat_thread_cycles_create(
		    "debug-enabled HVC handled by VMM"), vcpu, exit, 1000, false);

	mrs_bench_kernel(vcpu, exit, "debug-enabled MRS trap handled by kernel");

	return NULL;
}

T_DECL(trap_benchmark, "trap-processing benchmark")
{
	vm_setup();
	pthread_t vcpu_thread = create_vcpu_thread(hvc_bench_loop, 0,
	    trap_bench_monitor, NULL);
	T_ASSERT_POSIX_SUCCESS(pthread_join(vcpu_thread, NULL), "join vcpu");
	vm_cleanup();
}

static semaphore_t sem1;
static semaphore_t sem2;
static _Atomic uint32_t stage;

static void
switch_and_return(bool leader)
{
	// wait_semaphore, signal_semaphore
	(void)semaphore_wait_signal(leader ? sem2 : sem1, leader ? sem1 : sem2);
}

static void *
vcpu_switch_leader(void *arg __unused, hv_vcpu_t vcpu, hv_vcpu_exit_t *exit)
{
	dt_stat_thread_cycles_t baseline = dt_stat_thread_cycles_create(
		"baseline VCPU run, no switch");
	dt_stat_thread_cycles_t thread = dt_stat_thread_cycles_create(
		"VCPU-thread switch");
	dt_stat_thread_cycles_t basic = dt_stat_thread_cycles_create(
		"basic VCPU-VCPU switch");
	dt_stat_thread_cycles_t baseline_debug = dt_stat_thread_cycles_create(
		"baseline debug-enabled VCPU run, no switch");
	dt_stat_thread_cycles_t basic_debug = dt_stat_thread_cycles_create(
		"basic VCPU <-> debug-enabled VCPU switch");
	dt_stat_thread_cycles_t debug_debug = dt_stat_thread_cycles_create(
		"debug-enabled VCPU <-> debug-enabled VCPU switch");

	bind_to_cpu(0);

	// Activate minimal VCPU state
	SET_PC(vcpu, hvc_loop);
	expect_hvc(vcpu, exit, 0);
	T_STAT_MEASURE_LOOP(baseline) {
		hv_vcpu_run(vcpu);
	}
	dt_stat_finalize(baseline);

	T_STAT_MEASURE_LOOP(thread) {
		hv_vcpu_run(vcpu);
		switch_and_return(true);
	}
	dt_stat_finalize(thread);
	atomic_store_explicit(&stage, 1, memory_order_relaxed);

	T_STAT_MEASURE_LOOP(basic) {
		hv_vcpu_run(vcpu);
		switch_and_return(true);
	}
	dt_stat_finalize(basic);
	atomic_store_explicit(&stage, 2, memory_order_relaxed);

	T_STAT_MEASURE_LOOP(basic_debug) {
		hv_vcpu_run(vcpu);
		switch_and_return(true);
	}
	dt_stat_finalize(basic_debug);
	atomic_store_explicit(&stage, 3, memory_order_relaxed);

	SET_PC(vcpu, activate_debug);
	expect_hvc(vcpu, exit, 0);
	SET_PC(vcpu, hvc_loop);
	T_STAT_MEASURE_LOOP(baseline_debug) {
		hv_vcpu_run(vcpu);
	}
	dt_stat_finalize(baseline_debug);

	T_STAT_MEASURE_LOOP(debug_debug) {
		hv_vcpu_run(vcpu);
		switch_and_return(true);
	}
	dt_stat_finalize(debug_debug);
	atomic_store_explicit(&stage, 4, memory_order_relaxed);

	T_ASSERT_MACH_SUCCESS(semaphore_signal(sem1), "final signal to follower");

	return NULL;
}

static void *
vcpu_switch_follower(void *arg __unused, hv_vcpu_t vcpu, hv_vcpu_exit_t *exit)
{
	bind_to_cpu(0);

	// Don't signal until we've been signaled once.
	T_ASSERT_MACH_SUCCESS(semaphore_wait(sem1),
	    "wait for first signal from leader");

	// For a baseline, don't enter the VCPU at all. This should result in a
	// negligible VCPU switch cost.
	while (atomic_load_explicit(&stage, memory_order_relaxed) == 0) {
		switch_and_return(false);
	}

	// Enter the VCPU once to activate a minimal amount of state.
	SET_PC(vcpu, hvc_loop);
	expect_hvc(vcpu, exit, 0);

	while (atomic_load_explicit(&stage, memory_order_relaxed) == 1) {
		hv_vcpu_run(vcpu);
		switch_and_return(false);
	}

	// Use debug state
	SET_PC(vcpu, activate_debug);
	expect_hvc(vcpu, exit, 0);
	SET_PC(vcpu, hvc_loop);

	while (atomic_load_explicit(&stage, memory_order_relaxed) == 2) {
		hv_vcpu_run(vcpu);
		switch_and_return(false);
	}

	while (atomic_load_explicit(&stage, memory_order_relaxed) == 3) {
		hv_vcpu_run(vcpu);
		switch_and_return(false);
	}

	return NULL;
}

T_DECL(vcpu_switch_benchmark, "vcpu state-switching benchmarks",
    T_META_BOOTARGS_SET("enable_skstb=1"))
{
	bind_to_cpu(0);

	T_ASSERT_MACH_SUCCESS(semaphore_create(mach_task_self(), &sem1,
	    SYNC_POLICY_FIFO, 0), "semaphore_create 1");
	T_ASSERT_MACH_SUCCESS(semaphore_create(mach_task_self(), &sem2,
	    SYNC_POLICY_FIFO, 0), "semaphore_create 2");

	vm_setup();
	pthread_t vcpu1_thread = create_vcpu_thread(hvc_loop, 0,
	    vcpu_switch_leader, NULL);
	pthread_t vcpu2_thread = create_vcpu_thread(hvc_loop, 0,
	    vcpu_switch_follower, NULL);

	T_ASSERT_POSIX_SUCCESS(pthread_join(vcpu1_thread, NULL), "join vcpu1");
	T_ASSERT_POSIX_SUCCESS(pthread_join(vcpu2_thread, NULL), "join vcpu2");

	vm_cleanup();
}

static void
kernel_trap_loop(const char *desc)
{
	const int batch = 10000;
	dt_stat_time_t s = dt_stat_time_create(desc);
	while (!dt_stat_stable(s)) {
		dt_stat_token start = dt_stat_time_begin(s);
		for (int i = 0; i < batch; i++) {
			// We don't care what this does, just that it enters and leaves the
			// kernel and doesn't interfere with the test.
			(void)hv_test_trap(TRAP_HV_VCPU_SYSREGS_SYNC, 0);
		}
		dt_stat_time_end_batch(s, batch, start);
	}
	dt_stat_finalize(s);
}

static void *
invalidate_monitor(void *arg __unused, hv_vcpu_t vcpu, hv_vcpu_exit_t *exit)
{
	run_to_next_vm_fault(vcpu, exit);
	T_ASSERT_EQ_UINT(exit->reason, HV_EXIT_REASON_CANCELED, "vcpu interrupted");
	return NULL;
}

T_DECL(icache_inval_bench, "measure perf impact of IC IALLU flood",
    // Don't run this on hardware without the filters, as the impact could
    // cause timeouts in the host OS.
    T_META_REQUIRES_SYSCTL_EQ("hw.optional.ic_inval_filters", 1))
{
	vm_setup();

#define inval_vcpus 4
	pthread_t inval_threads[inval_vcpus];
	for (uint32_t i = 0; i < inval_vcpus; i++) {
		inval_threads[i] = create_vcpu_thread(ic_iallu_vcpu_entry, 0,
		    invalidate_monitor, NULL);
	}

	kernel_trap_loop("kernel trap while VCPUs executing IC IALLU");

	hv_vcpu_t inval_vcpu_array[inval_vcpus];
	for (uint32_t i = 0; i < inval_vcpus; i++) {
		inval_vcpu_array[i] = i;
	}
	T_ASSERT_EQ(hv_vcpus_exit(inval_vcpu_array, inval_vcpus), HV_SUCCESS,
	    "cancel inval vcpus");
	for (uint32_t i = 0; i < inval_vcpus; i++) {
		T_ASSERT_POSIX_SUCCESS(pthread_join(inval_threads[i], NULL),
		    "join inval_threads[%u]", i);
	}

	// Repeat the test case with the same number of VCPUs, but only spinning to
	// provide a baseline measurement.
	for (uint32_t i = 0; i < inval_vcpus; i++) {
		inval_threads[i] = create_vcpu_thread(spin_vcpu_entry, 0,
		    invalidate_monitor, NULL);
	}

	kernel_trap_loop("kernel trap while VCPUs spinning");

	for (uint32_t i = 0; i < inval_vcpus; i++) {
		inval_vcpu_array[i] = i;
	}
	T_ASSERT_EQ(hv_vcpus_exit(inval_vcpu_array, inval_vcpus), HV_SUCCESS,
	    "cancel inval vcpus");
	for (uint32_t i = 0; i < inval_vcpus; i++) {
		T_ASSERT_POSIX_SUCCESS(pthread_join(inval_threads[i], NULL),
		    "join inval_threads[%u]", i);
	}
	vm_cleanup();
}

struct thread_params {
	uint32_t id;
	uint32_t iter;
	pthread_t thread;
};

static void *
run_cancel_monitor(void *arg, hv_vcpu_t vcpu, hv_vcpu_exit_t *exit __unused)
{
	struct thread_params *param = (struct thread_params *)arg;
	dt_stat_time_t s = dt_stat_time_create("hv_vcpus_exit time vcpu%u",
	    param->id);
	while (!dt_stat_stable(s)) {
		dt_stat_token start = dt_stat_time_begin(s);
		for (uint32_t i = 0; i < param->iter; i++) {
			hv_vcpus_exit(&vcpu, 1);
		}
		dt_stat_time_end_batch(s, (int)param->iter, start);
	}
	dt_stat_finalize(s);
	return NULL;
}

static void
run_cancel_call(uint32_t vcpu_count, uint32_t iter)
{
	struct thread_params *threads = calloc(vcpu_count, sizeof(*threads));
	vm_setup();
	for (uint32_t i = 0; i < vcpu_count; i++) {
		threads[i].id = i;
		threads[i].iter = iter;
		threads[i].thread = create_vcpu_thread(hvc_loop, 0, run_cancel_monitor,
		    &threads[i]);
	}
	for (uint32_t i = 0; i < vcpu_count; i++) {
		T_ASSERT_POSIX_SUCCESS(pthread_join(threads[i].thread, NULL),
		    "join vcpu%u", i);
	}
	free(threads);
	vm_cleanup();
}

T_DECL(api_benchmarks, "API call parallel performance")
{
	run_cancel_call(1, 1000);
	run_cancel_call(4, 1000);
}