This is xnu-11215.1.10. See this file in:
/*
 * Copyright (c) 2022 Apple Computer, Inc. All rights reserved.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
 *
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. The rights granted to you under the License
 * may not be used to create, or enable the creation or redistribution of,
 * unlawful or unlicensed copies of an Apple operating system, or to
 * circumvent, violate, or enable the circumvention or violation of, any
 * terms of an Apple operating system software license agreement.
 *
 * Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this file.
 *
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
 */

#include <arm/misc_protos.h>
#include <arm64/proc_reg.h>
#include <libkern/section_keywords.h>

SECURITY_READ_ONLY_LATE(unsigned int) sme_version = 0;

/**
 * Returns the version of SME supported on this platform.
 *
 * In contrast to the compile-time HAS_ARM_FEAT_SME/HAS_ARM_FEAT_SME2 checks
 * that indicate compiler support, arm_sme_version() is a runtime check that
 * indicates actual processor support.
 *
 * @return the highest SME ISA version supported on this platform
 * (where 0 indicates no SME support)
 */
unsigned int
arm_sme_version(void)
{
	return sme_version;
}

#if HAS_ARM_FEAT_SME

#include <kern/cpu_data.h>
#include <kern/thread.h>

void
arm_sme_init(bool is_boot_cpu)
{
	if (is_boot_cpu) {
		uint64_t aa64pfr1_el1 = __builtin_arm_rsr64("ID_AA64PFR1_EL1");
		sme_version = (aa64pfr1_el1 & ID_AA64PFR1_EL1_SME_MASK) >> ID_AA64PFR1_EL1_SME_OFFSET;
	}

	if (!sme_version) {
		return;
	}

	/* enable SME at EL1 only */
	uint64_t cpacr_el1 = __builtin_arm_rsr64("CPACR_EL1");
	cpacr_el1 &= ~CPACR_SMEN_MASK;
	cpacr_el1 |= CPACR_SMEN_EL0_TRAP;
	__builtin_arm_wsr64("CPACR_EL1", cpacr_el1);
	__builtin_arm_isb(ISB_SY);

	/* set vector length to max supported by hardware */
	uint64_t smcr_el1 = SMCR_EL1_LEN(~0);
#ifdef APPLEH16
	/*
	 * fastsim bug: rdar://96247932 (SME streaming vector length seems to be uncapped)
	 *
	 * SME saved-state with the max-size SVL is too large to use with the
	 * zone allocator.  H16G hardware is expected to cap SVL at 64 bytes.
	 */
	const unsigned int H16_SME_SVL_B = 64;
	smcr_el1 = SMCR_EL1_LEN((H16_SME_SVL_B / 16) - 1);
#endif
#if HAS_ARM_FEAT_SME2
	/* enable ZT0 access */
	smcr_el1 |= SMCR_EL1_EZT0;
#endif
	__builtin_arm_wsr64("SMCR_EL1", smcr_el1);

	/* disable SME prioritization */
	const uint64_t smpri_el1 = SMPRI_EL1_PRIORITY(0);
	__builtin_arm_wsr64("SMPRI_EL1", smpri_el1);

	__builtin_arm_wsr64("TPIDR2_EL0", 0);
}

/**
 * Returns the streaming SVE vector length.  The total size of the ZA array is
 * SVL_B x SVL_B bytes.
 *
 * @return the number of 8-bit elements in a streaming SVE vector
 */
uint16_t
arm_sme_svl_b(void)
{
	uint64_t ret = 0;
	asm volatile (
                "rdsvl	%[ret], #1"
                : [ret] "=r"(ret)
        );

	assert(__builtin_popcountll(ret) == 1);
	assert(ret >= 16);
	assert(ret <= 256);

	return (uint16_t)ret;
}

/**
 * Save the current CPU's ZA array to the provided storage space.
 *
 * @param sme_ss destination ZA storage
 * @param svl_b SVL corresponding to sme_ss, in bytes
 */
void
arm_save_sme_za(arm_sme_context_t *sme_ss, uint16_t svl_b)
{
	uint8_t *za = arm_sme_za(sme_ss, svl_b);
	/*
	 * SME adds ldr and str variants convenient for context-switching ZA:
	 *
	 *   <ldr|str> za[<Wv>, #<imm>], [<Xn>, #<imm>, mul vl]
	 *
	 * If we view ZA as a 2D array with dimensions SVL_B x SVL_B, then these
	 * instructions copy data between ZA[<Wv> + <imm>][] and an SVL_B-sized
	 * block of memory starting at address <Xn> + <imm> * SVL_B.
	 *
	 * <imm> is between 0-15, so we can perform up to 16 copies before
	 * updating <Wv> and <Xn>.  <Wv> also must be one of W12-W15.  This is
	 * an unusual restriction for AArch64 that can't be represented with
	 * extended asm register constraints, so we need to manually constrain
	 * this operand with the register keyword.
	 */
	for (register uint16_t i asm("w12") = 0; i < svl_b; i += 16) {
		asm volatile (
                        "str    za[%w[i],  #0], [%[addr],  #0, mul vl]"   "\n"
                        "str    za[%w[i],  #1], [%[addr],  #1, mul vl]"   "\n"
                        "str    za[%w[i],  #2], [%[addr],  #2, mul vl]"   "\n"
                        "str    za[%w[i],  #3], [%[addr],  #3, mul vl]"   "\n"
                        "str    za[%w[i],  #4], [%[addr],  #4, mul vl]"   "\n"
                        "str    za[%w[i],  #5], [%[addr],  #5, mul vl]"   "\n"
                        "str    za[%w[i],  #6], [%[addr],  #6, mul vl]"   "\n"
                        "str    za[%w[i],  #7], [%[addr],  #7, mul vl]"   "\n"
                        "str    za[%w[i],  #8], [%[addr],  #8, mul vl]"   "\n"
                        "str    za[%w[i],  #9], [%[addr],  #9, mul vl]"   "\n"
                        "str    za[%w[i], #10], [%[addr], #10, mul vl]"   "\n"
                        "str    za[%w[i], #11], [%[addr], #11, mul vl]"   "\n"
                        "str    za[%w[i], #12], [%[addr], #12, mul vl]"   "\n"
                        "str    za[%w[i], #13], [%[addr], #13, mul vl]"   "\n"
                        "str    za[%w[i], #14], [%[addr], #14, mul vl]"   "\n"
                        "str    za[%w[i], #15], [%[addr], #15, mul vl]"   "\n"
                        :
                        : [i] "r"(i),
                          [addr] "r"(za + (i * svl_b))
                );
	}
}

/**
 * Load the current CPU's ZA array from the provided storage space.
 *
 * @param sme_ss source ZA storage
 * @param svl_b SVL corresponding to sme_ss, in bytes
 */
void
arm_load_sme_za(const arm_sme_context_t *sme_ss, uint16_t svl_b)
{
	const uint8_t *za = const_arm_sme_za(sme_ss, svl_b);
	for (register uint16_t i asm("w12") = 0; i < svl_b; i += 16) {
		asm volatile (
                        "ldr    za[%w[i],  #0], [%[addr],  #0, mul vl]"   "\n"
                        "ldr    za[%w[i],  #1], [%[addr],  #1, mul vl]"   "\n"
                        "ldr    za[%w[i],  #2], [%[addr],  #2, mul vl]"   "\n"
                        "ldr    za[%w[i],  #3], [%[addr],  #3, mul vl]"   "\n"
                        "ldr    za[%w[i],  #4], [%[addr],  #4, mul vl]"   "\n"
                        "ldr    za[%w[i],  #5], [%[addr],  #5, mul vl]"   "\n"
                        "ldr    za[%w[i],  #6], [%[addr],  #6, mul vl]"   "\n"
                        "ldr    za[%w[i],  #7], [%[addr],  #7, mul vl]"   "\n"
                        "ldr    za[%w[i],  #8], [%[addr],  #8, mul vl]"   "\n"
                        "ldr    za[%w[i],  #9], [%[addr],  #9, mul vl]"   "\n"
                        "ldr    za[%w[i], #10], [%[addr], #10, mul vl]"   "\n"
                        "ldr    za[%w[i], #11], [%[addr], #11, mul vl]"   "\n"
                        "ldr    za[%w[i], #12], [%[addr], #12, mul vl]"   "\n"
                        "ldr    za[%w[i], #13], [%[addr], #13, mul vl]"   "\n"
                        "ldr    za[%w[i], #14], [%[addr], #14, mul vl]"   "\n"
                        "ldr    za[%w[i], #15], [%[addr], #15, mul vl]"   "\n"
                        :
                        : [i] "r"(i),
                          [addr] "r"(za + (i * svl_b))
                );
	}
}

/**
 * Configures CPACR_EL1 to trap or enable SME instructions at EL0.
 *
 * The caller does not need to issue any instruction barriers;
 * arm_context_switch_requires_sync() is automatically invoked if needed.
 *
 * @param trap_enabled whether to trap SME instructions at EL0
 */
void
arm_sme_trap_at_el0(bool trap_enabled)
{
	uint64_t cpacr_el1 = __builtin_arm_rsr64("CPACR_EL1");
	unsigned int prev_mode = (unsigned int)(cpacr_el1 & CPACR_SMEN_MASK);
	unsigned int new_mode = trap_enabled ? CPACR_SMEN_EL0_TRAP : CPACR_SMEN_ENABLE;

	if (prev_mode != new_mode) {
		cpacr_el1 &= ~CPACR_SMEN_MASK;
		cpacr_el1 |= new_mode;
		__builtin_arm_wsr64("CPACR_EL1", cpacr_el1);
		arm_context_switch_requires_sync();
	}
}

/**
 * Returns whether the current thread has an active SME context.
 */
boolean_t
arm_sme_is_active(void)
{
	/* Kernel entry clobbers SVCR.SM, so check the saved state instead of live register state */
	arm_sme_saved_state_t *sme_ss = machine_thread_get_sme_state(current_thread());
	return sme_ss && (sme_ss->svcr & (SVCR_SM | SVCR_ZA));
}

#if HAS_ARM_FEAT_SME2
/**
 * Save the current CPU's ZT0 array to the provided storage space.
 *
 * @param sme_ss destination ZT0 storage
 */
void
arm_save_sme_zt0(arm_sme_context_t *sme_ss)
{
	asm volatile (
                "str zt0, [%[addr]]"
                :
                : [addr] "r"(sme_ss->zt0)
        );
}

/**
 * Load the current CPU's ZT0 array from the provided storage space.
 *
 * @param sme_ss source ZT0 storage
 */
void
arm_load_sme_zt0(const arm_sme_context_t *sme_ss)
{
	asm volatile (
                "ldr	zt0, [%[addr]]"
                :
                : [addr] "r"(sme_ss->zt0)
        );
}
#endif /* HAS_ARM_FEAT_SME2 */

/**
 * Save the current CPU's ZA and ZT0 arrays to the provided storage space.
 *
 * If this CPU does not support SME2, ZT0 storage is zeroed out instead.
 *
 * @param sme_ss destination storage
 * @param svl_b SVL corresponding to sme_ss, in bytes
 */
void
arm_save_sme_za_zt0(arm_sme_context_t *sme_ss, uint16_t svl_b)
{
	arm_save_sme_za(sme_ss, svl_b);
#if HAS_ARM_FEAT_SME2
	if (arm_sme_version() >= 2) {
		arm_save_sme_zt0(sme_ss);
	}
#else
	if (0) {
	}
#endif
	else {
		bzero(sme_ss->zt0, sizeof(sme_ss->zt0));
	}
}

/**
 * Load the current CPU's ZA and ZT0 arrays from the provided storage space.
 *
 * If this CPU does not support SME2, ZT0 storage is ignored.
 *
 * @param sme_ss source storage
 * @param svl_b SVL corresponding to sme_ss, in bytes
 */
void
arm_load_sme_za_zt0(const arm_sme_context_t *sme_ss, uint16_t svl_b)
{
	arm_load_sme_za(sme_ss, svl_b);
#if HAS_ARM_FEAT_SME2
	if (arm_sme_version() >= 2) {
		arm_load_sme_zt0(sme_ss);
	}
#endif
}

#endif /* HAS_ARM_FEAT_SME */