/*
* Copyright (c) 2022 Apple Computer, Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
* This file contains Original Code and/or Modifications of Original Code
* as defined in and that are subject to the Apple Public Source License
* Version 2.0 (the 'License'). You may not use this file except in
* compliance with the License. The rights granted to you under the License
* may not be used to create, or enable the creation or redistribution of,
* unlawful or unlicensed copies of an Apple operating system, or to
* circumvent, violate, or enable the circumvention or violation of, any
* terms of an Apple operating system software license agreement.
*
* Please obtain a copy of the License at
* http://www.opensource.apple.com/apsl/ and read it before using this file.
*
* The Original Code and all software distributed under the License are
* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
* Please see the License for the specific language governing rights and
* limitations under the License.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
*/
#include <arm/misc_protos.h>
#include <arm64/proc_reg.h>
#include <libkern/section_keywords.h>
SECURITY_READ_ONLY_LATE(unsigned int) sme_version = 0;
/**
* Returns the version of SME supported on this platform.
*
* In contrast to the compile-time HAS_ARM_FEAT_SME/HAS_ARM_FEAT_SME2 checks
* that indicate compiler support, arm_sme_version() is a runtime check that
* indicates actual processor support.
*
* @return the highest SME ISA version supported on this platform
* (where 0 indicates no SME support)
*/
unsigned int
arm_sme_version(void)
{
return sme_version;
}
#if HAS_ARM_FEAT_SME
#include <kern/cpu_data.h>
#include <kern/thread.h>
void
arm_sme_init(bool is_boot_cpu)
{
if (is_boot_cpu) {
uint64_t aa64pfr1_el1 = __builtin_arm_rsr64("ID_AA64PFR1_EL1");
sme_version = (aa64pfr1_el1 & ID_AA64PFR1_EL1_SME_MASK) >> ID_AA64PFR1_EL1_SME_OFFSET;
}
if (!sme_version) {
return;
}
/* enable SME at EL1 only */
uint64_t cpacr_el1 = __builtin_arm_rsr64("CPACR_EL1");
cpacr_el1 &= ~CPACR_SMEN_MASK;
cpacr_el1 |= CPACR_SMEN_EL0_TRAP;
__builtin_arm_wsr64("CPACR_EL1", cpacr_el1);
__builtin_arm_isb(ISB_SY);
/* set vector length to max supported by hardware */
uint64_t smcr_el1 = SMCR_EL1_LEN(~0);
#ifdef APPLEH16
/*
* fastsim bug: rdar://96247932 (SME streaming vector length seems to be uncapped)
*
* SME saved-state with the max-size SVL is too large to use with the
* zone allocator. H16G hardware is expected to cap SVL at 64 bytes.
*/
const unsigned int H16_SME_SVL_B = 64;
smcr_el1 = SMCR_EL1_LEN((H16_SME_SVL_B / 16) - 1);
#endif
#if HAS_ARM_FEAT_SME2
/* enable ZT0 access */
smcr_el1 |= SMCR_EL1_EZT0;
#endif
__builtin_arm_wsr64("SMCR_EL1", smcr_el1);
/* disable SME prioritization */
const uint64_t smpri_el1 = SMPRI_EL1_PRIORITY(0);
__builtin_arm_wsr64("SMPRI_EL1", smpri_el1);
__builtin_arm_wsr64("TPIDR2_EL0", 0);
}
/**
* Returns the streaming SVE vector length. The total size of the ZA array is
* SVL_B x SVL_B bytes.
*
* @return the number of 8-bit elements in a streaming SVE vector
*/
uint16_t
arm_sme_svl_b(void)
{
uint64_t ret = 0;
asm volatile (
"rdsvl %[ret], #1"
: [ret] "=r"(ret)
);
assert(__builtin_popcountll(ret) == 1);
assert(ret >= 16);
assert(ret <= 256);
return (uint16_t)ret;
}
/**
* Save the current CPU's ZA array to the provided storage space.
*
* @param sme_ss destination ZA storage
* @param svl_b SVL corresponding to sme_ss, in bytes
*/
void
arm_save_sme_za(arm_sme_context_t *sme_ss, uint16_t svl_b)
{
uint8_t *za = arm_sme_za(sme_ss, svl_b);
/*
* SME adds ldr and str variants convenient for context-switching ZA:
*
* <ldr|str> za[<Wv>, #<imm>], [<Xn>, #<imm>, mul vl]
*
* If we view ZA as a 2D array with dimensions SVL_B x SVL_B, then these
* instructions copy data between ZA[<Wv> + <imm>][] and an SVL_B-sized
* block of memory starting at address <Xn> + <imm> * SVL_B.
*
* <imm> is between 0-15, so we can perform up to 16 copies before
* updating <Wv> and <Xn>. <Wv> also must be one of W12-W15. This is
* an unusual restriction for AArch64 that can't be represented with
* extended asm register constraints, so we need to manually constrain
* this operand with the register keyword.
*/
for (register uint16_t i asm("w12") = 0; i < svl_b; i += 16) {
asm volatile (
"str za[%w[i], #0], [%[addr], #0, mul vl]" "\n"
"str za[%w[i], #1], [%[addr], #1, mul vl]" "\n"
"str za[%w[i], #2], [%[addr], #2, mul vl]" "\n"
"str za[%w[i], #3], [%[addr], #3, mul vl]" "\n"
"str za[%w[i], #4], [%[addr], #4, mul vl]" "\n"
"str za[%w[i], #5], [%[addr], #5, mul vl]" "\n"
"str za[%w[i], #6], [%[addr], #6, mul vl]" "\n"
"str za[%w[i], #7], [%[addr], #7, mul vl]" "\n"
"str za[%w[i], #8], [%[addr], #8, mul vl]" "\n"
"str za[%w[i], #9], [%[addr], #9, mul vl]" "\n"
"str za[%w[i], #10], [%[addr], #10, mul vl]" "\n"
"str za[%w[i], #11], [%[addr], #11, mul vl]" "\n"
"str za[%w[i], #12], [%[addr], #12, mul vl]" "\n"
"str za[%w[i], #13], [%[addr], #13, mul vl]" "\n"
"str za[%w[i], #14], [%[addr], #14, mul vl]" "\n"
"str za[%w[i], #15], [%[addr], #15, mul vl]" "\n"
:
: [i] "r"(i),
[addr] "r"(za + (i * svl_b))
);
}
}
/**
* Load the current CPU's ZA array from the provided storage space.
*
* @param sme_ss source ZA storage
* @param svl_b SVL corresponding to sme_ss, in bytes
*/
void
arm_load_sme_za(const arm_sme_context_t *sme_ss, uint16_t svl_b)
{
const uint8_t *za = const_arm_sme_za(sme_ss, svl_b);
for (register uint16_t i asm("w12") = 0; i < svl_b; i += 16) {
asm volatile (
"ldr za[%w[i], #0], [%[addr], #0, mul vl]" "\n"
"ldr za[%w[i], #1], [%[addr], #1, mul vl]" "\n"
"ldr za[%w[i], #2], [%[addr], #2, mul vl]" "\n"
"ldr za[%w[i], #3], [%[addr], #3, mul vl]" "\n"
"ldr za[%w[i], #4], [%[addr], #4, mul vl]" "\n"
"ldr za[%w[i], #5], [%[addr], #5, mul vl]" "\n"
"ldr za[%w[i], #6], [%[addr], #6, mul vl]" "\n"
"ldr za[%w[i], #7], [%[addr], #7, mul vl]" "\n"
"ldr za[%w[i], #8], [%[addr], #8, mul vl]" "\n"
"ldr za[%w[i], #9], [%[addr], #9, mul vl]" "\n"
"ldr za[%w[i], #10], [%[addr], #10, mul vl]" "\n"
"ldr za[%w[i], #11], [%[addr], #11, mul vl]" "\n"
"ldr za[%w[i], #12], [%[addr], #12, mul vl]" "\n"
"ldr za[%w[i], #13], [%[addr], #13, mul vl]" "\n"
"ldr za[%w[i], #14], [%[addr], #14, mul vl]" "\n"
"ldr za[%w[i], #15], [%[addr], #15, mul vl]" "\n"
:
: [i] "r"(i),
[addr] "r"(za + (i * svl_b))
);
}
}
/**
* Configures CPACR_EL1 to trap or enable SME instructions at EL0.
*
* The caller does not need to issue any instruction barriers;
* arm_context_switch_requires_sync() is automatically invoked if needed.
*
* @param trap_enabled whether to trap SME instructions at EL0
*/
void
arm_sme_trap_at_el0(bool trap_enabled)
{
uint64_t cpacr_el1 = __builtin_arm_rsr64("CPACR_EL1");
unsigned int prev_mode = (unsigned int)(cpacr_el1 & CPACR_SMEN_MASK);
unsigned int new_mode = trap_enabled ? CPACR_SMEN_EL0_TRAP : CPACR_SMEN_ENABLE;
if (prev_mode != new_mode) {
cpacr_el1 &= ~CPACR_SMEN_MASK;
cpacr_el1 |= new_mode;
__builtin_arm_wsr64("CPACR_EL1", cpacr_el1);
arm_context_switch_requires_sync();
}
}
/**
* Returns whether the current thread has an active SME context.
*/
boolean_t
arm_sme_is_active(void)
{
/* Kernel entry clobbers SVCR.SM, so check the saved state instead of live register state */
arm_sme_saved_state_t *sme_ss = machine_thread_get_sme_state(current_thread());
return sme_ss && (sme_ss->svcr & (SVCR_SM | SVCR_ZA));
}
#if HAS_ARM_FEAT_SME2
/**
* Save the current CPU's ZT0 array to the provided storage space.
*
* @param sme_ss destination ZT0 storage
*/
void
arm_save_sme_zt0(arm_sme_context_t *sme_ss)
{
asm volatile (
"str zt0, [%[addr]]"
:
: [addr] "r"(sme_ss->zt0)
);
}
/**
* Load the current CPU's ZT0 array from the provided storage space.
*
* @param sme_ss source ZT0 storage
*/
void
arm_load_sme_zt0(const arm_sme_context_t *sme_ss)
{
asm volatile (
"ldr zt0, [%[addr]]"
:
: [addr] "r"(sme_ss->zt0)
);
}
#endif /* HAS_ARM_FEAT_SME2 */
/**
* Save the current CPU's ZA and ZT0 arrays to the provided storage space.
*
* If this CPU does not support SME2, ZT0 storage is zeroed out instead.
*
* @param sme_ss destination storage
* @param svl_b SVL corresponding to sme_ss, in bytes
*/
void
arm_save_sme_za_zt0(arm_sme_context_t *sme_ss, uint16_t svl_b)
{
arm_save_sme_za(sme_ss, svl_b);
#if HAS_ARM_FEAT_SME2
if (arm_sme_version() >= 2) {
arm_save_sme_zt0(sme_ss);
}
#else
if (0) {
}
#endif
else {
bzero(sme_ss->zt0, sizeof(sme_ss->zt0));
}
}
/**
* Load the current CPU's ZA and ZT0 arrays from the provided storage space.
*
* If this CPU does not support SME2, ZT0 storage is ignored.
*
* @param sme_ss source storage
* @param svl_b SVL corresponding to sme_ss, in bytes
*/
void
arm_load_sme_za_zt0(const arm_sme_context_t *sme_ss, uint16_t svl_b)
{
arm_load_sme_za(sme_ss, svl_b);
#if HAS_ARM_FEAT_SME2
if (arm_sme_version() >= 2) {
arm_load_sme_zt0(sme_ss);
}
#endif
}
#endif /* HAS_ARM_FEAT_SME */