Source of /osfmk/arm64/sptm/pmap/pmap_data.h (From xnu-11215.1.10)

/*
 * Copyright (c) 2020 Apple Inc. All rights reserved.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
 *
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. The rights granted to you under the License
 * may not be used to create, or enable the creation or redistribution of,
 * unlawful or unlicensed copies of an Apple operating system, or to
 * circumvent, violate, or enable the circumvention or violation of, any
 * terms of an Apple operating system software license agreement.
 *
 * Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this file.
 *
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
 */
/**
 * This header file is used to store the types, prototypes, and inline functions
 * that define some of the most important data structures used in the pmap. This
 * header is only meant for sharing types within the pmap; if a type is meant to
 * be used by the rest of the kernel, then put it into osfmk/arm64/sptm/pmap/pmap.h.
 */
#pragma once

#include <stdint.h>

#include <kern/ledger.h>
#include <mach/vm_types.h>
#include <mach_assert.h>
#include <vm/vm_page.h>

#include <arm/cpu_data.h>
#include <arm/machine_routines.h>
#include <arm64/proc_reg.h>

#if HIBERNATION
#include <arm64/hibernate_secure_hmac.h>
#endif /* HIBERNATION */

/* Temporary include before moving all ledger functions into pmap_data.c */
#include <os/refcnt.h>

/**
 * These headers are safe to be included in this file since they shouldn't rely
 * on any of the internal pmap header files (so no circular dependencies).
 */
#include <arm64/sptm/pmap/pmap.h>
#include <arm64/sptm/pmap/pmap_pt_geometry.h>

#include <arm64/sptm/sptm.h>

/**
 * These values represent the first and last kernel-managed physical addresses.
 * We keep track of extra metadata on kernel-managed pages compared to other
 * pages (usually iBoot carved out memory or I/O).
 */
extern pmap_paddr_t vm_first_phys, vm_last_phys;

#define PMAP_HIB_STATE_REACHED(states) false
#define PMAP_ASSERT_NOT_WRITING_HIB()
#define PMAP_IS_HIBERNATING() false

/**
 * Return whether the given address represents a kernel-managed physical page.
 *
 * Whether a page is considered "kernel-managed" is determined by the BootArgs
 * passed by the bootloader. Typically memory carved out by the bootloader as
 * well as I/O memory should return false.
 *
 * @param pa The physical address to check.
 */
static inline bool
pa_valid(pmap_paddr_t pa)
{
	return (pa >= vm_first_phys) && (pa < vm_last_phys);
}

/* Sentinal value indicating an invalid physical address index. */
#define INVALID_PAI UINT_MAX

/**
 * The pmap has a variety of data structures (pv_head_table/pp_attr_table) that
 * contain an entry for every kernel-managed page in the system. These systems
 * are indexed with physical address indices ("pai") generated by this function.
 *
 * The logic is simple since there should be one entry in each of these data
 * structures for each kernel-managed physical page in the system. These data
 * structures are allocated on boot based on the amount of memory available.
 *
 * @note PAIs are defined using the VM page size, which might not be identical
 *       to the underlying hardware page size for an arbitrary address space.
 *       This means that the data structures relying on PAIs will contain one
 *       entry for each VM page, not hardware page.
 *
 * @note This function is only valid for physical addresses that are
 *       kernel-managed.
 */
static inline unsigned int
pa_index(pmap_paddr_t pa)
{
	return (unsigned int)atop(pa - vm_first_phys);
}

/**
 * Convert from a physical address index (pai) back to a raw physical address.
 *
 * @param pai The physical address index to convert to a PA.
 *
 * @return The page-aligned physical address corresponding to [pai].
 */
static inline pmap_paddr_t
pai_to_pa(unsigned int pai)
{
	return ptoa((pmap_paddr_t)pai) + vm_first_phys;
}

/* See the definition of pv_head_table for more information. */
extern uintptr_t *pv_head_table;

/* Represents a NULL entry in the pv_head_table. */
#define PV_ENTRY_NULL ((pv_entry_t *) 0)

/**
 * Given a physical address index, return the corresponding pv_head_table entry.
 *
 * @note The returned entry might be invalid, or a pointer to a pt_entry_t,
 *       pv_entry_t, or pt_desc_t depending on the type for this entry.
 *       Determine the type using pvh_test_type().
 *
 * @param pai The index returned by pa_index() for the page whose pv_head_table
 *            entry should be retrieved.
 */
static inline uintptr_t
pai_to_pvh(unsigned int pai)
{
	return pv_head_table[pai];
}

/**
 * Each pv_head_table entry can be one of four different types:
 *
 * - PVH_TYPE_NULL: No mappings to the physical page exist outside of the
 *                  physical aperture. Physical aperture mappings are not
 *                  tracked in the pv_head_table.
 *
 * - PVH_TYPE_PVEP: There are multiple mappings to the physical page.
 *                  These entries are linked lists of pv_entry_t objects (which
 *                  each contain a pointer to the associated PTE and a pointer
 *                  to the next entry in the list).
 *
 * - PVH_TYPE_PTEP: There is a single mapping to the physical page. Once more
 *                  mappings are created, this entry will get upgraded to an
 *                  entry of type PVH_TYPE_PVEP. These entries are pointers
 *                  directly to the page table entry that contain the mapping
 *                  (pt_entry_t*).
 *
 * - PVH_TYPE_PTDP: The physical page is being used as a page table. These
 *                  entries are pointers to page table descriptor structures
 *                  (pt_desc_t) which contain metadata related to each page
 *                  table.
 *
 * The type is stored in the bottom two bits of each pv_head_table entry. That
 * type needs to be checked before dereferencing the pointer to determine which
 * pointer type to dereference as.
 */
#define PVH_TYPE_NULL 0x0UL
#define PVH_TYPE_PVEP 0x1UL
#define PVH_TYPE_PTEP 0x2UL
#define PVH_TYPE_PTDP 0x3UL

#define PVH_TYPE_MASK (0x3UL)


/**
 * PV_HEAD_TABLE Flags.
 *
 * All flags listed below are stored in the pv_head_table entry/pointer
 * (per-physical-page) unless otherwise noted.
 *
 * Please update the pv_walk LLDB macro if these flags are changed or added to.
 */

/**
 * This flag is set for every mapping created by an IOMMU.
 *
 * Stored in each PTE pointer (for PVH_TYPE_PVEP lists), or in the pv_head_table
 * entry/pointer for single-PTE entries (PVH_TYPE_PTEP).
 */
#define PVH_FLAG_IOMMU 0x4UL

/**
 * This flag is only valid when PVH_FLAG_IOMMU is set. For an IOMMU mapping, if
 * this bit is set, then the PTE pointer points directly into the IOMMU page
 * table for this mapping. If this bit is cleared, then the "PTE pointer" is
 * actually a pointer to the IOMMU descriptor object that owns this mapping.
 *
 * There are cases where it's not easy to tie an IOMMU mapping directly to a
 * specific page table, so this allows us to at least get a pointer to which
 * IOMMU created this mapping which is useful for debugging purposes.
 *
 * Stored in each PTE pointer (for PVH_TYPE_PVEP lists), or in the pv_head_table
 * entry/pointer for single-PTE entries (PVH_TYPE_PTEP).
 */
#define PVH_FLAG_IOMMU_TABLE (1ULL << 63)

/**
 * This flag is set when the first CPU (non-IOMMU) mapping is created. This is
 * important to keep track of because various accounting statistics are based on
 * the options specified for the first CPU mapping. This flag, and thus the
 * accounting statistics, will persist as long as there *any* mappings of the
 * page (including IOMMU mappings). This works because the accounting for a page
 * should not need to change until the page is recycled by the VM layer, and we
 * double-check that there are no mappings (CPU or IOMMU) when a page is
 * recycled (see: pmap_verify_free()).
 */
#define PVH_FLAG_CPU (1ULL << 62)

/* This bit is used as a lock when modifying a pv_head_table entry. */
#define PVH_LOCK_BIT 61
#define PVH_FLAG_LOCK (1ULL << PVH_LOCK_BIT)

/**
 * This flag is set when there are any executable mappings to this physical
 * page. This is used to prevent any writable mappings from being created at
 * the same time an executable mapping exists.
 */
#define PVH_FLAG_EXEC (1ULL << 60)

/**
 * This flag is used to mark that a page has been hashed into the hibernation
 * image.
 *
 * The hibernation driver will use this to ensure that all PPL-owned memory is
 * correctly included into the hibernation image (a missing PPL page could be
 * a security concern when coming out of hibernation).
 */
#define PVH_FLAG_HASHED (1ULL << 58)

#define PVH_FLAG_RETIRED 0


#define PVH_FLAG_TAGGED 0


/**
 * This flag is used to mark that a PV head entry has been placed into
 * "sleep mode", which typically happens when the lock owner needs to
 * process a long PV list.  If this bit is set, threads which contend
 * on the PVH lock must call thread_block() to wait until they are awakened
 * by the current lock owner releasing the lock.
 */
#define PVH_FLAG_SLEEP (1ULL << 54)

/**
 * These bits need to be set to safely dereference a pv_head_table
 * entry/pointer.
 *
 * Any change to this #define should also update the copy located in the pmap.py
 * LLDB macros file.
 */
#define PVH_MUTABLE_FLAGS (PVH_FLAG_CPU | PVH_FLAG_EXEC | PVH_FLAG_HASHED | PVH_FLAG_RETIRED | PVH_FLAG_TAGGED)

#define PVH_LOCK_FLAGS (PVH_FLAG_LOCK | PVH_FLAG_SLEEP)

#define PVH_HIGH_FLAGS (PVH_MUTABLE_FLAGS | PVH_LOCK_FLAGS)

/* Mask used to clear out the TYPE bits from a pv_head_table entry/pointer. */
#define PVH_LIST_MASK (~PVH_TYPE_MASK)

/* Which 32-bit word in each pv_head_table entry/pointer contains the LOCK bit. */
#define PVH_LOCK_WORD 1 /* Assumes little-endian */

/**
 * Assert that a pv_head_table entry is locked. Will panic if the lock isn't
 * acquired.
 *
 * @param index The physical address index to check.
 */
static inline void
pvh_assert_locked(__assert_only unsigned int index)
{
	assertf(os_atomic_load(&pv_head_table[index], relaxed) & PVH_LOCK_FLAGS,
	    "%s: PVH %p (=%p) for pai 0x%x not locked or in sleep mode", __func__,
	    &pv_head_table[index], (void*)(os_atomic_load(&pv_head_table[index], relaxed)), index);
}

/**
 * Helper function for returning the 32-bit PVH lock word corresponding
 * to a physical address index.
 *
 * @param index The physical address index of the pv_head_table entry
 *
 * @return A pointer to the 32-bit word containing the lock bit
 */
static inline uint32_t*
pvh_lock_word(unsigned int index)
{
	return (uint32_t*)(&pv_head_table[index]) + PVH_LOCK_WORD;
}

/**
 * Helper macro for computing the lock bit offset within the 32-bit
 * lock word for each PV head entry.
 *
 * @return A 32-bit integer containing the lock bit offset.
 */
#define PVH_LOCK_BIT_OFFSET (PVH_LOCK_BIT - (PVH_LOCK_WORD * 32))

/**
 * Lock a pv_head_table entry, and return the value stored in the pv_head_table array.
 *
 * @param index The physical address index of the pv_head_table entry to lock.
 *
 * @return A wrapper object with the contents of the locked pv_head_table entry.
 */
static inline locked_pvh_t __attribute__((warn_unused_result))
pvh_lock(unsigned int index)
{
	extern unsigned int not_in_kdp;
	const bool was_preemptible = preemption_enabled();
	assert(was_preemptible || (startup_phase < STARTUP_SUB_EARLY_BOOT) ||
	    PMAP_IS_HIBERNATING() || !not_in_kdp);

	bool (^check_preemption)(void) = ^bool (void) {
		return was_preemptible && pmap_pending_preemption();
	};

	hw_lock_status_t ret;
	locked_pvh_t locked_pvh = {.pvh = 0, .pai = index};
	do {
		ret = hw_lock_bit_to_b(pvh_lock_word(index), PVH_LOCK_BIT_OFFSET,
		    &hw_lock_bit_policy, check_preemption, &pmap_lck_grp);

		if (ret == HW_LOCK_ACQUIRED) {
			locked_pvh.pvh = os_atomic_load(&pv_head_table[index], relaxed);
			if (__improbable(locked_pvh.pvh & PVH_FLAG_SLEEP)) {
				wait_result_t wres;
				wres = assert_wait(&pv_head_table[index], THREAD_UNINT);
				hw_unlock_bit(pvh_lock_word(index), PVH_LOCK_BIT_OFFSET);
				assertf(wres == THREAD_WAITING, "%s: unexpected wait result %d", __func__, wres);
				thread_block(THREAD_CONTINUE_NULL);
				ret = HW_LOCK_CONTENDED;
			}
		}
	} while (ret != HW_LOCK_ACQUIRED);

	return locked_pvh;
}

/**
 * Lock a pvh_head_table entry, possibly in a preemption-disabled context.
 *
 * @note This function is only meant for special use cases in which pmap
 *       functions must be invoked with preemption disabled.  These cases
 *       are expected to be rare and limited.  If you think you need to
 *       use this in more places, you're probably wrong.
 *
 * @param index The physical address index of the pv_head_table entry to lock.
 *
 * @return A wrapper object with the contents of the locked pv_head_table entry.
 */
static inline locked_pvh_t __attribute__((warn_unused_result))
pvh_lock_nopreempt(unsigned int index)
{
	if (__improbable(preemption_enabled())) {
		return pvh_lock(index);
	}
	hw_lock_bit(pvh_lock_word(index), PVH_LOCK_BIT_OFFSET, &pmap_lck_grp);
	const locked_pvh_t locked_pvh = {.pvh = os_atomic_load(&pv_head_table[index], relaxed), .pai = index};

	if (__improbable(locked_pvh.pvh & PVH_FLAG_SLEEP)) {
		panic("%s invoked on sleep-mode PVH %p for pai 0x%x", __func__, &pv_head_table[index], index);
	}

	return locked_pvh;
}

/**
 * Attempt to lock a pv_head_table entry, failing if the lock can't be immediately acquired.
 *
 * @param index The physical address index of the pv_head_table entry to lock.
 *
 * @return A wrapper object with the contents of the locked pv_head_table entry if successful,
 *         0 otherwise.
 */
static inline locked_pvh_t __attribute__((warn_unused_result))
pvh_try_lock(unsigned int index)
{
	locked_pvh_t locked_pvh = {.pvh = 0, .pai = index};
	bool locked = hw_lock_bit_try(pvh_lock_word(index), PVH_LOCK_BIT_OFFSET, &pmap_lck_grp);

	if (locked) {
		locked_pvh.pvh = os_atomic_load(&pv_head_table[index], relaxed);
		assert(locked_pvh.pvh != 0);
		if (__improbable(locked_pvh.pvh & PVH_FLAG_SLEEP)) {
			hw_unlock_bit(pvh_lock_word(index), PVH_LOCK_BIT_OFFSET);
			locked_pvh.pvh = 0;
		}
	}

	return locked_pvh;
}

/**
 * Helper for determining whether a preceding pvh_try_lock() call succeeded.
 *
 * @param locked_pvh A wrapper representing a possibly-locked PV head table entry
 *        returned by pvh_try_lock().
 *
 * @return True if [locked_pvh] represents a successfully-locked PVH, false otherwise.
 */
static inline bool
pvh_try_lock_success(const locked_pvh_t *locked_pvh)
{
	assert(locked_pvh != NULL);
	return locked_pvh->pvh != 0;
}

/**
 * Place a pv_head_table entry in sleep mode, so that other threads contending on the PVH
 * lock will sleep until this thread calls pvh_unlock().
 *
 * @note It is legal to call this function if the lock is already in sleep mode.
 *       In that case, the call will have no effect.
 * @note This function must not be called with preemption disabled by any other agent
 *       but [locked_pvh] itself.  Preemption must be fully re-enabled by the time
 *       this function returns, either because it was already enabled (because the
 *       lock was already in sleep mode), or because this function enabled it by placing
 *       the lock in sleep mode.
 *
 * @param locked_pvh Pointer to a wrapper object representing the locked PV head table entry.
 */
static inline void
pvh_lock_enter_sleep_mode(locked_pvh_t *locked_pvh)
{
	assert(locked_pvh != NULL);
	assert(locked_pvh->pvh != 0);
	unsigned int index = locked_pvh->pai;
	pvh_assert_locked(index);
	const uintptr_t old_pvh = os_atomic_load(&pv_head_table[index], relaxed);
	if (!(old_pvh & PVH_FLAG_SLEEP)) {
		assert(old_pvh & PVH_FLAG_LOCK);
		os_atomic_store(&pv_head_table[index], old_pvh | PVH_FLAG_SLEEP, relaxed);
		/**
		 * Tell the scheduler that this thread may need a priority boost if it needs to go
		 * off-core, to reduce the likelihood of priority inversion.
		 */
		locked_pvh->pri_token = thread_priority_floor_start();
		hw_unlock_bit(pvh_lock_word(index), PVH_LOCK_BIT_OFFSET);
	}

	/* Hibernation runs single-core so we can skip this check. */
	assert(preemption_enabled() || PMAP_IS_HIBERNATING());
}

/**
 * Check that a pv_head_table entry/pointer is a specific type.
 *
 * @param pvh The pv_head_table entry/pointer to check.
 * @param type The type to check for.
 *
 * @return True if the pv_head_table entry is of the passed in type, false
 *         otherwise.
 */
static inline bool
pvh_test_type(uintptr_t pvh, uintptr_t type)
{
	return (pvh & PVH_TYPE_MASK) == type;
}

/**
 * Unlock a pv_head_table entry, updating the contents of the entry with the passed-in value.
 *
 * @note Only the non-lock flags, pointer, and type fields of the entry will be updated
 *       according to the passed-in value.  PVH_LOCK_FLAGS will be ignored as they are
 *       directly manipulated by this function.
 *
 * @param locked_pvh Pointer to a wrapper object representing the locked PV head table entry.
 *        The pvh field from this entry, except for the PVH_LOCK_FLAGS bits, will be stored
 *        in pv_head_table to reflect any updates that may have been performed on the PV list
 *        while the lock was held.
 */
static inline void
pvh_unlock(locked_pvh_t *locked_pvh)
{
	assert(locked_pvh != NULL);
	assert(locked_pvh->pvh != 0);
	unsigned int index = locked_pvh->pai;
	pvh_assert_locked(index);
	const uintptr_t old_pvh = os_atomic_load(&pv_head_table[index], relaxed);
	bool pri_floor_end = false;

	if (__improbable(old_pvh & PVH_FLAG_SLEEP)) {
		pri_floor_end = true;
		const bool was_preemptible = preemption_enabled();
		bool (^check_preemption)(void) = ^bool (void) {
			return was_preemptible && pmap_pending_preemption();
		};

		hw_lock_status_t ret;
		do {
			ret = hw_lock_bit_to_b(pvh_lock_word(index), PVH_LOCK_BIT_OFFSET,
			    &hw_lock_bit_policy, check_preemption, &pmap_lck_grp);
		} while (ret != HW_LOCK_ACQUIRED);

		os_atomic_store(&pv_head_table[index],
		    (locked_pvh->pvh & ~PVH_FLAG_SLEEP) | PVH_FLAG_LOCK, relaxed);
		thread_wakeup(&pv_head_table[index]);
	} else if ((old_pvh & ~PVH_LOCK_FLAGS) != (locked_pvh->pvh & ~PVH_LOCK_FLAGS)) {
		os_atomic_store(&pv_head_table[index],
		    (locked_pvh->pvh & ~PVH_FLAG_SLEEP) | PVH_FLAG_LOCK, relaxed);
	}
	hw_unlock_bit(pvh_lock_word(index), PVH_LOCK_BIT_OFFSET);

	if (__improbable(pri_floor_end)) {
		thread_priority_floor_end(&locked_pvh->pri_token);
	}

	locked_pvh->pvh = 0;
}

/**
 * Convert a pv_head_table entry/pointer into a page table entry pointer. This
 * should only be done if the type of this entry is PVH_TYPE_PTEP.
 *
 * @param pvh The pv_head_table entry/pointer to convert into a pt_entry_t*.
 *
 * @return Return back a safe to derefence pointer to the single mapping of this
 *         physical page by masking off the TYPE bits and adding any missing
 *         flags to the upper portion of the pointer.
 */
static inline pt_entry_t*
pvh_ptep(uintptr_t pvh)
{
	assert(pvh_test_type(pvh, PVH_TYPE_PTEP));
	return (pt_entry_t *)((pvh & PVH_LIST_MASK) | PVH_HIGH_FLAGS);
}

/**
 * Convert a pv_head_table entry/pointer into a PVE list pointer. This
 * should only be done if the type of this entry is PVH_TYPE_PVEP.
 *
 * @param pvh The pv_head_table entry/pointer to convert into a safe to
 *            dereference pv_entry_t*.
 *
 * @return Return back a safe to derefence pointer to the first mapping of this
 *         physical page by masking off the TYPE bits and adding any missing
 *         flags to the upper portion of the pointer.
 */
static inline pv_entry_t*
pvh_pve_list(uintptr_t pvh)
{
	assert(pvh_test_type(pvh, PVH_TYPE_PVEP));
	return (pv_entry_t *)((pvh & PVH_LIST_MASK) | PVH_HIGH_FLAGS);
}

/**
 * Return the mutable flags associated with a pv_head_table entry/pointer.
 *
 * @param pvh The pv_head_table entry whose flags to get.
 *
 * @return The mutable flags encoded in [pvh].
 */
static inline uintptr_t
pvh_get_flags(uintptr_t pvh)
{
	return pvh & PVH_MUTABLE_FLAGS;
}

/**
 * Update the flags associated with a pv_head_table entry/pointer.
 *
 * @note This function does not actually modify the pv_head_table,
 *       it only installs an updated pv_head_table entry in [locked_pvh]
 *       that can later be passed to pvh_unlock() to update the actual array
 *       entry.
 *
 * @param locked_pvh A wrapper struct containing the pv_head_table
 *                   entry/pointer to update.
 *
 */
static inline void
pvh_set_flags(locked_pvh_t *locked_pvh, uintptr_t flags)
{
	locked_pvh->pvh = (locked_pvh->pvh & ~PVH_MUTABLE_FLAGS) | (flags & PVH_MUTABLE_FLAGS);
}

/**
 * Update a pv_head_table entry/pointer to be a different type and/or point to
 * a different object.
 *
 * @note This function does not actually modify the pv_head_table,
 *       it only installs an updated pv_head_table entry in [locked_pvh]
 *       that can later be passed to pvh_unlock() to update the actual array
 *       entry.
 *
 * @param locked_pvh A wrapper struct containing the pv_head_table
 *                   entry/pointer to update.
 * @param pvep The new entry to use. This could be either a pt_entry_t*,
 *             pv_entry_t*, or pt_desc_t* depending on the type.
 * @param type The type of the new entry.
 */
static inline void
pvh_update_head(locked_pvh_t *locked_pvh, void *pvep, unsigned int type)
{
	assert(!((uintptr_t)pvep & PVH_TYPE_MASK));
	const uintptr_t pvh_flags = locked_pvh->pvh & PVH_HIGH_FLAGS;
	locked_pvh->pvh = ((uintptr_t)pvep & ~PVH_HIGH_FLAGS) | type | pvh_flags;
}

/**
 * Given a page table entry pointer retrieved from the pv_head_table (from an
 * entry of type PVH_TYPE_PTEP or PVH_TYPE_PVEP), return back whether the PTE is
 * an IOMMU mapping.
 *
 * @note The way this function determines whether the passed in pointer is
 *       pointing to an IOMMU PTE, is by checking for a special flag stored in
 *       the lower bits of the pointer. This flag is only set on pointers stored
 *       in the pv_head_table, and as such, this function will only work on
 *       pointers retrieved from the pv_head_table. If a pointer to a PTE was
 *       directly retrieved from an IOMMU's page tables, this function would
 *       always return false despite actually being an IOMMU PTE.
 *
 * @param ptep A PTE pointer obtained from the pv_head_table to check.
 *
 * @return True if the entry is an IOMMU mapping, false otherwise.
 */
static inline bool
pvh_ptep_is_iommu(const pt_entry_t *ptep)
{
#ifdef PVH_FLAG_IOMMU
	return (uintptr_t)ptep & PVH_FLAG_IOMMU;
#else /* PVH_FLAG_IOMMU */
	#pragma unused(ptep)
	return false;
#endif /* PVH_FLAG_IOMMU */
}

/**
 * Sometimes the PTE pointers retrieved from the pv_head_table (from an entry of
 * type PVH_TYPE_PTEP or PVH_TYPE_PVEP) contain flags themselves. This function
 * strips out those flags and returns back a dereferencable pointer.
 *
 * @param ptep The PTE pointer to strip out the unwanted flags.
 *
 * @return A valid dereferencable pointer to the page table entry.
 */
static inline const pt_entry_t*
pvh_strip_ptep(const pt_entry_t *ptep)
{
#ifdef PVH_FLAG_IOMMU
	const uintptr_t pte_va = (uintptr_t)ptep;
	return (const pt_entry_t*)((pte_va & ~PVH_FLAG_IOMMU) | PVH_FLAG_IOMMU_TABLE);
#else /* PVH_FLAG_IOMMU */
	return ptep;
#endif /* PVH_FLAG_IOMMU */
}

/**
 * PVH_TYPE_PVEP Helper Functions.
 *
 * The following are methods used to manipulate PVE lists. This is the type of
 * pv_head_table entry used when there are multiple mappings to a single
 * physical page.
 */

/**
 * Whether a physical page is using "alternate accounting" (ALTACCT) for its
 * ledger statistics is something that needs to be tracked on a per-mapping
 * basis, not on a per-physical-page basis. Because of that, it's tracked
 * differently depending on whether there's a single mapping to a page
 * (PVH_TYPE_PTEP) or multiple (PVH_TYPE_PVEP). For single mappings, the bit is
 * tracked in the pp_attr_table. But when there are multiple mappings, the least
 * significant bit of the corresponding "pve_pte" pointer in each pv_entry object
 * is used as a marker for pages using alternate accounting.
 *
 * @note See the definition for PP_ATTR_ALTACCT for a more detailed description
 *       of what "alternate accounting" actually means in respect to the
 *       footprint ledger.
 *
 * Since some code (KernelDiskImages, e.g.) might map a phsyical page as
 * "device" memory (i.e. external) while it's also being used as regular
 * "anonymous" memory (i.e. internal) in user space, we have to manage the
 * "internal" attribute per mapping rather than per physical page.
 * When there are multiple mappings, we use the next least significant bit of
 * the corresponding "pve_pte" pointer for that.
 */
#define PVE_PTEP_ALTACCT ((uintptr_t) 0x1)
#define PVE_PTEP_INTERNAL ((uintptr_t) 0x2)
#define PVE_PTEP_FLAGS (PVE_PTEP_ALTACCT | PVE_PTEP_INTERNAL)

/**
 * Set the ALTACCT bit for a specific PTE pointer.
 *
 * @param pvep A pointer to the current pv_entry mapping in the linked list of
 *             mappings.
 * @param idx Index of the chosen PTE pointer inside the PVE.
 */
static inline void
pve_set_altacct(pv_entry_t *pvep, unsigned idx)
{
	assert(idx < PTE_PER_PVE);
	pvep->pve_ptep[idx] = (pt_entry_t *)((uintptr_t)pvep->pve_ptep[idx] | PVE_PTEP_ALTACCT);
}

/**
 * Set the INTERNAL bit for a specific PTE pointer.
 *
 * @param pvep A pointer to the current pv_entry mapping in the linked list of
 *             mappings.
 * @param idx Index of the chosen PTE pointer inside the PVE.
 */
static inline void
pve_set_internal(pv_entry_t *pvep, unsigned idx)
{
	assert(idx < PTE_PER_PVE);
	pvep->pve_ptep[idx] = (pt_entry_t *)((uintptr_t)pvep->pve_ptep[idx] | PVE_PTEP_INTERNAL);
}

/**
 * Clear the ALTACCT bit for a specific PTE pointer.
 *
 * @param pvep A pointer to the current pv_entry mapping in the linked list of
 *             mappings.
 * @param idx Index of the chosen PTE pointer inside the PVE.
 */
static inline void
pve_clr_altacct(pv_entry_t *pvep, unsigned idx)
{
	assert(idx < PTE_PER_PVE);
	pvep->pve_ptep[idx] = (pt_entry_t *)((uintptr_t)pvep->pve_ptep[idx] & ~PVE_PTEP_ALTACCT);
}

/**
 * Clear the INTERNAL bit for a specific PTE pointer.
 *
 * @param pvep A pointer to the current pv_entry mapping in the linked list of
 *             mappings.
 * @param idx Index of the chosen PTE pointer inside the PVE.
 */
static inline void
pve_clr_internal(pv_entry_t *pvep, unsigned idx)
{
	assert(idx < PTE_PER_PVE);
	pvep->pve_ptep[idx] = (pt_entry_t *)((uintptr_t)pvep->pve_ptep[idx] & ~PVE_PTEP_INTERNAL);
}

/**
 * Return the ALTACCT bit for a specific PTE pointer.
 *
 * @param pvep A pointer to the current pv_entry mapping in the linked list of
 *             mappings.
 * @param idx Index of the chosen PTE pointer inside the PVE.
 */
static inline bool
pve_get_altacct(pv_entry_t *pvep, unsigned idx)
{
	assert(idx < PTE_PER_PVE);
	return (uintptr_t)pvep->pve_ptep[idx] & PVE_PTEP_ALTACCT;
}

/**
 * Return the INTERNAL bit for a specific PTE pointer.
 *
 * @param pvep A pointer to the current pv_entry mapping in the linked list of
 *             mappings.
 * @param idx Index of the chosen PTE pointer inside the PVE.
 */
static inline bool
pve_get_internal(pv_entry_t *pvep, unsigned idx)
{
	assert(idx < PTE_PER_PVE);
	return (uintptr_t)pvep->pve_ptep[idx] & PVE_PTEP_INTERNAL;
}

/**
 * Return the next mapping (pv_entry) in a linked list of mappings. This applies
 * to pv_head_table entries of type PVH_TYPE_PVEP.
 *
 * @param pvep A pointer to the current pv_entry mapping in the linked list of
 *             mappings.
 *
 * @return The next virtual mapping for a physical page, or PV_ENTRY_NULL if the
 *         end of the list has been reached.
 */
static inline pv_entry_t *
pve_next(pv_entry_t *pvep)
{
	return pvep->pve_next;
}

/**
 * Return a pointer to the pve_next field in a pv_entry. This value is used
 * when adding and removing entries to a PVE list.
 *
 * @param pvep The pv_entry whose pve_next field is being accessed.
 *
 * @return Pointer to the pve_next field.
 */
static inline pv_entry_t **
pve_next_ptr(pv_entry_t *pvep)
{
	return &pvep->pve_next;
}

/**
 * Return a pointer to the page table entry for this mapping.
 *
 * @param pvep The pv_entry whose pve_ptep field is to be returned.
 * @param idx Index of the chosen PTE pointer inside the PVE.
 *
 * @return Pointer to the page table entry.
 */
static inline pt_entry_t *
pve_get_ptep(pv_entry_t *pvep, unsigned idx)
{
	assert(idx < PTE_PER_PVE);
	return (pt_entry_t *)((uintptr_t)pvep->pve_ptep[idx] & ~PVE_PTEP_FLAGS);
}

/**
 * Update the page table entry for a specific physical to virtual mapping.
 *
 * @param pvep The pv_entry to update.
 * @param idx Index of the chosen PTE pointer inside the PVE.
 * @param ptep_new The new page table entry.
 */
static inline void
pve_set_ptep(pv_entry_t *pvep, unsigned idx, pt_entry_t *ptep_new)
{
	assert(idx < PTE_PER_PVE);
	pvep->pve_ptep[idx] = ptep_new;
}

/**
 * Initialize all fields in a PVE to NULL.
 *
 * @param pvep The pv_entry to initialize.
 */
static inline void
pve_init(pv_entry_t *pvep)
{
	pvep->pve_next = PV_ENTRY_NULL;
	for (int i = 0; i < PTE_PER_PVE; i++) {
		pvep->pve_ptep[i] = PT_ENTRY_NULL;
	}
}

/**
 * Find PTE pointer in PVE and return its index.
 *
 * @param pvep The PVE to search.
 * @param ptep PTE to search for.
 *
 * @return Index of the found entry, or -1 if no entry exists.
 */
static inline int
pve_find_ptep_index(pv_entry_t *pvep, pt_entry_t *ptep)
{
	for (unsigned int i = 0; i < PTE_PER_PVE; i++) {
		if (pve_get_ptep(pvep, i) == ptep) {
			return (int)i;
		}
	}

	return -1;
}

/**
 * Checks if no PTEs are currently associated with this PVE.
 *
 * @param pvep The PVE to search.
 *
 * @return True if no PTEs are currently associated with this PVE, or false.
 */
static inline bool
pve_is_empty(pv_entry_t *pvep)
{
	for (unsigned int i = 0; i < PTE_PER_PVE; i++) {
		if (pve_get_ptep(pvep, i) != PT_ENTRY_NULL) {
			return false;
		}
	}

	return true;
}

/**
 * Prepend a new pv_entry node to a PVE list.
 *
 * @note This function does not actually modify the pv_head_table,
 *       it only installs an updated pv_head_table entry in [locked_pvh]
 *       that can later be passed to pvh_unlock() to update the actual array
 *       entry.
 *
 * @param locked_pvh A wrapper struct containing the pv_head_table
 *                   entry/pointer to update.  This entry represents
 *                   the linked list of mappings to update.
 * @param pvep The new mapping to add to the linked list.
 */
static inline void
pve_add(locked_pvh_t *locked_pvh, pv_entry_t *pvep)
{
	assert(pvh_test_type(locked_pvh->pvh, PVH_TYPE_PVEP));

	pvep->pve_next = pvh_pve_list(locked_pvh->pvh);
	pvh_update_head(locked_pvh, pvep, PVH_TYPE_PVEP);
}

/**
 * Remove an entry from a PVE list of mappings.
 *
 * @note This function does not actually modify the pv_head_table,
 *       it only installs an updated pv_head_table entry in [locked_pvh]
 *       that can later be passed to pvh_unlock() to update the actual array
 *       entry.
 *
 * @param locked_pvh A wrapper struct containing the pv_head_table entry/pointer
 *                   to update.  This entry represents the linked list of mappings
 *                   from which to remove an entry.
 * @param pvepp A pointer to the pv_entry_t* that's being removed. If this entry
 *              is the first in the linked list of mappings, then NULL should be
 *              passed here and the removal will be reflected in the returned
 *              pv_head_table entry.
 * @param pvep The entry that should be removed. Should be identical to a
 *             dereference of the pvepp parameter (unless it's the pv_head_table
 *             entry).
 */
static inline void
pve_remove(locked_pvh_t *locked_pvh, pv_entry_t **pvepp, pv_entry_t *pvep)
{
	assert(pvh_test_type(locked_pvh->pvh, PVH_TYPE_PVEP));

	if (pvepp == NULL) {
		assertf(pvh_pve_list(locked_pvh->pvh) == pvep, "%s: pvh %p != pvep %p",
		    __func__, (void*)locked_pvh->pvh, pvep);
		if (pve_next(pvep) == PV_ENTRY_NULL) {
			/* The last mapping to this page is being removed. */
			pvh_update_head(locked_pvh, PV_ENTRY_NULL, PVH_TYPE_NULL);
		} else {
			/**
			 * There are still mappings left, make the next one the new head of
			 * the list. This effectively removes the first entry from the list.
			 */
			pvh_update_head(locked_pvh, pve_next(pvep), PVH_TYPE_PVEP);
		}
	} else {
		/**
		 * Move the previous entry's next field to the entry after the one being
		 * removed. This will clobber the ALTACCT and INTERNAL bits.
		 */
		*pvepp = pve_next(pvep);
	}
}

/**
 * PVH_TYPE_PTDP Types and Helper Functions.
 *
 * The following are types and methods used to manipulate page table descriptor
 * (PTD) objects. This is the type of pv_head_table entry used when a page is
 * being used as a page table.
 */

/**
 * Page table descriptor (PTD) info structure.
 *
 * Contains information about a page table. These pieces of data are separate
 * from the PTD itself because in address spaces where the VM page size doesn't
 * match the underlying hardware page size, one PTD could represent multiple
 * page tables (and so will need multiple PTD info structures).
 *
 * These fields are also in their own struct so that they can be allocated
 * separately from the associated pt_desc_t object. This allows us to allocate
 * the counts in this structure in a way that ensures they don't fall within the
 * same cache line as the main pt_desc_t object. This is important because the
 * fields in this structure are atomically updated which could cause false
 * sharing cache performance issues with the "va" field in pt_desc_t if all of
 * the fields were within the same structure.
 */
typedef struct {
	/*
	 * For non-leaf pagetables, should be 0.
	 * For leaf pagetables, should reflect the number of wired entries.
	 * For IOMMU pages, may optionally reflect a driver-defined refcount (IOMMU
	 * operations are implicitly wired).
	 */
	unsigned short wiredcnt;
} ptd_info_t;

/**
 * This type is used to identify a specific IOMMU driver and an instance of
 * that driver which owns a specific page or page table. This type will be used
 * within both PTD and PVE lists to track IOMMU-owned pages and IOMMU mappings
 * respectively.
 *
 * Despite the fact this value is not a pointer, we need to make this value sort
 * of look like a kernel pointer: the bottom 3-bits must be zero and the upper
 * bits must all be ones by default. This is due to the fact that this type can
 * be embedded into the PVH table to represent an IOMMU mapping. The PVH table
 * code expects "kernel-pointer-like" properties so it can store flags in those
 * areas of the 64-bit value.
 */
typedef uint64_t iommu_instance_t;

/* 8-bit ID of the IOMMU driver which the instance derives from. */
#define IOMMU_ID_SHIFT 8U
#define IOMMU_ID_MASK  0x000000000000FF00ULL

#define GET_IOMMU_ID(x) ((sptm_iommu_id_t)(((x) & IOMMU_ID_MASK) >> IOMMU_ID_SHIFT))
#define SET_IOMMU_ID(x) (((uint64_t)(x) << IOMMU_ID_SHIFT) & IOMMU_ID_MASK)

/**
 * An IOMMU token is a 32-bit value unique to each instance of an IOMMU driver.
 * This is strictly used to help with debugging and provides a mechanism to
 * trace a mapping or page table back to the exact IOMMU instance that owns it.
 * Typically, this would be the instance ID, but for drivers that use only a
 * single global instance, this could be something else like a root page table
 * ppnum_t.
 */
#define IOMMU_TOKEN_SHIFT 16U
#define IOMMU_TOKEN_MASK  0x0000FFFFFFFF0000ULL

#define GET_IOMMU_TOKEN(x) ((iommu_token_t)(((x) & IOMMU_TOKEN_MASK) >> IOMMU_TOKEN_SHIFT))
#define SET_IOMMU_TOKEN(x) (((uint64_t)(x) << IOMMU_TOKEN_SHIFT) & IOMMU_TOKEN_MASK)

/**
 * The default value for iommu_instance_t. See the type definition for more
 * details on why the upper bits need to initially be all ones.
 */
#define IOMMU_INSTANCE_DEFAULT 0xFFFF000000000000ULL

/**
 * Since "zero" is a valid IOMMU ID and token, the "NULL" value of an IOMMU
 * instance sets the ID and token to all ones as a sentinel invalid value.
 */
#define IOMMU_INSTANCE_NULL 0xFFFFFFFFFFFFFF00ULL

/**
 * Page Table Descriptor (PTD).
 *
 * Provides a per-table data structure and a way of keeping track of all page
 * tables in the system.
 *
 * This structure is also used as a convenient way of keeping track of IOMMU
 * pages (which may or may not be used as page tables). In that case the SPTM
 * frame type for the page will be XNU_IOMMU, the "iommu" field will describe
 * the owner of the page, and ptd_info[0].wiredcnt can be used as an arbitrary
 * refcnt controlled by the IOMMU driver.
 */
typedef struct pt_desc {
	/* Each page table is either owned by a pmap or a specific IOMMU. */
	union {
		struct pmap *pmap;
	};

	/**
	 * The following fields contain per-page-table properties, and as such,
	 * might have multiple elements each. This is due to a single PTD
	 * potentially representing multiple page tables (in address spaces where
	 * the VM page size differs from the hardware page size). Use the
	 * ptd_get_index() function to get the correct index for a specific page
	 * table.
	 */

	/**
	 * The first address of the virtual address space this page table is
	 * translating for, or a value set by an IOMMU driver if this PTD is being
	 * used to track an IOMMU page.
	 */
	vm_offset_t va;

	/**
	 * ptd_info_t's are allocated separately so as to reduce false sharing
	 * with the va field. This is desirable because ptd_info_t's are updated
	 * atomically from all CPUs.
	 */
	ptd_info_t *ptd_info;
} pt_desc_t;

/**
 * Per-CPU structure for tracking in-flight SPTM retype operations.
 *
 * This structure is intended to be embedded in the pmap per-CPU data object,
 * and is meant to be used for situations in which the caller needs to ensure
 * that potentially sensitive concurrent SPTM operations have completed on other
 * CPUs prior to retyping a page.  If these sensitive operations haven't completed
 * when the retype occurs, and they happen to involve the page being retyped
 * (either directly or through mappings thereof), an SPTM violation panic may
 * result.
 */
typedef struct {
	/**
	 * Critical section sequence number of the local CPU.  A value of zero
	 * indicates that no retype epoch critical section is currently active on
	 * the CPU.
	 */
	uint64_t local_seq;

	/**
	 * The sequence number to use the next time a retype epoch critical section
	 * is entered on the local CPU.  This should monotonically increase.
	 */
	uint64_t next_seq;

	/**
	 * This array stores the retype sequence numbers observed on remote CPUs.
	 * When the local CPU needs to wait for critical sections to complete on
	 * other CPUs, this is intended to provide an initial sample of those other
	 * CPUs' critical section state.  The caller can then wait for each remote
	 * CPU's sequence number to return to zero or advance beyond the value
	 * stored in its entry in this array.
	 */
	uint64_t remote_seq[MAX_CPUS];

	/**
	 * Flags used to track the state of an active retype epoch drain operation
	 * on the local CPU.
	 */

	/**
	 * This flag indicates that a drain operation has been prepared on the
	 * local CPU by sampling remote CPU epoch states into the remote_seq array.
	 * This must be set before the drain operation can be performed.
	 */
	#define PMAP_RETYPE_EPOCH_PREPARED (1 << 0)

	/**
	 * This flag indicates that one or more remote CPUs had a non-zero retype
	 * epoch value when the remote_seq array was most recently sampled.
	 * If this flag is not set, then we already know that no remote CPUs can
	 * be in a critical section in which prior mapping state for the page to
	 * be retyped may have been observed, so we can skip the drain operation.
	 */
	#define PMAP_RETYPE_EPOCH_DRAIN_REQUIRED (1 << 1)
	uint8_t flags;
} pmap_retype_epoch_t;

#define PMAP_SPTM_PCPU_ALIGN (8192)

typedef struct {
	/**
	 * Per-CPU array of SPTM_MAPPING_LIMIT PTE records, obtained from SPTM
	 * during bootstrap.
	 */
	sptm_pte_t *sptm_prev_ptes;

	/**
	 * A piece of per-cpu scratch memory used by IOMMU drivers when passing data
	 * into the SPTM. The size is defined by PMAP_IOMMU_SCRATCH_SIZE.
	 */
	void *sptm_iommu_scratch;

	/* Accumulator for batched disjoint SPTM ops, to avoid excessive stack usage. */
	sptm_disjoint_op_t sptm_ops[SPTM_MAPPING_LIMIT];

	/* Accumulator for batched VA-contiguous SPTM ops, to avoid excessive stack usage. */
	sptm_pte_t sptm_templates[SPTM_MAPPING_LIMIT];

	/* Base PA of ops array, for passing the ops into the SPTM. */
	pmap_paddr_t sptm_ops_pa;

	/* Base PA of templates array, for passing templates into the SPTM. */
	pmap_paddr_t sptm_templates_pa;

	/* PMAP pagetable descriptors associated with each element of sptm_ops. */
	pt_desc_t *sptm_ptds[SPTM_MAPPING_LIMIT];

	/* PTD info objects associated with each pmap PTE pointer. */
	ptd_info_t *sptm_ptd_info[SPTM_MAPPING_LIMIT];

	/* Accounting-related flags for each element of sptm_ops. */
	#define PMAP_SPTM_FLAG_INTERNAL (0x1)
	#define PMAP_SPTM_FLAG_ALTACCT (0x2)
	uint8_t sptm_acct_flags[SPTM_MAPPING_LIMIT];

	/* Retype epoch tracking structure. */
	pmap_retype_epoch_t retype_epoch;

	/* Guest virtual machine dispatch structure. */
	sptm_guest_dispatch_t sptm_guest_dispatch;

	/* Guest virtual machine dispatch structure physical address. */
	pmap_paddr_t sptm_guest_dispatch_paddr;

	/* SPTM Logical CPU ID  */
	uint16_t sptm_cpu_id;

	/* Read index associated with this CPU's SPTM trace buffer  */
	uint64_t sptm_trace_buffer_read_index;
} __attribute__((aligned(PMAP_SPTM_PCPU_ALIGN))) pmap_sptm_percpu_data_t;

_Static_assert((PAGE_SIZE % PMAP_SPTM_PCPU_ALIGN) == 0,
    "SPTM per-CPU data alignment does not fit evenly within a page");
_Static_assert(sizeof(pmap_sptm_percpu_data_t) <= PMAP_SPTM_PCPU_ALIGN,
    "sizeof(pmap_sptm_percpu_data_t) is larger than PMAP_SPTM_PCPU_ALIGN");

PERCPU_DECL(pmap_sptm_percpu_data_t, pmap_sptm_percpu);

/**
 * Convert a pv_head_table entry/pointer into a page table descriptor pointer.
 * This should only be done if the type of this entry is PVH_TYPE_PTDP.
 *
 * @param pvh The pv_head_table entry/pointer to convert into a safe to
 *            dereference pt_desc_t*.
 *
 * @return Return back a safe to derefence pointer to the page table descriptor
 *         for this physical page by masking off the TYPE bits and adding any
 *         missing flags to the upper portion of the pointer.
 */
static inline pt_desc_t*
pvh_ptd(uintptr_t pvh)
{
	return (pt_desc_t *)((pvh & PVH_LIST_MASK) | PVH_HIGH_FLAGS);
}

/**
 * Given an arbitrary page table entry, return back the page table descriptor
 * (PTD) object for the page table that contains that entry.
 *
 * @param ptep Pointer to a PTE whose page table descriptor object to return.
 *
 * @return The PTD object for the passed in page table.
 */
static inline pt_desc_t *
ptep_get_ptd(const pt_entry_t *ptep)
{
	assert(ptep != NULL);

	const vm_offset_t pt_base_va = (vm_offset_t)ptep;
	uintptr_t pvh = pai_to_pvh(pa_index(kvtophys(pt_base_va)));

	if (__improbable(!pvh_test_type(pvh, PVH_TYPE_PTDP))) {
		panic("%s: invalid PV head 0x%llx for PTE %p", __func__, (uint64_t)pvh, ptep);
	}

	return pvh_ptd(pvh);
}

/**
 * Given an arbitrary page table entry, return back the pmap that owns that
 * page table.
 *
 * @note This won't work correctly for page tables owned by IOMMUs, because
 *       those table aren't owned by any specific pmap.
 *
 * @param ptep Pointer to a page table entry whose owner we're trying to return.
 *
 * @return The pmap that owns the given page table entry.
 */
static inline struct pmap *
ptep_get_pmap(const pt_entry_t *ptep)
{
	return ptep_get_ptd(ptep)->pmap;
}


/**
 * Given an arbitrary translation table entry, get the page table descriptor
 * (PTD) object for the page table pointed to by the TTE.
 *
 * @param tte The translation table entry to parse. For instance, if this is an
 *            L2 TTE, then the PTD for the L3 table this entry points to will be
 *            returned.
 *
 * @return The page table descriptor (PTD) for the page table pointed to by this
 *         TTE.
 */
static inline pt_desc_t *
tte_get_ptd(const tt_entry_t tte)
{
	const vm_offset_t pt_base_va = (vm_offset_t)(tte & ~((tt_entry_t)PAGE_MASK));
	uintptr_t pvh = pai_to_pvh(pa_index(pt_base_va));

	if (__improbable(!pvh_test_type(pvh, PVH_TYPE_PTDP))) {
		panic("%s: invalid PV head 0x%llx for TTE 0x%llx", __func__, (uint64_t)pvh, (uint64_t)tte);
	}

	return pvh_ptd(pvh);
}

/**
 * This function returns the ptd_info_t structure associated with a given
 * page table descriptor.
 *
 * @param ptd The page table descriptor that's being accessed.
 *
 * @return ptd_info_t structure associated with [ptd].
 */
static inline ptd_info_t *
ptd_get_info(pt_desc_t *ptd)
{
	assert(ptd != NULL);
	return ptd->ptd_info;
}

/**
 * Given a pointer to a page table entry, return back the ptd_info structure
 * for the page table that contains that entry.
 *
 * @param ptep Pointer to a PTE whose ptd_info object to return.
 *
 * @return The ptd_info object for the page table that contains the passed in
 *         page table entry.
 */
static inline ptd_info_t *
ptep_get_info(const pt_entry_t *ptep)
{
	return ptd_get_info(ptep_get_ptd(ptep));
}

/**
 * Return the virtual address mapped by the passed in leaf page table entry,
 * using an already-retrieved pagetable descriptor.
 *
 * @param ptdp pointer to the descriptor for the pagetable containing ptep
 * @param ptep Pointer to a PTE to parse
 */
static inline vm_map_address_t
ptd_get_va(const pt_desc_t *ptdp, const pt_entry_t *ptep)
{
	const pt_attr_t * const pt_attr = pmap_get_pt_attr(ptdp->pmap);

	vm_map_address_t va = ptdp->va;

	const uint64_t pmap_page_shift = pt_attr_leaf_shift(pmap_get_pt_attr(ptdp->pmap));
	const vm_offset_t ptep_page = (vm_offset_t)ptep >> pmap_page_shift;

	/**
	 * Use the difference between the VM page shift and the hardware page shift
	 * to get the index of the correct page table. In practice, this equates to
	 * masking out the bottom two bits of the L3 table index in address spaces
	 * where the VM page size is greater than the hardware page size. In address
	 * spaces where they're identical, the index will always be zero.
	 */
	const unsigned int ttep_index = ptep_page & ((1U << (PAGE_SHIFT - pmap_page_shift)) - 1);
	va += ttep_index * pt_attr_twig_size(pt_attr);

	/* Increment VA now to target the VA space covered by this specific PTE */
	const vm_offset_t ptep_index = ((vm_offset_t)ptep & pt_attr_leaf_offmask(pt_attr)) / sizeof(*ptep);
	va += (ptep_index << pt_attr_leaf_shift(pt_attr));

	return va;
}

/**
 * Return the virtual address that is being mapped by the passed in leaf page
 * table entry.
 *
 * @param ptep Pointer to a PTE to parse.
 */
static inline vm_map_address_t
ptep_get_va(const pt_entry_t *ptep)
{
	return ptd_get_va(ptep_get_ptd(ptep), ptep);
}

/**
 * Physical Page Attribute Table (pp_attr_table) defines and helper functions.
 */

/* How many bits to use for flags on a per-VM-page basis. */
typedef uint16_t pp_attr_t;

/* See the definition of pp_attr_table for more information. */
extern volatile pp_attr_t* pp_attr_table;

/**
 * Flags stored in the pp_attr_table on a per-physical-page basis.
 *
 * Please update the pv_walk LLDB macro if these flags are changed or added to.
 */

/**
 * The bottom 6-bits are used to store the default WIMG (cacheability and memory
 * type) setting for this physical page. This can be changed by calling
 * pmap_set_cache_attributes().
 *
 * If a default WIMG setting isn't set for a page, then the default is Normal,
 * Cached memory (VM_WIMG_DEFAULT).
 */
#define PP_ATTR_WIMG_MASK 0x003F
#define PP_ATTR_WIMG(x) ((x) & PP_ATTR_WIMG_MASK)

/**
 * The reference and modify bits keep track of whether a page has been accessed
 * or modified since the last time the bits were cleared. These bits are used to
 * enforce policy decisions in the VM layer.
 */
#define PP_ATTR_REFERENCED 0x0040
#define PP_ATTR_MODIFIED   0x0080

/**
 * This physical page is being used as anonymous memory that's internally
 * managed by the VM and is not connected to an external pager. This flag is
 * only set/cleared on the first CPU mapping of a page (see PVH_FLAG_CPU). Any
 * subsequent mappings won't set/clear this flag until all mappings are removed
 * and a new CPU mapping is added.
 */
#define PP_ATTR_INTERNAL 0x0100

/**
 * This flag is used to keep track of pages that are still resident but are not
 * considered dirty and can be reclaimed under memory pressure. These pages do
 * not count as a part of the memory footprint, so the footprint ledger does not
 * need to be updated for these pages. This is hinted to the VM by the
 * `madvise(MADV_FREE_REUSABLE)` system call.
 */
#define PP_ATTR_REUSABLE 0x0200

/**
 * This flag denotes that a page is utilizing "alternate accounting". This means
 * that the pmap doesn't need to keep track of these pages with regards to the
 * footprint ledger because the VM is already accounting for them in a different
 * way. These include IOKit mappings (VM adds their entire virtual size to the
 * footprint), and purgeable pages (VM counts them only when non-volatile and
 * only for one "owner"), among others.
 *
 * Note that alternate accounting status is tracked on a per-mapping basis (not
 * per-page). Because of that the ALTACCT flag in the pp_attr_table is only used
 * when there's a single mapping to a page. When there are multiple mappings,
 * the status of this flag is tracked in the pv_head_table (see PVE_PTEP_ALTACCT
 * above).
 */
#define PP_ATTR_ALTACCT 0x0400

/**
 * This bit was originally used on x86 to keep track of what pages to not
 * encrypt during the hibernation process as a performance optimization when
 * encryption was done in software. This doesn't apply to the ARM
 * hibernation process because all pages are automatically encrypted using
 * hardware acceleration. Despite that, the pmap still keeps track of this flag
 * as a debugging aid on internal builds.
 *
 * TODO: This bit can probably be reclaimed:
 * rdar://70740650 (PMAP Cleanup: Potentially reclaim the PP_ATTR_NOENCRYPT bit on ARM)
 */
#define PP_ATTR_NOENCRYPT 0x0800

/**
 * These bits denote that a physical page is expecting the next access or
 * modification to set the PP_ATTR_REFERENCED and PP_ATTR_MODIFIED flags
 * respectively.
 */
#define PP_ATTR_REFFAULT 0x1000
#define PP_ATTR_MODFAULT 0x2000

/**
 * Atomically set some flags in a pp_attr_table entry.
 *
 * @param pai The physical address index for the entry to update.
 * @param bits The flags to set in the entry.
 */
static inline void
ppattr_set_bits(unsigned int pai, pp_attr_t bits)
{
	volatile pp_attr_t *ppattr = &pp_attr_table[pai];
	os_atomic_or(ppattr, bits, relaxed);
}

/**
 * Atomically clear some flags in a pp_attr_table entry.
 *
 * @param pai The physical address index for the entry to update.
 * @param bits The flags to clear in the entry.
 */
static inline void
ppattr_clear_bits(unsigned int pai, pp_attr_t bits)
{
	volatile pp_attr_t *ppattr = &pp_attr_table[pai];
	os_atomic_andnot(ppattr, bits, relaxed);
}

/**
 * General-purpose function for atomically modifying flags in a pp_attr_table entry.
 *
 * @param pai The physical address index for the entry to update.
 * @param bits_to_clear Mask of bits to atomically clear from the entry.
 * @param bits_to_set Mask of bits to atomically set in the entry.
 *
 * @note [bits_to_clear] and [bits_to_set] must not overlap.
 */
static inline void
ppattr_modify_bits(unsigned int pai, pp_attr_t bits_to_clear, pp_attr_t bits_to_set)
{
	assert((bits_to_set & bits_to_clear) == 0);
	pp_attr_t prev_ppattr, new_ppattr;
	os_atomic_rmw_loop(&pp_attr_table[pai], prev_ppattr, new_ppattr, relaxed, {
		new_ppattr = (prev_ppattr & ~bits_to_clear) | bits_to_set;
	});
}

/**
 * Return true if the pp_attr_table entry contains the passed in bits.
 *
 * @param pai The physical address index for the entry to test.
 * @param bits The flags to check for.
 */
static inline bool
ppattr_test_bits(unsigned int pai, pp_attr_t bits)
{
	const volatile pp_attr_t *ppattr = &pp_attr_table[pai];
	return (*ppattr & bits) == bits;
}

/**
 * Only set some flags in a pp_attr_table entry if the passed in physical
 * address is a kernel-managed address.
 *
 * @param pa The physical address for the entry to update.
 * @param bits The flags to set in the entry.
 */
static inline void
ppattr_pa_set_bits(pmap_paddr_t pa, pp_attr_t bits)
{
	if (pa_valid(pa)) {
		ppattr_set_bits(pa_index(pa), bits);
	}
}

/**
 * Only clear some flags in a pp_attr_table entry if the passed in physical
 * address is a kernel-managed address.
 *
 * @param pa The physical address for the entry to update.
 * @param bits The flags to clear in the entry.
 */
static inline void
ppattr_pa_clear_bits(pmap_paddr_t pa, pp_attr_t bits)
{
	if (pa_valid(pa)) {
		ppattr_clear_bits(pa_index(pa), bits);
	}
}

/**
 * Only test flags in a pp_attr_table entry if the passed in physical address
 * is a kernel-managed page.
 *
 * @param pa The physical address for the entry to test.
 * @param bits The flags to check for.
 *
 * @return False if the PA isn't a kernel-managed page, otherwise true/false
 *         depending on whether the bits are set.
 */
static inline bool
ppattr_pa_test_bits(pmap_paddr_t pa, pp_attr_t bits)
{
	return pa_valid(pa) ? ppattr_test_bits(pa_index(pa), bits) : false;
}

/**
 * Set the PP_ATTR_MODIFIED flag on a specific pp_attr_table entry if the passed
 * in physical address is a kernel-managed page.
 *
 * @param pa The physical address for the entry to update.
 */
static inline void
ppattr_pa_set_modify(pmap_paddr_t pa)
{
	ppattr_pa_set_bits(pa, PP_ATTR_MODIFIED);
}

/**
 * Clear the PP_ATTR_MODIFIED flag on a specific pp_attr_table entry if the
 * passed in physical address is a kernel-managed page.
 *
 * @param pa The physical address for the entry to update.
 */
static inline void
ppattr_pa_clear_modify(pmap_paddr_t pa)
{
	ppattr_pa_clear_bits(pa, PP_ATTR_MODIFIED);
}

/**
 * Set the PP_ATTR_REFERENCED flag on a specific pp_attr_table entry if the
 * passed in physical address is a kernel-managed page.
 *
 * @param pa The physical address for the entry to update.
 */
static inline void
ppattr_pa_set_reference(pmap_paddr_t pa)
{
	ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED);
}

/**
 * Clear the PP_ATTR_REFERENCED flag on a specific pp_attr_table entry if the
 * passed in physical address is a kernel-managed page.
 *
 * @param pa The physical address for the entry to update.
 */
static inline void
ppattr_pa_clear_reference(pmap_paddr_t pa)
{
	ppattr_pa_clear_bits(pa, PP_ATTR_REFERENCED);
}

/**
 * Set the PP_ATTR_INTERNAL flag on a specific pp_attr_table entry.
 *
 * @param pai The physical address index for the entry to update.
 */
static inline void
ppattr_set_internal(unsigned int pai)
{
	ppattr_set_bits(pai, PP_ATTR_INTERNAL);
}

/**
 * Clear the PP_ATTR_INTERNAL flag on a specific pp_attr_table entry.
 *
 * @param pai The physical address index for the entry to update.
 */
static inline void
ppattr_clear_internal(unsigned int pai)
{
	ppattr_clear_bits(pai, PP_ATTR_INTERNAL);
}

/**
 * Return true if the pp_attr_table entry has the PP_ATTR_INTERNAL flag set.
 *
 * @param pai The physical address index for the entry to test.
 */
static inline bool
ppattr_test_internal(unsigned int pai)
{
	return ppattr_test_bits(pai, PP_ATTR_INTERNAL);
}

/**
 * Set the PP_ATTR_REUSABLE flag on a specific pp_attr_table entry.
 *
 * @param pai The physical address index for the entry to update.
 */
static inline void
ppattr_set_reusable(unsigned int pai)
{
	ppattr_set_bits(pai, PP_ATTR_REUSABLE);
}

/**
 * Clear the PP_ATTR_REUSABLE flag on a specific pp_attr_table entry.
 *
 * @param pai The physical address index for the entry to update.
 */
static inline void
ppattr_clear_reusable(unsigned int pai)
{
	ppattr_clear_bits(pai, PP_ATTR_REUSABLE);
}

/**
 * Return true if the pp_attr_table entry has the PP_ATTR_REUSABLE flag set.
 *
 * @param pai The physical address index for the entry to test.
 */
static inline bool
ppattr_test_reusable(unsigned int pai)
{
	return ppattr_test_bits(pai, PP_ATTR_REUSABLE);
}

/**
 * Set the PP_ATTR_ALTACCT flag on a specific pp_attr_table entry.
 *
 * @note This is only valid when the ALTACCT flag is being tracked using the
 *       pp_attr_table. See the descriptions above the PVE_PTEP_ALTACCT and
 *       PP_ATTR_ALTACCT definitions for more information.
 *
 * @param pai The physical address index for the entry to update.
 */
static inline void
ppattr_set_altacct(unsigned int pai)
{
	ppattr_set_bits(pai, PP_ATTR_ALTACCT);
}

/**
 * Clear the PP_ATTR_ALTACCT flag on a specific pp_attr_table entry.
 *
 * @note This is only valid when the ALTACCT flag is being tracked using the
 *       pp_attr_table. See the descriptions above the PVE_PTEP_ALTACCT and
 *       PP_ATTR_ALTACCT definitions for more information.
 *
 * @param pai The physical address index for the entry to update.
 */
static inline void
ppattr_clear_altacct(unsigned int pai)
{
	ppattr_clear_bits(pai, PP_ATTR_ALTACCT);
}

/**
 * Get the PP_ATTR_ALTACCT flag on a specific pp_attr_table entry.
 *
 * @note This is only valid when the ALTACCT flag is being tracked using the
 *       pp_attr_table. See the descriptions above the PVE_PTEP_ALTACCT and
 *       PP_ATTR_ALTACCT definitions for more information.
 *
 * @param pai The physical address index for the entry to test.
 *
 * @return True if the passed in page uses alternate accounting, false
 *         otherwise.
 */
static inline bool
ppattr_is_altacct(unsigned int pai)
{
	return ppattr_test_bits(pai, PP_ATTR_ALTACCT);
}

/**
 * Get the PP_ATTR_INTERNAL flag on a specific pp_attr_table entry.
 *
 * @note This is only valid when the INTERNAL flag is being tracked using the
 *       pp_attr_table. See the descriptions above the PVE_PTEP_INTERNAL and
 *       PP_ATTR_INTERNAL definitions for more information.
 *
 * @param pai The physical address index for the entry to test.
 *
 * @return True if the passed in page is accounted for as "internal", false
 *         otherwise.
 */
static inline bool
ppattr_is_internal(unsigned int pai)
{
	return ppattr_test_bits(pai, PP_ATTR_INTERNAL);
}

/**
 * The "alternate accounting" (ALTACCT) status for a page is tracked differently
 * depending on whether there are one or multiple mappings to a page. This
 * function abstracts out the difference between single and multiple mappings to
 * a page and provides a single function for determining whether alternate
 * accounting is set for a mapping.
 *
 * @note See the descriptions above the PVE_PTEP_ALTACCT and PP_ATTR_ALTACCT
 *       definitions for more information.
 *
 * @param pai The physical address index for the entry to test.
 * @param pvep Pointer to the pv_entry_t object containing that mapping.
 * @param idx Index of the chosen PTE pointer inside the PVE.
 *
 * @return True if the passed in page uses alternate accounting, false
 *         otherwise.
 */
static inline bool
ppattr_pve_is_altacct(unsigned int pai, pv_entry_t *pvep, unsigned idx)
{
	return (pvep == PV_ENTRY_NULL) ? ppattr_is_altacct(pai) : pve_get_altacct(pvep, idx);
}

/**
 * The "internal" (INTERNAL) status for a page is tracked differently
 * depending on whether there are one or multiple mappings to a page. This
 * function abstracts out the difference between single and multiple mappings to
 * a page and provides a single function for determining whether "internal"
 * is set for a mapping.
 *
 * @note See the descriptions above the PVE_PTEP_INTERNAL and PP_ATTR_INTERNAL
 *       definitions for more information.
 *
 * @param pai The physical address index for the entry to test.
 * @param pvep Pointer to the pv_entry_t object containing that mapping.
 * @param idx Index of the chosen PTE pointer inside the PVE.
 *
 * @return True if the passed in page is "internal", false otherwise.
 */
static inline bool
ppattr_pve_is_internal(unsigned int pai, pv_entry_t *pvep, unsigned idx)
{
	return (pvep == PV_ENTRY_NULL) ? ppattr_is_internal(pai) : pve_get_internal(pvep, idx);
}

/**
 * The "alternate accounting" (ALTACCT) status for a page is tracked differently
 * depending on whether there are one or multiple mappings to a page. This
 * function abstracts out the difference between single and multiple mappings to
 * a page and provides a single function for setting the alternate accounting status
 * for a mapping.
 *
 * @note See the descriptions above the PVE_PTEP_ALTACCT and PP_ATTR_ALTACCT
 *       definitions for more information.
 *
 * @param pai The physical address index for the entry to update.
 * @param pvep Pointer to the pv_entry_t object containing that mapping.
 * @param idx Index of the chosen PTE pointer inside the PVE.
 */
static inline void
ppattr_pve_set_altacct(unsigned int pai, pv_entry_t *pvep, unsigned idx)
{
	if (pvep == PV_ENTRY_NULL) {
		ppattr_set_altacct(pai);
	} else {
		pve_set_altacct(pvep, idx);
	}
}

/**
 * The "internal" (INTERNAL) status for a page is tracked differently
 * depending on whether there are one or multiple mappings to a page. This
 * function abstracts out the difference between single and multiple mappings to
 * a page and provides a single function for setting the "internal" status
 * for a mapping.
 *
 * @note See the descriptions above the PVE_PTEP_INTERNAL and PP_ATTR_INTERNAL
 *       definitions for more information.
 *
 * @param pai The physical address index for the entry to update.
 * @param pvep Pointer to the pv_entry_t object containing that mapping.
 * @param idx Index of the chosen PTE pointer inside the PVE.
 */
static inline void
ppattr_pve_set_internal(unsigned int pai, pv_entry_t *pvep, unsigned idx)
{
	if (pvep == PV_ENTRY_NULL) {
		ppattr_set_internal(pai);
	} else {
		pve_set_internal(pvep, idx);
	}
}

/**
 * The "alternate accounting" (ALTACCT) status for a page is tracked differently
 * depending on whether there are one or multiple mappings to a page. This
 * function abstracts out the difference between single and multiple mappings to
 * a page and provides a single function for clearing the alternate accounting status
 * for a mapping.
 *
 * @note See the descriptions above the PVE_PTEP_ALTACCT and PP_ATTR_ALTACCT
 *       definitions for more information.
 *
 * @param pai The physical address index for the entry to update.
 * @param pvep Pointer to the pv_entry_t object containing that mapping.
 * @param idx Index of the chosen PTE pointer inside the PVE.
 */
static inline void
ppattr_pve_clr_altacct(unsigned int pai, pv_entry_t *pvep, unsigned idx)
{
	if (pvep == PV_ENTRY_NULL) {
		ppattr_clear_altacct(pai);
	} else {
		pve_clr_altacct(pvep, idx);
	}
}

/**
 * The "internal" (INTERNAL) status for a page is tracked differently
 * depending on whether there are one or multiple mappings to a page. This
 * function abstracts out the difference between single and multiple mappings to
 * a page and provides a single function for clearing the "internal" status
 * for a mapping.
 *
 * @note See the descriptions above the PVE_PTEP_INTERNAL and PP_ATTR_INTERNAL
 *       definitions for more information.
 *
 * @param pai The physical address index for the entry to update.
 * @param pvep Pointer to the pv_entry_t object containing that mapping.
 * @param idx Index of the chosen PTE pointer inside the PVE.
 */
static inline void
ppattr_pve_clr_internal(unsigned int pai, pv_entry_t *pvep, unsigned idx)
{
	if (pvep == PV_ENTRY_NULL) {
		ppattr_clear_internal(pai);
	} else {
		pve_clr_internal(pvep, idx);
	}
}

/**
 * Set the PP_ATTR_REFFAULT flag on a specific pp_attr_table entry.
 *
 * @param pai The physical address index for the entry to update.
 */
static inline void
ppattr_set_reffault(unsigned int pai)
{
	ppattr_set_bits(pai, PP_ATTR_REFFAULT);
}

/**
 * Clear the PP_ATTR_REFFAULT flag on a specific pp_attr_table entry.
 *
 * @param pai The physical address index for the entry to update.
 */
static inline void
ppattr_clear_reffault(unsigned int pai)
{
	ppattr_clear_bits(pai, PP_ATTR_REFFAULT);
}

/**
 * Return true if the pp_attr_table entry has the PP_ATTR_REFFAULT flag set.
 *
 * @param pai The physical address index for the entry to test.
 */
static inline bool
ppattr_test_reffault(unsigned int pai)
{
	return ppattr_test_bits(pai, PP_ATTR_REFFAULT);
}

/**
 * Set the PP_ATTR_MODFAULT flag on a specific pp_attr_table entry.
 *
 * @param pai The physical address index for the entry to update.
 */
static inline void
ppattr_set_modfault(unsigned int pai)
{
	ppattr_set_bits(pai, PP_ATTR_MODFAULT);
}

/**
 * Clear the PP_ATTR_MODFAULT flag on a specific pp_attr_table entry.
 *
 * @param pai The physical address index for the entry to update.
 */
static inline void
ppattr_clear_modfault(unsigned int pai)
{
	ppattr_clear_bits(pai, PP_ATTR_MODFAULT);
}

/**
 * Return true if the pp_attr_table entry has the PP_ATTR_MODFAULT flag set.
 *
 * @param pai The physical address index for the entry to test.
 */
static inline bool
ppattr_test_modfault(unsigned int pai)
{
	return ppattr_test_bits(pai, PP_ATTR_MODFAULT);
}

/**
 * Retype epoch operations:
 *
 * The retype epoch facility provides an SMR/RCU-like mechanism by which the SPTM pmap
 * can ensure all CPUs have observed updated mapping state before retyping a physical page.
 *
 * There are certain cases in which the pmap, while issuing an SPTM call that modifies
 * mappings, cannot hold locks such as the PVH lock which would prevent the page from
 * being concurrently retyped.  This is particularly true for batched operations such
 * as pmap_remove(), phys_attribute_clear_range(), and pmap_batch_set_cache_attributes().
 * In these cases, the pmap may call pmap_retype_epoch_enter() to note that it is
 * performing such a sensitive operation on the local CPU.  It must then call
 * pmap_retype_epoch_exit() upon completion of the sensitive operation.
 *
 * Then, for any instance in which the pmap needs to retype a page without being
 * otherwise guaranteed (e.g. by VM layer locking or the existing page type) that such
 * a sensitive operation is not in progress on some other CPU, it must drain these
 * sensitive operations from other CPUs.  Specifically, it must ensure that any
 * sensitive operation which may have observed prior mapping state of the page that
 * is to be retyped has completed.  This is accomplished by first calling
 * pmap_retype_epoch_prepare_drain() to record the initial retype epoch state of
 * all CPUs, followed by pmap_retype_epoch_drain() to ensure all remote CPUs are
 * either not in an epoch or have advanced beyond the initially recorded epoch.
 * These are exposed as two separate functions in order to allow the calling CPU
 * to do other work between calling pmap_retype_epoch_prepare_drain() and
 * pmap_retype_epoch_drain(), as a best-effort attempt to minimize time wasted
 * spinning in pmap_retype_epoch_drain().
 *
 * When draining the retype epoch, the following assumptions must hold true:
 *
 * 1) The calling thread must guarantee that prior updates needed to bring the page
 * into the correct mapping state for retyping have already been performed and made
 * globally visible using the appropriate barriers.  In most cases this means that
 * all existing mappings of the page must have been removed.  For any alterations
 * of mapping state, global visibility is conveniently already guaranteed by the
 * DSBs that are architecturally required to synchronize PTE updates and the TLBIs
 * that follow them.
 *
 * 2) The calling thread must have some means of ensuring the new mappings cannot
 * be added for the page that would bring it out of the correct state for retyping.
 * This is typically done by holding the PVH lock and/or the exclusive pmap lock
 * such that pmap_enter() cannot concurrently execute against the page.
 *
 * 3) The calling thread must not perform any operation which requires preemptibility
 * between calling pmap_retype_epoch_prepare_drain() and pmap_retype_epoch_drain().
 */

/**
 * Enter the retype epoch on the local CPU to indicate an in-progress SPTM operation
 * that may be sensitive to a concurrent retype operation on another CPU.
 *
 * @note This function increments the thread's preemption disable count and returns
 *       with preemption disabled.
 *
 * @note This function issues all required barriers to ensure correct ordering of
 *       the epoch update relative to ensuing SPTM accesses.
 */
static inline void
pmap_retype_epoch_enter(void)
{
	mp_disable_preemption();
	pmap_retype_epoch_t *retype_epoch = &PERCPU_GET(pmap_sptm_percpu)->retype_epoch;
	assert(!preemption_enabled());

	/* Must not already been in a retype epoch on this CPU. */
	assert(retype_epoch->local_seq == 0);
	retype_epoch->local_seq = ++retype_epoch->next_seq;
	/* Unsigned 64-bit per-CPU integer should never overflow on any human timescale. */
	assert(retype_epoch->local_seq != 0);

	/**
	 * Issue a store-load barrier to ensure that remote observers of any ensuing
	 * SPTM accesses will also observe the epoch update.
	 */
	os_atomic_thread_fence(seq_cst);
}

/**
 * Exit the retype epoch on the local CPU to indicate completion of an SPTM operation
 * that may be sensitive to a concurrent retype operation on another CPU.
 *
 * @note This function must be called with preemption disabled and will decrement
 *       the current thread's preemption disable count.
 */
static inline void
pmap_retype_epoch_exit(void)
{
	pmap_retype_epoch_t *retype_epoch = &PERCPU_GET(pmap_sptm_percpu)->retype_epoch;
	assert(!preemption_enabled());
	assert(retype_epoch->local_seq == retype_epoch->next_seq);

	/**
	 * Clear the sequence using a store-release operation to ensure that prior
	 * SPTM modifications will be visible to remote observers before the absence
	 * of an epoch is visible.
	 */
	os_atomic_store(&retype_epoch->local_seq, 0, release);
	mp_enable_preemption();
}

/**
 * Prepare the local CPU to perform an epoch drain operation by recording the retype
 * epoch state of other CPUs.
 *
 * @note This function increments the current thread's preemption disable count and
 *       returns with preemption disabled.
 *
 * @note This function issues all necessary barriers to ensure that the subsequent
 *       retype operation is not speculated ahead of the epoch sampling.
 *
 * @note This function does NOT issue any barriers to ensure that prior updates of
 *       mapping state are globally visible and have proper store-load ordering with
 *       respect to the scan performed here.  In the cases where this function is
 *       intended to be used, this ordering should be guaranteed automatically by
 *       the DSBs used to synchronize prior mapping updates issued by the caller.
 *       If this function is ever used in a situation where that cannot be guaranteed,
 *       the caller must issue at least the equivalent of 'dmb ish' (a.k.a. a seq_cst
 *       thread_fence) before calling this function.
 */
static inline void
pmap_retype_epoch_prepare_drain(void)
{
	mp_disable_preemption();
	pmap_retype_epoch_t *retype_epoch = &PERCPU_GET(pmap_sptm_percpu)->retype_epoch;
	assert(retype_epoch->flags == 0);
	unsigned int i = 0;
	uint8_t flags = PMAP_RETYPE_EPOCH_PREPARED;

	/* Sample each CPU's epoch state. */
	percpu_foreach(pmap_pcpu, pmap_sptm_percpu) {
		const uint64_t remote_epoch =
		    os_atomic_load(&pmap_pcpu->retype_epoch.local_seq, relaxed);
		retype_epoch->remote_seq[i] = remote_epoch;

		/**
		 * If the remote CPU has an active epoch, make a note to ourselves that
		 * we'll need to drain it.
		 */
		if (remote_epoch != 0) {
			flags |= PMAP_RETYPE_EPOCH_DRAIN_REQUIRED;
		}
		++i;
	}
	retype_epoch->flags = flags;

	/**
	 * Issue a load-load barrier to ensure subsequent drain or retype operations will
	 * not be speculated ahead of the sampling we just did.
	 */
	os_atomic_thread_fence(acquire);
}

/**
 * Ensure that all CPUs have advanced beyond any active epoch that was recorded in the
 * most recent call to pmap_retype_epoch_prepare_drain().
 *
 * @note This function expects to be called with preemption disabled and will decrement
 *       the current thread's preemption disable count.
 *
 * @note pmap_retype_epoch_prepare_drain() must have been called on the local CPU
 *       prior to calling this function.  This function will return immediately if
 *       this prior call did not observe any active epochs on remote CPUs.
 *
 * @note This function issues all necessary barriers to ensure that the subsequent
 *       retype operation is not speculated ahead of the epoch sampling.
 */
static inline void
pmap_retype_epoch_drain(void)
{
	assert(!preemption_enabled());
	pmap_retype_epoch_t *retype_epoch = &PERCPU_GET(pmap_sptm_percpu)->retype_epoch;
	const uint8_t flags = retype_epoch->flags;
	assert(flags & PMAP_RETYPE_EPOCH_PREPARED);
	retype_epoch->flags = 0;
	if (!(flags & PMAP_RETYPE_EPOCH_DRAIN_REQUIRED)) {
		mp_enable_preemption();
		return;
	}
	unsigned int i = 0;
	percpu_foreach(pmap_pcpu, pmap_sptm_percpu) {
		if (retype_epoch->remote_seq[i] != 0) {
			assert((pmap_pcpu->retype_epoch.local_seq == 0) ||
			    (pmap_pcpu->retype_epoch.local_seq >= retype_epoch->remote_seq[i]));
			/**
			 * If the remote CPU was in an epoch, WFE-spin until it either exits the epoch
			 * or advances to a new epoch.
			 */
			while ((os_atomic_load_exclusive(&pmap_pcpu->retype_epoch.local_seq, relaxed) ==
			    retype_epoch->remote_seq[i])) {
				__builtin_arm_wfe();
			}
			/* Clear the monitor if we exclusive-loaded a value that didn't require WFE. */
			os_atomic_clear_exclusive();
		}
		++i;
	}
	mp_enable_preemption();
	/**
	 * Issue a load-load barrier to ensure subsequent retype operations will
	 * not be speculated ahead of the sampling we just did.
	 */
	os_atomic_thread_fence(acquire);
}

/**
 * Helper to determine whether a frame type is one that requires automatic
 * retyping (by the pmap layer) back to XNU_DEFAULT when all mappings of the
 * page are gone.
 *
 * @return true if the type requires auto-retyping, false otherwise.
 */
static inline bool
pmap_type_requires_retype_on_unmap(sptm_frame_type_t frame_type)
{
	return (frame_type == XNU_USER_EXEC) || (frame_type == XNU_USER_DEBUG) ||
	       (frame_type == XNU_USER_JIT) || (frame_type == XNU_ROZONE) ||
	       (frame_type == XNU_KERNEL_RESTRICTED);
}


/**
 * If necessary, prepare a physical page for being retyped back to XNU_DEFAULT
 * after the last CPU mapping has been removed.  This is only needed for pages of
 * certain special types such as the various executable types and the kernel RO
 * zone type.
 *
 * @note The PVH lock for the physical page that is getting a new mapping
 *       registered must already be held.
 *
 * @param pa The physical address of the recently-unmapped page.
 *
 * @return true if the page will need to be retyped, false otherwise.
 */
static inline bool
pmap_prepare_unmapped_page_for_retype(pmap_paddr_t pa)
{
	pvh_assert_locked(pa_index(pa));
	const sptm_frame_type_t frame_type = sptm_get_frame_type(pa);
	if (__improbable(pmap_type_requires_retype_on_unmap(frame_type))) {
		pmap_retype_epoch_prepare_drain();
		return true;
	}
	return false;
}

/**
 * If necessary, retype a physical page back to XNU_DEFAULT after the last CPU
 * mapping has been removed.  This is only needed for pages of certain special
 * types such as the various executable types, the kernel RO zone type,
 * and XNU_KERNEL_RESTRICTED.
 *
 * @note The PVH lock for the physical page that is getting a new mapping
 *       registered must already be held.
 *
 * @param pa The physical address of the recently-unmapped page.
 *
 * @return true if the page needed to be retyped, false otherwise.
 */
static inline bool
pmap_retype_unmapped_page(pmap_paddr_t pa)
{
	pvh_assert_locked(pa_index(pa));
	const sptm_frame_type_t frame_type = sptm_get_frame_type(pa);
	if (__improbable(pmap_type_requires_retype_on_unmap(frame_type))) {
		sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
		pmap_retype_epoch_drain();
		sptm_retype(pa & ~PAGE_MASK, frame_type, XNU_DEFAULT, retype_params);
		return true;
	}
	return false;
}

static inline boolean_t
pmap_is_preemptible(void)
{
	return preemption_enabled() || (startup_phase < STARTUP_SUB_EARLY_BOOT) || PMAP_IS_HIBERNATING();
}

/**
 * This helper function ensures that potentially-long-running batched operations are
 * called in preemptible context before entering the SPTM, so that the SPTM call may
 * periodically exit to allow pending urgent ASTs to be taken.
 */
static inline void
pmap_verify_preemptible(void)
{
	assert(pmap_is_preemptible());
}

/**
 * The minimum number of pages to keep in the PPL page free list.
 *
 * We define our target as 8 pages: enough for 2 page table pages, a PTD page,
 * and a PV page; in essence, twice as many pages as may be necessary to satisfy
 * a single pmap_enter request.
 */
#define PMAP_MIN_FREE_PPL_PAGES 8

/**
 * Flags passed to various page allocation functions, usually accessed through
 * the pmap_page_alloc() API. Each function that can take these flags as
 * a part of its option field, will describe these flags in its function header.
 */

/* Can be used when no allocation flags are wanted. */
#define PMAP_PAGE_ALLOCATE_NONE 0x0

/**
 * Instruct the allocation function to return immediately if no pages are
 * current available. Without this flag, the function will spin and wait for a
 * page to become available. This flag can be required in some circumstances
 * (for instance, when allocating pages from within the PPL).
 */
#define PMAP_PAGE_ALLOCATE_NOWAIT 0x1

/**
 * Instructs an allocation function to fallback to reclaiming a userspace page
 * table if it failed to allocate a page from the free lists. This can be useful
 * when allocating from within the PPL because refilling the free lists requires
 * exiting and re-entering the PPL (which incurs extra latency).
 *
 * This is a quick way of allocating a page at the expense of having to
 * reallocate the table the next time one of its mappings is accessed.
 */
#define PMAP_PAGE_RECLAIM_NOWAIT 0x2

/**
 * Instructs an allocation function to avoid zero-filling the newly-allocated
 * page.  This should be used only if you know the page will be fully initialized
 * by some other means on the relevant allocation path.
 */
#define PMAP_PAGE_NOZEROFILL 0x4

/**
 * Global variables exported to the rest of the internal pmap implementation.
 */
extern pmap_paddr_t sptm_cpu_iommu_scratch_start;
extern pmap_paddr_t sptm_cpu_iommu_scratch_end;
extern unsigned int inuse_pmap_pages_count;
extern vm_object_t pmap_object;
extern uint32_t pv_alloc_initial_target;
extern uint32_t pv_kern_alloc_initial_target;

/**
 * Functions exported to the rest of the internal pmap implementation.
 */
extern void pmap_data_bootstrap(void);
extern void pmap_enqueue_pages(vm_page_t);
extern kern_return_t pmap_page_alloc(pmap_paddr_t *, unsigned);
extern void pmap_page_free(pmap_paddr_t);

/**
 * The modes in which a pmap lock can be acquired. Note that shared access
 * doesn't necessarily mean "read-only". As long as data is atomically updated
 * correctly (to account for multi-cpu accesses) data can still get written with
 * a shared lock held. Care just needs to be taken so as to not introduce any
 * race conditions when there are multiple writers.
 *
 * This is here in pmap_data.h because it's a needed parameter for pv_alloc()
 * and pmap_enter_pv(). This header is always included in pmap_internal.h before
 * the rest of the pmap locking code is defined so there shouldn't be any issues
 * with missing types.
 */
OS_ENUM(pmap_lock_mode, uint8_t,
    PMAP_LOCK_SHARED,
    PMAP_LOCK_EXCLUSIVE,
    PMAP_LOCK_HELD);

/**
 * Possible return values for pv_alloc(). See the pv_alloc() function header for
 * a description of each of these values.
 */
typedef enum {
	PV_ALLOC_SUCCESS,
	PV_ALLOC_RETRY,
	PV_ALLOC_FAIL
} pv_alloc_return_t;

extern pv_alloc_return_t pv_alloc(
	pmap_t, pmap_lock_mode_t, unsigned int, pv_entry_t **, locked_pvh_t *, volatile uint16_t *);
extern void pv_free(pv_entry_t *);
extern void pv_list_free(pv_entry_t *, pv_entry_t *, unsigned int);
extern void pmap_compute_pv_targets(void);
extern pv_alloc_return_t pmap_enter_pv(
	pmap_t, pt_entry_t *, unsigned int, pmap_lock_mode_t, locked_pvh_t *, pv_entry_t **, int *);

typedef enum {
	PV_REMOVE_SUCCESS, /* found a mapping */
	PV_REMOVE_FAIL /* no mapping found */
} pv_remove_return_t;

extern pv_remove_return_t pmap_remove_pv(pmap_t, pt_entry_t *, locked_pvh_t *, bool *, bool *);

extern void ptd_bootstrap(pt_desc_t *, unsigned int);
extern pt_desc_t *ptd_alloc_unlinked(unsigned int);
extern pt_desc_t *ptd_alloc(pmap_t, unsigned int);
extern void ptd_deallocate(pt_desc_t *);
extern void ptd_info_init(
	pt_desc_t *, pmap_t, vm_map_address_t, unsigned int, pt_entry_t *);

extern kern_return_t pmap_ledger_credit(pmap_t, int, ledger_amount_t);
extern kern_return_t pmap_ledger_debit(pmap_t, int, ledger_amount_t);

extern void validate_pmap_internal(const volatile struct pmap *, const char *);
extern void validate_pmap_mutable_internal(const volatile struct pmap *, const char *);

/**
 * Macro function wrappers around pmap validation so that the calling function
 * can be printed in the panic strings for easier validation failure debugging.
 */
#define validate_pmap(x) validate_pmap_internal(x, __func__)
#define validate_pmap_mutable(x) validate_pmap_mutable_internal(x, __func__)

/**
 * This structure describes a SPTM-owned I/O range.
 *
 * @note This doesn't necessarily have to represent "I/O" only, this can also
 *       represent non-kernel-managed DRAM (e.g., iBoot carveouts). Any physical
 *       address region that isn't considered "kernel-managed" is fair game.
 *
 * @note The layout of this structure needs to map 1-to-1 with the pmap-io-range
 *       device tree nodes. Astris (through the LowGlobals) also depends on the
 *       consistency of this structure.
 *
 * @note These definitions are copied to SPTM and they need to be in sync.
 */
typedef struct pmap_io_range {
	/* Physical address of the PPL-owned I/O range. */
	uint64_t addr;

	/* Length (in bytes) of the PPL-owned I/O range. */
	uint64_t len;

	/* Strong DSB required for pages in this range. */
	#define PMAP_IO_RANGE_STRONG_SYNC (1UL << 31)

	/* Corresponds to memory carved out by bootloader. */
	#define PMAP_IO_RANGE_CARVEOUT (1UL << 30)

	/* Pages in this range need to be included in the hibernation image */
	#define PMAP_IO_RANGE_NEEDS_HIBERNATING (1UL << 29)

	/* Mark the range as 'owned' by a given subsystem */
	#define PMAP_IO_RANGE_OWNED (1UL << 28)

	/**
	 * Lower 16 bits treated as pp_attr_t, upper 16 bits contain additional
	 * mapping flags (defined above).
	 */
	uint32_t wimg;

	/* 4 Character Code (4CC) describing what this range is. */
	uint32_t signature;
} pmap_io_range_t;

/* Reminder: be sure to change all relevant device trees if you change the layout of pmap_io_range_t */
_Static_assert(sizeof(pmap_io_range_t) == 24, "unexpected size for pmap_io_range_t");

extern pmap_io_range_t* pmap_find_io_attr(pmap_paddr_t);

/**
 * This structure describes a sub-page-size I/O region owned by SPTM but the kernel can write to.
 *
 * @note I/O filter software will use a collection of such data structures to determine access
 *       permissions to a page owned by SPTM.
 *
 * @note The {signature, offset} key is used to index a collection of such data structures to
 *       optimize for space in the case where one page layout is repeated for many devices, such
 *       as the memory controller channels.
 */
typedef struct pmap_io_filter_entry {
	/* 4 Character Code (4CC) describing what this range (page) is. */
	uint32_t signature;

	/* Offset within the page. It has to be within [0, PAGE_SIZE). */
	uint16_t offset;

	/* Length of the range, and (offset + length) has to be within [0, PAGE_SIZE). */
	uint16_t length;
} pmap_io_filter_entry_t;

_Static_assert(sizeof(pmap_io_filter_entry_t) == 8, "unexpected size for pmap_io_filter_entry_t");

extern void pmap_cpu_data_init_internal(unsigned int);