/*
* Copyright (c) 2000-2020 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
* This file contains Original Code and/or Modifications of Original Code
* as defined in and that are subject to the Apple Public Source License
* Version 2.0 (the 'License'). You may not use this file except in
* compliance with the License. The rights granted to you under the License
* may not be used to create, or enable the creation or redistribution of,
* unlawful or unlicensed copies of an Apple operating system, or to
* circumvent, violate, or enable the circumvention or violation of, any
* terms of an Apple operating system software license agreement.
*
* Please obtain a copy of the License at
* http://www.opensource.apple.com/apsl/ and read it before using this file.
*
* The Original Code and all software distributed under the License are
* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
* Please see the License for the specific language governing rights and
* limitations under the License.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
*/
/*
* @OSF_COPYRIGHT@
*/
/*
* Mach Operating System
* Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
* All Rights Reserved.
*
* Permission to use, copy, modify and distribute this software and its
* documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
* ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie Mellon
* the rights to redistribute these changes.
*/
/*
*/
/*
* File: vm/vm_pageout.c
* Author: Avadis Tevanian, Jr., Michael Wayne Young
* Date: 1985
*
* The proverbial page-out daemon.
*/
#include "mach/kern_return.h"
#include <stdint.h>
#include <ptrauth.h>
#include <debug.h>
#include <mach/mach_types.h>
#include <mach/memory_object.h>
#include <mach/mach_host_server.h>
#include <mach/upl.h>
#include <mach/vm_map.h>
#include <mach/vm_param.h>
#include <mach/vm_statistics.h>
#include <mach/sdt.h>
#include <kern/kern_types.h>
#include <kern/counter.h>
#include <kern/host_statistics.h>
#include <kern/machine.h>
#include <kern/misc_protos.h>
#include <kern/sched.h>
#include <kern/thread.h>
#include <kern/kalloc.h>
#include <kern/zalloc_internal.h>
#include <kern/policy_internal.h>
#include <kern/thread_group.h>
#include <os/log.h>
#include <sys/kdebug_triage.h>
#include <machine/vm_tuning.h>
#include <machine/commpage.h>
#include <vm/pmap.h>
#include <vm/vm_compressor_pager_internal.h>
#include <vm/vm_fault_internal.h>
#include <vm/vm_map_internal.h>
#include <vm/vm_object_internal.h>
#include <vm/vm_page_internal.h>
#include <vm/vm_pageout_internal.h>
#include <vm/vm_protos_internal.h> /* must be last */
#include <vm/memory_object.h>
#include <vm/vm_purgeable_internal.h>
#include <vm/vm_shared_region.h>
#include <vm/vm_compressor_internal.h>
#include <vm/vm_kern_xnu.h>
#include <vm/vm_iokit.h>
#include <vm/vm_ubc.h>
#include <san/kasan.h>
#if CONFIG_PHANTOM_CACHE
#include <vm/vm_phantom_cache_internal.h>
#endif
#if UPL_DEBUG
#include <libkern/OSDebug.h>
#endif
extern int cs_debug;
#if CONFIG_MBUF_MCACHE
extern void mbuf_drain(boolean_t);
#endif /* CONFIG_MBUF_MCACHE */
#if VM_PRESSURE_EVENTS
#if CONFIG_JETSAM
extern unsigned int memorystatus_available_pages;
extern unsigned int memorystatus_available_pages_pressure;
extern unsigned int memorystatus_available_pages_critical;
#else /* CONFIG_JETSAM */
extern uint64_t memorystatus_available_pages;
extern uint64_t memorystatus_available_pages_pressure;
extern uint64_t memorystatus_available_pages_critical;
#endif /* CONFIG_JETSAM */
#if CONFIG_FREEZE
extern unsigned int memorystatus_frozen_count;
extern unsigned int memorystatus_suspended_count;
#endif /* CONFIG_FREEZE */
extern vm_pressure_level_t memorystatus_vm_pressure_level;
extern lck_mtx_t memorystatus_jetsam_broadcast_lock;
extern uint32_t memorystatus_jetsam_fg_band_waiters;
extern uint32_t memorystatus_jetsam_bg_band_waiters;
void vm_pressure_response(void);
extern void consider_vm_pressure_events(void);
#define MEMORYSTATUS_SUSPENDED_THRESHOLD 4
#endif /* VM_PRESSURE_EVENTS */
SECURITY_READ_ONLY_LATE(thread_t) vm_pageout_scan_thread;
SECURITY_READ_ONLY_LATE(thread_t) vm_pageout_gc_thread;
#if CONFIG_VPS_DYNAMIC_PRIO
TUNABLE(bool, vps_dynamic_priority_enabled, "vps_dynamic_priority_enabled", false);
#else
const bool vps_dynamic_priority_enabled = false;
#endif
boolean_t vps_yield_for_pgqlockwaiters = TRUE;
#ifndef VM_PAGEOUT_BURST_INACTIVE_THROTTLE /* maximum iterations of the inactive queue w/o stealing/cleaning a page */
#if !XNU_TARGET_OS_OSX
#define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 1024
#else /* !XNU_TARGET_OS_OSX */
#define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 4096
#endif /* !XNU_TARGET_OS_OSX */
#endif
#ifndef VM_PAGEOUT_DEADLOCK_RELIEF
#define VM_PAGEOUT_DEADLOCK_RELIEF 100 /* number of pages to move to break deadlock */
#endif
#ifndef VM_PAGE_LAUNDRY_MAX
#define VM_PAGE_LAUNDRY_MAX 128UL /* maximum pageouts on a given pageout queue */
#endif /* VM_PAGEOUT_LAUNDRY_MAX */
#ifndef VM_PAGEOUT_BURST_WAIT
#define VM_PAGEOUT_BURST_WAIT 1 /* milliseconds */
#endif /* VM_PAGEOUT_BURST_WAIT */
#ifndef VM_PAGEOUT_EMPTY_WAIT
#define VM_PAGEOUT_EMPTY_WAIT 50 /* milliseconds */
#endif /* VM_PAGEOUT_EMPTY_WAIT */
#ifndef VM_PAGEOUT_DEADLOCK_WAIT
#define VM_PAGEOUT_DEADLOCK_WAIT 100 /* milliseconds */
#endif /* VM_PAGEOUT_DEADLOCK_WAIT */
#ifndef VM_PAGEOUT_IDLE_WAIT
#define VM_PAGEOUT_IDLE_WAIT 10 /* milliseconds */
#endif /* VM_PAGEOUT_IDLE_WAIT */
#ifndef VM_PAGEOUT_SWAP_WAIT
#define VM_PAGEOUT_SWAP_WAIT 10 /* milliseconds */
#endif /* VM_PAGEOUT_SWAP_WAIT */
/*
* vm_page_max_speculative_age_q should be less than or equal to
* VM_PAGE_RESERVED_SPECULATIVE_AGE_Q which is number of allocated
* vm_page_queue_speculative entries.
*/
TUNABLE_DEV_WRITEABLE(unsigned int, vm_page_max_speculative_age_q, "vm_page_max_speculative_age_q", VM_PAGE_DEFAULT_MAX_SPECULATIVE_AGE_Q);
#ifndef VM_PAGE_SPECULATIVE_TARGET
#define VM_PAGE_SPECULATIVE_TARGET(total) ((total) * 1 / (100 / vm_pageout_state.vm_page_speculative_percentage))
#endif /* VM_PAGE_SPECULATIVE_TARGET */
/*
* To obtain a reasonable LRU approximation, the inactive queue
* needs to be large enough to give pages on it a chance to be
* referenced a second time. This macro defines the fraction
* of active+inactive pages that should be inactive.
* The pageout daemon uses it to update vm_page_inactive_target.
*
* If vm_page_free_count falls below vm_page_free_target and
* vm_page_inactive_count is below vm_page_inactive_target,
* then the pageout daemon starts running.
*/
#ifndef VM_PAGE_INACTIVE_TARGET
#define VM_PAGE_INACTIVE_TARGET(avail) ((avail) * 1 / 2)
#endif /* VM_PAGE_INACTIVE_TARGET */
/*
* Once the pageout daemon starts running, it keeps going
* until vm_page_free_count meets or exceeds vm_page_free_target.
*/
#ifndef VM_PAGE_FREE_TARGET
#if !XNU_TARGET_OS_OSX
#define VM_PAGE_FREE_TARGET(free) (15 + (free) / 100)
#else /* !XNU_TARGET_OS_OSX */
#define VM_PAGE_FREE_TARGET(free) (15 + (free) / 80)
#endif /* !XNU_TARGET_OS_OSX */
#endif /* VM_PAGE_FREE_TARGET */
/*
* The pageout daemon always starts running once vm_page_free_count
* falls below vm_page_free_min.
*/
#ifndef VM_PAGE_FREE_MIN
#if !XNU_TARGET_OS_OSX
#define VM_PAGE_FREE_MIN(free) (10 + (free) / 200)
#else /* !XNU_TARGET_OS_OSX */
#define VM_PAGE_FREE_MIN(free) (10 + (free) / 100)
#endif /* !XNU_TARGET_OS_OSX */
#endif /* VM_PAGE_FREE_MIN */
#if !XNU_TARGET_OS_OSX
#define VM_PAGE_FREE_RESERVED_LIMIT 100
#define VM_PAGE_FREE_MIN_LIMIT 1500
#define VM_PAGE_FREE_TARGET_LIMIT 2000
#else /* !XNU_TARGET_OS_OSX */
#define VM_PAGE_FREE_RESERVED_LIMIT 1700
#define VM_PAGE_FREE_MIN_LIMIT 3500
#define VM_PAGE_FREE_TARGET_LIMIT 4000
#endif /* !XNU_TARGET_OS_OSX */
/*
* When vm_page_free_count falls below vm_page_free_reserved,
* only vm-privileged threads can allocate pages. vm-privilege
* allows the pageout daemon and default pager (and any other
* associated threads needed for default pageout) to continue
* operation by dipping into the reserved pool of pages.
*/
#ifndef VM_PAGE_FREE_RESERVED
#define VM_PAGE_FREE_RESERVED(n) \
((unsigned) (6 * VM_PAGE_LAUNDRY_MAX) + (n))
#endif /* VM_PAGE_FREE_RESERVED */
/*
* When we dequeue pages from the inactive list, they are
* reactivated (ie, put back on the active queue) if referenced.
* However, it is possible to starve the free list if other
* processors are referencing pages faster than we can turn off
* the referenced bit. So we limit the number of reactivations
* we will make per call of vm_pageout_scan().
*/
#define VM_PAGE_REACTIVATE_LIMIT_MAX 20000
#ifndef VM_PAGE_REACTIVATE_LIMIT
#if !XNU_TARGET_OS_OSX
#define VM_PAGE_REACTIVATE_LIMIT(avail) (VM_PAGE_INACTIVE_TARGET(avail) / 2)
#else /* !XNU_TARGET_OS_OSX */
#define VM_PAGE_REACTIVATE_LIMIT(avail) (MAX((avail) * 1 / 20,VM_PAGE_REACTIVATE_LIMIT_MAX))
#endif /* !XNU_TARGET_OS_OSX */
#endif /* VM_PAGE_REACTIVATE_LIMIT */
#define VM_PAGEOUT_INACTIVE_FORCE_RECLAIM 1000
int vm_pageout_protect_realtime = true;
extern boolean_t hibernate_cleaning_in_progress;
struct pgo_iothread_state pgo_iothread_internal_state[MAX_COMPRESSOR_THREAD_COUNT];
struct pgo_iothread_state pgo_iothread_external_state;
#if VM_PRESSURE_EVENTS
void vm_pressure_thread(void);
boolean_t VM_PRESSURE_NORMAL_TO_WARNING(void);
boolean_t VM_PRESSURE_WARNING_TO_CRITICAL(void);
boolean_t VM_PRESSURE_WARNING_TO_NORMAL(void);
boolean_t VM_PRESSURE_CRITICAL_TO_WARNING(void);
#endif
static void vm_pageout_iothread_external(struct pgo_iothread_state *, wait_result_t);
static void vm_pageout_iothread_internal(struct pgo_iothread_state *, wait_result_t);
static void vm_pageout_adjust_eq_iothrottle(struct pgo_iothread_state *, boolean_t);
extern void vm_pageout_continue(void);
extern void vm_pageout_scan(void);
boolean_t vm_pageout_running = FALSE;
uint32_t vm_page_upl_tainted = 0;
uint32_t vm_page_iopl_tainted = 0;
#if XNU_TARGET_OS_OSX
static boolean_t vm_pageout_waiter = FALSE;
#endif /* XNU_TARGET_OS_OSX */
#if DEVELOPMENT || DEBUG
struct vm_pageout_debug vm_pageout_debug;
#endif
struct vm_pageout_vminfo vm_pageout_vminfo;
struct vm_pageout_state vm_pageout_state;
struct vm_config vm_config;
struct vm_pageout_queue vm_pageout_queue_internal VM_PAGE_PACKED_ALIGNED;
struct vm_pageout_queue vm_pageout_queue_external VM_PAGE_PACKED_ALIGNED;
#if DEVELOPMENT || DEBUG
struct vm_pageout_queue vm_pageout_queue_benchmark VM_PAGE_PACKED_ALIGNED;
#endif /* DEVELOPMENT || DEBUG */
int vm_upl_wait_for_pages = 0;
vm_object_t vm_pageout_scan_wants_object = VM_OBJECT_NULL;
boolean_t(*volatile consider_buffer_cache_collect)(int) = NULL;
int vm_debug_events = 0;
LCK_GRP_DECLARE(vm_pageout_lck_grp, "vm_pageout");
#if CONFIG_MEMORYSTATUS
extern void memorystatus_kill_on_vps_starvation(void);
uint32_t vm_pageout_memorystatus_fb_factor_nr = 5;
uint32_t vm_pageout_memorystatus_fb_factor_dr = 2;
#endif
#if __AMP__
/*
* Bind compressor threads to e-cores unless there are multiple non-e clusters
*/
#if (MAX_CPU_CLUSTERS > 2)
#define VM_COMPRESSOR_EBOUND_DEFAULT false
#elif defined(XNU_TARGET_OS_XR)
#define VM_COMPRESSOR_EBOUND_DEFAULT false
#else
#define VM_COMPRESSOR_EBOUND_DEFAULT true
#endif
TUNABLE(bool, vm_compressor_ebound, "vmcomp_ecluster", VM_COMPRESSOR_EBOUND_DEFAULT);
int vm_pgo_pbound = 0;
extern void thread_bind_cluster_type(thread_t, char, bool);
#endif /* __AMP__ */
/*
* Routine: vm_pageout_object_terminate
* Purpose:
* Destroy the pageout_object, and perform all of the
* required cleanup actions.
*
* In/Out conditions:
* The object must be locked, and will be returned locked.
*/
void
vm_pageout_object_terminate(
vm_object_t object)
{
vm_object_t shadow_object;
/*
* Deal with the deallocation (last reference) of a pageout object
* (used for cleaning-in-place) by dropping the paging references/
* freeing pages in the original object.
*/
assert(object->pageout);
shadow_object = object->shadow;
vm_object_lock(shadow_object);
while (!vm_page_queue_empty(&object->memq)) {
vm_page_t p, m;
vm_object_offset_t offset;
p = (vm_page_t) vm_page_queue_first(&object->memq);
assert(p->vmp_private);
assert(p->vmp_free_when_done);
p->vmp_free_when_done = FALSE;
assert(!p->vmp_cleaning);
assert(!p->vmp_laundry);
offset = p->vmp_offset;
VM_PAGE_FREE(p);
p = VM_PAGE_NULL;
m = vm_page_lookup(shadow_object,
offset + object->vo_shadow_offset);
if (m == VM_PAGE_NULL) {
continue;
}
assert((m->vmp_dirty) || (m->vmp_precious) ||
(m->vmp_busy && m->vmp_cleaning));
/*
* Handle the trusted pager throttle.
* Also decrement the burst throttle (if external).
*/
vm_page_lock_queues();
if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
vm_pageout_throttle_up(m);
}
/*
* Handle the "target" page(s). These pages are to be freed if
* successfully cleaned. Target pages are always busy, and are
* wired exactly once. The initial target pages are not mapped,
* (so cannot be referenced or modified) but converted target
* pages may have been modified between the selection as an
* adjacent page and conversion to a target.
*/
if (m->vmp_free_when_done) {
assert(m->vmp_busy);
assert(m->vmp_q_state == VM_PAGE_IS_WIRED);
assert(m->vmp_wire_count == 1);
m->vmp_cleaning = FALSE;
m->vmp_free_when_done = FALSE;
/*
* Revoke all access to the page. Since the object is
* locked, and the page is busy, this prevents the page
* from being dirtied after the pmap_disconnect() call
* returns.
*
* Since the page is left "dirty" but "not modifed", we
* can detect whether the page was redirtied during
* pageout by checking the modify state.
*/
if (pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)) & VM_MEM_MODIFIED) {
SET_PAGE_DIRTY(m, FALSE);
} else {
m->vmp_dirty = FALSE;
}
if (m->vmp_dirty) {
vm_page_unwire(m, TRUE); /* reactivates */
counter_inc(&vm_statistics_reactivations);
vm_page_wakeup_done(object, m);
} else {
vm_page_free(m); /* clears busy, etc. */
}
vm_page_unlock_queues();
continue;
}
/*
* Handle the "adjacent" pages. These pages were cleaned in
* place, and should be left alone.
* If prep_pin_count is nonzero, then someone is using the
* page, so make it active.
*/
if ((m->vmp_q_state == VM_PAGE_NOT_ON_Q) && !m->vmp_private) {
if (m->vmp_reference) {
vm_page_activate(m);
} else {
vm_page_deactivate(m);
}
}
if (m->vmp_overwriting) {
/*
* the (COPY_OUT_FROM == FALSE) request_page_list case
*/
if (m->vmp_busy) {
/*
* We do not re-set m->vmp_dirty !
* The page was busy so no extraneous activity
* could have occurred. COPY_INTO is a read into the
* new pages. CLEAN_IN_PLACE does actually write
* out the pages but handling outside of this code
* will take care of resetting dirty. We clear the
* modify however for the Programmed I/O case.
*/
pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
m->vmp_busy = FALSE;
m->vmp_absent = FALSE;
} else {
/*
* alternate (COPY_OUT_FROM == FALSE) request_page_list case
* Occurs when the original page was wired
* at the time of the list request
*/
assert(VM_PAGE_WIRED(m));
vm_page_unwire(m, TRUE); /* reactivates */
}
m->vmp_overwriting = FALSE;
} else {
m->vmp_dirty = FALSE;
}
m->vmp_cleaning = FALSE;
/*
* Wakeup any thread waiting for the page to be un-cleaning.
*/
vm_page_wakeup(object, m);
vm_page_unlock_queues();
}
/*
* Account for the paging reference taken in vm_paging_object_allocate.
*/
vm_object_activity_end(shadow_object);
vm_object_unlock(shadow_object);
assert(object->ref_count == 0);
assert(object->paging_in_progress == 0);
assert(object->activity_in_progress == 0);
assert(object->resident_page_count == 0);
return;
}
/*
* Routine: vm_pageclean_setup
*
* Purpose: setup a page to be cleaned (made non-dirty), but not
* necessarily flushed from the VM page cache.
* This is accomplished by cleaning in place.
*
* The page must not be busy, and new_object
* must be locked.
*
*/
static void
vm_pageclean_setup(
vm_page_t m,
vm_page_t new_m,
vm_object_t new_object,
vm_object_offset_t new_offset)
{
assert(!m->vmp_busy);
#if 0
assert(!m->vmp_cleaning);
#endif
pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
/*
* Mark original page as cleaning in place.
*/
m->vmp_cleaning = TRUE;
SET_PAGE_DIRTY(m, FALSE);
m->vmp_precious = FALSE;
/*
* Convert the fictitious page to a private shadow of
* the real page.
*/
assert(new_m->vmp_fictitious);
assert(VM_PAGE_GET_PHYS_PAGE(new_m) == vm_page_fictitious_addr);
new_m->vmp_fictitious = FALSE;
new_m->vmp_private = TRUE;
new_m->vmp_free_when_done = TRUE;
VM_PAGE_SET_PHYS_PAGE(new_m, VM_PAGE_GET_PHYS_PAGE(m));
vm_page_lockspin_queues();
vm_page_wire(new_m, VM_KERN_MEMORY_NONE, TRUE);
vm_page_unlock_queues();
vm_page_insert_wired(new_m, new_object, new_offset, VM_KERN_MEMORY_NONE);
assert(!new_m->vmp_wanted);
new_m->vmp_busy = FALSE;
}
/*
* Routine: vm_pageout_initialize_page
* Purpose:
* Causes the specified page to be initialized in
* the appropriate memory object. This routine is used to push
* pages into a copy-object when they are modified in the
* permanent object.
*
* The page is moved to a temporary object and paged out.
*
* In/out conditions:
* The page in question must not be on any pageout queues.
* The object to which it belongs must be locked.
* The page must be busy, but not hold a paging reference.
*
* Implementation:
* Move this page to a completely new object.
*/
void
vm_pageout_initialize_page(
vm_page_t m)
{
vm_object_t object;
vm_object_offset_t paging_offset;
memory_object_t pager;
assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
object = VM_PAGE_OBJECT(m);
assert(m->vmp_busy);
assert(object->internal);
/*
* Verify that we really want to clean this page
*/
assert(!m->vmp_absent);
assert(m->vmp_dirty);
/*
* Create a paging reference to let us play with the object.
*/
paging_offset = m->vmp_offset + object->paging_offset;
if (m->vmp_absent || VMP_ERROR_GET(m) || m->vmp_restart || (!m->vmp_dirty && !m->vmp_precious)) {
panic("reservation without pageout?"); /* alan */
VM_PAGE_FREE(m);
vm_object_unlock(object);
return;
}
/*
* If there's no pager, then we can't clean the page. This should
* never happen since this should be a copy object and therefore not
* an external object, so the pager should always be there.
*/
pager = object->pager;
if (pager == MEMORY_OBJECT_NULL) {
panic("missing pager for copy object");
VM_PAGE_FREE(m);
return;
}
/*
* set the page for future call to vm_fault_list_request
*/
pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
SET_PAGE_DIRTY(m, FALSE);
/*
* keep the object from collapsing or terminating
*/
vm_object_paging_begin(object);
vm_object_unlock(object);
/*
* Write the data to its pager.
* Note that the data is passed by naming the new object,
* not a virtual address; the pager interface has been
* manipulated to use the "internal memory" data type.
* [The object reference from its allocation is donated
* to the eventual recipient.]
*/
memory_object_data_initialize(pager, paging_offset, PAGE_SIZE);
vm_object_lock(object);
vm_object_paging_end(object);
}
/*
* vm_pageout_cluster:
*
* Given a page, queue it to the appropriate I/O thread,
* which will page it out and attempt to clean adjacent pages
* in the same operation.
*
* The object and queues must be locked. We will take a
* paging reference to prevent deallocation or collapse when we
* release the object lock back at the call site. The I/O thread
* is responsible for consuming this reference
*
* The page must not be on any pageout queue.
*/
#if DEVELOPMENT || DEBUG
vmct_stats_t vmct_stats;
int32_t vmct_active = 0;
uint64_t vm_compressor_epoch_start = 0;
uint64_t vm_compressor_epoch_stop = 0;
typedef enum vmct_state_t {
VMCT_IDLE,
VMCT_AWAKENED,
VMCT_ACTIVE,
} vmct_state_t;
vmct_state_t vmct_state[MAX_COMPRESSOR_THREAD_COUNT];
#endif
static void
vm_pageout_cluster_to_queue(vm_page_t m, struct vm_pageout_queue *q)
{
vm_object_t object = VM_PAGE_OBJECT(m);
VM_PAGE_CHECK(m);
LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
vm_object_lock_assert_exclusive(object);
/*
* Make sure it's OK to page this out.
*/
assert((m->vmp_dirty || m->vmp_precious) && (!VM_PAGE_WIRED(m)));
assert(!m->vmp_cleaning && !m->vmp_laundry);
assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
/*
* protect the object from collapse or termination
*/
vm_object_activity_begin(object);
/*
* pgo_laundry count is tied to the laundry bit
*/
m->vmp_laundry = TRUE;
q->pgo_laundry++;
m->vmp_q_state = VM_PAGE_ON_PAGEOUT_Q;
vm_page_queue_enter(&q->pgo_pending, m, vmp_pageq);
if (object->internal == TRUE) {
assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
m->vmp_busy = TRUE;
#if DEVELOPMENT || DEBUG
/*
* The benchmark queue will be woken up independently by the benchmark
* itself.
*/
if (q != &vm_pageout_queue_benchmark) {
#else /* DEVELOPMENT || DEBUG */
if (true) {
#endif /* DEVELOPMENT || DEBUG */
/*
* Wake up the first compressor thread. It will wake subsequent
* threads if necessary.
*/
sched_cond_signal(&pgo_iothread_internal_state[0].pgo_wakeup,
pgo_iothread_internal_state[0].pgo_iothread);
}
} else {
sched_cond_signal(&pgo_iothread_external_state.pgo_wakeup, pgo_iothread_external_state.pgo_iothread);
}
VM_PAGE_CHECK(m);
}
void
vm_pageout_cluster(vm_page_t m)
{
struct vm_pageout_queue *q;
vm_object_t object = VM_PAGE_OBJECT(m);
if (object->internal) {
q = &vm_pageout_queue_internal;
} else {
q = &vm_pageout_queue_external;
}
vm_pageout_cluster_to_queue(m, q);
}
/*
* A page is back from laundry or we are stealing it back from
* the laundering state. See if there are some pages waiting to
* go to laundry and if we can let some of them go now.
*
* Object and page queues must be locked.
*/
void
vm_pageout_throttle_up(
vm_page_t m)
{
struct vm_pageout_queue *q;
vm_object_t m_object;
m_object = VM_PAGE_OBJECT(m);
assert(m_object != VM_OBJECT_NULL);
assert(!is_kernel_object(m_object));
LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
vm_object_lock_assert_exclusive(m_object);
if (m_object->internal == TRUE) {
q = &vm_pageout_queue_internal;
} else {
q = &vm_pageout_queue_external;
}
if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
vm_page_queue_remove(&q->pgo_pending, m, vmp_pageq);
m->vmp_q_state = VM_PAGE_NOT_ON_Q;
VM_PAGE_ZERO_PAGEQ_ENTRY(m);
vm_object_activity_end(m_object);
VM_PAGEOUT_DEBUG(vm_page_steal_pageout_page, 1);
}
if (m->vmp_laundry == TRUE) {
m->vmp_laundry = FALSE;
q->pgo_laundry--;
if (q->pgo_throttled == TRUE) {
q->pgo_throttled = FALSE;
thread_wakeup((event_t) &q->pgo_laundry);
}
if (q->pgo_draining == TRUE && q->pgo_laundry == 0) {
q->pgo_draining = FALSE;
thread_wakeup((event_t) (&q->pgo_laundry + 1));
}
VM_PAGEOUT_DEBUG(vm_pageout_throttle_up_count, 1);
}
}
static void
vm_pageout_throttle_up_batch(
struct vm_pageout_queue *q,
int batch_cnt)
{
LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
VM_PAGEOUT_DEBUG(vm_pageout_throttle_up_count, batch_cnt);
q->pgo_laundry -= batch_cnt;
if (q->pgo_throttled == TRUE) {
q->pgo_throttled = FALSE;
thread_wakeup((event_t) &q->pgo_laundry);
}
if (q->pgo_draining == TRUE && q->pgo_laundry == 0) {
q->pgo_draining = FALSE;
thread_wakeup((event_t) (&q->pgo_laundry + 1));
}
}
/*
* VM memory pressure monitoring.
*
* vm_pageout_scan() keeps track of the number of pages it considers and
* reclaims, in the currently active vm_pageout_stat[vm_pageout_stat_now].
*
* compute_memory_pressure() is called every second from compute_averages()
* and moves "vm_pageout_stat_now" forward, to start accumulating the number
* of recalimed pages in a new vm_pageout_stat[] bucket.
*
* mach_vm_pressure_monitor() collects past statistics about memory pressure.
* The caller provides the number of seconds ("nsecs") worth of statistics
* it wants, up to 30 seconds.
* It computes the number of pages reclaimed in the past "nsecs" seconds and
* also returns the number of pages the system still needs to reclaim at this
* moment in time.
*/
#if DEVELOPMENT || DEBUG
#define VM_PAGEOUT_STAT_SIZE (30 * 8) + 1
#else
#define VM_PAGEOUT_STAT_SIZE (1 * 8) + 1
#endif
struct vm_pageout_stat {
unsigned long vm_page_active_count;
unsigned long vm_page_speculative_count;
unsigned long vm_page_inactive_count;
unsigned long vm_page_anonymous_count;
unsigned long vm_page_free_count;
unsigned long vm_page_wire_count;
unsigned long vm_page_compressor_count;
unsigned long vm_page_pages_compressed;
unsigned long vm_page_pageable_internal_count;
unsigned long vm_page_pageable_external_count;
unsigned long vm_page_xpmapped_external_count;
unsigned int pages_grabbed;
unsigned int pages_freed;
unsigned int pages_compressed;
unsigned int pages_grabbed_by_compressor;
unsigned int failed_compressions;
unsigned int pages_evicted;
unsigned int pages_purged;
unsigned int considered;
unsigned int considered_bq_internal;
unsigned int considered_bq_external;
unsigned int skipped_external;
unsigned int skipped_internal;
unsigned int filecache_min_reactivations;
unsigned int freed_speculative;
unsigned int freed_cleaned;
unsigned int freed_internal;
unsigned int freed_external;
unsigned int cleaned_dirty_external;
unsigned int cleaned_dirty_internal;
unsigned int inactive_referenced;
unsigned int inactive_nolock;
unsigned int reactivation_limit_exceeded;
unsigned int forced_inactive_reclaim;
unsigned int throttled_internal_q;
unsigned int throttled_external_q;
unsigned int phantom_ghosts_found;
unsigned int phantom_ghosts_added;
unsigned int vm_page_realtime_count;
unsigned int forcereclaimed_sharedcache;
unsigned int forcereclaimed_realtime;
unsigned int protected_sharedcache;
unsigned int protected_realtime;
} vm_pageout_stats[VM_PAGEOUT_STAT_SIZE];
unsigned int vm_pageout_stat_now = 0;
#define VM_PAGEOUT_STAT_BEFORE(i) \
(((i) == 0) ? VM_PAGEOUT_STAT_SIZE - 1 : (i) - 1)
#define VM_PAGEOUT_STAT_AFTER(i) \
(((i) == VM_PAGEOUT_STAT_SIZE - 1) ? 0 : (i) + 1)
#if VM_PAGE_BUCKETS_CHECK
int vm_page_buckets_check_interval = 80; /* in eighths of a second */
#endif /* VM_PAGE_BUCKETS_CHECK */
void
record_memory_pressure(void);
void
record_memory_pressure(void)
{
unsigned int vm_pageout_next;
#if VM_PAGE_BUCKETS_CHECK
/* check the consistency of VM page buckets at regular interval */
static int counter = 0;
if ((++counter % vm_page_buckets_check_interval) == 0) {
vm_page_buckets_check();
}
#endif /* VM_PAGE_BUCKETS_CHECK */
vm_pageout_state.vm_memory_pressure =
vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_speculative +
vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_cleaned +
vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_internal +
vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_external;
commpage_set_memory_pressure((unsigned int)vm_pageout_state.vm_memory_pressure );
/* move "now" forward */
vm_pageout_next = VM_PAGEOUT_STAT_AFTER(vm_pageout_stat_now);
bzero(&vm_pageout_stats[vm_pageout_next], sizeof(struct vm_pageout_stat));
vm_pageout_stat_now = vm_pageout_next;
}
/*
* IMPORTANT
* mach_vm_ctl_page_free_wanted() is called indirectly, via
* mach_vm_pressure_monitor(), when taking a stackshot. Therefore,
* it must be safe in the restricted stackshot context. Locks and/or
* blocking are not allowable.
*/
unsigned int
mach_vm_ctl_page_free_wanted(void)
{
unsigned int page_free_target, page_free_count, page_free_wanted;
page_free_target = vm_page_free_target;
page_free_count = vm_page_free_count;
if (page_free_target > page_free_count) {
page_free_wanted = page_free_target - page_free_count;
} else {
page_free_wanted = 0;
}
return page_free_wanted;
}
/*
* IMPORTANT:
* mach_vm_pressure_monitor() is called when taking a stackshot, with
* wait_for_pressure FALSE, so that code path must remain safe in the
* restricted stackshot context. No blocking or locks are allowable.
* on that code path.
*/
kern_return_t
mach_vm_pressure_monitor(
boolean_t wait_for_pressure,
unsigned int nsecs_monitored,
unsigned int *pages_reclaimed_p,
unsigned int *pages_wanted_p)
{
wait_result_t wr;
unsigned int vm_pageout_then, vm_pageout_now;
unsigned int pages_reclaimed;
unsigned int units_of_monitor;
units_of_monitor = 8 * nsecs_monitored;
/*
* We don't take the vm_page_queue_lock here because we don't want
* vm_pressure_monitor() to get in the way of the vm_pageout_scan()
* thread when it's trying to reclaim memory. We don't need fully
* accurate monitoring anyway...
*/
if (wait_for_pressure) {
/* wait until there's memory pressure */
while (vm_page_free_count >= vm_page_free_target) {
wr = assert_wait((event_t) &vm_page_free_wanted,
THREAD_INTERRUPTIBLE);
if (wr == THREAD_WAITING) {
wr = thread_block(THREAD_CONTINUE_NULL);
}
if (wr == THREAD_INTERRUPTED) {
return KERN_ABORTED;
}
if (wr == THREAD_AWAKENED) {
/*
* The memory pressure might have already
* been relieved but let's not block again
* and let's report that there was memory
* pressure at some point.
*/
break;
}
}
}
/* provide the number of pages the system wants to reclaim */
if (pages_wanted_p != NULL) {
*pages_wanted_p = mach_vm_ctl_page_free_wanted();
}
if (pages_reclaimed_p == NULL) {
return KERN_SUCCESS;
}
/* provide number of pages reclaimed in the last "nsecs_monitored" */
vm_pageout_now = vm_pageout_stat_now;
pages_reclaimed = 0;
for (vm_pageout_then =
VM_PAGEOUT_STAT_BEFORE(vm_pageout_now);
vm_pageout_then != vm_pageout_now &&
units_of_monitor-- != 0;
vm_pageout_then =
VM_PAGEOUT_STAT_BEFORE(vm_pageout_then)) {
pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_speculative;
pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_cleaned;
pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_internal;
pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_external;
}
*pages_reclaimed_p = pages_reclaimed;
return KERN_SUCCESS;
}
#if DEVELOPMENT || DEBUG
static void
vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t *, int);
/*
* condition variable used to make sure there is
* only a single sweep going on at a time
*/
bool vm_pageout_disconnect_all_pages_active = false;
void
vm_pageout_disconnect_all_pages()
{
vm_page_lock_queues();
if (vm_pageout_disconnect_all_pages_active) {
vm_page_unlock_queues();
return;
}
vm_pageout_disconnect_all_pages_active = true;
vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_throttled,
vm_page_throttled_count);
vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_anonymous,
vm_page_anonymous_count);
vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_inactive,
(vm_page_inactive_count - vm_page_anonymous_count));
vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_active,
vm_page_active_count);
#ifdef CONFIG_SECLUDED_MEMORY
vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_secluded,
vm_page_secluded_count);
#endif /* CONFIG_SECLUDED_MEMORY */
vm_page_unlock_queues();
vm_pageout_disconnect_all_pages_active = false;
}
/* NB: assumes the page_queues lock is held on entry, returns with page queue lock held */
void
vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t *q, int qcount)
{
vm_page_t m;
vm_object_t t_object = NULL;
vm_object_t l_object = NULL;
vm_object_t m_object = NULL;
int delayed_unlock = 0;
int try_failed_count = 0;
int disconnected_count = 0;
int paused_count = 0;
int object_locked_count = 0;
KDBG((MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_DISCONNECT_ALL_PAGE_MAPPINGS) |
DBG_FUNC_START),
q, qcount);
while (qcount && !vm_page_queue_empty(q)) {
LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
m = (vm_page_t) vm_page_queue_first(q);
m_object = VM_PAGE_OBJECT(m);
if (m_object == VM_OBJECT_NULL) {
/*
* Bumped into a free page. This should only happen on the
* secluded queue
*/
#if CONFIG_SECLUDED_MEMORY
assert(q == &vm_page_queue_secluded);
#endif /* CONFIG_SECLUDED_MEMORY */
goto reenter_pg_on_q;
}
/*
* check to see if we currently are working
* with the same object... if so, we've
* already got the lock
*/
if (m_object != l_object) {
/*
* the object associated with candidate page is
* different from the one we were just working
* with... dump the lock if we still own it
*/
if (l_object != NULL) {
vm_object_unlock(l_object);
l_object = NULL;
}
if (m_object != t_object) {
try_failed_count = 0;
}
/*
* Try to lock object; since we've alread got the
* page queues lock, we can only 'try' for this one.
* if the 'try' fails, we need to do a mutex_pause
* to allow the owner of the object lock a chance to
* run...
*/
if (!vm_object_lock_try_scan(m_object)) {
if (try_failed_count > 20) {
goto reenter_pg_on_q;
}
vm_page_unlock_queues();
mutex_pause(try_failed_count++);
vm_page_lock_queues();
delayed_unlock = 0;
paused_count++;
t_object = m_object;
continue;
}
object_locked_count++;
l_object = m_object;
}
if (!m_object->alive || m->vmp_cleaning || m->vmp_laundry ||
m->vmp_busy || m->vmp_absent || VMP_ERROR_GET(m) ||
m->vmp_free_when_done) {
/*
* put it back on the head of its queue
*/
goto reenter_pg_on_q;
}
if (m->vmp_pmapped == TRUE) {
pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
disconnected_count++;
}
reenter_pg_on_q:
vm_page_queue_remove(q, m, vmp_pageq);
vm_page_queue_enter(q, m, vmp_pageq);
qcount--;
try_failed_count = 0;
if (delayed_unlock++ > 128) {
if (l_object != NULL) {
vm_object_unlock(l_object);
l_object = NULL;
}
lck_mtx_yield(&vm_page_queue_lock);
delayed_unlock = 0;
}
}
if (l_object != NULL) {
vm_object_unlock(l_object);
l_object = NULL;
}
KDBG((MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_DISCONNECT_ALL_PAGE_MAPPINGS) |
DBG_FUNC_END),
q, disconnected_count, object_locked_count, paused_count);
}
extern const char *proc_best_name(struct proc* proc);
int
vm_toggle_task_selfdonate_pages(task_t task)
{
int state = 0;
if (vm_page_donate_mode == VM_PAGE_DONATE_DISABLED) {
printf("VM Donation mode is OFF on the system\n");
return state;
}
if (task != kernel_task) {
task_lock(task);
if (!task->donates_own_pages) {
printf("SELF DONATE for %s ON\n", proc_best_name(get_bsdtask_info(task)));
task->donates_own_pages = true;
state = 1;
} else if (task->donates_own_pages) {
printf("SELF DONATE for %s OFF\n", proc_best_name(get_bsdtask_info(task)));
task->donates_own_pages = false;
state = 0;
}
task_unlock(task);
}
return state;
}
#endif /* DEVELOPMENT || DEBUG */
void
vm_task_set_selfdonate_pages(task_t task, bool donate)
{
assert(vm_page_donate_mode != VM_PAGE_DONATE_DISABLED);
assert(task != kernel_task);
task_lock(task);
task->donates_own_pages = donate;
task_unlock(task);
}
static size_t
vm_pageout_page_queue(vm_page_queue_head_t *, size_t, bool);
/*
* condition variable used to make sure there is
* only a single sweep going on at a time
*/
boolean_t vm_pageout_anonymous_pages_active = FALSE;
kern_return_t
vm_pageout_anonymous_pages()
{
if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
size_t throttled_pages_moved, anonymous_pages_moved, active_pages_moved;
vm_page_lock_queues();
if (vm_pageout_anonymous_pages_active == TRUE) {
vm_page_unlock_queues();
return KERN_RESOURCE_SHORTAGE;
}
vm_pageout_anonymous_pages_active = TRUE;
vm_page_unlock_queues();
throttled_pages_moved = vm_pageout_page_queue(&vm_page_queue_throttled, vm_page_throttled_count, false);
anonymous_pages_moved = vm_pageout_page_queue(&vm_page_queue_anonymous, vm_page_anonymous_count, false);
active_pages_moved = vm_pageout_page_queue(&vm_page_queue_active, vm_page_active_count, false);
os_log(OS_LOG_DEFAULT,
"%s: throttled pages moved: %zu, anonymous pages moved: %zu, active pages moved: %zu",
__func__, throttled_pages_moved, anonymous_pages_moved, active_pages_moved);
if (VM_CONFIG_SWAP_IS_PRESENT) {
vm_consider_swapping();
}
vm_page_lock_queues();
vm_pageout_anonymous_pages_active = FALSE;
vm_page_unlock_queues();
return KERN_SUCCESS;
} else {
return KERN_NOT_SUPPORTED;
}
}
size_t
vm_pageout_page_queue(vm_page_queue_head_t *q, size_t qcount, bool perf_test)
{
vm_page_t m;
vm_object_t t_object = NULL;
vm_object_t l_object = NULL;
vm_object_t m_object = NULL;
int delayed_unlock = 0;
int try_failed_count = 0;
int refmod_state;
int pmap_options;
struct vm_pageout_queue *iq;
ppnum_t phys_page;
size_t pages_moved = 0;
iq = &vm_pageout_queue_internal;
vm_page_lock_queues();
#if DEVELOPMENT || DEBUG
if (perf_test) {
iq = &vm_pageout_queue_benchmark;
// ensure the benchmark queue isn't throttled
iq->pgo_maxlaundry = (unsigned int) qcount;
}
#endif /* DEVELOPMENT ||DEBUG */
while (qcount && !vm_page_queue_empty(q)) {
LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
if (VM_PAGE_Q_THROTTLED(iq)) {
if (l_object != NULL) {
vm_object_unlock(l_object);
l_object = NULL;
}
iq->pgo_draining = TRUE;
assert_wait((event_t) (&iq->pgo_laundry + 1), THREAD_INTERRUPTIBLE);
vm_page_unlock_queues();
thread_block(THREAD_CONTINUE_NULL);
vm_page_lock_queues();
delayed_unlock = 0;
continue;
}
m = (vm_page_t) vm_page_queue_first(q);
m_object = VM_PAGE_OBJECT(m);
/*
* check to see if we currently are working
* with the same object... if so, we've
* already got the lock
*/
if (m_object != l_object) {
if (!m_object->internal) {
goto reenter_pg_on_q;
}
/*
* the object associated with candidate page is
* different from the one we were just working
* with... dump the lock if we still own it
*/
if (l_object != NULL) {
vm_object_unlock(l_object);
l_object = NULL;
}
if (m_object != t_object) {
try_failed_count = 0;
}
/*
* Try to lock object; since we've alread got the
* page queues lock, we can only 'try' for this one.
* if the 'try' fails, we need to do a mutex_pause
* to allow the owner of the object lock a chance to
* run...
*/
if (!vm_object_lock_try_scan(m_object)) {
if (try_failed_count > 20) {
goto reenter_pg_on_q;
}
vm_page_unlock_queues();
mutex_pause(try_failed_count++);
vm_page_lock_queues();
delayed_unlock = 0;
t_object = m_object;
continue;
}
l_object = m_object;
}
if (!m_object->alive || m->vmp_cleaning || m->vmp_laundry || m->vmp_busy || m->vmp_absent || VMP_ERROR_GET(m) || m->vmp_free_when_done) {
/*
* page is not to be cleaned
* put it back on the head of its queue
*/
goto reenter_pg_on_q;
}
phys_page = VM_PAGE_GET_PHYS_PAGE(m);
if (m->vmp_reference == FALSE && m->vmp_pmapped == TRUE) {
refmod_state = pmap_get_refmod(phys_page);
if (refmod_state & VM_MEM_REFERENCED) {
m->vmp_reference = TRUE;
}
if (refmod_state & VM_MEM_MODIFIED) {
SET_PAGE_DIRTY(m, FALSE);
}
}
if (m->vmp_reference == TRUE) {
m->vmp_reference = FALSE;
pmap_clear_refmod_options(phys_page, VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
goto reenter_pg_on_q;
}
if (m->vmp_pmapped == TRUE) {
if (m->vmp_dirty || m->vmp_precious) {
pmap_options = PMAP_OPTIONS_COMPRESSOR;
} else {
pmap_options = PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
}
refmod_state = pmap_disconnect_options(phys_page, pmap_options, NULL);
if (refmod_state & VM_MEM_MODIFIED) {
SET_PAGE_DIRTY(m, FALSE);
}
}
if (!m->vmp_dirty && !m->vmp_precious) {
vm_page_unlock_queues();
VM_PAGE_FREE(m);
vm_page_lock_queues();
delayed_unlock = 0;
goto next_pg;
}
if (!m_object->pager_initialized || m_object->pager == MEMORY_OBJECT_NULL) {
if (!m_object->pager_initialized) {
vm_page_unlock_queues();
vm_object_collapse(m_object, (vm_object_offset_t) 0, TRUE);
if (!m_object->pager_initialized) {
vm_object_compressor_pager_create(m_object);
}
vm_page_lock_queues();
delayed_unlock = 0;
}
if (!m_object->pager_initialized || m_object->pager == MEMORY_OBJECT_NULL) {
goto reenter_pg_on_q;
}
/*
* vm_object_compressor_pager_create will drop the object lock
* which means 'm' may no longer be valid to use
*/
continue;
}
if (!perf_test) {
/*
* we've already factored out pages in the laundry which
* means this page can't be on the pageout queue so it's
* safe to do the vm_page_queues_remove
*/
bool donate = (m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
vm_page_queues_remove(m, TRUE);
if (donate) {
/*
* The compressor needs to see this bit to know
* where this page needs to land. Also if stolen,
* this bit helps put the page back in the right
* special queue where it belongs.
*/
m->vmp_on_specialq = VM_PAGE_SPECIAL_Q_DONATE;
}
} else {
vm_page_queue_remove(q, m, vmp_pageq);
}
LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
vm_pageout_cluster_to_queue(m, iq);
pages_moved++;
goto next_pg;
reenter_pg_on_q:
vm_page_queue_remove(q, m, vmp_pageq);
vm_page_queue_enter(q, m, vmp_pageq);
next_pg:
qcount--;
try_failed_count = 0;
if (delayed_unlock++ > 128) {
if (l_object != NULL) {
vm_object_unlock(l_object);
l_object = NULL;
}
lck_mtx_yield(&vm_page_queue_lock);
delayed_unlock = 0;
}
}
if (l_object != NULL) {
vm_object_unlock(l_object);
l_object = NULL;
}
vm_page_unlock_queues();
return pages_moved;
}
/*
* function in BSD to apply I/O throttle to the pageout thread
*/
extern void vm_pageout_io_throttle(void);
#define VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m, obj) \
MACRO_BEGIN \
/* \
* If a "reusable" page somehow made it back into \
* the active queue, it's been re-used and is not \
* quite re-usable. \
* If the VM object was "all_reusable", consider it \
* as "all re-used" instead of converting it to \
* "partially re-used", which could be expensive. \
*/ \
assert(VM_PAGE_OBJECT((m)) == (obj)); \
if ((m)->vmp_reusable || \
(obj)->all_reusable) { \
vm_object_reuse_pages((obj), \
(m)->vmp_offset, \
(m)->vmp_offset + PAGE_SIZE_64, \
FALSE); \
} \
MACRO_END
#define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT 64
#define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX 1024
#define FCS_IDLE 0
#define FCS_DELAYED 1
#define FCS_DEADLOCK_DETECTED 2
struct flow_control {
int state;
mach_timespec_t ts;
};
uint64_t vm_pageout_rejected_bq_internal = 0;
uint64_t vm_pageout_rejected_bq_external = 0;
uint64_t vm_pageout_skipped_bq_internal = 0;
uint64_t vm_pageout_skipped_bq_external = 0;
#define ANONS_GRABBED_LIMIT 2
#if 0
static void vm_pageout_delayed_unlock(int *, int *, vm_page_t *);
#endif
static void vm_pageout_prepare_to_block(vm_object_t *, int *, vm_page_t *, int *, int);
#define VM_PAGEOUT_PB_NO_ACTION 0
#define VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER 1
#define VM_PAGEOUT_PB_THREAD_YIELD 2
#if 0
static void
vm_pageout_delayed_unlock(int *delayed_unlock, int *local_freed, vm_page_t *local_freeq)
{
if (*local_freeq) {
vm_page_unlock_queues();
VM_DEBUG_CONSTANT_EVENT(
vm_pageout_freelist, DBG_VM_PAGEOUT_FREELIST, DBG_FUNC_START,
vm_page_free_count, 0, 0, 1);
vm_page_free_list(*local_freeq, TRUE);
VM_DEBUG_CONSTANT_EVENT(vm_pageout_freelist, DBG_VM_PAGEOUT_FREELIST, DBG_FUNC_END,
vm_page_free_count, *local_freed, 0, 1);
*local_freeq = NULL;
*local_freed = 0;
vm_page_lock_queues();
} else {
lck_mtx_yield(&vm_page_queue_lock);
}
*delayed_unlock = 1;
}
#endif
static void
vm_pageout_prepare_to_block(vm_object_t *object, int *delayed_unlock,
vm_page_t *local_freeq, int *local_freed, int action)
{
vm_page_unlock_queues();
if (*object != NULL) {
vm_object_unlock(*object);
*object = NULL;
}
if (*local_freeq) {
vm_page_free_list(*local_freeq, TRUE);
*local_freeq = NULL;
*local_freed = 0;
}
*delayed_unlock = 1;
switch (action) {
case VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER:
vm_consider_waking_compactor_swapper();
break;
case VM_PAGEOUT_PB_THREAD_YIELD:
thread_yield_internal(1);
break;
case VM_PAGEOUT_PB_NO_ACTION:
default:
break;
}
vm_page_lock_queues();
}
static struct vm_pageout_vminfo last;
uint64_t last_vm_page_pages_grabbed = 0;
extern uint32_t c_segment_pages_compressed;
extern uint64_t shared_region_pager_reclaimed;
extern struct memory_object_pager_ops shared_region_pager_ops;
void
update_vm_info(void)
{
unsigned long tmp;
uint64_t tmp64;
vm_pageout_stats[vm_pageout_stat_now].vm_page_active_count = vm_page_active_count;
vm_pageout_stats[vm_pageout_stat_now].vm_page_speculative_count = vm_page_speculative_count;
vm_pageout_stats[vm_pageout_stat_now].vm_page_inactive_count = vm_page_inactive_count;
vm_pageout_stats[vm_pageout_stat_now].vm_page_anonymous_count = vm_page_anonymous_count;
vm_pageout_stats[vm_pageout_stat_now].vm_page_free_count = vm_page_free_count;
vm_pageout_stats[vm_pageout_stat_now].vm_page_wire_count = vm_page_wire_count;
vm_pageout_stats[vm_pageout_stat_now].vm_page_compressor_count = VM_PAGE_COMPRESSOR_COUNT;
vm_pageout_stats[vm_pageout_stat_now].vm_page_pages_compressed = c_segment_pages_compressed;
vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_internal_count = vm_page_pageable_internal_count;
vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_external_count = vm_page_pageable_external_count;
vm_pageout_stats[vm_pageout_stat_now].vm_page_xpmapped_external_count = vm_page_xpmapped_external_count;
vm_pageout_stats[vm_pageout_stat_now].vm_page_realtime_count = vm_page_realtime_count;
tmp = vm_pageout_vminfo.vm_pageout_considered_page;
vm_pageout_stats[vm_pageout_stat_now].considered = (unsigned int)(tmp - last.vm_pageout_considered_page);
last.vm_pageout_considered_page = tmp;
tmp64 = vm_pageout_vminfo.vm_pageout_compressions;
vm_pageout_stats[vm_pageout_stat_now].pages_compressed = (unsigned int)(tmp64 - last.vm_pageout_compressions);
last.vm_pageout_compressions = tmp64;
tmp = vm_pageout_vminfo.vm_compressor_failed;
vm_pageout_stats[vm_pageout_stat_now].failed_compressions = (unsigned int)(tmp - last.vm_compressor_failed);
last.vm_compressor_failed = tmp;
tmp64 = vm_pageout_vminfo.vm_compressor_pages_grabbed;
vm_pageout_stats[vm_pageout_stat_now].pages_grabbed_by_compressor = (unsigned int)(tmp64 - last.vm_compressor_pages_grabbed);
last.vm_compressor_pages_grabbed = tmp64;
tmp = vm_pageout_vminfo.vm_phantom_cache_found_ghost;
vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_found = (unsigned int)(tmp - last.vm_phantom_cache_found_ghost);
last.vm_phantom_cache_found_ghost = tmp;
tmp = vm_pageout_vminfo.vm_phantom_cache_added_ghost;
vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_added = (unsigned int)(tmp - last.vm_phantom_cache_added_ghost);
last.vm_phantom_cache_added_ghost = tmp;
tmp64 = counter_load(&vm_page_grab_count);
vm_pageout_stats[vm_pageout_stat_now].pages_grabbed = (unsigned int)(tmp64 - last_vm_page_pages_grabbed);
last_vm_page_pages_grabbed = tmp64;
tmp = vm_pageout_vminfo.vm_page_pages_freed;
vm_pageout_stats[vm_pageout_stat_now].pages_freed = (unsigned int)(tmp - last.vm_page_pages_freed);
last.vm_page_pages_freed = tmp;
if (vm_pageout_stats[vm_pageout_stat_now].considered) {
tmp = vm_pageout_vminfo.vm_pageout_pages_evicted;
vm_pageout_stats[vm_pageout_stat_now].pages_evicted = (unsigned int)(tmp - last.vm_pageout_pages_evicted);
last.vm_pageout_pages_evicted = tmp;
tmp = vm_pageout_vminfo.vm_pageout_pages_purged;
vm_pageout_stats[vm_pageout_stat_now].pages_purged = (unsigned int)(tmp - last.vm_pageout_pages_purged);
last.vm_pageout_pages_purged = tmp;
tmp = vm_pageout_vminfo.vm_pageout_freed_speculative;
vm_pageout_stats[vm_pageout_stat_now].freed_speculative = (unsigned int)(tmp - last.vm_pageout_freed_speculative);
last.vm_pageout_freed_speculative = tmp;
tmp = vm_pageout_vminfo.vm_pageout_freed_external;
vm_pageout_stats[vm_pageout_stat_now].freed_external = (unsigned int)(tmp - last.vm_pageout_freed_external);
last.vm_pageout_freed_external = tmp;
tmp = vm_pageout_vminfo.vm_pageout_inactive_referenced;
vm_pageout_stats[vm_pageout_stat_now].inactive_referenced = (unsigned int)(tmp - last.vm_pageout_inactive_referenced);
last.vm_pageout_inactive_referenced = tmp;
tmp = vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_external;
vm_pageout_stats[vm_pageout_stat_now].throttled_external_q = (unsigned int)(tmp - last.vm_pageout_scan_inactive_throttled_external);
last.vm_pageout_scan_inactive_throttled_external = tmp;
tmp = vm_pageout_vminfo.vm_pageout_inactive_dirty_external;
vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_external = (unsigned int)(tmp - last.vm_pageout_inactive_dirty_external);
last.vm_pageout_inactive_dirty_external = tmp;
tmp = vm_pageout_vminfo.vm_pageout_freed_cleaned;
vm_pageout_stats[vm_pageout_stat_now].freed_cleaned = (unsigned int)(tmp - last.vm_pageout_freed_cleaned);
last.vm_pageout_freed_cleaned = tmp;
tmp = vm_pageout_vminfo.vm_pageout_inactive_nolock;
vm_pageout_stats[vm_pageout_stat_now].inactive_nolock = (unsigned int)(tmp - last.vm_pageout_inactive_nolock);
last.vm_pageout_inactive_nolock = tmp;
tmp = vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_internal;
vm_pageout_stats[vm_pageout_stat_now].throttled_internal_q = (unsigned int)(tmp - last.vm_pageout_scan_inactive_throttled_internal);
last.vm_pageout_scan_inactive_throttled_internal = tmp;
tmp = vm_pageout_vminfo.vm_pageout_skipped_external;
vm_pageout_stats[vm_pageout_stat_now].skipped_external = (unsigned int)(tmp - last.vm_pageout_skipped_external);
last.vm_pageout_skipped_external = tmp;
tmp = vm_pageout_vminfo.vm_pageout_skipped_internal;
vm_pageout_stats[vm_pageout_stat_now].skipped_internal = (unsigned int)(tmp - last.vm_pageout_skipped_internal);
last.vm_pageout_skipped_internal = tmp;
tmp = vm_pageout_vminfo.vm_pageout_reactivation_limit_exceeded;
vm_pageout_stats[vm_pageout_stat_now].reactivation_limit_exceeded = (unsigned int)(tmp - last.vm_pageout_reactivation_limit_exceeded);
last.vm_pageout_reactivation_limit_exceeded = tmp;
tmp = vm_pageout_vminfo.vm_pageout_inactive_force_reclaim;
vm_pageout_stats[vm_pageout_stat_now].forced_inactive_reclaim = (unsigned int)(tmp - last.vm_pageout_inactive_force_reclaim);
last.vm_pageout_inactive_force_reclaim = tmp;
tmp = vm_pageout_vminfo.vm_pageout_freed_internal;
vm_pageout_stats[vm_pageout_stat_now].freed_internal = (unsigned int)(tmp - last.vm_pageout_freed_internal);
last.vm_pageout_freed_internal = tmp;
tmp = vm_pageout_vminfo.vm_pageout_considered_bq_internal;
vm_pageout_stats[vm_pageout_stat_now].considered_bq_internal = (unsigned int)(tmp - last.vm_pageout_considered_bq_internal);
last.vm_pageout_considered_bq_internal = tmp;
tmp = vm_pageout_vminfo.vm_pageout_considered_bq_external;
vm_pageout_stats[vm_pageout_stat_now].considered_bq_external = (unsigned int)(tmp - last.vm_pageout_considered_bq_external);
last.vm_pageout_considered_bq_external = tmp;
tmp = vm_pageout_vminfo.vm_pageout_filecache_min_reactivated;
vm_pageout_stats[vm_pageout_stat_now].filecache_min_reactivations = (unsigned int)(tmp - last.vm_pageout_filecache_min_reactivated);
last.vm_pageout_filecache_min_reactivated = tmp;
tmp = vm_pageout_vminfo.vm_pageout_inactive_dirty_internal;
vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_internal = (unsigned int)(tmp - last.vm_pageout_inactive_dirty_internal);
last.vm_pageout_inactive_dirty_internal = tmp;
tmp = vm_pageout_vminfo.vm_pageout_forcereclaimed_sharedcache;
vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_sharedcache = (unsigned int)(tmp - last.vm_pageout_forcereclaimed_sharedcache);
last.vm_pageout_forcereclaimed_sharedcache = tmp;
tmp = vm_pageout_vminfo.vm_pageout_forcereclaimed_realtime;
vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_realtime = (unsigned int)(tmp - last.vm_pageout_forcereclaimed_realtime);
last.vm_pageout_forcereclaimed_realtime = tmp;
tmp = vm_pageout_vminfo.vm_pageout_protected_sharedcache;
vm_pageout_stats[vm_pageout_stat_now].protected_sharedcache = (unsigned int)(tmp - last.vm_pageout_protected_sharedcache);
last.vm_pageout_protected_sharedcache = tmp;
tmp = vm_pageout_vminfo.vm_pageout_protected_realtime;
vm_pageout_stats[vm_pageout_stat_now].protected_realtime = (unsigned int)(tmp - last.vm_pageout_protected_realtime);
last.vm_pageout_protected_realtime = tmp;
}
KDBG((VMDBG_CODE(DBG_VM_INFO1)) | DBG_FUNC_NONE,
vm_pageout_stats[vm_pageout_stat_now].vm_page_active_count,
vm_pageout_stats[vm_pageout_stat_now].vm_page_speculative_count,
vm_pageout_stats[vm_pageout_stat_now].vm_page_inactive_count,
vm_pageout_stats[vm_pageout_stat_now].vm_page_anonymous_count);
KDBG((VMDBG_CODE(DBG_VM_INFO2)) | DBG_FUNC_NONE,
vm_pageout_stats[vm_pageout_stat_now].vm_page_free_count,
vm_pageout_stats[vm_pageout_stat_now].vm_page_wire_count,
vm_pageout_stats[vm_pageout_stat_now].vm_page_compressor_count);
KDBG((VMDBG_CODE(DBG_VM_INFO3)) | DBG_FUNC_NONE,
vm_pageout_stats[vm_pageout_stat_now].vm_page_pages_compressed,
vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_internal_count,
vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_external_count,
vm_pageout_stats[vm_pageout_stat_now].vm_page_xpmapped_external_count);
if (vm_pageout_stats[vm_pageout_stat_now].considered ||
vm_pageout_stats[vm_pageout_stat_now].pages_compressed ||
vm_pageout_stats[vm_pageout_stat_now].failed_compressions) {
KDBG((VMDBG_CODE(DBG_VM_INFO4)) | DBG_FUNC_NONE,
vm_pageout_stats[vm_pageout_stat_now].considered,
vm_pageout_stats[vm_pageout_stat_now].freed_speculative,
vm_pageout_stats[vm_pageout_stat_now].freed_external,
vm_pageout_stats[vm_pageout_stat_now].inactive_referenced);
KDBG((VMDBG_CODE(DBG_VM_INFO5)) | DBG_FUNC_NONE,
vm_pageout_stats[vm_pageout_stat_now].throttled_external_q,
vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_external,
vm_pageout_stats[vm_pageout_stat_now].freed_cleaned,
vm_pageout_stats[vm_pageout_stat_now].inactive_nolock);
KDBG((VMDBG_CODE(DBG_VM_INFO6)) | DBG_FUNC_NONE,
vm_pageout_stats[vm_pageout_stat_now].throttled_internal_q,
vm_pageout_stats[vm_pageout_stat_now].pages_compressed,
vm_pageout_stats[vm_pageout_stat_now].pages_grabbed_by_compressor,
vm_pageout_stats[vm_pageout_stat_now].skipped_external);
KDBG((VMDBG_CODE(DBG_VM_INFO7)) | DBG_FUNC_NONE,
vm_pageout_stats[vm_pageout_stat_now].reactivation_limit_exceeded,
vm_pageout_stats[vm_pageout_stat_now].forced_inactive_reclaim,
vm_pageout_stats[vm_pageout_stat_now].failed_compressions,
vm_pageout_stats[vm_pageout_stat_now].freed_internal);
KDBG((VMDBG_CODE(DBG_VM_INFO8)) | DBG_FUNC_NONE,
vm_pageout_stats[vm_pageout_stat_now].considered_bq_internal,
vm_pageout_stats[vm_pageout_stat_now].considered_bq_external,
vm_pageout_stats[vm_pageout_stat_now].filecache_min_reactivations,
vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_internal);
KDBG((VMDBG_CODE(DBG_VM_INFO10)) | DBG_FUNC_NONE,
vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_sharedcache,
vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_realtime,
vm_pageout_stats[vm_pageout_stat_now].protected_sharedcache,
vm_pageout_stats[vm_pageout_stat_now].protected_realtime);
}
KDBG((VMDBG_CODE(DBG_VM_INFO9)) | DBG_FUNC_NONE,
vm_pageout_stats[vm_pageout_stat_now].pages_grabbed,
vm_pageout_stats[vm_pageout_stat_now].pages_freed,
vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_found,
vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_added);
record_memory_pressure();
}
extern boolean_t hibernation_vmqueues_inspection;
/*
* Return values for functions called by vm_pageout_scan
* that control its flow.
*
* PROCEED -- vm_pageout_scan will keep making forward progress.
* DONE_RETURN -- page demand satisfied, work is done -> vm_pageout_scan returns.
* NEXT_ITERATION -- restart the 'for' loop in vm_pageout_scan aka continue.
*/
#define VM_PAGEOUT_SCAN_PROCEED (0)
#define VM_PAGEOUT_SCAN_DONE_RETURN (1)
#define VM_PAGEOUT_SCAN_NEXT_ITERATION (2)
/*
* This function is called only from vm_pageout_scan and
* it moves overflow secluded pages (one-at-a-time) to the
* batched 'local' free Q or active Q.
*/
static void
vps_deal_with_secluded_page_overflow(vm_page_t *local_freeq, int *local_freed)
{
#if CONFIG_SECLUDED_MEMORY
/*
* Deal with secluded_q overflow.
*/
if (vm_page_secluded_count > vm_page_secluded_target) {
vm_page_t secluded_page;
/*
* SECLUDED_AGING_BEFORE_ACTIVE:
* Excess secluded pages go to the active queue and
* will later go to the inactive queue.
*/
assert((vm_page_secluded_count_free +
vm_page_secluded_count_inuse) ==
vm_page_secluded_count);
secluded_page = (vm_page_t)vm_page_queue_first(&vm_page_queue_secluded);
assert(secluded_page->vmp_q_state == VM_PAGE_ON_SECLUDED_Q);
vm_page_queues_remove(secluded_page, FALSE);
assert(!secluded_page->vmp_fictitious);
assert(!VM_PAGE_WIRED(secluded_page));
if (secluded_page->vmp_object == 0) {
/* transfer to free queue */
assert(secluded_page->vmp_busy);
secluded_page->vmp_snext = *local_freeq;
*local_freeq = secluded_page;
*local_freed += 1;
} else {
/* transfer to head of active queue */
vm_page_enqueue_active(secluded_page, FALSE);
secluded_page = VM_PAGE_NULL;
}
}
#else /* CONFIG_SECLUDED_MEMORY */
#pragma unused(local_freeq)
#pragma unused(local_freed)
return;
#endif /* CONFIG_SECLUDED_MEMORY */
}
/*
* This function is called only from vm_pageout_scan and
* it initializes the loop targets for vm_pageout_scan().
*/
static void
vps_init_page_targets(void)
{
/*
* LD TODO: Other page targets should be calculated here too.
*/
vm_page_anonymous_min = vm_page_inactive_target / 20;
if (vm_pageout_state.vm_page_speculative_percentage > 50) {
vm_pageout_state.vm_page_speculative_percentage = 50;
} else if (vm_pageout_state.vm_page_speculative_percentage <= 0) {
vm_pageout_state.vm_page_speculative_percentage = 1;
}
vm_pageout_state.vm_page_speculative_target = VM_PAGE_SPECULATIVE_TARGET(vm_page_active_count +
vm_page_inactive_count);
}
/*
* This function is called only from vm_pageout_scan and
* it purges a single VM object at-a-time and will either
* make vm_pageout_scan() restart the loop or keeping moving forward.
*/
static int
vps_purge_object()
{
int force_purge;
assert(available_for_purge >= 0);
force_purge = 0; /* no force-purging */
#if VM_PRESSURE_EVENTS
vm_pressure_level_t pressure_level;
pressure_level = memorystatus_vm_pressure_level;
if (pressure_level > kVMPressureNormal) {
if (pressure_level >= kVMPressureCritical) {
force_purge = vm_pageout_state.memorystatus_purge_on_critical;
} else if (pressure_level >= kVMPressureUrgent) {
force_purge = vm_pageout_state.memorystatus_purge_on_urgent;
} else if (pressure_level >= kVMPressureWarning) {
force_purge = vm_pageout_state.memorystatus_purge_on_warning;
}
}
#endif /* VM_PRESSURE_EVENTS */
if (available_for_purge || force_purge) {
memoryshot(DBG_VM_PAGEOUT_PURGEONE, DBG_FUNC_START);
VM_DEBUG_EVENT(vm_pageout_purgeone, DBG_VM_PAGEOUT_PURGEONE, DBG_FUNC_START, vm_page_free_count, 0, 0, 0);
if (vm_purgeable_object_purge_one(force_purge, C_DONT_BLOCK)) {
VM_PAGEOUT_DEBUG(vm_pageout_purged_objects, 1);
VM_DEBUG_EVENT(vm_pageout_purgeone, DBG_VM_PAGEOUT_PURGEONE, DBG_FUNC_END, vm_page_free_count, 0, 0, 0);
memoryshot(DBG_VM_PAGEOUT_PURGEONE, DBG_FUNC_END);
return VM_PAGEOUT_SCAN_NEXT_ITERATION;
}
VM_DEBUG_EVENT(vm_pageout_purgeone, DBG_VM_PAGEOUT_PURGEONE, DBG_FUNC_END, 0, 0, 0, -1);
memoryshot(DBG_VM_PAGEOUT_PURGEONE, DBG_FUNC_END);
}
return VM_PAGEOUT_SCAN_PROCEED;
}
/*
* This function is called only from vm_pageout_scan and
* it will try to age the next speculative Q if the oldest
* one is empty.
*/
static int
vps_age_speculative_queue(boolean_t force_speculative_aging)
{
#define DELAY_SPECULATIVE_AGE 1000
/*
* try to pull pages from the aging bins...
* see vm_page_internal.h for an explanation of how
* this mechanism works
*/
boolean_t can_steal = FALSE;
int num_scanned_queues;
static int delay_speculative_age = 0; /* depends the # of times we go through the main pageout_scan loop.*/
mach_timespec_t ts;
struct vm_speculative_age_q *aq;
struct vm_speculative_age_q *sq;
sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
aq = &vm_page_queue_speculative[speculative_steal_index];
num_scanned_queues = 0;
while (vm_page_queue_empty(&aq->age_q) &&
num_scanned_queues++ != vm_page_max_speculative_age_q) {
speculative_steal_index++;
if (speculative_steal_index > vm_page_max_speculative_age_q) {
speculative_steal_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
}
aq = &vm_page_queue_speculative[speculative_steal_index];
}
if (num_scanned_queues == vm_page_max_speculative_age_q + 1) {
/*
* XXX We've scanned all the speculative
* queues but still haven't found one
* that is not empty, even though
* vm_page_speculative_count is not 0.
*/
if (!vm_page_queue_empty(&sq->age_q)) {
return VM_PAGEOUT_SCAN_NEXT_ITERATION;
}
#if DEVELOPMENT || DEBUG
panic("vm_pageout_scan: vm_page_speculative_count=%d but queues are empty", vm_page_speculative_count);
#endif
/* readjust... */
vm_page_speculative_count = 0;
/* ... and continue */
return VM_PAGEOUT_SCAN_NEXT_ITERATION;
}
if (vm_page_speculative_count > vm_pageout_state.vm_page_speculative_target || force_speculative_aging == TRUE) {
can_steal = TRUE;
} else {
if (!delay_speculative_age) {
mach_timespec_t ts_fully_aged;
ts_fully_aged.tv_sec = (vm_page_max_speculative_age_q * vm_pageout_state.vm_page_speculative_q_age_ms) / 1000;
ts_fully_aged.tv_nsec = ((vm_page_max_speculative_age_q * vm_pageout_state.vm_page_speculative_q_age_ms) % 1000)
* 1000 * NSEC_PER_USEC;
ADD_MACH_TIMESPEC(&ts_fully_aged, &aq->age_ts);
clock_sec_t sec;
clock_nsec_t nsec;
clock_get_system_nanotime(&sec, &nsec);
ts.tv_sec = (unsigned int) sec;
ts.tv_nsec = nsec;
if (CMP_MACH_TIMESPEC(&ts, &ts_fully_aged) >= 0) {
can_steal = TRUE;
} else {
delay_speculative_age++;
}
} else {
delay_speculative_age++;
if (delay_speculative_age == DELAY_SPECULATIVE_AGE) {
delay_speculative_age = 0;
}
}
}
if (can_steal == TRUE) {
vm_page_speculate_ageit(aq);
}
return VM_PAGEOUT_SCAN_PROCEED;
}
/*
* This function is called only from vm_pageout_scan and
* it evicts a single VM object from the cache.
*/
static int inline
vps_object_cache_evict(vm_object_t *object_to_unlock)
{
static int cache_evict_throttle = 0;
struct vm_speculative_age_q *sq;
sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
if (vm_page_queue_empty(&sq->age_q) && cache_evict_throttle == 0) {
int pages_evicted;
if (*object_to_unlock != NULL) {
vm_object_unlock(*object_to_unlock);
*object_to_unlock = NULL;
}
KDBG(0x13001ec | DBG_FUNC_START);
pages_evicted = vm_object_cache_evict(100, 10);
KDBG(0x13001ec | DBG_FUNC_END, pages_evicted);
if (pages_evicted) {
vm_pageout_vminfo.vm_pageout_pages_evicted += pages_evicted;
VM_DEBUG_EVENT(vm_pageout_cache_evict, DBG_VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE,
vm_page_free_count, pages_evicted, vm_pageout_vminfo.vm_pageout_pages_evicted, 0);
memoryshot(DBG_VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE);
/*
* we just freed up to 100 pages,
* so go back to the top of the main loop
* and re-evaulate the memory situation
*/
return VM_PAGEOUT_SCAN_NEXT_ITERATION;
} else {
cache_evict_throttle = 1000;
}
}
if (cache_evict_throttle) {
cache_evict_throttle--;
}
return VM_PAGEOUT_SCAN_PROCEED;
}
/*
* This function is called only from vm_pageout_scan and
* it calculates the filecache min. that needs to be maintained
* as we start to steal pages.
*/
static void
vps_calculate_filecache_min(void)
{
int divisor = vm_pageout_state.vm_page_filecache_min_divisor;
#if CONFIG_JETSAM
/*
* don't let the filecache_min fall below 15% of available memory
* on systems with an active compressor that isn't nearing its
* limits w/r to accepting new data
*
* on systems w/o the compressor/swapper, the filecache is always
* a very large percentage of the AVAILABLE_NON_COMPRESSED_MEMORY
* since most (if not all) of the anonymous pages are in the
* throttled queue (which isn't counted as available) which
* effectively disables this filter
*/
if (vm_compressor_low_on_space() || divisor == 0) {
vm_pageout_state.vm_page_filecache_min = 0;
} else {
vm_pageout_state.vm_page_filecache_min =
((AVAILABLE_NON_COMPRESSED_MEMORY) * 10) / divisor;
}
#else
if (vm_compressor_out_of_space() || divisor == 0) {
vm_pageout_state.vm_page_filecache_min = 0;
} else {
/*
* don't let the filecache_min fall below the specified critical level
*/
vm_pageout_state.vm_page_filecache_min =
((AVAILABLE_NON_COMPRESSED_MEMORY) * 10) / divisor;
}
#endif
if (vm_page_free_count < (vm_page_free_reserved / 4)) {
vm_pageout_state.vm_page_filecache_min = 0;
}
}
/*
* This function is called only from vm_pageout_scan and
* it updates the flow control time to detect if VM pageoutscan
* isn't making progress.
*/
static void
vps_flow_control_reset_deadlock_timer(struct flow_control *flow_control)
{
mach_timespec_t ts;
clock_sec_t sec;
clock_nsec_t nsec;
ts.tv_sec = vm_pageout_state.vm_pageout_deadlock_wait / 1000;
ts.tv_nsec = (vm_pageout_state.vm_pageout_deadlock_wait % 1000) * 1000 * NSEC_PER_USEC;
clock_get_system_nanotime(&sec, &nsec);
flow_control->ts.tv_sec = (unsigned int) sec;
flow_control->ts.tv_nsec = nsec;
ADD_MACH_TIMESPEC(&flow_control->ts, &ts);
flow_control->state = FCS_DELAYED;
vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_internal++;
}
/*
* This function is called only from vm_pageout_scan and
* it is the flow control logic of VM pageout scan which
* controls if it should block and for how long.
* Any blocking of vm_pageout_scan happens ONLY in this function.
*/
static int
vps_flow_control(struct flow_control *flow_control, int *anons_grabbed, vm_object_t *object, int *delayed_unlock,
vm_page_t *local_freeq, int *local_freed, int *vm_pageout_deadlock_target, unsigned int inactive_burst_count)
{
boolean_t exceeded_burst_throttle = FALSE;
unsigned int msecs = 0;
uint32_t inactive_external_count;
mach_timespec_t ts;
struct vm_pageout_queue *iq;
struct vm_pageout_queue *eq;
struct vm_speculative_age_q *sq;
iq = &vm_pageout_queue_internal;
eq = &vm_pageout_queue_external;
sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
/*
* Sometimes we have to pause:
* 1) No inactive pages - nothing to do.
* 2) Loop control - no acceptable pages found on the inactive queue
* within the last vm_pageout_burst_inactive_throttle iterations
* 3) Flow control - default pageout queue is full
*/
if (vm_page_queue_empty(&vm_page_queue_inactive) &&
vm_page_queue_empty(&vm_page_queue_anonymous) &&
vm_page_queue_empty(&vm_page_queue_cleaned) &&
vm_page_queue_empty(&sq->age_q)) {
VM_PAGEOUT_DEBUG(vm_pageout_scan_empty_throttle, 1);
msecs = vm_pageout_state.vm_pageout_empty_wait;
} else if (inactive_burst_count >=
MIN(vm_pageout_state.vm_pageout_burst_inactive_throttle,
(vm_page_inactive_count +
vm_page_speculative_count))) {
VM_PAGEOUT_DEBUG(vm_pageout_scan_burst_throttle, 1);
msecs = vm_pageout_state.vm_pageout_burst_wait;
exceeded_burst_throttle = TRUE;
} else if (VM_PAGE_Q_THROTTLED(iq) &&
VM_DYNAMIC_PAGING_ENABLED()) {
clock_sec_t sec;
clock_nsec_t nsec;
switch (flow_control->state) {
case FCS_IDLE:
if ((vm_page_free_count + *local_freed) < vm_page_free_target &&
vm_pageout_state.vm_restricted_to_single_processor == FALSE) {
/*
* since the compressor is running independently of vm_pageout_scan
* let's not wait for it just yet... as long as we have a healthy supply
* of filecache pages to work with, let's keep stealing those.
*/
inactive_external_count = vm_page_inactive_count - vm_page_anonymous_count;
if (vm_page_pageable_external_count > vm_pageout_state.vm_page_filecache_min &&
(inactive_external_count >= VM_PAGE_INACTIVE_TARGET(vm_page_pageable_external_count))) {
*anons_grabbed = ANONS_GRABBED_LIMIT;
VM_PAGEOUT_DEBUG(vm_pageout_scan_throttle_deferred, 1);
return VM_PAGEOUT_SCAN_PROCEED;
}
}
vps_flow_control_reset_deadlock_timer(flow_control);
msecs = vm_pageout_state.vm_pageout_deadlock_wait;
break;
case FCS_DELAYED:
clock_get_system_nanotime(&sec, &nsec);
ts.tv_sec = (unsigned int) sec;
ts.tv_nsec = nsec;
if (CMP_MACH_TIMESPEC(&ts, &flow_control->ts) >= 0) {
/*
* the pageout thread for the default pager is potentially
* deadlocked since the
* default pager queue has been throttled for more than the
* allowable time... we need to move some clean pages or dirty
* pages belonging to the external pagers if they aren't throttled
* vm_page_free_wanted represents the number of threads currently
* blocked waiting for pages... we'll move one page for each of
* these plus a fixed amount to break the logjam... once we're done
* moving this number of pages, we'll re-enter the FSC_DELAYED state
* with a new timeout target since we have no way of knowing
* whether we've broken the deadlock except through observation
* of the queue associated with the default pager... we need to
* stop moving pages and allow the system to run to see what
* state it settles into.
*/
*vm_pageout_deadlock_target = vm_pageout_state.vm_pageout_deadlock_relief +
vm_page_free_wanted + vm_page_free_wanted_privileged;
VM_PAGEOUT_DEBUG(vm_pageout_scan_deadlock_detected, 1);
flow_control->state = FCS_DEADLOCK_DETECTED;
thread_wakeup(VM_PAGEOUT_GC_EVENT);
return VM_PAGEOUT_SCAN_PROCEED;
}
/*
* just resniff instead of trying
* to compute a new delay time... we're going to be
* awakened immediately upon a laundry completion,
* so we won't wait any longer than necessary
*/
msecs = vm_pageout_state.vm_pageout_idle_wait;
break;
case FCS_DEADLOCK_DETECTED:
if (*vm_pageout_deadlock_target) {
return VM_PAGEOUT_SCAN_PROCEED;
}
vps_flow_control_reset_deadlock_timer(flow_control);
msecs = vm_pageout_state.vm_pageout_deadlock_wait;
break;
}
} else {
/*
* No need to pause...
*/
return VM_PAGEOUT_SCAN_PROCEED;
}
vm_pageout_scan_wants_object = VM_OBJECT_NULL;
vm_pageout_prepare_to_block(object, delayed_unlock, local_freeq, local_freed,
VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
if (vm_page_free_count >= vm_page_free_target) {
/*
* we're here because
* 1) someone else freed up some pages while we had
* the queues unlocked above
* and we've hit one of the 3 conditions that
* cause us to pause the pageout scan thread
*
* since we already have enough free pages,
* let's avoid stalling and return normally
*
* before we return, make sure the pageout I/O threads
* are running throttled in case there are still requests
* in the laundry... since we have enough free pages
* we don't need the laundry to be cleaned in a timely
* fashion... so let's avoid interfering with foreground
* activity
*
* we don't want to hold vm_page_queue_free_lock when
* calling vm_pageout_adjust_eq_iothrottle (since it
* may cause other locks to be taken), we do the intitial
* check outside of the lock. Once we take the lock,
* we recheck the condition since it may have changed.
* if it has, no problem, we will make the threads
* non-throttled before actually blocking
*/
vm_pageout_adjust_eq_iothrottle(&pgo_iothread_external_state, TRUE);
}
vm_free_page_lock();
if (vm_page_free_count >= vm_page_free_target &&
(vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) {
return VM_PAGEOUT_SCAN_DONE_RETURN;
}
vm_free_page_unlock();
if ((vm_page_free_count + vm_page_cleaned_count) < vm_page_free_target) {
/*
* we're most likely about to block due to one of
* the 3 conditions that cause vm_pageout_scan to
* not be able to make forward progress w/r
* to providing new pages to the free queue,
* so unthrottle the I/O threads in case we
* have laundry to be cleaned... it needs
* to be completed ASAP.
*
* even if we don't block, we want the io threads
* running unthrottled since the sum of free +
* clean pages is still under our free target
*/
vm_pageout_adjust_eq_iothrottle(&pgo_iothread_external_state, FALSE);
}
if (vm_page_cleaned_count > 0 && exceeded_burst_throttle == FALSE) {
/*
* if we get here we're below our free target and
* we're stalling due to a full laundry queue or
* we don't have any inactive pages other then
* those in the clean queue...
* however, we have pages on the clean queue that
* can be moved to the free queue, so let's not
* stall the pageout scan
*/
flow_control->state = FCS_IDLE;
return VM_PAGEOUT_SCAN_PROCEED;
}
if (flow_control->state == FCS_DELAYED && !VM_PAGE_Q_THROTTLED(iq)) {
flow_control->state = FCS_IDLE;
return VM_PAGEOUT_SCAN_PROCEED;
}
VM_CHECK_MEMORYSTATUS;
if (flow_control->state != FCS_IDLE) {
VM_PAGEOUT_DEBUG(vm_pageout_scan_throttle, 1);
}
iq->pgo_throttled = TRUE;
assert_wait_timeout((event_t) &iq->pgo_laundry, THREAD_INTERRUPTIBLE, msecs, 1000 * NSEC_PER_USEC);
vm_page_unlock_queues();
assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
VM_DEBUG_EVENT(vm_pageout_thread_block, DBG_VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START,
iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0);
memoryshot(DBG_VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START);
thread_block(THREAD_CONTINUE_NULL);
VM_DEBUG_EVENT(vm_pageout_thread_block, DBG_VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END,
iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0);
memoryshot(DBG_VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END);
vm_page_lock_queues();
iq->pgo_throttled = FALSE;
vps_init_page_targets();
return VM_PAGEOUT_SCAN_NEXT_ITERATION;
}
extern boolean_t vm_darkwake_mode;
/*
* This function is called only from vm_pageout_scan and
* it will find and return the most appropriate page to be
* reclaimed.
*/
static int
vps_choose_victim_page(vm_page_t *victim_page, int *anons_grabbed, boolean_t *grab_anonymous, boolean_t force_anonymous,
boolean_t *is_page_from_bg_q, unsigned int *reactivated_this_call)
{
vm_page_t m = NULL;
vm_object_t m_object = VM_OBJECT_NULL;
uint32_t inactive_external_count;
struct vm_speculative_age_q *sq;
struct vm_pageout_queue *iq;
int retval = VM_PAGEOUT_SCAN_PROCEED;
sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
iq = &vm_pageout_queue_internal;
*is_page_from_bg_q = FALSE;
m = NULL;
m_object = VM_OBJECT_NULL;
if (VM_DYNAMIC_PAGING_ENABLED()) {
assert(vm_page_throttled_count == 0);
assert(vm_page_queue_empty(&vm_page_queue_throttled));
}
/*
* Try for a clean-queue inactive page.
* These are pages that vm_pageout_scan tried to steal earlier, but
* were dirty and had to be cleaned. Pick them up now that they are clean.
*/
if (!vm_page_queue_empty(&vm_page_queue_cleaned)) {
m = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q);
goto found_page;
}
/*
* The next most eligible pages are ones we paged in speculatively,
* but which have not yet been touched and have been aged out.
*/
if (!vm_page_queue_empty(&sq->age_q)) {
m = (vm_page_t) vm_page_queue_first(&sq->age_q);
assert(m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q);
if (!m->vmp_dirty || force_anonymous == FALSE) {
goto found_page;
} else {
m = NULL;
}
}
#if !CONFIG_JETSAM
if (vm_page_donate_mode != VM_PAGE_DONATE_DISABLED) {
if (vm_page_donate_queue_ripe && !vm_page_queue_empty(&vm_page_queue_donate)) {
m = (vm_page_t) vm_page_queue_first(&vm_page_queue_donate);
assert(m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
goto found_page;
}
}
#endif /* !CONFIG_JETSAM */
if (vm_page_background_mode != VM_PAGE_BG_DISABLED && (vm_page_background_count > vm_page_background_target)) {
vm_object_t bg_m_object = NULL;
m = (vm_page_t) vm_page_queue_first(&vm_page_queue_background);
bg_m_object = VM_PAGE_OBJECT(m);
if (!VM_PAGE_PAGEABLE(m) || (vm_darkwake_mode && m->vmp_busy)) {
/*
* This page is on the background queue
* but not on a pageable queue OR is busy during
* darkwake mode when the target is artificially lowered.
* If it is busy during darkwake mode, and we don't skip it,
* we will just swing back around and try again with the same
* queue and might hit the same page or its neighbor in a
* similar state. Both of these are transient states and will
* get resolved, but, at this point let's ignore this page.
*/
if (vm_darkwake_mode && m->vmp_busy) {
if (bg_m_object->internal) {
vm_pageout_skipped_bq_internal++;
} else {
vm_pageout_skipped_bq_external++;
}
}
} else if (force_anonymous == FALSE || bg_m_object->internal) {
if (bg_m_object->internal &&
(VM_PAGE_Q_THROTTLED(iq) ||
vm_compressor_out_of_space() == TRUE ||
vm_page_free_count < (vm_page_free_reserved / 4))) {
vm_pageout_skipped_bq_internal++;
} else {
*is_page_from_bg_q = TRUE;
if (bg_m_object->internal) {
vm_pageout_vminfo.vm_pageout_considered_bq_internal++;
} else {
vm_pageout_vminfo.vm_pageout_considered_bq_external++;
}
goto found_page;
}
}
}
inactive_external_count = vm_page_inactive_count - vm_page_anonymous_count;
if ((vm_page_pageable_external_count < vm_pageout_state.vm_page_filecache_min || force_anonymous == TRUE) ||
(inactive_external_count < VM_PAGE_INACTIVE_TARGET(vm_page_pageable_external_count))) {
*grab_anonymous = TRUE;
*anons_grabbed = 0;
if (VM_CONFIG_SWAP_IS_ACTIVE) {
vm_pageout_vminfo.vm_pageout_skipped_external++;
} else {
if (vm_page_free_count < (COMPRESSOR_FREE_RESERVED_LIMIT * 2)) {
/*
* No swap and we are in dangerously low levels of free memory.
* If we keep going ahead with anonymous pages, we are going to run into a situation
* where the compressor will be stuck waiting for free pages (if it isn't already).
*
* So, pick a file backed page...
*/
*grab_anonymous = FALSE;
*anons_grabbed = ANONS_GRABBED_LIMIT;
vm_pageout_vminfo.vm_pageout_skipped_internal++;
}
}
goto want_anonymous;
}
*grab_anonymous = (vm_page_anonymous_count > vm_page_anonymous_min);
#if CONFIG_JETSAM
/* If the file-backed pool has accumulated
* significantly more pages than the jetsam
* threshold, prefer to reclaim those
* inline to minimise compute overhead of reclaiming
* anonymous pages.
* This calculation does not account for the CPU local
* external page queues, as those are expected to be
* much smaller relative to the global pools.
*/
struct vm_pageout_queue *eq = &vm_pageout_queue_external;
if (*grab_anonymous == TRUE && !VM_PAGE_Q_THROTTLED(eq)) {
if (vm_page_pageable_external_count >
vm_pageout_state.vm_page_filecache_min) {
if ((vm_page_pageable_external_count *
vm_pageout_memorystatus_fb_factor_dr) >
(memorystatus_available_pages_critical *
vm_pageout_memorystatus_fb_factor_nr)) {
*grab_anonymous = FALSE;
VM_PAGEOUT_DEBUG(vm_grab_anon_overrides, 1);
}
}
if (*grab_anonymous) {
VM_PAGEOUT_DEBUG(vm_grab_anon_nops, 1);
}
}
#endif /* CONFIG_JETSAM */
want_anonymous:
if (*grab_anonymous == FALSE || *anons_grabbed >= ANONS_GRABBED_LIMIT || vm_page_queue_empty(&vm_page_queue_anonymous)) {
if (!vm_page_queue_empty(&vm_page_queue_inactive)) {
m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_EXTERNAL_Q);
*anons_grabbed = 0;
if (vm_page_pageable_external_count < vm_pageout_state.vm_page_filecache_min) {
if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
if ((++(*reactivated_this_call) % 100)) {
vm_pageout_vminfo.vm_pageout_filecache_min_reactivated++;
vm_page_activate(m);
counter_inc(&vm_statistics_reactivations);
#if DEVELOPMENT || DEBUG
if (*is_page_from_bg_q == TRUE) {
if (m_object->internal) {
vm_pageout_rejected_bq_internal++;
} else {
vm_pageout_rejected_bq_external++;
}
}
#endif /* DEVELOPMENT || DEBUG */
vm_pageout_state.vm_pageout_inactive_used++;
m = NULL;
retval = VM_PAGEOUT_SCAN_NEXT_ITERATION;
goto found_page;
}
/*
* steal 1 of the file backed pages even if
* we are under the limit that has been set
* for a healthy filecache
*/
}
}
goto found_page;
}
}
if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q);
*anons_grabbed += 1;
goto found_page;
}
m = NULL;
found_page:
*victim_page = m;
return retval;
}
/*
* This function is called only from vm_pageout_scan and
* it will put a page back on the active/inactive queue
* if we can't reclaim it for some reason.
*/
static void
vps_requeue_page(vm_page_t m, int page_prev_q_state, __unused boolean_t page_from_bg_q)
{
if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
vm_page_enqueue_inactive(m, FALSE);
} else {
vm_page_activate(m);
}
#if DEVELOPMENT || DEBUG
vm_object_t m_object = VM_PAGE_OBJECT(m);
if (page_from_bg_q == TRUE) {
if (m_object->internal) {
vm_pageout_rejected_bq_internal++;
} else {
vm_pageout_rejected_bq_external++;
}
}
#endif /* DEVELOPMENT || DEBUG */
}
/*
* This function is called only from vm_pageout_scan and
* it will try to grab the victim page's VM object (m_object)
* which differs from the previous victim page's object (object).
*/
static int
vps_switch_object(vm_page_t m, vm_object_t m_object, vm_object_t *object, int page_prev_q_state, boolean_t avoid_anon_pages, boolean_t page_from_bg_q)
{
struct vm_speculative_age_q *sq;
sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
/*
* the object associated with candidate page is
* different from the one we were just working
* with... dump the lock if we still own it
*/
if (*object != NULL) {
vm_object_unlock(*object);
*object = NULL;
}
/*
* Try to lock object; since we've alread got the
* page queues lock, we can only 'try' for this one.
* if the 'try' fails, we need to do a mutex_pause
* to allow the owner of the object lock a chance to
* run... otherwise, we're likely to trip over this
* object in the same state as we work our way through
* the queue... clumps of pages associated with the same
* object are fairly typical on the inactive and active queues
*/
if (!vm_object_lock_try_scan(m_object)) {
vm_page_t m_want = NULL;
vm_pageout_vminfo.vm_pageout_inactive_nolock++;
if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
VM_PAGEOUT_DEBUG(vm_pageout_cleaned_nolock, 1);
}
pmap_clear_reference(VM_PAGE_GET_PHYS_PAGE(m));
m->vmp_reference = FALSE;
if (!m_object->object_is_shared_cache) {
/*
* don't apply this optimization if this is the shared cache
* object, it's too easy to get rid of very hot and important
* pages...
* m->vmp_object must be stable since we hold the page queues lock...
* we can update the scan_collisions field sans the object lock
* since it is a separate field and this is the only spot that does
* a read-modify-write operation and it is never executed concurrently...
* we can asynchronously set this field to 0 when creating a UPL, so it
* is possible for the value to be a bit non-determistic, but that's ok
* since it's only used as a hint
*/
m_object->scan_collisions = 1;
}
if (page_from_bg_q) {
m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_background);
} else if (!vm_page_queue_empty(&vm_page_queue_cleaned)) {
m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
} else if (!vm_page_queue_empty(&sq->age_q)) {
m_want = (vm_page_t) vm_page_queue_first(&sq->age_q);
} else if ((avoid_anon_pages || vm_page_queue_empty(&vm_page_queue_anonymous)) &&
!vm_page_queue_empty(&vm_page_queue_inactive)) {
m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
} else if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
}
/*
* this is the next object we're going to be interested in
* try to make sure its available after the mutex_pause
* returns control
*/
if (m_want) {
vm_pageout_scan_wants_object = VM_PAGE_OBJECT(m_want);
}
vps_requeue_page(m, page_prev_q_state, page_from_bg_q);
return VM_PAGEOUT_SCAN_NEXT_ITERATION;
} else {
*object = m_object;
vm_pageout_scan_wants_object = VM_OBJECT_NULL;
}
return VM_PAGEOUT_SCAN_PROCEED;
}
/*
* This function is called only from vm_pageout_scan and
* it notices that pageout scan may be rendered ineffective
* due to a FS deadlock and will jetsam a process if possible.
* If jetsam isn't supported, it'll move the page to the active
* queue to try and get some different pages pushed onwards so
* we can try to get out of this scenario.
*/
static void
vps_deal_with_throttled_queues(vm_page_t m, vm_object_t *object, uint32_t *vm_pageout_inactive_external_forced_reactivate_limit,
boolean_t *force_anonymous, __unused boolean_t is_page_from_bg_q)
{
struct vm_pageout_queue *eq;
vm_object_t cur_object = VM_OBJECT_NULL;
cur_object = *object;
eq = &vm_pageout_queue_external;
if (cur_object->internal == FALSE) {
/*
* we need to break up the following potential deadlock case...
* a) The external pageout thread is stuck on the truncate lock for a file that is being extended i.e. written.
* b) The thread doing the writing is waiting for pages while holding the truncate lock
* c) Most of the pages in the inactive queue belong to this file.
*
* we are potentially in this deadlock because...
* a) the external pageout queue is throttled
* b) we're done with the active queue and moved on to the inactive queue
* c) we've got a dirty external page
*
* since we don't know the reason for the external pageout queue being throttled we
* must suspect that we are deadlocked, so move the current page onto the active queue
* in an effort to cause a page from the active queue to 'age' to the inactive queue
*
* if we don't have jetsam configured (i.e. we have a dynamic pager), set
* 'force_anonymous' to TRUE to cause us to grab a page from the cleaned/anonymous
* pool the next time we select a victim page... if we can make enough new free pages,
* the deadlock will break, the external pageout queue will empty and it will no longer
* be throttled
*
* if we have jetsam configured, keep a count of the pages reactivated this way so
* that we can try to find clean pages in the active/inactive queues before
* deciding to jetsam a process
*/
vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_external++;
vm_page_check_pageable_safe(m);
assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
vm_page_queue_enter(&vm_page_queue_active, m, vmp_pageq);
m->vmp_q_state = VM_PAGE_ON_ACTIVE_Q;
vm_page_active_count++;
vm_page_pageable_external_count++;
vm_pageout_adjust_eq_iothrottle(&pgo_iothread_external_state, FALSE);
#if CONFIG_MEMORYSTATUS && CONFIG_JETSAM
#pragma unused(force_anonymous)
*vm_pageout_inactive_external_forced_reactivate_limit -= 1;
if (*vm_pageout_inactive_external_forced_reactivate_limit <= 0) {
*vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count;
/*
* Possible deadlock scenario so request jetsam action
*/
memorystatus_kill_on_vps_starvation();
VM_DEBUG_CONSTANT_EVENT(vm_pageout_jetsam, DBG_VM_PAGEOUT_JETSAM, DBG_FUNC_NONE,
vm_page_active_count, vm_page_inactive_count, vm_page_free_count, vm_page_free_count);
}
#else /* CONFIG_MEMORYSTATUS && CONFIG_JETSAM */
#pragma unused(vm_pageout_inactive_external_forced_reactivate_limit)
*force_anonymous = TRUE;
#endif /* CONFIG_MEMORYSTATUS && CONFIG_JETSAM */
} else {
vm_page_activate(m);
counter_inc(&vm_statistics_reactivations);
#if DEVELOPMENT || DEBUG
if (is_page_from_bg_q == TRUE) {
if (cur_object->internal) {
vm_pageout_rejected_bq_internal++;
} else {
vm_pageout_rejected_bq_external++;
}
}
#endif /* DEVELOPMENT || DEBUG */
vm_pageout_state.vm_pageout_inactive_used++;
}
}
void
vm_page_balance_inactive(int max_to_move)
{
vm_page_t m;
LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
if (hibernation_vmqueues_inspection || hibernate_cleaning_in_progress) {
/*
* It is likely that the hibernation code path is
* dealing with these very queues as we are about
* to move pages around in/from them and completely
* change the linkage of the pages.
*
* And so we skip the rebalancing of these queues.
*/
return;
}
vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count +
vm_page_inactive_count +
vm_page_speculative_count);
while (max_to_move-- && (vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target) {
VM_PAGEOUT_DEBUG(vm_pageout_balanced, 1);
m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
assert(m->vmp_q_state == VM_PAGE_ON_ACTIVE_Q);
assert(!m->vmp_laundry);
assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr);
DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
/*
* by not passing in a pmap_flush_context we will forgo any TLB flushing, local or otherwise...
*
* a TLB flush isn't really needed here since at worst we'll miss the reference bit being
* updated in the PTE if a remote processor still has this mapping cached in its TLB when the
* new reference happens. If no futher references happen on the page after that remote TLB flushes
* we'll see a clean, non-referenced page when it eventually gets pulled out of the inactive queue
* by pageout_scan, which is just fine since the last reference would have happened quite far
* in the past (TLB caches don't hang around for very long), and of course could just as easily
* have happened before we moved the page
*/
if (m->vmp_pmapped == TRUE) {
/*
* We might be holding the page queue lock as a
* spin lock and clearing the "referenced" bit could
* take a while if there are lots of mappings of
* that page, so make sure we acquire the lock as
* as mutex to avoid a spinlock timeout.
*/
vm_page_lockconvert_queues();
pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(m), VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
}
/*
* The page might be absent or busy,
* but vm_page_deactivate can handle that.
* FALSE indicates that we don't want a H/W clear reference
*/
vm_page_deactivate_internal(m, FALSE);
}
}
/*
* vm_pageout_scan does the dirty work for the pageout daemon.
* It returns with both vm_page_queue_free_lock and vm_page_queue_lock
* held and vm_page_free_wanted == 0.
*/
void
vm_pageout_scan(void)
{
unsigned int loop_count = 0;
unsigned int inactive_burst_count = 0;
unsigned int reactivated_this_call;
unsigned int reactivate_limit;
vm_page_t local_freeq = NULL;
int local_freed = 0;
int delayed_unlock;
int delayed_unlock_limit = 0;
int refmod_state = 0;
int vm_pageout_deadlock_target = 0;
struct vm_pageout_queue *iq;
struct vm_pageout_queue *eq;
struct vm_speculative_age_q *sq;
struct flow_control flow_control = { .state = 0, .ts = { .tv_sec = 0, .tv_nsec = 0 } };
boolean_t inactive_throttled = FALSE;
vm_object_t object = NULL;
uint32_t inactive_reclaim_run;
boolean_t grab_anonymous = FALSE;
boolean_t force_anonymous = FALSE;
boolean_t force_speculative_aging = FALSE;
int anons_grabbed = 0;
int page_prev_q_state = 0;
boolean_t page_from_bg_q = FALSE;
uint32_t vm_pageout_inactive_external_forced_reactivate_limit = 0;
vm_object_t m_object = VM_OBJECT_NULL;
int retval = 0;
boolean_t lock_yield_check = FALSE;
VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, DBG_VM_PAGEOUT_SCAN, DBG_FUNC_START,
vm_pageout_vminfo.vm_pageout_freed_speculative,
vm_pageout_state.vm_pageout_inactive_clean,
vm_pageout_vminfo.vm_pageout_inactive_dirty_internal,
vm_pageout_vminfo.vm_pageout_inactive_dirty_external);
flow_control.state = FCS_IDLE;
iq = &vm_pageout_queue_internal;
eq = &vm_pageout_queue_external;
sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
/* Ask the pmap layer to return any pages it no longer needs. */
pmap_release_pages_fast();
vm_page_lock_queues();
delayed_unlock = 1;
/*
* Calculate the max number of referenced pages on the inactive
* queue that we will reactivate.
*/
reactivated_this_call = 0;
reactivate_limit = VM_PAGE_REACTIVATE_LIMIT(vm_page_active_count +
vm_page_inactive_count);
inactive_reclaim_run = 0;
vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count;
/*
* We must limit the rate at which we send pages to the pagers
* so that we don't tie up too many pages in the I/O queues.
* We implement a throttling mechanism using the laundry count
* to limit the number of pages outstanding to the default
* and external pagers. We can bypass the throttles and look
* for clean pages if the pageout queues don't drain in a timely
* fashion since this may indicate that the pageout paths are
* stalled waiting for memory, which only we can provide.
*/
vps_init_page_targets();
assert(object == NULL);
assert(delayed_unlock != 0);
for (;;) {
vm_page_t m;
DTRACE_VM2(rev, int, 1, (uint64_t *), NULL);
if (lock_yield_check) {
lock_yield_check = FALSE;
if (delayed_unlock++ > delayed_unlock_limit) {
vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed,
VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
} else if (vm_pageout_scan_wants_object) {
vm_page_unlock_queues();
mutex_pause(0);
vm_page_lock_queues();
} else if (vps_yield_for_pgqlockwaiters && lck_mtx_yield(&vm_page_queue_lock)) {
VM_PAGEOUT_DEBUG(vm_pageout_yield_for_free_pages, 1);
}
}
if (vm_upl_wait_for_pages < 0) {
vm_upl_wait_for_pages = 0;
}
delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT + vm_upl_wait_for_pages;
if (delayed_unlock_limit > VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX) {
delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX;
}
vps_deal_with_secluded_page_overflow(&local_freeq, &local_freed);
assert(delayed_unlock);
/*
* maintain our balance
*/
vm_page_balance_inactive(1);
/**********************************************************************
* above this point we're playing with the active and secluded queues
* below this point we're playing with the throttling mechanisms
* and the inactive queue
**********************************************************************/
if (vm_page_free_count + local_freed >= vm_page_free_target) {
vm_pageout_scan_wants_object = VM_OBJECT_NULL;
vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed,
VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
/*
* make sure the pageout I/O threads are running
* throttled in case there are still requests
* in the laundry... since we have met our targets
* we don't need the laundry to be cleaned in a timely
* fashion... so let's avoid interfering with foreground
* activity
*/
vm_pageout_adjust_eq_iothrottle(&pgo_iothread_external_state, TRUE);
vm_free_page_lock();
if ((vm_page_free_count >= vm_page_free_target) &&
(vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) {
/*
* done - we have met our target *and*
* there is no one waiting for a page.
*/
return_from_scan:
assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, DBG_VM_PAGEOUT_SCAN, DBG_FUNC_NONE,
vm_pageout_state.vm_pageout_inactive,
vm_pageout_state.vm_pageout_inactive_used, 0, 0);
VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, DBG_VM_PAGEOUT_SCAN, DBG_FUNC_END,
vm_pageout_vminfo.vm_pageout_freed_speculative,
vm_pageout_state.vm_pageout_inactive_clean,
vm_pageout_vminfo.vm_pageout_inactive_dirty_internal,
vm_pageout_vminfo.vm_pageout_inactive_dirty_external);
return;
}
vm_free_page_unlock();
}
/*
* Before anything, we check if we have any ripe volatile
* objects around. If so, try to purge the first object.
* If the purge fails, fall through to reclaim a page instead.
* If the purge succeeds, go back to the top and reevalute
* the new memory situation.
*/
retval = vps_purge_object();
if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
/*
* Success
*/
if (object != NULL) {
vm_object_unlock(object);
object = NULL;
}
lock_yield_check = FALSE;
continue;
}
/*
* If our 'aged' queue is empty and we have some speculative pages
* in the other queues, let's go through and see if we need to age
* them.
*
* If we succeeded in aging a speculative Q or just that everything
* looks normal w.r.t queue age and queue counts, we keep going onward.
*
* If, for some reason, we seem to have a mismatch between the spec.
* page count and the page queues, we reset those variables and
* restart the loop (LD TODO: Track this better?).
*/
if (vm_page_queue_empty(&sq->age_q) && vm_page_speculative_count) {
retval = vps_age_speculative_queue(force_speculative_aging);
if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
lock_yield_check = FALSE;
continue;
}
}
force_speculative_aging = FALSE;
/*
* Check to see if we need to evict objects from the cache.
*
* Note: 'object' here doesn't have anything to do with
* the eviction part. We just need to make sure we have dropped
* any object lock we might be holding if we need to go down
* into the eviction logic.
*/
retval = vps_object_cache_evict(&object);
if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
lock_yield_check = FALSE;
continue;
}
/*
* Calculate our filecache_min that will affect the loop
* going forward.
*/
vps_calculate_filecache_min();
/*
* LD TODO: Use a structure to hold all state variables for a single
* vm_pageout_scan iteration and pass that structure to this function instead.
*/
retval = vps_flow_control(&flow_control, &anons_grabbed, &object,
&delayed_unlock, &local_freeq, &local_freed,
&vm_pageout_deadlock_target, inactive_burst_count);
if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
if (loop_count >= vm_page_inactive_count) {
loop_count = 0;
}
inactive_burst_count = 0;
assert(object == NULL);
assert(delayed_unlock != 0);
lock_yield_check = FALSE;
continue;
} else if (retval == VM_PAGEOUT_SCAN_DONE_RETURN) {
goto return_from_scan;
}
flow_control.state = FCS_IDLE;
vm_pageout_inactive_external_forced_reactivate_limit = MIN((vm_page_active_count + vm_page_inactive_count),
vm_pageout_inactive_external_forced_reactivate_limit);
loop_count++;
inactive_burst_count++;
vm_pageout_state.vm_pageout_inactive++;
/*
* Choose a victim.
*/
m = NULL;
retval = vps_choose_victim_page(&m, &anons_grabbed, &grab_anonymous, force_anonymous, &page_from_bg_q, &reactivated_this_call);
if (m == NULL) {
if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
inactive_burst_count = 0;
if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
}
lock_yield_check = TRUE;
continue;
}
/*
* if we've gotten here, we have no victim page.
* check to see if we've not finished balancing the queues
* or we have a page on the aged speculative queue that we
* skipped due to force_anonymous == TRUE.. or we have
* speculative pages that we can prematurely age... if
* one of these cases we'll keep going, else panic
*/
force_anonymous = FALSE;
VM_PAGEOUT_DEBUG(vm_pageout_no_victim, 1);
if (!vm_page_queue_empty(&sq->age_q)) {
lock_yield_check = TRUE;
continue;
}
if (vm_page_speculative_count) {
force_speculative_aging = TRUE;
lock_yield_check = TRUE;
continue;
}
panic("vm_pageout: no victim");
/* NOTREACHED */
}
assert(VM_PAGE_PAGEABLE(m));
m_object = VM_PAGE_OBJECT(m);
force_anonymous = FALSE;
page_prev_q_state = m->vmp_q_state;
/*
* we just found this page on one of our queues...
* it can't also be on the pageout queue, so safe
* to call vm_page_queues_remove
*/
bool donate = (m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
vm_page_queues_remove(m, TRUE);
if (donate) {
/*
* The compressor needs to see this bit to know
* where this page needs to land. Also if stolen,
* this bit helps put the page back in the right
* special queue where it belongs.
*/
m->vmp_on_specialq = VM_PAGE_SPECIAL_Q_DONATE;
}
assert(!m->vmp_laundry);
assert(!m->vmp_private);
assert(!m->vmp_fictitious);
assert(!is_kernel_object(m_object));
assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr);
vm_pageout_vminfo.vm_pageout_considered_page++;
DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
/*
* check to see if we currently are working
* with the same object... if so, we've
* already got the lock
*/
if (m_object != object) {
boolean_t avoid_anon_pages = (grab_anonymous == FALSE || anons_grabbed >= ANONS_GRABBED_LIMIT);
/*
* vps_switch_object() will always drop the 'object' lock first
* and then try to acquire the 'm_object' lock. So 'object' has to point to
* either 'm_object' or NULL.
*/
retval = vps_switch_object(m, m_object, &object, page_prev_q_state, avoid_anon_pages, page_from_bg_q);
if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
lock_yield_check = TRUE;
continue;
}
}
assert(m_object == object);
assert(VM_PAGE_OBJECT(m) == m_object);
if (m->vmp_busy) {
/*
* Somebody is already playing with this page.
* Put it back on the appropriate queue
*
*/
VM_PAGEOUT_DEBUG(vm_pageout_inactive_busy, 1);
if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
VM_PAGEOUT_DEBUG(vm_pageout_cleaned_busy, 1);
}
vps_requeue_page(m, page_prev_q_state, page_from_bg_q);
lock_yield_check = TRUE;
continue;
}
/*
* if (m->vmp_cleaning && !m->vmp_free_when_done)
* If already cleaning this page in place
* just leave if off the paging queues.
* We can leave the page mapped, and upl_commit_range
* will put it on the clean queue.
*
* if (m->vmp_free_when_done && !m->vmp_cleaning)
* an msync INVALIDATE is in progress...
* this page has been marked for destruction
* after it has been cleaned,
* but not yet gathered into a UPL
* where 'cleaning' will be set...
* just leave it off the paging queues
*
* if (m->vmp_free_when_done && m->vmp_clenaing)
* an msync INVALIDATE is in progress
* and the UPL has already gathered this page...
* just leave it off the paging queues
*/
if (m->vmp_free_when_done || m->vmp_cleaning) {
lock_yield_check = TRUE;
continue;
}
/*
* If it's absent, in error or the object is no longer alive,
* we can reclaim the page... in the no longer alive case,
* there are 2 states the page can be in that preclude us
* from reclaiming it - busy or cleaning - that we've already
* dealt with
*/
if (m->vmp_absent || VMP_ERROR_GET(m) || !object->alive ||
(!object->internal && object->pager == MEMORY_OBJECT_NULL)) {
if (m->vmp_absent) {
VM_PAGEOUT_DEBUG(vm_pageout_inactive_absent, 1);
} else if (!object->alive ||
(!object->internal &&
object->pager == MEMORY_OBJECT_NULL)) {
VM_PAGEOUT_DEBUG(vm_pageout_inactive_notalive, 1);
} else {
VM_PAGEOUT_DEBUG(vm_pageout_inactive_error, 1);
}
reclaim_page:
if (vm_pageout_deadlock_target) {
VM_PAGEOUT_DEBUG(vm_pageout_scan_inactive_throttle_success, 1);
vm_pageout_deadlock_target--;
}
DTRACE_VM2(dfree, int, 1, (uint64_t *), NULL);
if (object->internal) {
DTRACE_VM2(anonfree, int, 1, (uint64_t *), NULL);
} else {
DTRACE_VM2(fsfree, int, 1, (uint64_t *), NULL);
}
assert(!m->vmp_cleaning);
assert(!m->vmp_laundry);
if (!object->internal &&
object->pager != NULL &&
object->pager->mo_pager_ops == &shared_region_pager_ops) {
shared_region_pager_reclaimed++;
}
m->vmp_busy = TRUE;
/*
* remove page from object here since we're already
* behind the object lock... defer the rest of the work
* we'd normally do in vm_page_free_prepare_object
* until 'vm_page_free_list' is called
*/
if (m->vmp_tabled) {
vm_page_remove(m, TRUE);
}
assert(m->vmp_pageq.next == 0 && m->vmp_pageq.prev == 0);
m->vmp_snext = local_freeq;
local_freeq = m;
local_freed++;
if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
vm_pageout_vminfo.vm_pageout_freed_speculative++;
} else if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
vm_pageout_vminfo.vm_pageout_freed_cleaned++;
} else if (page_prev_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q) {
vm_pageout_vminfo.vm_pageout_freed_internal++;
} else {
vm_pageout_vminfo.vm_pageout_freed_external++;
}
inactive_burst_count = 0;
lock_yield_check = TRUE;
continue;
}
if (object->vo_copy == VM_OBJECT_NULL) {
/*
* No one else can have any interest in this page.
* If this is an empty purgable object, the page can be
* reclaimed even if dirty.
* If the page belongs to a volatile purgable object, we
* reactivate it if the compressor isn't active.
*/
if (object->purgable == VM_PURGABLE_EMPTY) {
if (m->vmp_pmapped == TRUE) {
/* unmap the page */
refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
if (refmod_state & VM_MEM_MODIFIED) {
SET_PAGE_DIRTY(m, FALSE);
}
}
if (m->vmp_dirty || m->vmp_precious) {
/* we saved the cost of cleaning this page ! */
vm_page_purged_count++;
}
goto reclaim_page;
}
if (VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
/*
* With the VM compressor, the cost of
* reclaiming a page is much lower (no I/O),
* so if we find a "volatile" page, it's better
* to let it get compressed rather than letting
* it occupy a full page until it gets purged.
* So no need to check for "volatile" here.
*/
} else if (object->purgable == VM_PURGABLE_VOLATILE) {
/*
* Avoid cleaning a "volatile" page which might
* be purged soon.
*/
/* if it's wired, we can't put it on our queue */
assert(!VM_PAGE_WIRED(m));
/* just stick it back on! */
reactivated_this_call++;
if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
VM_PAGEOUT_DEBUG(vm_pageout_cleaned_volatile_reactivated, 1);
}
goto reactivate_page;
}
} /* vo_copy NULL */
/*
* If it's being used, reactivate.
* (Fictitious pages are either busy or absent.)
* First, update the reference and dirty bits
* to make sure the page is unreferenced.
*/
refmod_state = -1;
if (m->vmp_reference == FALSE && m->vmp_pmapped == TRUE) {
refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m));
if (refmod_state & VM_MEM_REFERENCED) {
m->vmp_reference = TRUE;
}
if (refmod_state & VM_MEM_MODIFIED) {
SET_PAGE_DIRTY(m, FALSE);
}
}
if (m->vmp_reference || m->vmp_dirty) {
/* deal with a rogue "reusable" page */
VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m, m_object);
}
if (vm_pageout_state.vm_page_xpmapped_min_divisor == 0) {
vm_pageout_state.vm_page_xpmapped_min = 0;
} else {
vm_pageout_state.vm_page_xpmapped_min = (vm_page_pageable_external_count * 10) /
vm_pageout_state.vm_page_xpmapped_min_divisor;
}
if (!m->vmp_no_cache &&
page_from_bg_q == FALSE &&
(m->vmp_reference || (m->vmp_xpmapped && !object->internal &&
(vm_page_xpmapped_external_count < vm_pageout_state.vm_page_xpmapped_min)))) {
/*
* The page we pulled off the inactive list has
* been referenced. It is possible for other
* processors to be touching pages faster than we
* can clear the referenced bit and traverse the
* inactive queue, so we limit the number of
* reactivations.
*/
if (++reactivated_this_call >= reactivate_limit &&
!object->object_is_shared_cache &&
!((m->vmp_realtime ||
object->for_realtime) &&
vm_pageout_protect_realtime)) {
vm_pageout_vminfo.vm_pageout_reactivation_limit_exceeded++;
} else if (++inactive_reclaim_run >= VM_PAGEOUT_INACTIVE_FORCE_RECLAIM) {
vm_pageout_vminfo.vm_pageout_inactive_force_reclaim++;
if (object->object_is_shared_cache) {
vm_pageout_vminfo.vm_pageout_forcereclaimed_sharedcache++;
} else if (m->vmp_realtime ||
object->for_realtime) {
vm_pageout_vminfo.vm_pageout_forcereclaimed_realtime++;
}
} else {
uint32_t isinuse;
if (reactivated_this_call >= reactivate_limit) {
if (object->object_is_shared_cache) {
vm_pageout_vminfo.vm_pageout_protected_sharedcache++;
} else if ((m->vmp_realtime ||
object->for_realtime) &&
vm_pageout_protect_realtime) {
vm_pageout_vminfo.vm_pageout_protected_realtime++;
}
}
if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reference_reactivated, 1);
}
vm_pageout_vminfo.vm_pageout_inactive_referenced++;
reactivate_page:
if (!object->internal && object->pager != MEMORY_OBJECT_NULL &&
vnode_pager_get_isinuse(object->pager, &isinuse) == KERN_SUCCESS && !isinuse) {
/*
* no explict mappings of this object exist
* and it's not open via the filesystem
*/
vm_page_deactivate(m);
VM_PAGEOUT_DEBUG(vm_pageout_inactive_deactivated, 1);
} else {
/*
* The page was/is being used, so put back on active list.
*/
vm_page_activate(m);
counter_inc(&vm_statistics_reactivations);
inactive_burst_count = 0;
}
#if DEVELOPMENT || DEBUG
if (page_from_bg_q == TRUE) {
if (m_object->internal) {
vm_pageout_rejected_bq_internal++;
} else {
vm_pageout_rejected_bq_external++;
}
}
#endif /* DEVELOPMENT || DEBUG */
if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
}
vm_pageout_state.vm_pageout_inactive_used++;
lock_yield_check = TRUE;
continue;
}
/*
* Make sure we call pmap_get_refmod() if it
* wasn't already called just above, to update
* the dirty bit.
*/
if ((refmod_state == -1) && !m->vmp_dirty && m->vmp_pmapped) {
refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m));
if (refmod_state & VM_MEM_MODIFIED) {
SET_PAGE_DIRTY(m, FALSE);
}
}
}
/*
* we've got a candidate page to steal...
*
* m->vmp_dirty is up to date courtesy of the
* preceding check for m->vmp_reference... if
* we get here, then m->vmp_reference had to be
* FALSE (or possibly "reactivate_limit" was
* exceeded), but in either case we called
* pmap_get_refmod() and updated both
* m->vmp_reference and m->vmp_dirty
*
* if it's dirty or precious we need to
* see if the target queue is throtttled
* it if is, we need to skip over it by moving it back
* to the end of the inactive queue
*/
inactive_throttled = FALSE;
if (m->vmp_dirty || m->vmp_precious) {
if (object->internal) {
if (VM_PAGE_Q_THROTTLED(iq)) {
inactive_throttled = TRUE;
}
} else if (VM_PAGE_Q_THROTTLED(eq)) {
inactive_throttled = TRUE;
}
}
throttle_inactive:
if (!VM_DYNAMIC_PAGING_ENABLED() &&
object->internal && m->vmp_dirty &&
(object->purgable == VM_PURGABLE_DENY ||
object->purgable == VM_PURGABLE_NONVOLATILE ||
object->purgable == VM_PURGABLE_VOLATILE)) {
vm_page_check_pageable_safe(m);
assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
vm_page_queue_enter(&vm_page_queue_throttled, m, vmp_pageq);
m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
vm_page_throttled_count++;
VM_PAGEOUT_DEBUG(vm_pageout_scan_reclaimed_throttled, 1);
inactive_burst_count = 0;
lock_yield_check = TRUE;
continue;
}
if (inactive_throttled == TRUE) {
vps_deal_with_throttled_queues(m, &object, &vm_pageout_inactive_external_forced_reactivate_limit,
&force_anonymous, page_from_bg_q);
inactive_burst_count = 0;
if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
}
lock_yield_check = TRUE;
continue;
}
/*
* we've got a page that we can steal...
* eliminate all mappings and make sure
* we have the up-to-date modified state
*
* if we need to do a pmap_disconnect then we
* need to re-evaluate m->vmp_dirty since the pmap_disconnect
* provides the true state atomically... the
* page was still mapped up to the pmap_disconnect
* and may have been dirtied at the last microsecond
*
* Note that if 'pmapped' is FALSE then the page is not
* and has not been in any map, so there is no point calling
* pmap_disconnect(). m->vmp_dirty could have been set in anticipation
* of likely usage of the page.
*/
if (m->vmp_pmapped == TRUE) {
int pmap_options;
/*
* Don't count this page as going into the compressor
* if any of these are true:
* 1) compressed pager isn't enabled
* 2) Freezer enabled device with compressed pager
* backend (exclusive use) i.e. most of the VM system
* (including vm_pageout_scan) has no knowledge of
* the compressor
* 3) This page belongs to a file and hence will not be
* sent into the compressor
*/
if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE ||
object->internal == FALSE) {
pmap_options = 0;
} else if (m->vmp_dirty || m->vmp_precious) {
/*
* VM knows that this page is dirty (or
* precious) and needs to be compressed
* rather than freed.
* Tell the pmap layer to count this page
* as "compressed".
*/
pmap_options = PMAP_OPTIONS_COMPRESSOR;
} else {
/*
* VM does not know if the page needs to
* be preserved but the pmap layer might tell
* us if any mapping has "modified" it.
* Let's the pmap layer to count this page
* as compressed if and only if it has been
* modified.
*/
pmap_options =
PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
}
refmod_state = pmap_disconnect_options(VM_PAGE_GET_PHYS_PAGE(m),
pmap_options,
NULL);
if (refmod_state & VM_MEM_MODIFIED) {
SET_PAGE_DIRTY(m, FALSE);
}
}
/*
* reset our count of pages that have been reclaimed
* since the last page was 'stolen'
*/
inactive_reclaim_run = 0;
/*
* If it's clean and not precious, we can free the page.
*/
if (!m->vmp_dirty && !m->vmp_precious) {
vm_pageout_state.vm_pageout_inactive_clean++;
/*
* OK, at this point we have found a page we are going to free.
*/
#if CONFIG_PHANTOM_CACHE
if (!object->internal) {
vm_phantom_cache_add_ghost(m);
}
#endif
goto reclaim_page;
}
/*
* The page may have been dirtied since the last check
* for a throttled target queue (which may have been skipped
* if the page was clean then). With the dirty page
* disconnected here, we can make one final check.
*/
if (object->internal) {
if (VM_PAGE_Q_THROTTLED(iq)) {
inactive_throttled = TRUE;
}
} else if (VM_PAGE_Q_THROTTLED(eq)) {
inactive_throttled = TRUE;
}
if (inactive_throttled == TRUE) {
goto throttle_inactive;
}
#if VM_PRESSURE_EVENTS
#if CONFIG_JETSAM
/*
* If Jetsam is enabled, then the sending
* of memory pressure notifications is handled
* from the same thread that takes care of high-water
* and other jetsams i.e. the memorystatus_thread.
*/
#else /* CONFIG_JETSAM */
vm_pressure_response();
#endif /* CONFIG_JETSAM */
#endif /* VM_PRESSURE_EVENTS */
if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
VM_PAGEOUT_DEBUG(vm_pageout_speculative_dirty, 1);
}
if (object->internal) {
vm_pageout_vminfo.vm_pageout_inactive_dirty_internal++;
} else {
vm_pageout_vminfo.vm_pageout_inactive_dirty_external++;
}
/*
* internal pages will go to the compressor...
* external pages will go to the appropriate pager to be cleaned
* and upon completion will end up on 'vm_page_queue_cleaned' which
* is a preferred queue to steal from
*/
vm_pageout_cluster(m);
inactive_burst_count = 0;
/*
* back to top of pageout scan loop
*/
}
}
void
vm_page_free_reserve(
int pages)
{
int free_after_reserve;
if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
if ((vm_page_free_reserved + pages + COMPRESSOR_FREE_RESERVED_LIMIT) >= (VM_PAGE_FREE_RESERVED_LIMIT + COMPRESSOR_FREE_RESERVED_LIMIT)) {
vm_page_free_reserved = VM_PAGE_FREE_RESERVED_LIMIT + COMPRESSOR_FREE_RESERVED_LIMIT;
} else {
vm_page_free_reserved += (pages + COMPRESSOR_FREE_RESERVED_LIMIT);
}
} else {
if ((vm_page_free_reserved + pages) >= VM_PAGE_FREE_RESERVED_LIMIT) {
vm_page_free_reserved = VM_PAGE_FREE_RESERVED_LIMIT;
} else {
vm_page_free_reserved += pages;
}
}
free_after_reserve = vm_pageout_state.vm_page_free_count_init - vm_page_free_reserved;
vm_page_free_min = vm_page_free_reserved +
VM_PAGE_FREE_MIN(free_after_reserve);
if (vm_page_free_min > VM_PAGE_FREE_MIN_LIMIT) {
vm_page_free_min = VM_PAGE_FREE_MIN_LIMIT;
}
vm_page_free_target = vm_page_free_reserved +
VM_PAGE_FREE_TARGET(free_after_reserve);
if (vm_page_free_target > VM_PAGE_FREE_TARGET_LIMIT) {
vm_page_free_target = VM_PAGE_FREE_TARGET_LIMIT;
}
if (vm_page_free_target < vm_page_free_min + 5) {
vm_page_free_target = vm_page_free_min + 5;
}
vm_page_throttle_limit = vm_page_free_target - (vm_page_free_target / 2);
}
/*
* vm_pageout is the high level pageout daemon.
*/
void
vm_pageout_continue(void)
{
DTRACE_VM2(pgrrun, int, 1, (uint64_t *), NULL);
VM_PAGEOUT_DEBUG(vm_pageout_scan_event_counter, 1);
vm_free_page_lock();
vm_pageout_running = TRUE;
vm_free_page_unlock();
vm_pageout_scan();
/*
* we hold both the vm_page_queue_free_lock
* and the vm_page_queues_lock at this point
*/
assert(vm_page_free_wanted == 0);
assert(vm_page_free_wanted_privileged == 0);
assert_wait((event_t) &vm_page_free_wanted, THREAD_UNINT);
vm_pageout_running = FALSE;
#if XNU_TARGET_OS_OSX
if (vm_pageout_waiter) {
vm_pageout_waiter = FALSE;
thread_wakeup((event_t)&vm_pageout_waiter);
}
#endif /* XNU_TARGET_OS_OSX */
vm_free_page_unlock();
vm_page_unlock_queues();
thread_block((thread_continue_t)vm_pageout_continue);
/*NOTREACHED*/
}
#if XNU_TARGET_OS_OSX
kern_return_t
vm_pageout_wait(uint64_t deadline)
{
kern_return_t kr;
vm_free_page_lock();
for (kr = KERN_SUCCESS; vm_pageout_running && (KERN_SUCCESS == kr);) {
vm_pageout_waiter = TRUE;
if (THREAD_AWAKENED != lck_mtx_sleep_deadline(
&vm_page_queue_free_lock, LCK_SLEEP_DEFAULT,
(event_t) &vm_pageout_waiter, THREAD_UNINT, deadline)) {
kr = KERN_OPERATION_TIMED_OUT;
}
}
vm_free_page_unlock();
return kr;
}
#endif /* XNU_TARGET_OS_OSX */
OS_NORETURN
static void
vm_pageout_iothread_external_continue(struct pgo_iothread_state *ethr, __unused wait_result_t w)
{
vm_page_t m = NULL;
vm_object_t object;
vm_object_offset_t offset;
memory_object_t pager;
struct vm_pageout_queue *q = ethr->q;
/* On systems with a compressor, the external IO thread clears its
* VM privileged bit to accommodate large allocations (e.g. bulk UPL
* creation)
*/
if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
current_thread()->options &= ~TH_OPT_VMPRIV;
}
sched_cond_ack(&(ethr->pgo_wakeup));
while (true) {
vm_page_lockspin_queues();
while (!vm_page_queue_empty(&q->pgo_pending)) {
q->pgo_busy = TRUE;
vm_page_queue_remove_first(&q->pgo_pending, m, vmp_pageq);
assert(m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q);
VM_PAGE_CHECK(m);
/*
* grab a snapshot of the object and offset this
* page is tabled in so that we can relookup this
* page after we've taken the object lock - these
* fields are stable while we hold the page queues lock
* but as soon as we drop it, there is nothing to keep
* this page in this object... we hold an activity_in_progress
* on this object which will keep it from terminating
*/
object = VM_PAGE_OBJECT(m);
offset = m->vmp_offset;
m->vmp_q_state = VM_PAGE_NOT_ON_Q;
VM_PAGE_ZERO_PAGEQ_ENTRY(m);
vm_page_unlock_queues();
vm_object_lock(object);
m = vm_page_lookup(object, offset);
if (m == NULL || m->vmp_busy || m->vmp_cleaning ||
!m->vmp_laundry || (m->vmp_q_state != VM_PAGE_NOT_ON_Q)) {
/*
* it's either the same page that someone else has
* started cleaning (or it's finished cleaning or
* been put back on the pageout queue), or
* the page has been freed or we have found a
* new page at this offset... in all of these cases
* we merely need to release the activity_in_progress
* we took when we put the page on the pageout queue
*/
vm_object_activity_end(object);
vm_object_unlock(object);
vm_page_lockspin_queues();
continue;
}
pager = object->pager;
if (pager == MEMORY_OBJECT_NULL) {
/*
* This pager has been destroyed by either
* memory_object_destroy or vm_object_destroy, and
* so there is nowhere for the page to go.
*/
if (m->vmp_free_when_done) {
/*
* Just free the page... VM_PAGE_FREE takes
* care of cleaning up all the state...
* including doing the vm_pageout_throttle_up
*/
VM_PAGE_FREE(m);
} else {
vm_page_lockspin_queues();
vm_pageout_throttle_up(m);
vm_page_activate(m);
vm_page_unlock_queues();
/*
* And we are done with it.
*/
}
vm_object_activity_end(object);
vm_object_unlock(object);
vm_page_lockspin_queues();
continue;
}
#if 0
/*
* we don't hold the page queue lock
* so this check isn't safe to make
*/
VM_PAGE_CHECK(m);
#endif
/*
* give back the activity_in_progress reference we
* took when we queued up this page and replace it
* it with a paging_in_progress reference that will
* also hold the paging offset from changing and
* prevent the object from terminating
*/
vm_object_activity_end(object);
vm_object_paging_begin(object);
vm_object_unlock(object);
/*
* Send the data to the pager.
* any pageout clustering happens there
*/
memory_object_data_return(pager,
m->vmp_offset + object->paging_offset,
PAGE_SIZE,
NULL,
NULL,
FALSE,
FALSE,
0);
vm_object_lock(object);
vm_object_paging_end(object);
vm_object_unlock(object);
vm_pageout_io_throttle();
vm_page_lockspin_queues();
}
q->pgo_busy = FALSE;
vm_page_unlock_queues();
sched_cond_wait_parameter(&(ethr->pgo_wakeup), THREAD_UNINT, (thread_continue_t)vm_pageout_iothread_external_continue, ethr);
}
/*NOTREACHED*/
}
uint32_t vm_compressor_time_thread; /* Set via sysctl 'vm.compressor_timing_enabled' to record time accrued by this thread. */
#if DEVELOPMENT || DEBUG
static void
vm_pageout_record_thread_time(int cqid, int ncomps)
{
if (__improbable(vm_compressor_time_thread)) {
vmct_stats.vmct_runtimes[cqid] = thread_get_runtime_self();
vmct_stats.vmct_pages[cqid] += ncomps;
vmct_stats.vmct_iterations[cqid]++;
if (ncomps > vmct_stats.vmct_maxpages[cqid]) {
vmct_stats.vmct_maxpages[cqid] = ncomps;
}
if (ncomps < vmct_stats.vmct_minpages[cqid]) {
vmct_stats.vmct_minpages[cqid] = ncomps;
}
}
}
#endif
static void *
vm_pageout_select_filling_chead(struct pgo_iothread_state *cq, vm_page_t m)
{
/*
* Technically we need the pageq locks to manipulate the vmp_on_specialq field.
* However, this page has been removed from all queues and is only
* known to this compressor thread dealing with this local queue.
*
* TODO: Add a second localq that is the early localq and
* put special pages like this one on that queue in the block above
* under the pageq lock to avoid this 'works but not clean' logic.
*/
void *donate_queue_head;
#if XNU_TARGET_OS_OSX /* tag:DONATE */
donate_queue_head = &cq->current_early_swapout_chead;
#else /* XNU_TARGET_OS_OSX */
donate_queue_head = &cq->current_late_swapout_chead;
#endif /* XNU_TARGET_OS_OSX */
if (m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE) {
m->vmp_on_specialq = VM_PAGE_SPECIAL_Q_EMPTY;
return donate_queue_head;
} else {
return &cq->current_regular_swapout_chead;
}
}
#define MAX_FREE_BATCH 32
OS_NORETURN
static void
vm_pageout_iothread_internal_continue(struct pgo_iothread_state *cq, __unused wait_result_t w)
{
struct vm_pageout_queue *q;
vm_page_t m = NULL;
boolean_t pgo_draining;
vm_page_t local_q;
int local_cnt;
vm_page_t local_freeq = NULL;
int local_freed = 0;
int local_batch_size;
#if DEVELOPMENT || DEBUG
int ncomps = 0;
boolean_t marked_active = FALSE;
int num_pages_processed = 0;
#endif
void *chead = NULL;
KDBG_FILTERED(0xe040000c | DBG_FUNC_END);
sched_cond_ack(&(cq->pgo_wakeup));
q = cq->q;
while (true) { /* this top loop is for the compressor_running_perf_test running a full speed without blocking */
#if DEVELOPMENT || DEBUG
bool benchmark_accounting = false;
/* If we're running the compressor perf test, only process the benchmark pages.
* We'll get back to our regular queue once the benchmark is done */
if (compressor_running_perf_test) {
q = cq->benchmark_q;
if (!vm_page_queue_empty(&q->pgo_pending)) {
benchmark_accounting = true;
} else {
q = cq->q;
benchmark_accounting = false;
}
}
#endif /* DEVELOPMENT || DEBUG */
#if __AMP__
if (vm_compressor_ebound && (vm_pageout_state.vm_compressor_thread_count > 1)) {
local_batch_size = (q->pgo_maxlaundry >> 3);
local_batch_size = MAX(local_batch_size, 16);
} else {
local_batch_size = q->pgo_maxlaundry / (vm_pageout_state.vm_compressor_thread_count * 2);
}
#else
local_batch_size = q->pgo_maxlaundry / (vm_pageout_state.vm_compressor_thread_count * 2);
#endif
#if RECORD_THE_COMPRESSED_DATA
if (q->pgo_laundry) {
c_compressed_record_init();
}
#endif
while (true) { /* this loop is for working though all the pages in the pending queue */
int pages_left_on_q = 0;
local_cnt = 0;
local_q = NULL;
KDBG_FILTERED(0xe0400014 | DBG_FUNC_START);
vm_page_lock_queues();
#if DEVELOPMENT || DEBUG
if (marked_active == FALSE) {
vmct_active++;
vmct_state[cq->id] = VMCT_ACTIVE;
marked_active = TRUE;
if (vmct_active == 1) {
vm_compressor_epoch_start = mach_absolute_time();
}
}
#endif
KDBG_FILTERED(0xe0400014 | DBG_FUNC_END);
KDBG_FILTERED(0xe0400018 | DBG_FUNC_START, q->pgo_laundry);
/* empty the entire content of the thread input q to local_q, but not more than local_batch_size pages */
while (!vm_page_queue_empty(&q->pgo_pending) && local_cnt < local_batch_size) {
vm_page_queue_remove_first(&q->pgo_pending, m, vmp_pageq);
assert(m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q);
VM_PAGE_CHECK(m);
m->vmp_q_state = VM_PAGE_NOT_ON_Q;
VM_PAGE_ZERO_PAGEQ_ENTRY(m);
m->vmp_laundry = FALSE;
m->vmp_snext = local_q;
local_q = m;
local_cnt++;
}
if (local_q == NULL) {
break;
}
q->pgo_busy = TRUE;
if ((pgo_draining = q->pgo_draining) == FALSE) {
vm_pageout_throttle_up_batch(q, local_cnt);
pages_left_on_q = q->pgo_laundry;
} else {
pages_left_on_q = q->pgo_laundry - local_cnt;
}
vm_page_unlock_queues();
#if !RECORD_THE_COMPRESSED_DATA
/* if we have lots to compress, wake up the other thread to help.
* disabled when recording data since record data is not protected with a mutex so this may cause races */
if (pages_left_on_q >= local_batch_size && cq->id < (vm_pageout_state.vm_compressor_thread_count - 1)) {
// wake up the next compressor thread
sched_cond_signal(&pgo_iothread_internal_state[cq->id + 1].pgo_wakeup,
pgo_iothread_internal_state[cq->id + 1].pgo_iothread);
}
#endif
KDBG_FILTERED(0xe0400018 | DBG_FUNC_END, q->pgo_laundry);
while (local_q) {
KDBG_FILTERED(0xe0400024 | DBG_FUNC_START, local_cnt);
m = local_q;
local_q = m->vmp_snext;
m->vmp_snext = NULL;
chead = vm_pageout_select_filling_chead(cq, m);
if (vm_pageout_compress_page(chead, cq->scratch_buf, m) == KERN_SUCCESS) {
#if DEVELOPMENT || DEBUG
ncomps++;
#endif
KDBG_FILTERED(0xe0400024 | DBG_FUNC_END, local_cnt);
m->vmp_snext = local_freeq;
local_freeq = m;
local_freed++;
/* if we gathered enough free pages, free them now */
if (local_freed >= MAX_FREE_BATCH) {
OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
vm_page_free_list(local_freeq, TRUE);
local_freeq = NULL;
local_freed = 0;
}
}
#if DEVELOPMENT || DEBUG
num_pages_processed++;
#endif /* DEVELOPMENT || DEBUG */
#if !CONFIG_JETSAM /* Maybe: if there's no JETSAM, be more proactive in waking up anybody that needs free pages */
while (vm_page_free_count < COMPRESSOR_FREE_RESERVED_LIMIT) {
kern_return_t wait_result;
int need_wakeup = 0;
if (local_freeq) {
OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
vm_page_free_list(local_freeq, TRUE);
local_freeq = NULL;
local_freed = 0;
continue;
}
vm_free_page_lock_spin();
if (vm_page_free_count < COMPRESSOR_FREE_RESERVED_LIMIT) {
if (vm_page_free_wanted_privileged++ == 0) {
need_wakeup = 1;
}
wait_result = assert_wait((event_t)&vm_page_free_wanted_privileged, THREAD_UNINT);
vm_free_page_unlock();
if (need_wakeup) {
thread_wakeup((event_t)&vm_page_free_wanted);
}
if (wait_result == THREAD_WAITING) {
thread_block(THREAD_CONTINUE_NULL);
}
} else {
vm_free_page_unlock();
}
}
#endif
} /* while (local_q) */
/* free any leftovers in the freeq */
if (local_freeq) {
OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
vm_page_free_list(local_freeq, TRUE);
local_freeq = NULL;
local_freed = 0;
}
if (pgo_draining == TRUE) {
vm_page_lockspin_queues();
vm_pageout_throttle_up_batch(q, local_cnt);
vm_page_unlock_queues();
}
}
KDBG_FILTERED(0xe040000c | DBG_FUNC_START);
/*
* queue lock is held and our q is empty
*/
q->pgo_busy = FALSE;
#if DEVELOPMENT || DEBUG
if (marked_active == TRUE) {
vmct_active--;
vmct_state[cq->id] = VMCT_IDLE;
if (vmct_active == 0) {
vm_compressor_epoch_stop = mach_absolute_time();
assertf(vm_compressor_epoch_stop >= vm_compressor_epoch_start,
"Compressor epoch non-monotonic: 0x%llx -> 0x%llx",
vm_compressor_epoch_start, vm_compressor_epoch_stop);
/* This interval includes intervals where one or more
* compressor threads were pre-empted
*/
vmct_stats.vmct_cthreads_total += vm_compressor_epoch_stop - vm_compressor_epoch_start;
}
}
if (compressor_running_perf_test && benchmark_accounting) {
/*
* We could turn ON compressor_running_perf_test while still processing
* regular non-benchmark pages. We shouldn't count them here else we
* could overshoot. We might also still be populating that benchmark Q
* and be under pressure. So we will go back to the regular queues. And
* benchmark accounting will be off for that case too.
*/
compressor_perf_test_pages_processed += num_pages_processed;
thread_wakeup(&compressor_perf_test_pages_processed);
}
#endif
vm_page_unlock_queues();
#if DEVELOPMENT || DEBUG
vm_pageout_record_thread_time(cq->id, ncomps);
#endif
KDBG_FILTERED(0xe0400018 | DBG_FUNC_END);
#if DEVELOPMENT || DEBUG
if (compressor_running_perf_test && benchmark_accounting) {
/*
* We've been exclusively compressing pages from the benchmark queue,
* do 1 pass over the internal queue before blocking.
*/
continue;
}
#endif
sched_cond_wait_parameter(&(cq->pgo_wakeup), THREAD_UNINT, (thread_continue_t)vm_pageout_iothread_internal_continue, (void *) cq);
}
/*NOTREACHED*/
}
/* resolves the pager and maintain stats in the pager and in the vm_object */
kern_return_t
vm_pageout_compress_page(void **current_chead, char *scratch_buf, vm_page_t m)
{
vm_object_t object;
memory_object_t pager;
int compressed_count_delta;
kern_return_t retval;
object = VM_PAGE_OBJECT(m);
assert(!m->vmp_free_when_done);
assert(!m->vmp_laundry);
pager = object->pager;
if (!object->pager_initialized || pager == MEMORY_OBJECT_NULL) {
KDBG_FILTERED(0xe0400010 | DBG_FUNC_START, object, pager);
vm_object_lock(object);
/*
* If there is no memory object for the page, create
* one and hand it to the compression pager.
*/
if (!object->pager_initialized) {
vm_object_collapse(object, (vm_object_offset_t) 0, TRUE);
}
if (!object->pager_initialized) {
vm_object_compressor_pager_create(object);
}
pager = object->pager;
if (!object->pager_initialized || pager == MEMORY_OBJECT_NULL) {
/*
* Still no pager for the object,
* or the pager has been destroyed.
* Reactivate the page.
*
* Should only happen if there is no
* compression pager
*/
vm_page_wakeup_done(object, m);
vm_page_lockspin_queues();
vm_page_activate(m);
VM_PAGEOUT_DEBUG(vm_pageout_dirty_no_pager, 1);
vm_page_unlock_queues();
/*
* And we are done with it.
*/
vm_object_activity_end(object);
vm_object_unlock(object);
return KERN_FAILURE;
}
vm_object_unlock(object);
KDBG_FILTERED(0xe0400010 | DBG_FUNC_END, object, pager);
}
assert(object->pager_initialized && pager != MEMORY_OBJECT_NULL);
assert(object->activity_in_progress > 0);
#if CONFIG_TRACK_UNMODIFIED_ANON_PAGES
if (m->vmp_unmodified_ro == true) {
os_atomic_inc(&compressor_ro_uncompressed_total_returned, relaxed);
}
#endif /* CONFIG_TRACK_UNMODIFIED_ANON_PAGES */
vm_compressor_options_t flags = 0;
#if CONFIG_TRACK_UNMODIFIED_ANON_PAGES
if (m->vmp_unmodified_ro) {
flags |= C_PAGE_UNMODIFIED;
}
#endif /* CONFIG_TRACK_UNMODIFIED_ANON_PAGES */
retval = vm_compressor_pager_put(
pager,
m->vmp_offset + object->paging_offset,
VM_PAGE_GET_PHYS_PAGE(m),
current_chead,
scratch_buf,
&compressed_count_delta,
flags);
vm_object_lock(object);
assert(object->activity_in_progress > 0);
assert(VM_PAGE_OBJECT(m) == object);
assert( !VM_PAGE_WIRED(m));
vm_compressor_pager_count(pager,
compressed_count_delta,
FALSE, /* shared_lock */
object);
if (retval == KERN_SUCCESS) {
/*
* If the object is purgeable, its owner's
* purgeable ledgers will be updated in
* vm_page_remove() but the page still
* contributes to the owner's memory footprint,
* so account for it as such.
*/
if (m->vmp_tabled) {
vm_page_remove(m, TRUE);
}
if ((object->purgable != VM_PURGABLE_DENY ||
object->vo_ledger_tag) &&
object->vo_owner != NULL) {
/* one more compressed purgeable/tagged page */
vm_object_owner_compressed_update(object,
compressed_count_delta);
}
counter_inc(&vm_statistics_compressions);
} else {
vm_page_wakeup_done(object, m);
vm_page_lockspin_queues();
vm_page_activate(m);
vm_pageout_vminfo.vm_compressor_failed++;
vm_page_unlock_queues();
}
vm_object_activity_end(object);
vm_object_unlock(object);
return retval;
}
static void
vm_pageout_adjust_eq_iothrottle(struct pgo_iothread_state *ethr, boolean_t req_lowpriority)
{
uint32_t policy;
if (hibernate_cleaning_in_progress == TRUE) {
req_lowpriority = FALSE;
}
if (ethr->q->pgo_inited == TRUE && ethr->q->pgo_lowpriority != req_lowpriority) {
vm_page_unlock_queues();
if (req_lowpriority == TRUE) {
policy = THROTTLE_LEVEL_PAGEOUT_THROTTLED;
DTRACE_VM(laundrythrottle);
} else {
policy = THROTTLE_LEVEL_PAGEOUT_UNTHROTTLED;
DTRACE_VM(laundryunthrottle);
}
proc_set_thread_policy(ethr->pgo_iothread,
TASK_POLICY_EXTERNAL, TASK_POLICY_IO, policy);
vm_page_lock_queues();
ethr->q->pgo_lowpriority = req_lowpriority;
}
}
OS_NORETURN
static void
vm_pageout_iothread_external(struct pgo_iothread_state *ethr, __unused wait_result_t w)
{
thread_t self = current_thread();
self->options |= TH_OPT_VMPRIV;
DTRACE_VM2(laundrythrottle, int, 1, (uint64_t *), NULL);
proc_set_thread_policy(self, TASK_POLICY_EXTERNAL,
TASK_POLICY_IO, THROTTLE_LEVEL_PAGEOUT_THROTTLED);
vm_page_lock_queues();
vm_pageout_queue_external.pgo_lowpriority = TRUE;
vm_pageout_queue_external.pgo_inited = TRUE;
vm_page_unlock_queues();
#if CONFIG_THREAD_GROUPS
thread_group_vm_add();
#endif /* CONFIG_THREAD_GROUPS */
vm_pageout_iothread_external_continue(ethr, 0);
/*NOTREACHED*/
}
OS_NORETURN
static void
vm_pageout_iothread_internal(struct pgo_iothread_state *cthr, __unused wait_result_t w)
{
thread_t self = current_thread();
self->options |= TH_OPT_VMPRIV;
vm_page_lock_queues();
vm_pageout_queue_internal.pgo_lowpriority = TRUE;
vm_pageout_queue_internal.pgo_inited = TRUE;
#if DEVELOPMENT || DEBUG
vm_pageout_queue_benchmark.pgo_lowpriority = vm_pageout_queue_internal.pgo_lowpriority;
vm_pageout_queue_benchmark.pgo_inited = vm_pageout_queue_internal.pgo_inited;
vm_pageout_queue_benchmark.pgo_busy = FALSE;
#endif /* DEVELOPMENT || DEBUG */
vm_page_unlock_queues();
if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) {
thread_vm_bind_group_add();
}
#if CONFIG_THREAD_GROUPS
thread_group_vm_add();
#endif /* CONFIG_THREAD_GROUPS */
#if __AMP__
if (vm_compressor_ebound) {
/*
* Use the soft bound option for vm_compressor to allow it to run on
* P-cores if E-cluster is unavailable.
*/
thread_bind_cluster_type(self, 'E', true);
}
#endif /* __AMP__ */
thread_set_thread_name(current_thread(), "VM_compressor");
#if DEVELOPMENT || DEBUG
vmct_stats.vmct_minpages[cthr->id] = INT32_MAX;
#endif
vm_pageout_iothread_internal_continue(cthr, 0);
/*NOTREACHED*/
}
kern_return_t
vm_set_buffer_cleanup_callout(boolean_t (*func)(int))
{
if (OSCompareAndSwapPtr(NULL, ptrauth_nop_cast(void *, func), (void * volatile *) &consider_buffer_cache_collect)) {
return KERN_SUCCESS;
} else {
return KERN_FAILURE; /* Already set */
}
}
extern boolean_t memorystatus_manual_testing_on;
extern unsigned int memorystatus_level;
#if VM_PRESSURE_EVENTS
boolean_t vm_pressure_events_enabled = FALSE;
extern uint64_t next_warning_notification_sent_at_ts;
extern uint64_t next_critical_notification_sent_at_ts;
#define PRESSURE_LEVEL_STUCK_THRESHOLD_MINS (30) /* 30 minutes. */
/*
* The last time there was change in pressure level OR we forced a check
* because the system is stuck in a non-normal pressure level.
*/
uint64_t vm_pressure_last_level_transition_abs = 0;
/*
* This is how the long the system waits 'stuck' in an unchanged non-normal pressure
* level before resending out notifications for that level again.
*/
int vm_pressure_level_transition_threshold = PRESSURE_LEVEL_STUCK_THRESHOLD_MINS;
void
vm_pressure_response(void)
{
vm_pressure_level_t old_level = kVMPressureNormal;
int new_level = -1;
unsigned int total_pages;
uint64_t available_memory = 0;
uint64_t curr_ts, abs_time_since_level_transition, time_in_ns;
bool force_check = false;
int time_in_mins;
if (vm_pressure_events_enabled == FALSE) {
return;
}
#if !XNU_TARGET_OS_OSX
available_memory = (uint64_t) memorystatus_available_pages;
#else /* !XNU_TARGET_OS_OSX */
available_memory = (uint64_t) AVAILABLE_NON_COMPRESSED_MEMORY;
memorystatus_available_pages = (uint64_t) AVAILABLE_NON_COMPRESSED_MEMORY;
#endif /* !XNU_TARGET_OS_OSX */
total_pages = (unsigned int) atop_64(max_mem);
#if CONFIG_SECLUDED_MEMORY
total_pages -= vm_page_secluded_count;
#endif /* CONFIG_SECLUDED_MEMORY */
memorystatus_level = (unsigned int) ((available_memory * 100) / total_pages);
if (memorystatus_manual_testing_on) {
return;
}
curr_ts = mach_absolute_time();
abs_time_since_level_transition = curr_ts - vm_pressure_last_level_transition_abs;
absolutetime_to_nanoseconds(abs_time_since_level_transition, &time_in_ns);
time_in_mins = (int) ((time_in_ns / NSEC_PER_SEC) / 60);
force_check = (time_in_mins >= vm_pressure_level_transition_threshold);
old_level = memorystatus_vm_pressure_level;
switch (memorystatus_vm_pressure_level) {
case kVMPressureNormal:
{
if (VM_PRESSURE_WARNING_TO_CRITICAL()) {
new_level = kVMPressureCritical;
} else if (VM_PRESSURE_NORMAL_TO_WARNING()) {
new_level = kVMPressureWarning;
}
break;
}
case kVMPressureWarning:
case kVMPressureUrgent:
{
if (VM_PRESSURE_WARNING_TO_NORMAL()) {
new_level = kVMPressureNormal;
} else if (VM_PRESSURE_WARNING_TO_CRITICAL()) {
new_level = kVMPressureCritical;
} else if (force_check) {
new_level = kVMPressureWarning;
next_warning_notification_sent_at_ts = curr_ts;
}
break;
}
case kVMPressureCritical:
{
if (VM_PRESSURE_WARNING_TO_NORMAL()) {
new_level = kVMPressureNormal;
} else if (VM_PRESSURE_CRITICAL_TO_WARNING()) {
new_level = kVMPressureWarning;
} else if (force_check) {
new_level = kVMPressureCritical;
next_critical_notification_sent_at_ts = curr_ts;
}
break;
}
default:
return;
}
if (new_level != -1 || force_check) {
if (new_level != -1) {
memorystatus_vm_pressure_level = (vm_pressure_level_t) new_level;
if (new_level != (int) old_level) {
VM_DEBUG_CONSTANT_EVENT(vm_pressure_level_change, DBG_VM_PRESSURE_LEVEL_CHANGE, DBG_FUNC_NONE,
new_level, old_level, 0, 0);
}
} else {
VM_DEBUG_CONSTANT_EVENT(vm_pressure_level_change, DBG_VM_PRESSURE_LEVEL_CHANGE, DBG_FUNC_NONE,
new_level, old_level, force_check, 0);
}
if (hibernation_vmqueues_inspection || hibernate_cleaning_in_progress) {
/*
* We don't want to schedule a wakeup while hibernation is in progress
* because that could collide with checks for non-monotonicity in the scheduler.
* We do however do all the updates to memorystatus_vm_pressure_level because
* we _might_ want to use that for decisions regarding which pages or how
* many pages we want to dump in hibernation.
*/
return;
}
if ((memorystatus_vm_pressure_level != kVMPressureNormal) || (old_level != memorystatus_vm_pressure_level) || force_check) {
if (vm_pageout_state.vm_pressure_thread_running == FALSE) {
thread_wakeup(&vm_pressure_thread);
}
if (old_level != memorystatus_vm_pressure_level) {
thread_wakeup(&vm_pageout_state.vm_pressure_changed);
}
vm_pressure_last_level_transition_abs = curr_ts; /* renew the window of observation for a stuck pressure level */
}
}
}
#endif /* VM_PRESSURE_EVENTS */
/**
* Called by a kernel thread to ask if a number of pages may be wired.
*/
kern_return_t
mach_vm_wire_level_monitor(int64_t requested_pages)
{
if (requested_pages <= 0) {
return KERN_INVALID_ARGUMENT;
}
const int64_t max_wire_pages = atop_64(vm_global_user_wire_limit);
/**
* Available pages can be negative in the case where more system memory is
* wired than the threshold, so we must use a signed integer.
*/
const int64_t available_pages = max_wire_pages - vm_page_wire_count;
if (requested_pages > available_pages) {
return KERN_RESOURCE_SHORTAGE;
}
return KERN_SUCCESS;
}
/*
* Function called by a kernel thread to either get the current pressure level or
* wait until memory pressure changes from a given level.
*/
kern_return_t
mach_vm_pressure_level_monitor(boolean_t wait_for_pressure, unsigned int *pressure_level)
{
#if !VM_PRESSURE_EVENTS
(void)wait_for_pressure;
(void)pressure_level;
return KERN_NOT_SUPPORTED;
#else /* VM_PRESSURE_EVENTS */
uint32_t *waiters = NULL;
wait_result_t wr = 0;
vm_pressure_level_t old_level = memorystatus_vm_pressure_level;
if (pressure_level == NULL) {
return KERN_INVALID_ARGUMENT;
}
if (!wait_for_pressure && (*pressure_level == kVMPressureBackgroundJetsam ||
*pressure_level == kVMPressureForegroundJetsam)) {
return KERN_INVALID_ARGUMENT;
}
if (wait_for_pressure) {
switch (*pressure_level) {
case kVMPressureForegroundJetsam:
case kVMPressureBackgroundJetsam:
if (*pressure_level == kVMPressureForegroundJetsam) {
waiters = &memorystatus_jetsam_fg_band_waiters;
} else {
/* kVMPressureBackgroundJetsam */
waiters = &memorystatus_jetsam_bg_band_waiters;
}
lck_mtx_lock(&memorystatus_jetsam_broadcast_lock);
wr = assert_wait((event_t)waiters, THREAD_INTERRUPTIBLE);
if (wr == THREAD_WAITING) {
*waiters += 1;
lck_mtx_unlock(&memorystatus_jetsam_broadcast_lock);
wr = thread_block(THREAD_CONTINUE_NULL);
} else {
lck_mtx_unlock(&memorystatus_jetsam_broadcast_lock);
}
if (wr != THREAD_AWAKENED) {
return KERN_ABORTED;
}
return KERN_SUCCESS;
case kVMPressureNormal:
case kVMPressureWarning:
case kVMPressureUrgent:
case kVMPressureCritical:
while (old_level == *pressure_level) {
wr = assert_wait((event_t) &vm_pageout_state.vm_pressure_changed,
THREAD_INTERRUPTIBLE);
if (wr == THREAD_WAITING) {
wr = thread_block(THREAD_CONTINUE_NULL);
}
if (wr == THREAD_INTERRUPTED) {
return KERN_ABORTED;
}
if (wr == THREAD_AWAKENED) {
old_level = memorystatus_vm_pressure_level;
}
}
break;
default:
return KERN_INVALID_ARGUMENT;
}
}
*pressure_level = old_level;
return KERN_SUCCESS;
#endif /* VM_PRESSURE_EVENTS */
}
#if VM_PRESSURE_EVENTS
void
vm_pressure_thread(void)
{
static boolean_t thread_initialized = FALSE;
if (thread_initialized == TRUE) {
vm_pageout_state.vm_pressure_thread_running = TRUE;
consider_vm_pressure_events();
vm_pageout_state.vm_pressure_thread_running = FALSE;
}
#if CONFIG_THREAD_GROUPS
thread_group_vm_add();
#endif /* CONFIG_THREAD_GROUPS */
thread_set_thread_name(current_thread(), "VM_pressure");
thread_initialized = TRUE;
assert_wait((event_t) &vm_pressure_thread, THREAD_UNINT);
thread_block((thread_continue_t)vm_pressure_thread);
}
#endif /* VM_PRESSURE_EVENTS */
/*
* called once per-second via "compute_averages"
*/
void
compute_pageout_gc_throttle(__unused void *arg)
{
if (vm_pageout_vminfo.vm_pageout_considered_page != vm_pageout_state.vm_pageout_considered_page_last) {
vm_pageout_state.vm_pageout_considered_page_last = vm_pageout_vminfo.vm_pageout_considered_page;
thread_wakeup(VM_PAGEOUT_GC_EVENT);
}
}
/*
* vm_pageout_garbage_collect can also be called when the zone allocator needs
* to call zone_gc on a different thread in order to trigger zone-map-exhaustion
* jetsams. We need to check if the zone map size is above its jetsam limit to
* decide if this was indeed the case.
*
* We need to do this on a different thread because of the following reasons:
*
* 1. In the case of synchronous jetsams, the leaking process can try to jetsam
* itself causing the system to hang. We perform synchronous jetsams if we're
* leaking in the VM map entries zone, so the leaking process could be doing a
* zalloc for a VM map entry while holding its vm_map lock, when it decides to
* jetsam itself. We also need the vm_map lock on the process termination path,
* which would now lead the dying process to deadlock against itself.
*
* 2. The jetsam path might need to allocate zone memory itself. We could try
* using the non-blocking variant of zalloc for this path, but we can still
* end up trying to do a kmem_alloc when the zone maps are almost full.
*/
__dead2
void
vm_pageout_garbage_collect(void *step, wait_result_t wr __unused)
{
assert(step == VM_PAGEOUT_GC_INIT || step == VM_PAGEOUT_GC_COLLECT);
if (step == VM_PAGEOUT_GC_INIT) {
/* first time being called is not about GC */
#if CONFIG_THREAD_GROUPS
thread_group_vm_add();
#endif /* CONFIG_THREAD_GROUPS */
} else if (zone_map_nearing_exhaustion()) {
/*
* Woken up by the zone allocator for zone-map-exhaustion jetsams.
*
* Bail out after calling zone_gc (which triggers the
* zone-map-exhaustion jetsams). If we fall through, the subsequent
* operations that clear out a bunch of caches might allocate zone
* memory themselves (for eg. vm_map operations would need VM map
* entries). Since the zone map is almost full at this point, we
* could end up with a panic. We just need to quickly jetsam a
* process and exit here.
*
* It could so happen that we were woken up to relieve memory
* pressure and the zone map also happened to be near its limit at
* the time, in which case we'll skip out early. But that should be
* ok; if memory pressure persists, the thread will simply be woken
* up again.
*/
zone_gc(ZONE_GC_JETSAM);
} else {
/* Woken up by vm_pageout_scan or compute_pageout_gc_throttle. */
boolean_t buf_large_zfree = FALSE;
boolean_t first_try = TRUE;
stack_collect();
consider_machine_collect();
#if CONFIG_MBUF_MCACHE
mbuf_drain(FALSE);
#endif /* CONFIG_MBUF_MCACHE */
do {
if (consider_buffer_cache_collect != NULL) {
buf_large_zfree = (*consider_buffer_cache_collect)(0);
}
if (first_try == TRUE || buf_large_zfree == TRUE) {
/*
* zone_gc should be last, because the other operations
* might return memory to zones.
*/
zone_gc(ZONE_GC_TRIM);
}
first_try = FALSE;
} while (buf_large_zfree == TRUE && vm_page_free_count < vm_page_free_target);
consider_machine_adjust();
}
assert_wait(VM_PAGEOUT_GC_EVENT, THREAD_UNINT);
thread_block_parameter(vm_pageout_garbage_collect, VM_PAGEOUT_GC_COLLECT);
__builtin_unreachable();
}
#if VM_PAGE_BUCKETS_CHECK
#if VM_PAGE_FAKE_BUCKETS
extern vm_map_offset_t vm_page_fake_buckets_start, vm_page_fake_buckets_end;
#endif /* VM_PAGE_FAKE_BUCKETS */
#endif /* VM_PAGE_BUCKETS_CHECK */
void
vm_set_restrictions(unsigned int num_cpus)
{
int vm_restricted_to_single_processor = 0;
if (PE_parse_boot_argn("vm_restricted_to_single_processor", &vm_restricted_to_single_processor, sizeof(vm_restricted_to_single_processor))) {
kprintf("Overriding vm_restricted_to_single_processor to %d\n", vm_restricted_to_single_processor);
vm_pageout_state.vm_restricted_to_single_processor = (vm_restricted_to_single_processor ? TRUE : FALSE);
} else {
assert(num_cpus > 0);
if (num_cpus <= 3) {
/*
* on systems with a limited number of CPUS, bind the
* 4 major threads that can free memory and that tend to use
* a fair bit of CPU under pressured conditions to a single processor.
* This insures that these threads don't hog all of the available CPUs
* (important for camera launch), while allowing them to run independently
* w/r to locks... the 4 threads are
* vm_pageout_scan, vm_pageout_iothread_internal (compressor),
* vm_compressor_swap_trigger_thread (minor and major compactions),
* memorystatus_thread (jetsams).
*
* the first time the thread is run, it is responsible for checking the
* state of vm_restricted_to_single_processor, and if TRUE it calls
* thread_bind_master... someday this should be replaced with a group
* scheduling mechanism and KPI.
*/
vm_pageout_state.vm_restricted_to_single_processor = TRUE;
} else {
vm_pageout_state.vm_restricted_to_single_processor = FALSE;
}
}
}
/*
* Set up vm_config based on the vm_compressor_mode.
* Must run BEFORE the pageout thread starts up.
*/
__startup_func
void
vm_config_init(void)
{
bzero(&vm_config, sizeof(vm_config));
switch (vm_compressor_mode) {
case VM_PAGER_DEFAULT:
printf("mapping deprecated VM_PAGER_DEFAULT to VM_PAGER_COMPRESSOR_WITH_SWAP\n");
OS_FALLTHROUGH;
case VM_PAGER_COMPRESSOR_WITH_SWAP:
vm_config.compressor_is_present = TRUE;
vm_config.swap_is_present = TRUE;
vm_config.compressor_is_active = TRUE;
vm_config.swap_is_active = TRUE;
break;
case VM_PAGER_COMPRESSOR_NO_SWAP:
vm_config.compressor_is_present = TRUE;
vm_config.swap_is_present = TRUE;
vm_config.compressor_is_active = TRUE;
break;
case VM_PAGER_FREEZER_DEFAULT:
printf("mapping deprecated VM_PAGER_FREEZER_DEFAULT to VM_PAGER_FREEZER_COMPRESSOR_NO_SWAP\n");
OS_FALLTHROUGH;
case VM_PAGER_FREEZER_COMPRESSOR_NO_SWAP:
vm_config.compressor_is_present = TRUE;
vm_config.swap_is_present = TRUE;
break;
case VM_PAGER_COMPRESSOR_NO_SWAP_PLUS_FREEZER_COMPRESSOR_WITH_SWAP:
vm_config.compressor_is_present = TRUE;
vm_config.swap_is_present = TRUE;
vm_config.compressor_is_active = TRUE;
vm_config.freezer_swap_is_active = TRUE;
break;
case VM_PAGER_NOT_CONFIGURED:
break;
default:
printf("unknown compressor mode - %x\n", vm_compressor_mode);
break;
}
}
__startup_func
static void
vm_pageout_create_gc_thread(void)
{
thread_t thread;
if (kernel_thread_create(vm_pageout_garbage_collect,
VM_PAGEOUT_GC_INIT, BASEPRI_DEFAULT, &thread) != KERN_SUCCESS) {
panic("vm_pageout_garbage_collect: create failed");
}
thread_set_thread_name(thread, "VM_pageout_garbage_collect");
if (thread->reserved_stack == 0) {
assert(thread->kernel_stack);
thread->reserved_stack = thread->kernel_stack;
}
/* thread is started in vm_pageout() */
vm_pageout_gc_thread = thread;
}
STARTUP(EARLY_BOOT, STARTUP_RANK_MIDDLE, vm_pageout_create_gc_thread);
void
vm_pageout(void)
{
thread_t self = current_thread();
thread_t thread;
kern_return_t result;
spl_t s;
/*
* Set thread privileges.
*/
s = splsched();
#if CONFIG_VPS_DYNAMIC_PRIO
if (vps_dynamic_priority_enabled) {
sched_set_kernel_thread_priority(self, MAXPRI_THROTTLE);
thread_set_eager_preempt(self);
} else {
sched_set_kernel_thread_priority(self, BASEPRI_VM);
}
#else /* CONFIG_VPS_DYNAMIC_PRIO */
sched_set_kernel_thread_priority(self, BASEPRI_VM);
#endif /* CONFIG_VPS_DYNAMIC_PRIO */
thread_lock(self);
self->options |= TH_OPT_VMPRIV;
thread_unlock(self);
if (!self->reserved_stack) {
self->reserved_stack = self->kernel_stack;
}
if (vm_pageout_state.vm_restricted_to_single_processor == TRUE &&
!vps_dynamic_priority_enabled) {
thread_vm_bind_group_add();
}
#if CONFIG_THREAD_GROUPS
thread_group_vm_add();
#endif /* CONFIG_THREAD_GROUPS */
#if __AMP__
PE_parse_boot_argn("vmpgo_pcluster", &vm_pgo_pbound, sizeof(vm_pgo_pbound));
if (vm_pgo_pbound) {
/*
* Use the soft bound option for vm pageout to allow it to run on
* E-cores if P-cluster is unavailable.
*/
thread_bind_cluster_type(self, 'P', true);
}
#endif /* __AMP__ */
PE_parse_boot_argn("vmpgo_protect_realtime",
&vm_pageout_protect_realtime,
sizeof(vm_pageout_protect_realtime));
splx(s);
thread_set_thread_name(current_thread(), "VM_pageout_scan");
/*
* Initialize some paging parameters.
*/
vm_pageout_state.vm_pressure_thread_running = FALSE;
vm_pageout_state.vm_pressure_changed = FALSE;
vm_pageout_state.memorystatus_purge_on_warning = 2;
vm_pageout_state.memorystatus_purge_on_urgent = 5;
vm_pageout_state.memorystatus_purge_on_critical = 8;
vm_pageout_state.vm_page_speculative_q_age_ms = VM_PAGE_SPECULATIVE_Q_AGE_MS;
vm_pageout_state.vm_page_speculative_percentage = 5;
vm_pageout_state.vm_page_speculative_target = 0;
vm_pageout_state.vm_pageout_swap_wait = 0;
vm_pageout_state.vm_pageout_idle_wait = 0;
vm_pageout_state.vm_pageout_empty_wait = 0;
vm_pageout_state.vm_pageout_burst_wait = 0;
vm_pageout_state.vm_pageout_deadlock_wait = 0;
vm_pageout_state.vm_pageout_deadlock_relief = 0;
vm_pageout_state.vm_pageout_burst_inactive_throttle = 0;
vm_pageout_state.vm_pageout_inactive = 0;
vm_pageout_state.vm_pageout_inactive_used = 0;
vm_pageout_state.vm_pageout_inactive_clean = 0;
vm_pageout_state.vm_memory_pressure = 0;
vm_pageout_state.vm_page_filecache_min = 0;
#if CONFIG_JETSAM
vm_pageout_state.vm_page_filecache_min_divisor = 70;
vm_pageout_state.vm_page_xpmapped_min_divisor = 40;
#else
vm_pageout_state.vm_page_filecache_min_divisor = 27;
vm_pageout_state.vm_page_xpmapped_min_divisor = 36;
#endif
vm_pageout_state.vm_page_free_count_init = vm_page_free_count;
vm_pageout_state.vm_pageout_considered_page_last = 0;
if (vm_pageout_state.vm_pageout_swap_wait == 0) {
vm_pageout_state.vm_pageout_swap_wait = VM_PAGEOUT_SWAP_WAIT;
}
if (vm_pageout_state.vm_pageout_idle_wait == 0) {
vm_pageout_state.vm_pageout_idle_wait = VM_PAGEOUT_IDLE_WAIT;
}
if (vm_pageout_state.vm_pageout_burst_wait == 0) {
vm_pageout_state.vm_pageout_burst_wait = VM_PAGEOUT_BURST_WAIT;
}
if (vm_pageout_state.vm_pageout_empty_wait == 0) {
vm_pageout_state.vm_pageout_empty_wait = VM_PAGEOUT_EMPTY_WAIT;
}
if (vm_pageout_state.vm_pageout_deadlock_wait == 0) {
vm_pageout_state.vm_pageout_deadlock_wait = VM_PAGEOUT_DEADLOCK_WAIT;
}
if (vm_pageout_state.vm_pageout_deadlock_relief == 0) {
vm_pageout_state.vm_pageout_deadlock_relief = VM_PAGEOUT_DEADLOCK_RELIEF;
}
if (vm_pageout_state.vm_pageout_burst_inactive_throttle == 0) {
vm_pageout_state.vm_pageout_burst_inactive_throttle = VM_PAGEOUT_BURST_INACTIVE_THROTTLE;
}
/*
* even if we've already called vm_page_free_reserve
* call it again here to insure that the targets are
* accurately calculated (it uses vm_page_free_count_init)
* calling it with an arg of 0 will not change the reserve
* but will re-calculate free_min and free_target
*/
if (vm_page_free_reserved < VM_PAGE_FREE_RESERVED(processor_count)) {
vm_page_free_reserve((VM_PAGE_FREE_RESERVED(processor_count)) - vm_page_free_reserved);
} else {
vm_page_free_reserve(0);
}
bzero(&vm_pageout_queue_external, sizeof(struct vm_pageout_queue));
bzero(&vm_pageout_queue_internal, sizeof(struct vm_pageout_queue));
vm_page_queue_init(&vm_pageout_queue_external.pgo_pending);
vm_pageout_queue_external.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
vm_page_queue_init(&vm_pageout_queue_internal.pgo_pending);
#if DEVELOPMENT || DEBUG
bzero(&vm_pageout_queue_benchmark, sizeof(struct vm_pageout_queue));
vm_page_queue_init(&vm_pageout_queue_benchmark.pgo_pending);
#endif /* DEVELOPMENT || DEBUG */
/* internal pageout thread started when default pager registered first time */
/* external pageout and garbage collection threads started here */
struct pgo_iothread_state *ethr = &pgo_iothread_external_state;
ethr->id = 0;
ethr->q = &vm_pageout_queue_external;
/* in external_state these cheads are never used, they are used only in internal_state for te compressor */
ethr->current_early_swapout_chead = NULL;
ethr->current_regular_swapout_chead = NULL;
ethr->current_late_swapout_chead = NULL;
ethr->scratch_buf = NULL;
#if DEVELOPMENT || DEBUG
ethr->benchmark_q = NULL;
#endif /* DEVELOPMENT || DEBUG */
sched_cond_init(&(ethr->pgo_wakeup));
result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_external,
(void *)ethr, BASEPRI_VM,
&(ethr->pgo_iothread));
if (result != KERN_SUCCESS) {
panic("vm_pageout: Unable to create external thread (%d)\n", result);
}
thread_set_thread_name(ethr->pgo_iothread, "VM_pageout_external_iothread");
thread_mtx_lock(vm_pageout_gc_thread );
thread_start(vm_pageout_gc_thread );
thread_mtx_unlock(vm_pageout_gc_thread);
#if VM_PRESSURE_EVENTS
result = kernel_thread_start_priority((thread_continue_t)vm_pressure_thread, NULL,
BASEPRI_DEFAULT,
&thread);
if (result != KERN_SUCCESS) {
panic("vm_pressure_thread: create failed");
}
thread_deallocate(thread);
#endif
vm_object_reaper_init();
if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
vm_compressor_init();
}
#if VM_PRESSURE_EVENTS
vm_pressure_events_enabled = TRUE;
#endif /* VM_PRESSURE_EVENTS */
#if CONFIG_PHANTOM_CACHE
vm_phantom_cache_init();
#endif
#if VM_PAGE_BUCKETS_CHECK
#if VM_PAGE_FAKE_BUCKETS
printf("**** DEBUG: protecting fake buckets [0x%llx:0x%llx]\n",
(uint64_t) vm_page_fake_buckets_start,
(uint64_t) vm_page_fake_buckets_end);
pmap_protect(kernel_pmap,
vm_page_fake_buckets_start,
vm_page_fake_buckets_end,
VM_PROT_READ);
// *(char *) vm_page_fake_buckets_start = 'x'; /* panic! */
#endif /* VM_PAGE_FAKE_BUCKETS */
#endif /* VM_PAGE_BUCKETS_CHECK */
#if VM_OBJECT_TRACKING
vm_object_tracking_init();
#endif /* VM_OBJECT_TRACKING */
#if __arm64__
// vm_tests();
#endif /* __arm64__ */
vm_pageout_continue();
/*
* Unreached code!
*
* The vm_pageout_continue() call above never returns, so the code below is never
* executed. We take advantage of this to declare several DTrace VM related probe
* points that our kernel doesn't have an analog for. These are probe points that
* exist in Solaris and are in the DTrace documentation, so people may have written
* scripts that use them. Declaring the probe points here means their scripts will
* compile and execute which we want for portability of the scripts, but since this
* section of code is never reached, the probe points will simply never fire. Yes,
* this is basically a hack. The problem is the DTrace probe points were chosen with
* Solaris specific VM events in mind, not portability to different VM implementations.
*/
DTRACE_VM2(execfree, int, 1, (uint64_t *), NULL);
DTRACE_VM2(execpgin, int, 1, (uint64_t *), NULL);
DTRACE_VM2(execpgout, int, 1, (uint64_t *), NULL);
DTRACE_VM2(pgswapin, int, 1, (uint64_t *), NULL);
DTRACE_VM2(pgswapout, int, 1, (uint64_t *), NULL);
DTRACE_VM2(swapin, int, 1, (uint64_t *), NULL);
DTRACE_VM2(swapout, int, 1, (uint64_t *), NULL);
/*NOTREACHED*/
}
kern_return_t
vm_pageout_internal_start(void)
{
kern_return_t result = KERN_SUCCESS;
host_basic_info_data_t hinfo;
vm_offset_t buf, bufsize;
assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT;
#define BSD_HOST 1
host_info((host_t)BSD_HOST, HOST_BASIC_INFO, (host_info_t)&hinfo, &count);
assert(hinfo.max_cpus > 0);
#if !XNU_TARGET_OS_OSX
vm_pageout_state.vm_compressor_thread_count = 1;
#else /* !XNU_TARGET_OS_OSX */
if (hinfo.max_cpus > 4) {
vm_pageout_state.vm_compressor_thread_count = 2;
} else {
vm_pageout_state.vm_compressor_thread_count = 1;
}
#endif /* !XNU_TARGET_OS_OSX */
#if __AMP__
if (vm_compressor_ebound) {
vm_pageout_state.vm_compressor_thread_count = 2;
}
#endif
PE_parse_boot_argn("vmcomp_threads", &vm_pageout_state.vm_compressor_thread_count,
sizeof(vm_pageout_state.vm_compressor_thread_count));
/* did we get from the bootargs an unreasonable number? */
if (vm_pageout_state.vm_compressor_thread_count >= hinfo.max_cpus) {
vm_pageout_state.vm_compressor_thread_count = hinfo.max_cpus - 1;
}
if (vm_pageout_state.vm_compressor_thread_count <= 0) {
vm_pageout_state.vm_compressor_thread_count = 1;
} else if (vm_pageout_state.vm_compressor_thread_count > MAX_COMPRESSOR_THREAD_COUNT) {
vm_pageout_state.vm_compressor_thread_count = MAX_COMPRESSOR_THREAD_COUNT;
}
vm_pageout_queue_internal.pgo_maxlaundry =
(vm_pageout_state.vm_compressor_thread_count * 4) * VM_PAGE_LAUNDRY_MAX;
PE_parse_boot_argn("vmpgoi_maxlaundry",
&vm_pageout_queue_internal.pgo_maxlaundry,
sizeof(vm_pageout_queue_internal.pgo_maxlaundry));
#if DEVELOPMENT || DEBUG
// Note: this will be modified at enqueue-time such that the benchmark queue is never throttled
vm_pageout_queue_benchmark.pgo_maxlaundry = vm_pageout_queue_internal.pgo_maxlaundry;
#endif /* DEVELOPMENT || DEBUG */
bufsize = COMPRESSOR_SCRATCH_BUF_SIZE;
kmem_alloc(kernel_map, &buf,
bufsize * vm_pageout_state.vm_compressor_thread_count,
KMA_DATA | KMA_NOFAIL | KMA_KOBJECT | KMA_PERMANENT,
VM_KERN_MEMORY_COMPRESSOR);
for (int i = 0; i < vm_pageout_state.vm_compressor_thread_count; i++) {
struct pgo_iothread_state *iq = &pgo_iothread_internal_state[i];
iq->id = i;
iq->q = &vm_pageout_queue_internal;
iq->current_early_swapout_chead = NULL;
iq->current_regular_swapout_chead = NULL;
iq->current_late_swapout_chead = NULL;
iq->scratch_buf = (char *)(buf + i * bufsize);
#if DEVELOPMENT || DEBUG
iq->benchmark_q = &vm_pageout_queue_benchmark;
#endif /* DEVELOPMENT || DEBUG */
sched_cond_init(&(iq->pgo_wakeup));
result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_internal,
(void *)iq, BASEPRI_VM,
&(iq->pgo_iothread));
if (result != KERN_SUCCESS) {
panic("vm_pageout: Unable to create compressor thread no. %d (%d)\n", i, result);
}
}
return result;
}
#if CONFIG_IOSCHED
/*
* To support I/O Expedite for compressed files we mark the upls with special flags.
* The way decmpfs works is that we create a big upl which marks all the pages needed to
* represent the compressed file as busy. We tag this upl with the flag UPL_DECMP_REQ. Decmpfs
* then issues smaller I/Os for compressed I/Os, deflates them and puts the data into the pages
* being held in the big original UPL. We mark each of these smaller UPLs with the flag
* UPL_DECMP_REAL_IO. Any outstanding real I/O UPL is tracked by the big req upl using the
* decmp_io_upl field (in the upl structure). This link is protected in the forward direction
* by the req upl lock (the reverse link doesnt need synch. since we never inspect this link
* unless the real I/O upl is being destroyed).
*/
static void
upl_set_decmp_info(upl_t upl, upl_t src_upl)
{
assert((src_upl->flags & UPL_DECMP_REQ) != 0);
upl_lock(src_upl);
if (src_upl->decmp_io_upl) {
/*
* If there is already an alive real I/O UPL, ignore this new UPL.
* This case should rarely happen and even if it does, it just means
* that we might issue a spurious expedite which the driver is expected
* to handle.
*/
upl_unlock(src_upl);
return;
}
src_upl->decmp_io_upl = (void *)upl;
src_upl->ref_count++;
upl->flags |= UPL_DECMP_REAL_IO;
upl->decmp_io_upl = (void *)src_upl;
upl_unlock(src_upl);
}
#endif /* CONFIG_IOSCHED */
#if UPL_DEBUG
int upl_debug_enabled = 1;
#else
int upl_debug_enabled = 0;
#endif
static upl_t
upl_create(int type, int flags, upl_size_t size)
{
uint32_t pages = (uint32_t)atop(round_page_32(size));
upl_t upl;
assert(page_aligned(size));
/*
* FIXME: this code assumes the allocation always succeeds,
* however `pages` can be up to MAX_UPL_SIZE.
*
* The allocation size is above 32k (resp. 128k)
* on 16k pages (resp. 4k), which kalloc might fail
* to allocate.
*/
upl = kalloc_type(struct upl, struct upl_page_info,
(type & UPL_CREATE_INTERNAL) ? pages : 0, Z_WAITOK | Z_ZERO);
if (type & UPL_CREATE_INTERNAL) {
flags |= UPL_INTERNAL;
}
if (type & UPL_CREATE_LITE) {
flags |= UPL_LITE;
if (pages) {
upl->lite_list = bitmap_alloc(pages);
}
}
upl->flags = flags;
upl->ref_count = 1;
upl_lock_init(upl);
#if CONFIG_IOSCHED
if (type & UPL_CREATE_IO_TRACKING) {
upl->upl_priority = proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO);
}
if ((type & UPL_CREATE_INTERNAL) && (type & UPL_CREATE_EXPEDITE_SUP)) {
/* Only support expedite on internal UPLs */
thread_t curthread = current_thread();
upl->upl_reprio_info = kalloc_data(sizeof(uint64_t) * pages,
Z_WAITOK | Z_ZERO);
upl->flags |= UPL_EXPEDITE_SUPPORTED;
if (curthread->decmp_upl != NULL) {
upl_set_decmp_info(upl, curthread->decmp_upl);
}
}
#endif
#if CONFIG_IOSCHED || UPL_DEBUG
if ((type & UPL_CREATE_IO_TRACKING) || upl_debug_enabled) {
upl->upl_creator = current_thread();
upl->flags |= UPL_TRACKED_BY_OBJECT;
}
#endif
#if UPL_DEBUG
upl->uple_create_btref = btref_get(__builtin_frame_address(0), 0);
#endif /* UPL_DEBUG */
return upl;
}
static void
upl_destroy(upl_t upl)
{
uint32_t pages;
// DEBUG4K_UPL("upl %p (u_offset 0x%llx u_size 0x%llx) object %p\n", upl, (uint64_t)upl->u_offset, (uint64_t)upl->u_size, upl->map_object);
if (upl->ext_ref_count) {
panic("upl(%p) ext_ref_count", upl);
}
#if CONFIG_IOSCHED
if ((upl->flags & UPL_DECMP_REAL_IO) && upl->decmp_io_upl) {
upl_t src_upl;
src_upl = upl->decmp_io_upl;
assert((src_upl->flags & UPL_DECMP_REQ) != 0);
upl_lock(src_upl);
src_upl->decmp_io_upl = NULL;
upl_unlock(src_upl);
upl_deallocate(src_upl);
}
#endif /* CONFIG_IOSCHED */
#if CONFIG_IOSCHED || UPL_DEBUG
if (((upl->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) &&
!(upl->flags & UPL_VECTOR)) {
vm_object_t object;
if (upl->flags & UPL_SHADOWED) {
object = upl->map_object->shadow;
} else {
object = upl->map_object;
}
vm_object_lock(object);
queue_remove(&object->uplq, upl, upl_t, uplq);
vm_object_activity_end(object);
vm_object_collapse(object, 0, TRUE);
vm_object_unlock(object);
}
#endif
/*
* drop a reference on the map_object whether or
* not a pageout object is inserted
*/
if (upl->flags & UPL_SHADOWED) {
vm_object_deallocate(upl->map_object);
}
if (upl->flags & UPL_DEVICE_MEMORY) {
pages = 1;
} else {
pages = (uint32_t)atop(upl_adjusted_size(upl, PAGE_MASK));
}
upl_lock_destroy(upl);
#if CONFIG_IOSCHED
if (upl->flags & UPL_EXPEDITE_SUPPORTED) {
kfree_data(upl->upl_reprio_info, sizeof(uint64_t) * pages);
}
#endif
#if UPL_DEBUG
for (int i = 0; i < upl->upl_commit_index; i++) {
btref_put(upl->upl_commit_records[i].c_btref);
}
btref_put(upl->uple_create_btref);
#endif /* UPL_DEBUG */
if ((upl->flags & UPL_LITE) && pages) {
bitmap_free(upl->lite_list, pages);
}
kfree_type(struct upl, struct upl_page_info,
(upl->flags & UPL_INTERNAL) ? pages : 0, upl);
}
void
upl_deallocate(upl_t upl)
{
upl_lock(upl);
if (--upl->ref_count == 0) {
if (vector_upl_is_valid(upl)) {
vector_upl_deallocate(upl);
}
upl_unlock(upl);
if (upl->upl_iodone) {
upl_callout_iodone(upl);
}
upl_destroy(upl);
} else {
upl_unlock(upl);
}
}
#if CONFIG_IOSCHED
void
upl_mark_decmp(upl_t upl)
{
if (upl->flags & UPL_TRACKED_BY_OBJECT) {
upl->flags |= UPL_DECMP_REQ;
upl->upl_creator->decmp_upl = (void *)upl;
}
}
void
upl_unmark_decmp(upl_t upl)
{
if (upl && (upl->flags & UPL_DECMP_REQ)) {
upl->upl_creator->decmp_upl = NULL;
}
}
#endif /* CONFIG_IOSCHED */
#define VM_PAGE_Q_BACKING_UP(q) \
((q)->pgo_laundry >= (((q)->pgo_maxlaundry * 8) / 10))
boolean_t must_throttle_writes(void);
boolean_t
must_throttle_writes()
{
if (VM_PAGE_Q_BACKING_UP(&vm_pageout_queue_external) &&
vm_page_pageable_external_count > (AVAILABLE_NON_COMPRESSED_MEMORY * 6) / 10) {
return TRUE;
}
return FALSE;
}
int vm_page_delayed_work_ctx_needed = 0;
KALLOC_TYPE_DEFINE(dw_ctx_zone, struct vm_page_delayed_work_ctx, KT_PRIV_ACCT);
__startup_func
static void
vm_page_delayed_work_init_ctx(void)
{
uint16_t min_delayed_work_ctx_allocated = 16;
/*
* try really hard to always keep NCPU elements around in the zone
* in order for the UPL code to almost always get an element.
*/
if (min_delayed_work_ctx_allocated < zpercpu_count()) {
min_delayed_work_ctx_allocated = (uint16_t)zpercpu_count();
}
zone_raise_reserve(dw_ctx_zone, min_delayed_work_ctx_allocated);
}
STARTUP(ZALLOC, STARTUP_RANK_LAST, vm_page_delayed_work_init_ctx);
struct vm_page_delayed_work*
vm_page_delayed_work_get_ctx(void)
{
struct vm_page_delayed_work_ctx * dw_ctx = NULL;
dw_ctx = zalloc_flags(dw_ctx_zone, Z_ZERO | Z_NOWAIT);
if (__probable(dw_ctx)) {
dw_ctx->delayed_owner = current_thread();
} else {
vm_page_delayed_work_ctx_needed++;
}
return dw_ctx ? dw_ctx->dwp : NULL;
}
void
vm_page_delayed_work_finish_ctx(struct vm_page_delayed_work* dwp)
{
struct vm_page_delayed_work_ctx *ldw_ctx;
ldw_ctx = (struct vm_page_delayed_work_ctx *)dwp;
ldw_ctx->delayed_owner = NULL;
zfree(dw_ctx_zone, ldw_ctx);
}
/*
* Routine: vm_object_upl_request
* Purpose:
* Cause the population of a portion of a vm_object.
* Depending on the nature of the request, the pages
* returned may be contain valid data or be uninitialized.
* A page list structure, listing the physical pages
* will be returned upon request.
* This function is called by the file system or any other
* supplier of backing store to a pager.
* IMPORTANT NOTE: The caller must still respect the relationship
* between the vm_object and its backing memory object. The
* caller MUST NOT substitute changes in the backing file
* without first doing a memory_object_lock_request on the
* target range unless it is know that the pages are not
* shared with another entity at the pager level.
* Copy_in_to:
* if a page list structure is present
* return the mapped physical pages, where a
* page is not present, return a non-initialized
* one. If the no_sync bit is turned on, don't
* call the pager unlock to synchronize with other
* possible copies of the page. Leave pages busy
* in the original object, if a page list structure
* was specified. When a commit of the page list
* pages is done, the dirty bit will be set for each one.
* Copy_out_from:
* If a page list structure is present, return
* all mapped pages. Where a page does not exist
* map a zero filled one. Leave pages busy in
* the original object. If a page list structure
* is not specified, this call is a no-op.
*
* Note: access of default pager objects has a rather interesting
* twist. The caller of this routine, presumably the file system
* page cache handling code, will never actually make a request
* against a default pager backed object. Only the default
* pager will make requests on backing store related vm_objects
* In this way the default pager can maintain the relationship
* between backing store files (abstract memory objects) and
* the vm_objects (cache objects), they support.
*
*/
__private_extern__ kern_return_t
vm_object_upl_request(
vm_object_t object,
vm_object_offset_t offset,
upl_size_t size,
upl_t *upl_ptr,
upl_page_info_array_t user_page_list,
unsigned int *page_list_count,
upl_control_flags_t cntrl_flags,
vm_tag_t tag)
{
vm_page_t dst_page = VM_PAGE_NULL;
vm_object_offset_t dst_offset;
upl_size_t xfer_size;
unsigned int size_in_pages;
boolean_t dirty;
boolean_t hw_dirty;
upl_t upl = NULL;
unsigned int entry;
vm_page_t alias_page = NULL;
int refmod_state = 0;
vm_object_t last_copy_object;
uint32_t last_copy_version;
struct vm_page_delayed_work dw_array;
struct vm_page_delayed_work *dwp, *dwp_start;
bool dwp_finish_ctx = TRUE;
int dw_count;
int dw_limit;
int io_tracking_flag = 0;
int grab_options;
int page_grab_count = 0;
ppnum_t phys_page;
pmap_flush_context pmap_flush_context_storage;
boolean_t pmap_flushes_delayed = FALSE;
#if DEVELOPMENT || DEBUG
task_t task = current_task();
#endif /* DEVELOPMENT || DEBUG */
dwp_start = dwp = NULL;
if (cntrl_flags & ~UPL_VALID_FLAGS) {
/*
* For forward compatibility's sake,
* reject any unknown flag.
*/
return KERN_INVALID_VALUE;
}
if ((!object->internal) && (object->paging_offset != 0)) {
panic("vm_object_upl_request: external object with non-zero paging offset");
}
if (object->phys_contiguous) {
panic("vm_object_upl_request: contiguous object specified");
}
assertf(page_aligned(offset) && page_aligned(size),
"offset 0x%llx size 0x%x",
offset, size);
VM_DEBUG_CONSTANT_EVENT(vm_object_upl_request, DBG_VM_UPL_REQUEST, DBG_FUNC_START, size, cntrl_flags, 0, 0);
dw_count = 0;
dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
dwp_start = vm_page_delayed_work_get_ctx();
if (dwp_start == NULL) {
dwp_start = &dw_array;
dw_limit = 1;
dwp_finish_ctx = FALSE;
}
dwp = dwp_start;
if (size > MAX_UPL_SIZE_BYTES) {
size = MAX_UPL_SIZE_BYTES;
}
if ((cntrl_flags & UPL_SET_INTERNAL) && page_list_count != NULL) {
*page_list_count = MAX_UPL_SIZE_BYTES >> PAGE_SHIFT;
}
#if CONFIG_IOSCHED || UPL_DEBUG
if (object->io_tracking || upl_debug_enabled) {
io_tracking_flag |= UPL_CREATE_IO_TRACKING;
}
#endif
#if CONFIG_IOSCHED
if (object->io_tracking) {
io_tracking_flag |= UPL_CREATE_EXPEDITE_SUP;
}
#endif
if (cntrl_flags & UPL_SET_INTERNAL) {
if (cntrl_flags & UPL_SET_LITE) {
upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE | io_tracking_flag, 0, size);
} else {
upl = upl_create(UPL_CREATE_INTERNAL | io_tracking_flag, 0, size);
}
user_page_list = size ? upl->page_list : NULL;
} else {
if (cntrl_flags & UPL_SET_LITE) {
upl = upl_create(UPL_CREATE_EXTERNAL | UPL_CREATE_LITE | io_tracking_flag, 0, size);
} else {
upl = upl_create(UPL_CREATE_EXTERNAL | io_tracking_flag, 0, size);
}
}
*upl_ptr = upl;
if (user_page_list) {
user_page_list[0].device = FALSE;
}
if (cntrl_flags & UPL_SET_LITE) {
upl->map_object = object;
} else {
upl->map_object = vm_object_allocate(size);
vm_object_lock(upl->map_object);
/*
* No neeed to lock the new object: nobody else knows
* about it yet, so it's all ours so far.
*/
upl->map_object->shadow = object;
VM_OBJECT_SET_PAGEOUT(upl->map_object, TRUE);
VM_OBJECT_SET_CAN_PERSIST(upl->map_object, FALSE);
upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
upl->map_object->vo_shadow_offset = offset;
upl->map_object->wimg_bits = object->wimg_bits;
assertf(page_aligned(upl->map_object->vo_shadow_offset),
"object %p shadow_offset 0x%llx",
upl->map_object, upl->map_object->vo_shadow_offset);
vm_object_unlock(upl->map_object);
alias_page = vm_page_grab_fictitious(TRUE);
upl->flags |= UPL_SHADOWED;
}
if (cntrl_flags & UPL_FOR_PAGEOUT) {
upl->flags |= UPL_PAGEOUT;
}
vm_object_lock(object);
vm_object_activity_begin(object);
grab_options = 0;
#if CONFIG_SECLUDED_MEMORY
if (object->can_grab_secluded) {
grab_options |= VM_PAGE_GRAB_SECLUDED;
}
#endif /* CONFIG_SECLUDED_MEMORY */
/*
* we can lock in the paging_offset once paging_in_progress is set
*/
upl->u_size = size;
upl->u_offset = offset + object->paging_offset;
#if CONFIG_IOSCHED || UPL_DEBUG
if (object->io_tracking || upl_debug_enabled) {
vm_object_activity_begin(object);
queue_enter(&object->uplq, upl, upl_t, uplq);
}
#endif
if ((cntrl_flags & UPL_WILL_MODIFY) && object->vo_copy != VM_OBJECT_NULL) {
/*
* Honor copy-on-write obligations
*
* The caller is gathering these pages and
* might modify their contents. We need to
* make sure that the copy object has its own
* private copies of these pages before we let
* the caller modify them.
*/
vm_object_update(object,
offset,
size,
NULL,
NULL,
FALSE, /* should_return */
MEMORY_OBJECT_COPY_SYNC,
VM_PROT_NO_CHANGE);
VM_PAGEOUT_DEBUG(upl_cow, 1);
VM_PAGEOUT_DEBUG(upl_cow_pages, (size >> PAGE_SHIFT));
}
/*
* remember which copy object we synchronized with
*/
last_copy_object = object->vo_copy;
last_copy_version = object->vo_copy_version;
entry = 0;
xfer_size = size;
dst_offset = offset;
size_in_pages = size / PAGE_SIZE;
if (vm_page_free_count > (vm_page_free_target + size_in_pages) ||
object->resident_page_count < ((MAX_UPL_SIZE_BYTES * 2) >> PAGE_SHIFT)) {
object->scan_collisions = 0;
}
if ((cntrl_flags & UPL_WILL_MODIFY) && must_throttle_writes() == TRUE) {
boolean_t isSSD = FALSE;
#if !XNU_TARGET_OS_OSX
isSSD = TRUE;
#else /* !XNU_TARGET_OS_OSX */
vnode_pager_get_isSSD(object->pager, &isSSD);
#endif /* !XNU_TARGET_OS_OSX */
vm_object_unlock(object);
OSAddAtomic(size_in_pages, &vm_upl_wait_for_pages);
if (isSSD == TRUE) {
delay(1000 * size_in_pages);
} else {
delay(5000 * size_in_pages);
}
OSAddAtomic(-size_in_pages, &vm_upl_wait_for_pages);
vm_object_lock(object);
}
while (xfer_size) {
dwp->dw_mask = 0;
if ((alias_page == NULL) && !(cntrl_flags & UPL_SET_LITE)) {
vm_object_unlock(object);
alias_page = vm_page_grab_fictitious(TRUE);
vm_object_lock(object);
}
if (cntrl_flags & UPL_COPYOUT_FROM) {
upl->flags |= UPL_PAGE_SYNC_DONE;
if (((dst_page = vm_page_lookup(object, dst_offset)) == VM_PAGE_NULL) ||
dst_page->vmp_fictitious ||
dst_page->vmp_absent ||
VMP_ERROR_GET(dst_page) ||
dst_page->vmp_cleaning ||
(VM_PAGE_WIRED(dst_page))) {
if (user_page_list) {
user_page_list[entry].phys_addr = 0;
}
goto try_next_page;
}
phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
/*
* grab this up front...
* a high percentange of the time we're going to
* need the hardware modification state a bit later
* anyway... so we can eliminate an extra call into
* the pmap layer by grabbing it here and recording it
*/
if (dst_page->vmp_pmapped) {
refmod_state = pmap_get_refmod(phys_page);
} else {
refmod_state = 0;
}
if ((refmod_state & VM_MEM_REFERENCED) && VM_PAGE_INACTIVE(dst_page)) {
/*
* page is on inactive list and referenced...
* reactivate it now... this gets it out of the
* way of vm_pageout_scan which would have to
* reactivate it upon tripping over it
*/
dwp->dw_mask |= DW_vm_page_activate;
}
if (cntrl_flags & UPL_RET_ONLY_DIRTY) {
/*
* we're only asking for DIRTY pages to be returned
*/
if (dst_page->vmp_laundry || !(cntrl_flags & UPL_FOR_PAGEOUT)) {
/*
* if we were the page stolen by vm_pageout_scan to be
* cleaned (as opposed to a buddy being clustered in
* or this request is not being driven by a PAGEOUT cluster
* then we only need to check for the page being dirty or
* precious to decide whether to return it
*/
if (dst_page->vmp_dirty || dst_page->vmp_precious || (refmod_state & VM_MEM_MODIFIED)) {
goto check_busy;
}
goto dont_return;
}
/*
* this is a request for a PAGEOUT cluster and this page
* is merely along for the ride as a 'buddy'... not only
* does it have to be dirty to be returned, but it also
* can't have been referenced recently...
*/
if ((hibernate_cleaning_in_progress == TRUE ||
(!((refmod_state & VM_MEM_REFERENCED) || dst_page->vmp_reference) ||
(dst_page->vmp_q_state == VM_PAGE_ON_THROTTLED_Q))) &&
((refmod_state & VM_MEM_MODIFIED) || dst_page->vmp_dirty || dst_page->vmp_precious)) {
goto check_busy;
}
dont_return:
/*
* if we reach here, we're not to return
* the page... go on to the next one
*/
if (dst_page->vmp_laundry == TRUE) {
/*
* if we get here, the page is not 'cleaning' (filtered out above).
* since it has been referenced, remove it from the laundry
* so we don't pay the cost of an I/O to clean a page
* we're just going to take back
*/
vm_page_lockspin_queues();
vm_pageout_steal_laundry(dst_page, TRUE);
vm_page_activate(dst_page);
vm_page_unlock_queues();
}
if (user_page_list) {
user_page_list[entry].phys_addr = 0;
}
goto try_next_page;
}
check_busy:
if (dst_page->vmp_busy) {
if (cntrl_flags & UPL_NOBLOCK) {
if (user_page_list) {
user_page_list[entry].phys_addr = 0;
}
dwp->dw_mask = 0;
goto try_next_page;
}
/*
* someone else is playing with the
* page. We will have to wait.
*/
vm_page_sleep(object, dst_page, THREAD_UNINT, LCK_SLEEP_EXCLUSIVE);
continue;
}
if (dst_page->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
vm_page_lockspin_queues();
if (dst_page->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
/*
* we've buddied up a page for a clustered pageout
* that has already been moved to the pageout
* queue by pageout_scan... we need to remove
* it from the queue and drop the laundry count
* on that queue
*/
vm_pageout_throttle_up(dst_page);
}
vm_page_unlock_queues();
}
hw_dirty = refmod_state & VM_MEM_MODIFIED;
dirty = hw_dirty ? TRUE : dst_page->vmp_dirty;
if (phys_page > upl->highest_page) {
upl->highest_page = phys_page;
}
assert(!pmap_is_noencrypt(phys_page));
if (cntrl_flags & UPL_SET_LITE) {
unsigned int pg_num;
pg_num = (unsigned int) ((dst_offset - offset) / PAGE_SIZE);
assert(pg_num == (dst_offset - offset) / PAGE_SIZE);
bitmap_set(upl->lite_list, pg_num);
if (hw_dirty) {
if (pmap_flushes_delayed == FALSE) {
pmap_flush_context_init(&pmap_flush_context_storage);
pmap_flushes_delayed = TRUE;
}
pmap_clear_refmod_options(phys_page,
VM_MEM_MODIFIED,
PMAP_OPTIONS_NOFLUSH | PMAP_OPTIONS_CLEAR_WRITE,
&pmap_flush_context_storage);
}
/*
* Mark original page as cleaning
* in place.
*/
dst_page->vmp_cleaning = TRUE;
dst_page->vmp_precious = FALSE;
} else {
/*
* use pageclean setup, it is more
* convenient even for the pageout
* cases here
*/
vm_object_lock(upl->map_object);
vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
vm_object_unlock(upl->map_object);
alias_page->vmp_absent = FALSE;
alias_page = NULL;
}
if (dirty) {
SET_PAGE_DIRTY(dst_page, FALSE);
} else {
dst_page->vmp_dirty = FALSE;
}
if (!dirty) {
dst_page->vmp_precious = TRUE;
}
if (!(cntrl_flags & UPL_CLEAN_IN_PLACE)) {
if (!VM_PAGE_WIRED(dst_page)) {
dst_page->vmp_free_when_done = TRUE;
}
}
} else {
if ((cntrl_flags & UPL_WILL_MODIFY) &&
(object->vo_copy != last_copy_object ||
object->vo_copy_version != last_copy_version)) {
/*
* Honor copy-on-write obligations
*
* The copy object has changed since we
* last synchronized for copy-on-write.
* Another copy object might have been
* inserted while we released the object's
* lock. Since someone could have seen the
* original contents of the remaining pages
* through that new object, we have to
* synchronize with it again for the remaining
* pages only. The previous pages are "busy"
* so they can not be seen through the new
* mapping. The new mapping will see our
* upcoming changes for those previous pages,
* but that's OK since they couldn't see what
* was there before. It's just a race anyway
* and there's no guarantee of consistency or
* atomicity. We just don't want new mappings
* to see both the *before* and *after* pages.
*/
if (object->vo_copy != VM_OBJECT_NULL) {
vm_object_update(
object,
dst_offset,/* current offset */
xfer_size, /* remaining size */
NULL,
NULL,
FALSE, /* should_return */
MEMORY_OBJECT_COPY_SYNC,
VM_PROT_NO_CHANGE);
VM_PAGEOUT_DEBUG(upl_cow_again, 1);
VM_PAGEOUT_DEBUG(upl_cow_again_pages, (xfer_size >> PAGE_SHIFT));
}
/*
* remember the copy object we synced with
*/
last_copy_object = object->vo_copy;
last_copy_version = object->vo_copy_version;
}
dst_page = vm_page_lookup(object, dst_offset);
if (dst_page != VM_PAGE_NULL) {
if ((cntrl_flags & UPL_RET_ONLY_ABSENT)) {
/*
* skip over pages already present in the cache
*/
if (user_page_list) {
user_page_list[entry].phys_addr = 0;
}
goto try_next_page;
}
if (dst_page->vmp_fictitious) {
panic("need corner case for fictitious page");
}
if (dst_page->vmp_busy || dst_page->vmp_cleaning) {
/*
* someone else is playing with the
* page. We will have to wait.
*/
vm_page_sleep(object, dst_page, THREAD_UNINT, LCK_SLEEP_EXCLUSIVE);
continue;
}
if (dst_page->vmp_laundry) {
vm_pageout_steal_laundry(dst_page, FALSE);
}
} else {
if (object->private) {
/*
* This is a nasty wrinkle for users
* of upl who encounter device or
* private memory however, it is
* unavoidable, only a fault can
* resolve the actual backing
* physical page by asking the
* backing device.
*/
if (user_page_list) {
user_page_list[entry].phys_addr = 0;
}
goto try_next_page;
}
if (object->scan_collisions) {
/*
* the pageout_scan thread is trying to steal
* pages from this object, but has run into our
* lock... grab 2 pages from the head of the object...
* the first is freed on behalf of pageout_scan, the
* 2nd is for our own use... we use vm_object_page_grab
* in both cases to avoid taking pages from the free
* list since we are under memory pressure and our
* lock on this object is getting in the way of
* relieving it
*/
dst_page = vm_object_page_grab(object);
if (dst_page != VM_PAGE_NULL) {
vm_page_release(dst_page,
FALSE);
}
dst_page = vm_object_page_grab(object);
}
if (dst_page == VM_PAGE_NULL) {
/*
* need to allocate a page
*/
dst_page = vm_page_grab_options(grab_options);
if (dst_page != VM_PAGE_NULL) {
page_grab_count++;
}
}
if (dst_page == VM_PAGE_NULL) {
if ((cntrl_flags & (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) == (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) {
/*
* we don't want to stall waiting for pages to come onto the free list
* while we're already holding absent pages in this UPL
* the caller will deal with the empty slots
*/
if (user_page_list) {
user_page_list[entry].phys_addr = 0;
}
goto try_next_page;
}
/*
* no pages available... wait
* then try again for the same
* offset...
*/
vm_object_unlock(object);
OSAddAtomic(size_in_pages, &vm_upl_wait_for_pages);
VM_DEBUG_EVENT(vm_upl_page_wait, DBG_VM_UPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
VM_PAGE_WAIT();
OSAddAtomic(-size_in_pages, &vm_upl_wait_for_pages);
VM_DEBUG_EVENT(vm_upl_page_wait, DBG_VM_UPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
vm_object_lock(object);
continue;
}
vm_page_insert(dst_page, object, dst_offset);
dst_page->vmp_absent = TRUE;
dst_page->vmp_busy = FALSE;
if (cntrl_flags & UPL_RET_ONLY_ABSENT) {
/*
* if UPL_RET_ONLY_ABSENT was specified,
* than we're definitely setting up a
* upl for a clustered read/pagein
* operation... mark the pages as clustered
* so upl_commit_range can put them on the
* speculative list
*/
dst_page->vmp_clustered = TRUE;
if (!(cntrl_flags & UPL_FILE_IO)) {
counter_inc(&vm_statistics_pageins);
}
}
}
phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
dst_page->vmp_overwriting = TRUE;
if (dst_page->vmp_pmapped) {
if (!(cntrl_flags & UPL_FILE_IO)) {
/*
* eliminate all mappings from the
* original object and its prodigy
*/
refmod_state = pmap_disconnect(phys_page);
} else {
refmod_state = pmap_get_refmod(phys_page);
}
} else {
refmod_state = 0;
}
hw_dirty = refmod_state & VM_MEM_MODIFIED;
dirty = hw_dirty ? TRUE : dst_page->vmp_dirty;
if (cntrl_flags & UPL_SET_LITE) {
unsigned int pg_num;
pg_num = (unsigned int) ((dst_offset - offset) / PAGE_SIZE);
assert(pg_num == (dst_offset - offset) / PAGE_SIZE);
bitmap_set(upl->lite_list, pg_num);
if (hw_dirty) {
pmap_clear_modify(phys_page);
}
/*
* Mark original page as cleaning
* in place.
*/
dst_page->vmp_cleaning = TRUE;
dst_page->vmp_precious = FALSE;
} else {
/*
* use pageclean setup, it is more
* convenient even for the pageout
* cases here
*/
vm_object_lock(upl->map_object);
vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
vm_object_unlock(upl->map_object);
alias_page->vmp_absent = FALSE;
alias_page = NULL;
}
if (cntrl_flags & UPL_REQUEST_SET_DIRTY) {
upl->flags &= ~UPL_CLEAR_DIRTY;
upl->flags |= UPL_SET_DIRTY;
dirty = TRUE;
/*
* Page belonging to a code-signed object is about to
* be written. Mark it tainted and disconnect it from
* all pmaps so processes have to fault it back in and
* deal with the tainted bit.
*/
if (object->code_signed && dst_page->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
dst_page->vmp_cs_tainted = VMP_CS_ALL_TRUE;
vm_page_upl_tainted++;
if (dst_page->vmp_pmapped) {
refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(dst_page));
if (refmod_state & VM_MEM_REFERENCED) {
dst_page->vmp_reference = TRUE;
}
}
}
} else if (cntrl_flags & UPL_CLEAN_IN_PLACE) {
/*
* clean in place for read implies
* that a write will be done on all
* the pages that are dirty before
* a upl commit is done. The caller
* is obligated to preserve the
* contents of all pages marked dirty
*/
upl->flags |= UPL_CLEAR_DIRTY;
}
dst_page->vmp_dirty = dirty;
if (!dirty) {
dst_page->vmp_precious = TRUE;
}
if (!VM_PAGE_WIRED(dst_page)) {
/*
* deny access to the target page while
* it is being worked on
*/
dst_page->vmp_busy = TRUE;
} else {
dwp->dw_mask |= DW_vm_page_wire;
}
/*
* We might be about to satisfy a fault which has been
* requested. So no need for the "restart" bit.
*/
dst_page->vmp_restart = FALSE;
if (!dst_page->vmp_absent && !(cntrl_flags & UPL_WILL_MODIFY)) {
/*
* expect the page to be used
*/
dwp->dw_mask |= DW_set_reference;
}
if (cntrl_flags & UPL_PRECIOUS) {
if (object->internal) {
SET_PAGE_DIRTY(dst_page, FALSE);
dst_page->vmp_precious = FALSE;
} else {
dst_page->vmp_precious = TRUE;
}
} else {
dst_page->vmp_precious = FALSE;
}
}
if (dst_page->vmp_busy) {
upl->flags |= UPL_HAS_BUSY;
}
if (phys_page > upl->highest_page) {
upl->highest_page = phys_page;
}
assert(!pmap_is_noencrypt(phys_page));
if (user_page_list) {
user_page_list[entry].phys_addr = phys_page;
user_page_list[entry].free_when_done = dst_page->vmp_free_when_done;
user_page_list[entry].absent = dst_page->vmp_absent;
user_page_list[entry].dirty = dst_page->vmp_dirty;
user_page_list[entry].precious = dst_page->vmp_precious;
user_page_list[entry].device = FALSE;
user_page_list[entry].needed = FALSE;
if (dst_page->vmp_clustered == TRUE) {
user_page_list[entry].speculative = (dst_page->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) ? TRUE : FALSE;
} else {
user_page_list[entry].speculative = FALSE;
}
user_page_list[entry].cs_validated = dst_page->vmp_cs_validated;
user_page_list[entry].cs_tainted = dst_page->vmp_cs_tainted;
user_page_list[entry].cs_nx = dst_page->vmp_cs_nx;
user_page_list[entry].mark = FALSE;
}
/*
* if UPL_RET_ONLY_ABSENT is set, then
* we are working with a fresh page and we've
* just set the clustered flag on it to
* indicate that it was drug in as part of a
* speculative cluster... so leave it alone
*/
if (!(cntrl_flags & UPL_RET_ONLY_ABSENT)) {
/*
* someone is explicitly grabbing this page...
* update clustered and speculative state
*
*/
if (dst_page->vmp_clustered) {
VM_PAGE_CONSUME_CLUSTERED(dst_page);
}
}
try_next_page:
if (dwp->dw_mask) {
if (dwp->dw_mask & DW_vm_page_activate) {
counter_inc(&vm_statistics_reactivations);
}
VM_PAGE_ADD_DELAYED_WORK(dwp, dst_page, dw_count);
if (dw_count >= dw_limit) {
vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
dwp = dwp_start;
dw_count = 0;
}
}
entry++;
dst_offset += PAGE_SIZE_64;
xfer_size -= PAGE_SIZE;
}
if (dw_count) {
vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
dwp = dwp_start;
dw_count = 0;
}
if (alias_page != NULL) {
VM_PAGE_FREE(alias_page);
}
if (pmap_flushes_delayed == TRUE) {
pmap_flush(&pmap_flush_context_storage);
}
if (page_list_count != NULL) {
if (upl->flags & UPL_INTERNAL) {
*page_list_count = 0;
} else if (*page_list_count > entry) {
*page_list_count = entry;
}
}
#if UPL_DEBUG
upl->upl_state = 1;
#endif
vm_object_unlock(object);
VM_DEBUG_CONSTANT_EVENT(vm_object_upl_request, DBG_VM_UPL_REQUEST, DBG_FUNC_END, page_grab_count, 0, 0, 0);
#if DEVELOPMENT || DEBUG
if (task != NULL) {
ledger_credit(task->ledger, task_ledgers.pages_grabbed_upl, page_grab_count);
}
#endif /* DEVELOPMENT || DEBUG */
if (dwp_start && dwp_finish_ctx) {
vm_page_delayed_work_finish_ctx(dwp_start);
dwp_start = dwp = NULL;
}
return KERN_SUCCESS;
}
/*
* Routine: vm_object_super_upl_request
* Purpose:
* Cause the population of a portion of a vm_object
* in much the same way as memory_object_upl_request.
* Depending on the nature of the request, the pages
* returned may be contain valid data or be uninitialized.
* However, the region may be expanded up to the super
* cluster size provided.
*/
__private_extern__ kern_return_t
vm_object_super_upl_request(
vm_object_t object,
vm_object_offset_t offset,
upl_size_t size,
upl_size_t super_cluster,
upl_t *upl,
upl_page_info_t *user_page_list,
unsigned int *page_list_count,
upl_control_flags_t cntrl_flags,
vm_tag_t tag)
{
if (object->paging_offset > offset || ((cntrl_flags & UPL_VECTOR) == UPL_VECTOR)) {
return KERN_FAILURE;
}
assert(object->paging_in_progress);
offset = offset - object->paging_offset;
if (super_cluster > size) {
vm_object_offset_t base_offset;
upl_size_t super_size;
vm_object_size_t super_size_64;
base_offset = (offset & ~((vm_object_offset_t) super_cluster - 1));
super_size = (offset + size) > (base_offset + super_cluster) ? super_cluster << 1 : super_cluster;
super_size_64 = ((base_offset + super_size) > object->vo_size) ? (object->vo_size - base_offset) : super_size;
super_size = (upl_size_t) super_size_64;
assert(super_size == super_size_64);
if (offset > (base_offset + super_size)) {
panic("vm_object_super_upl_request: Missed target pageout"
" %#llx,%#llx, %#x, %#x, %#x, %#llx\n",
offset, base_offset, super_size, super_cluster,
size, object->paging_offset);
}
/*
* apparently there is a case where the vm requests a
* page to be written out who's offset is beyond the
* object size
*/
if ((offset + size) > (base_offset + super_size)) {
super_size_64 = (offset + size) - base_offset;
super_size = (upl_size_t) super_size_64;
assert(super_size == super_size_64);
}
offset = base_offset;
size = super_size;
}
return vm_object_upl_request(object, offset, size, upl, user_page_list, page_list_count, cntrl_flags, tag);
}
int cs_executable_create_upl = 0;
extern int proc_selfpid(void);
extern char *proc_name_address(void *p);
kern_return_t
vm_map_create_upl(
vm_map_t map,
vm_map_address_t offset,
upl_size_t *upl_size,
upl_t *upl,
upl_page_info_array_t page_list,
unsigned int *count,
upl_control_flags_t *flags,
vm_tag_t tag)
{
vm_map_entry_t entry;
upl_control_flags_t caller_flags;
int force_data_sync;
int sync_cow_data;
vm_object_t local_object;
vm_map_offset_t local_offset;
vm_map_offset_t local_start;
kern_return_t ret;
vm_map_address_t original_offset;
vm_map_size_t original_size, adjusted_size;
vm_map_offset_t local_entry_start;
vm_object_offset_t local_entry_offset;
vm_object_offset_t offset_in_mapped_page;
boolean_t release_map = FALSE;
start_with_map:
original_offset = offset;
original_size = *upl_size;
adjusted_size = original_size;
caller_flags = *flags;
if (caller_flags & ~UPL_VALID_FLAGS) {
/*
* For forward compatibility's sake,
* reject any unknown flag.
*/
ret = KERN_INVALID_VALUE;
goto done;
}
force_data_sync = (caller_flags & UPL_FORCE_DATA_SYNC);
sync_cow_data = !(caller_flags & UPL_COPYOUT_FROM);
if (upl == NULL) {
ret = KERN_INVALID_ARGUMENT;
goto done;
}
REDISCOVER_ENTRY:
vm_map_lock_read(map);
if (!vm_map_lookup_entry(map, offset, &entry)) {
vm_map_unlock_read(map);
ret = KERN_FAILURE;
goto done;
}
local_entry_start = entry->vme_start;
local_entry_offset = VME_OFFSET(entry);
if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
DEBUG4K_UPL("map %p (%d) offset 0x%llx size 0x%x flags 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)offset, *upl_size, *flags);
}
if (entry->vme_end - original_offset < adjusted_size) {
adjusted_size = entry->vme_end - original_offset;
assert(adjusted_size > 0);
*upl_size = (upl_size_t) adjusted_size;
assert(*upl_size == adjusted_size);
}
if (caller_flags & UPL_QUERY_OBJECT_TYPE) {
*flags = 0;
if (!entry->is_sub_map &&
VME_OBJECT(entry) != VM_OBJECT_NULL) {
if (VME_OBJECT(entry)->private) {
*flags = UPL_DEV_MEMORY;
}
if (VME_OBJECT(entry)->phys_contiguous) {
*flags |= UPL_PHYS_CONTIG;
}
}
vm_map_unlock_read(map);
ret = KERN_SUCCESS;
goto done;
}
offset_in_mapped_page = 0;
if (VM_MAP_PAGE_SIZE(map) < PAGE_SIZE) {
offset = vm_map_trunc_page(original_offset, VM_MAP_PAGE_MASK(map));
*upl_size = (upl_size_t)
(vm_map_round_page(original_offset + adjusted_size,
VM_MAP_PAGE_MASK(map))
- offset);
offset_in_mapped_page = original_offset - offset;
assert(offset_in_mapped_page < VM_MAP_PAGE_SIZE(map));
DEBUG4K_UPL("map %p (%d) offset 0x%llx size 0x%llx flags 0x%llx -> offset 0x%llx adjusted_size 0x%llx *upl_size 0x%x offset_in_mapped_page 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)original_offset, (uint64_t)original_size, *flags, (uint64_t)offset, (uint64_t)adjusted_size, *upl_size, offset_in_mapped_page);
}
if (!entry->is_sub_map) {
if (VME_OBJECT(entry) == VM_OBJECT_NULL ||
!VME_OBJECT(entry)->phys_contiguous) {
if (*upl_size > MAX_UPL_SIZE_BYTES) {
*upl_size = MAX_UPL_SIZE_BYTES;
}
}
/*
* Create an object if necessary.
*/
if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
if (vm_map_lock_read_to_write(map)) {
goto REDISCOVER_ENTRY;
}
VME_OBJECT_SET(entry,
vm_object_allocate((vm_size_t)
vm_object_round_page((entry->vme_end - entry->vme_start))),
false, 0);
VME_OFFSET_SET(entry, 0);
assert(entry->use_pmap);
vm_map_lock_write_to_read(map);
}
if (!(caller_flags & UPL_COPYOUT_FROM) &&
!(entry->protection & VM_PROT_WRITE)) {
vm_map_unlock_read(map);
ret = KERN_PROTECTION_FAILURE;
goto done;
}
}
#if !XNU_TARGET_OS_OSX
if (map->pmap != kernel_pmap &&
(caller_flags & UPL_COPYOUT_FROM) &&
(entry->protection & VM_PROT_EXECUTE) &&
!(entry->protection & VM_PROT_WRITE)) {
vm_offset_t kaddr;
vm_size_t ksize;
/*
* We're about to create a read-only UPL backed by
* memory from an executable mapping.
* Wiring the pages would result in the pages being copied
* (due to the "MAP_PRIVATE" mapping) and no longer
* code-signed, so no longer eligible for execution.
* Instead, let's copy the data into a kernel buffer and
* create the UPL from this kernel buffer.
* The kernel buffer is then freed, leaving the UPL holding
* the last reference on the VM object, so the memory will
* be released when the UPL is committed.
*/
vm_map_unlock_read(map);
entry = VM_MAP_ENTRY_NULL;
/* allocate kernel buffer */
ksize = round_page(*upl_size);
kaddr = 0;
ret = kmem_alloc(kernel_map, &kaddr, ksize,
KMA_PAGEABLE | KMA_DATA, tag);
if (ret == KERN_SUCCESS) {
/* copyin the user data */
ret = copyinmap(map, offset, (void *)kaddr, *upl_size);
}
if (ret == KERN_SUCCESS) {
if (ksize > *upl_size) {
/* zero out the extra space in kernel buffer */
memset((void *)(kaddr + *upl_size),
0,
ksize - *upl_size);
}
/* create the UPL from the kernel buffer */
vm_object_offset_t offset_in_object;
vm_object_offset_t offset_in_object_page;
offset_in_object = offset - local_entry_start + local_entry_offset;
offset_in_object_page = offset_in_object - vm_object_trunc_page(offset_in_object);
assert(offset_in_object_page < PAGE_SIZE);
assert(offset_in_object_page + offset_in_mapped_page < PAGE_SIZE);
*upl_size -= offset_in_object_page + offset_in_mapped_page;
ret = vm_map_create_upl(kernel_map,
(vm_map_address_t)(kaddr + offset_in_object_page + offset_in_mapped_page),
upl_size, upl, page_list, count, flags, tag);
}
if (kaddr != 0) {
/* free the kernel buffer */
kmem_free(kernel_map, kaddr, ksize);
kaddr = 0;
ksize = 0;
}
#if DEVELOPMENT || DEBUG
DTRACE_VM4(create_upl_from_executable,
vm_map_t, map,
vm_map_address_t, offset,
upl_size_t, *upl_size,
kern_return_t, ret);
#endif /* DEVELOPMENT || DEBUG */
goto done;
}
#endif /* !XNU_TARGET_OS_OSX */
if (!entry->is_sub_map) {
local_object = VME_OBJECT(entry);
assert(local_object != VM_OBJECT_NULL);
}
if (!entry->is_sub_map &&
!entry->needs_copy &&
*upl_size != 0 &&
local_object->vo_size > *upl_size && /* partial UPL */
entry->wired_count == 0 && /* No COW for entries that are wired */
(map->pmap != kernel_pmap) && /* alias checks */
(vm_map_entry_should_cow_for_true_share(entry) /* case 1 */
||
( /* case 2 */
local_object->internal &&
(local_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) &&
local_object->ref_count > 1))) {
vm_prot_t prot;
/*
* Case 1:
* Set up the targeted range for copy-on-write to avoid
* applying true_share/copy_delay to the entire object.
*
* Case 2:
* This map entry covers only part of an internal
* object. There could be other map entries covering
* other areas of this object and some of these map
* entries could be marked as "needs_copy", which
* assumes that the object is COPY_SYMMETRIC.
* To avoid marking this object as COPY_DELAY and
* "true_share", let's shadow it and mark the new
* (smaller) object as "true_share" and COPY_DELAY.
*/
if (vm_map_lock_read_to_write(map)) {
goto REDISCOVER_ENTRY;
}
vm_map_lock_assert_exclusive(map);
assert(VME_OBJECT(entry) == local_object);
vm_map_clip_start(map,
entry,
vm_map_trunc_page(offset,
VM_MAP_PAGE_MASK(map)));
vm_map_clip_end(map,
entry,
vm_map_round_page(offset + *upl_size,
VM_MAP_PAGE_MASK(map)));
if ((entry->vme_end - offset) < *upl_size) {
*upl_size = (upl_size_t) (entry->vme_end - offset);
assert(*upl_size == entry->vme_end - offset);
}
prot = entry->protection & ~VM_PROT_WRITE;
if (override_nx(map, VME_ALIAS(entry)) && prot) {
prot |= VM_PROT_EXECUTE;
}
vm_object_pmap_protect(local_object,
VME_OFFSET(entry),
entry->vme_end - entry->vme_start,
((entry->is_shared ||
map->mapped_in_other_pmaps)
? PMAP_NULL
: map->pmap),
VM_MAP_PAGE_SIZE(map),
entry->vme_start,
prot);
assert(entry->wired_count == 0);
/*
* Lock the VM object and re-check its status: if it's mapped
* in another address space, we could still be racing with
* another thread holding that other VM map exclusively.
*/
vm_object_lock(local_object);
if (local_object->true_share) {
/* object is already in proper state: no COW needed */
assert(local_object->copy_strategy !=
MEMORY_OBJECT_COPY_SYMMETRIC);
} else {
/* not true_share: ask for copy-on-write below */
assert(local_object->copy_strategy ==
MEMORY_OBJECT_COPY_SYMMETRIC);
entry->needs_copy = TRUE;
}
vm_object_unlock(local_object);
vm_map_lock_write_to_read(map);
}
if (entry->needs_copy) {
/*
* Honor copy-on-write for COPY_SYMMETRIC
* strategy.
*/
vm_map_t local_map;
vm_object_t object;
vm_object_offset_t new_offset;
vm_prot_t prot;
boolean_t wired;
vm_map_version_t version;
vm_map_t real_map;
vm_prot_t fault_type;
local_map = map;
if (caller_flags & UPL_COPYOUT_FROM) {
fault_type = VM_PROT_READ | VM_PROT_COPY;
vm_counters.create_upl_extra_cow++;
vm_counters.create_upl_extra_cow_pages +=
(entry->vme_end - entry->vme_start) / PAGE_SIZE;
} else {
fault_type = VM_PROT_WRITE;
}
if (vm_map_lookup_and_lock_object(&local_map,
offset, fault_type,
OBJECT_LOCK_EXCLUSIVE,
&version, &object,
&new_offset, &prot, &wired,
NULL,
&real_map, NULL) != KERN_SUCCESS) {
if (fault_type == VM_PROT_WRITE) {
vm_counters.create_upl_lookup_failure_write++;
} else {
vm_counters.create_upl_lookup_failure_copy++;
}
vm_map_unlock_read(local_map);
ret = KERN_FAILURE;
goto done;
}
if (real_map != local_map) {
vm_map_unlock(real_map);
}
vm_map_unlock_read(local_map);
vm_object_unlock(object);
goto REDISCOVER_ENTRY;
}
if (entry->is_sub_map) {
vm_map_t submap;
submap = VME_SUBMAP(entry);
local_start = entry->vme_start;
local_offset = (vm_map_offset_t)VME_OFFSET(entry);
vm_map_reference(submap);
vm_map_unlock_read(map);
DEBUG4K_UPL("map %p offset 0x%llx (0x%llx) size 0x%x (adjusted 0x%llx original 0x%llx) offset_in_mapped_page 0x%llx submap %p\n", map, (uint64_t)offset, (uint64_t)original_offset, *upl_size, (uint64_t)adjusted_size, (uint64_t)original_size, offset_in_mapped_page, submap);
offset += offset_in_mapped_page;
*upl_size -= offset_in_mapped_page;
if (release_map) {
vm_map_deallocate(map);
}
map = submap;
release_map = TRUE;
offset = local_offset + (offset - local_start);
goto start_with_map;
}
if (sync_cow_data &&
(VME_OBJECT(entry)->shadow ||
VME_OBJECT(entry)->vo_copy)) {
local_object = VME_OBJECT(entry);
local_start = entry->vme_start;
local_offset = (vm_map_offset_t)VME_OFFSET(entry);
vm_object_reference(local_object);
vm_map_unlock_read(map);
if (local_object->shadow && local_object->vo_copy) {
vm_object_lock_request(local_object->shadow,
((vm_object_offset_t)
((offset - local_start) +
local_offset) +
local_object->vo_shadow_offset),
*upl_size, FALSE,
MEMORY_OBJECT_DATA_SYNC,
VM_PROT_NO_CHANGE);
}
sync_cow_data = FALSE;
vm_object_deallocate(local_object);
goto REDISCOVER_ENTRY;
}
if (force_data_sync) {
local_object = VME_OBJECT(entry);
local_start = entry->vme_start;
local_offset = (vm_map_offset_t)VME_OFFSET(entry);
vm_object_reference(local_object);
vm_map_unlock_read(map);
vm_object_lock_request(local_object,
((vm_object_offset_t)
((offset - local_start) +
local_offset)),
(vm_object_size_t)*upl_size,
FALSE,
MEMORY_OBJECT_DATA_SYNC,
VM_PROT_NO_CHANGE);
force_data_sync = FALSE;
vm_object_deallocate(local_object);
goto REDISCOVER_ENTRY;
}
if (VME_OBJECT(entry)->private) {
*flags = UPL_DEV_MEMORY;
} else {
*flags = 0;
}
if (VME_OBJECT(entry)->phys_contiguous) {
*flags |= UPL_PHYS_CONTIG;
}
local_object = VME_OBJECT(entry);
local_offset = (vm_map_offset_t)VME_OFFSET(entry);
local_start = entry->vme_start;
/*
* Wiring will copy the pages to the shadow object.
* The shadow object will not be code-signed so
* attempting to execute code from these copied pages
* would trigger a code-signing violation.
*/
if (entry->protection & VM_PROT_EXECUTE) {
#if MACH_ASSERT
printf("pid %d[%s] create_upl out of executable range from "
"0x%llx to 0x%llx: side effects may include "
"code-signing violations later on\n",
proc_selfpid(),
(get_bsdtask_info(current_task())
? proc_name_address(get_bsdtask_info(current_task()))
: "?"),
(uint64_t) entry->vme_start,
(uint64_t) entry->vme_end);
#endif /* MACH_ASSERT */
DTRACE_VM2(cs_executable_create_upl,
uint64_t, (uint64_t)entry->vme_start,
uint64_t, (uint64_t)entry->vme_end);
cs_executable_create_upl++;
}
vm_object_lock(local_object);
/*
* Ensure that this object is "true_share" and "copy_delay" now,
* while we're still holding the VM map lock. After we unlock the map,
* anything could happen to that mapping, including some copy-on-write
* activity. We need to make sure that the IOPL will point at the
* same memory as the mapping.
*/
if (local_object->true_share) {
assert(local_object->copy_strategy !=
MEMORY_OBJECT_COPY_SYMMETRIC);
} else if (!is_kernel_object(local_object) &&
local_object != compressor_object &&
!local_object->phys_contiguous) {
#if VM_OBJECT_TRACKING_OP_TRUESHARE
if (!local_object->true_share &&
vm_object_tracking_btlog) {
btlog_record(vm_object_tracking_btlog, local_object,
VM_OBJECT_TRACKING_OP_TRUESHARE,
btref_get(__builtin_frame_address(0), 0));
}
#endif /* VM_OBJECT_TRACKING_OP_TRUESHARE */
VM_OBJECT_SET_TRUE_SHARE(local_object, TRUE);
if (local_object->copy_strategy ==
MEMORY_OBJECT_COPY_SYMMETRIC) {
local_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
}
}
vm_object_reference_locked(local_object);
vm_object_unlock(local_object);
vm_map_unlock_read(map);
offset += offset_in_mapped_page;
assert(*upl_size > offset_in_mapped_page);
*upl_size -= offset_in_mapped_page;
ret = vm_object_iopl_request(local_object,
((vm_object_offset_t)
((offset - local_start) + local_offset)),
*upl_size,
upl,
page_list,
count,
caller_flags,
tag);
vm_object_deallocate(local_object);
done:
if (release_map) {
vm_map_deallocate(map);
}
return ret;
}
/*
* Internal routine to enter a UPL into a VM map.
*
* JMM - This should just be doable through the standard
* vm_map_enter() API.
*/
kern_return_t
vm_map_enter_upl_range(
vm_map_t map,
upl_t upl,
vm_object_offset_t offset_to_map,
vm_size_t size_to_map,
vm_prot_t prot_to_map,
vm_map_offset_t *dst_addr)
{
vm_map_size_t size;
vm_object_offset_t offset;
vm_map_offset_t addr;
vm_page_t m;
kern_return_t kr;
int isVectorUPL = 0, curr_upl = 0;
upl_t vector_upl = NULL;
mach_vm_offset_t vector_upl_dst_addr = 0;
vm_map_t vector_upl_submap = NULL;
upl_offset_t subupl_offset = 0;
upl_size_t subupl_size = 0;
if (upl == UPL_NULL) {
return KERN_INVALID_ARGUMENT;
}
DEBUG4K_UPL("map %p upl %p flags 0x%x object %p offset 0x%llx (uploff: 0x%llx) size 0x%lx (uplsz: 0x%x) \n", map, upl, upl->flags, upl->map_object, offset_to_map, upl->u_offset, size_to_map, upl->u_size);
assert(map == kernel_map);
if ((isVectorUPL = vector_upl_is_valid(upl))) {
int mapped = 0, valid_upls = 0;
vector_upl = upl;
upl_lock(vector_upl);
for (curr_upl = 0; curr_upl < vector_upl_max_upls(vector_upl); curr_upl++) {
upl = vector_upl_subupl_byindex(vector_upl, curr_upl );
if (upl == NULL) {
continue;
}
valid_upls++;
if (UPL_PAGE_LIST_MAPPED & upl->flags) {
mapped++;
}
}
if (mapped) {
if (mapped != valid_upls) {
panic("Only %d of the %d sub-upls within the Vector UPL are alread mapped", mapped, valid_upls);
} else {
upl_unlock(vector_upl);
return KERN_FAILURE;
}
}
if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
panic("TODO4K: vector UPL not implemented");
}
vector_upl_submap = kmem_suballoc(map, &vector_upl_dst_addr,
vector_upl->u_size, VM_MAP_CREATE_DEFAULT,
VM_FLAGS_ANYWHERE, KMS_NOFAIL | KMS_DATA,
VM_KERN_MEMORY_NONE).kmr_submap;
map = vector_upl_submap;
vector_upl_set_submap(vector_upl, vector_upl_submap, vector_upl_dst_addr);
curr_upl = 0;
} else {
upl_lock(upl);
}
process_upl_to_enter:
if (isVectorUPL) {
if (curr_upl == vector_upl_max_upls(vector_upl)) {
*dst_addr = vector_upl_dst_addr;
upl_unlock(vector_upl);
return KERN_SUCCESS;
}
upl = vector_upl_subupl_byindex(vector_upl, curr_upl++ );
if (upl == NULL) {
goto process_upl_to_enter;
}
vector_upl_get_iostate(vector_upl, upl, &subupl_offset, &subupl_size);
*dst_addr = (vm_map_offset_t)(vector_upl_dst_addr + (vm_map_offset_t)subupl_offset);
} else {
/*
* check to see if already mapped
*/
if (UPL_PAGE_LIST_MAPPED & upl->flags) {
upl_unlock(upl);
return KERN_FAILURE;
}
}
if ((!(upl->flags & UPL_SHADOWED)) &&
((upl->flags & UPL_HAS_BUSY) ||
!((upl->flags & (UPL_DEVICE_MEMORY | UPL_IO_WIRE)) || (upl->map_object->phys_contiguous)))) {
vm_object_t object;
vm_page_t alias_page;
vm_object_offset_t new_offset;
unsigned int pg_num;
size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
object = upl->map_object;
upl->map_object = vm_object_allocate(vm_object_round_page(size));
vm_object_lock(upl->map_object);
upl->map_object->shadow = object;
VM_OBJECT_SET_PAGEOUT(upl->map_object, TRUE);
VM_OBJECT_SET_CAN_PERSIST(upl->map_object, FALSE);
upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
upl->map_object->vo_shadow_offset = upl_adjusted_offset(upl, PAGE_MASK) - object->paging_offset;
assertf(page_aligned(upl->map_object->vo_shadow_offset),
"object %p shadow_offset 0x%llx",
upl->map_object,
(uint64_t)upl->map_object->vo_shadow_offset);
upl->map_object->wimg_bits = object->wimg_bits;
offset = upl->map_object->vo_shadow_offset;
new_offset = 0;
upl->flags |= UPL_SHADOWED;
while (size) {
pg_num = (unsigned int) (new_offset / PAGE_SIZE);
assert(pg_num == new_offset / PAGE_SIZE);
if (bitmap_test(upl->lite_list, pg_num)) {
alias_page = vm_page_grab_fictitious(TRUE);
vm_object_lock(object);
m = vm_page_lookup(object, offset);
if (m == VM_PAGE_NULL) {
panic("vm_upl_map: page missing");
}
/*
* Convert the fictitious page to a private
* shadow of the real page.
*/
assert(alias_page->vmp_fictitious);
alias_page->vmp_fictitious = FALSE;
alias_page->vmp_private = TRUE;
alias_page->vmp_free_when_done = TRUE;
/*
* since m is a page in the upl it must
* already be wired or BUSY, so it's
* safe to assign the underlying physical
* page to the alias
*/
VM_PAGE_SET_PHYS_PAGE(alias_page, VM_PAGE_GET_PHYS_PAGE(m));
vm_object_unlock(object);
vm_page_lockspin_queues();
vm_page_wire(alias_page, VM_KERN_MEMORY_NONE, TRUE);
vm_page_unlock_queues();
vm_page_insert_wired(alias_page, upl->map_object, new_offset, VM_KERN_MEMORY_NONE);
assert(!alias_page->vmp_wanted);
alias_page->vmp_busy = FALSE;
alias_page->vmp_absent = FALSE;
}
size -= PAGE_SIZE;
offset += PAGE_SIZE_64;
new_offset += PAGE_SIZE_64;
}
vm_object_unlock(upl->map_object);
}
if (upl->flags & UPL_SHADOWED) {
if (isVectorUPL) {
offset = 0;
} else {
offset = offset_to_map;
}
} else {
offset = upl_adjusted_offset(upl, VM_MAP_PAGE_MASK(map)) - upl->map_object->paging_offset;
if (!isVectorUPL) {
offset += offset_to_map;
}
}
if (isVectorUPL) {
size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
} else {
size = MIN(upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map)), size_to_map);
}
vm_object_reference(upl->map_object);
if (!isVectorUPL) {
*dst_addr = 0;
/*
* NEED A UPL_MAP ALIAS
*/
kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
VM_MAP_KERNEL_FLAGS_DATA_ANYWHERE(.vm_tag = VM_KERN_MEMORY_OSFMK),
upl->map_object, offset, FALSE,
prot_to_map, VM_PROT_ALL, VM_INHERIT_DEFAULT);
if (kr != KERN_SUCCESS) {
vm_object_deallocate(upl->map_object);
upl_unlock(upl);
return kr;
}
} else {
kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
VM_MAP_KERNEL_FLAGS_FIXED(.vm_tag = VM_KERN_MEMORY_OSFMK),
upl->map_object, offset, FALSE,
prot_to_map, VM_PROT_ALL, VM_INHERIT_DEFAULT);
if (kr) {
panic("vm_map_enter failed for a Vector UPL");
}
}
upl->u_mapped_size = (upl_size_t) size; /* When we allow multiple submappings of the UPL */
/* this will have to be an increment rather than */
/* an assignment. */
vm_object_lock(upl->map_object);
for (addr = *dst_addr; size > 0; size -= PAGE_SIZE, addr += PAGE_SIZE) {
m = vm_page_lookup(upl->map_object, offset);
if (m) {
m->vmp_pmapped = TRUE;
/*
* CODE SIGNING ENFORCEMENT: page has been wpmapped,
* but only in kernel space. If this was on a user map,
* we'd have to set the wpmapped bit.
*/
/* m->vmp_wpmapped = TRUE; */
assert(map->pmap == kernel_pmap);
kr = pmap_enter_check(map->pmap, addr, m, prot_to_map, VM_PROT_NONE, 0, TRUE);
assert(kr == KERN_SUCCESS);
#if KASAN
kasan_notify_address(addr, PAGE_SIZE_64);
#endif
}
offset += PAGE_SIZE_64;
}
vm_object_unlock(upl->map_object);
/*
* hold a reference for the mapping
*/
upl->ref_count++;
upl->flags |= UPL_PAGE_LIST_MAPPED;
upl->kaddr = (vm_offset_t) *dst_addr;
assert(upl->kaddr == *dst_addr);
if (isVectorUPL) {
goto process_upl_to_enter;
}
if (!isVectorUPL) {
vm_map_offset_t addr_adjustment;
addr_adjustment = (vm_map_offset_t)(upl->u_offset - upl_adjusted_offset(upl, VM_MAP_PAGE_MASK(map)));
if (addr_adjustment) {
assert(VM_MAP_PAGE_MASK(map) != PAGE_MASK);
DEBUG4K_UPL("dst_addr 0x%llx (+ 0x%llx) -> 0x%llx\n", (uint64_t)*dst_addr, (uint64_t)addr_adjustment, (uint64_t)(*dst_addr + addr_adjustment));
*dst_addr += addr_adjustment;
}
}
upl_unlock(upl);
return KERN_SUCCESS;
}
kern_return_t
vm_map_enter_upl(
vm_map_t map,
upl_t upl,
vm_map_offset_t *dst_addr)
{
upl_size_t upl_size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
return vm_map_enter_upl_range(map, upl, 0, upl_size, VM_PROT_DEFAULT, dst_addr);
}
/*
* Internal routine to remove a UPL mapping from a VM map.
*
* XXX - This should just be doable through a standard
* vm_map_remove() operation. Otherwise, implicit clean-up
* of the target map won't be able to correctly remove
* these (and release the reference on the UPL). Having
* to do this means we can't map these into user-space
* maps yet.
*/
kern_return_t
vm_map_remove_upl_range(
vm_map_t map,
upl_t upl,
__unused vm_object_offset_t offset_to_unmap,
__unused vm_size_t size_to_unmap)
{
vm_address_t addr;
upl_size_t size;
int isVectorUPL = 0, curr_upl = 0;
upl_t vector_upl = NULL;
if (upl == UPL_NULL) {
return KERN_INVALID_ARGUMENT;
}
if ((isVectorUPL = vector_upl_is_valid(upl))) {
int unmapped = 0, valid_upls = 0;
vector_upl = upl;
upl_lock(vector_upl);
for (curr_upl = 0; curr_upl < vector_upl_max_upls(vector_upl); curr_upl++) {
upl = vector_upl_subupl_byindex(vector_upl, curr_upl );
if (upl == NULL) {
continue;
}
valid_upls++;
if (!(UPL_PAGE_LIST_MAPPED & upl->flags)) {
unmapped++;
}
}
if (unmapped) {
if (unmapped != valid_upls) {
panic("%d of the %d sub-upls within the Vector UPL is/are not mapped", unmapped, valid_upls);
} else {
upl_unlock(vector_upl);
return KERN_FAILURE;
}
}
curr_upl = 0;
} else {
upl_lock(upl);
}
process_upl_to_remove:
if (isVectorUPL) {
if (curr_upl == vector_upl_max_upls(vector_upl)) {
vm_map_t v_upl_submap;
vm_offset_t v_upl_submap_dst_addr;
vector_upl_get_submap(vector_upl, &v_upl_submap, &v_upl_submap_dst_addr);
kmem_free_guard(map, v_upl_submap_dst_addr,
vector_upl->u_size, KMF_NONE, KMEM_GUARD_SUBMAP);
vm_map_deallocate(v_upl_submap);
upl_unlock(vector_upl);
return KERN_SUCCESS;
}
upl = vector_upl_subupl_byindex(vector_upl, curr_upl++ );
if (upl == NULL) {
goto process_upl_to_remove;
}
}
if (upl->flags & UPL_PAGE_LIST_MAPPED) {
addr = upl->kaddr;
size = upl->u_mapped_size;
assert(upl->ref_count > 1);
upl->ref_count--; /* removing mapping ref */
upl->flags &= ~UPL_PAGE_LIST_MAPPED;
upl->kaddr = (vm_offset_t) 0;
upl->u_mapped_size = 0;
if (isVectorUPL) {
/*
* If it's a Vectored UPL, we'll be removing the entire
* submap anyways, so no need to remove individual UPL
* element mappings from within the submap
*/
goto process_upl_to_remove;
}
upl_unlock(upl);
vm_map_remove(map,
vm_map_trunc_page(addr, VM_MAP_PAGE_MASK(map)),
vm_map_round_page(addr + size, VM_MAP_PAGE_MASK(map)));
return KERN_SUCCESS;
}
upl_unlock(upl);
return KERN_FAILURE;
}
kern_return_t
vm_map_remove_upl(
vm_map_t map,
upl_t upl)
{
upl_size_t upl_size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
return vm_map_remove_upl_range(map, upl, 0, upl_size);
}
void
iopl_valid_data(
upl_t upl,
vm_tag_t tag)
{
vm_object_t object;
vm_offset_t offset;
vm_page_t m, nxt_page = VM_PAGE_NULL;
upl_size_t size;
int wired_count = 0;
if (upl == NULL) {
panic("iopl_valid_data: NULL upl");
}
if (vector_upl_is_valid(upl)) {
panic("iopl_valid_data: vector upl");
}
if ((upl->flags & (UPL_DEVICE_MEMORY | UPL_SHADOWED | UPL_ACCESS_BLOCKED | UPL_IO_WIRE | UPL_INTERNAL)) != UPL_IO_WIRE) {
panic("iopl_valid_data: unsupported upl, flags = %x", upl->flags);
}
object = upl->map_object;
if (is_kernel_object(object) || object == compressor_object) {
panic("iopl_valid_data: object == kernel or compressor");
}
if (object->purgable == VM_PURGABLE_VOLATILE ||
object->purgable == VM_PURGABLE_EMPTY) {
panic("iopl_valid_data: object %p purgable %d",
object, object->purgable);
}
size = upl_adjusted_size(upl, PAGE_MASK);
vm_object_lock(object);
VM_OBJECT_WIRED_PAGE_UPDATE_START(object);
bool whole_object;
if (object->vo_size == size && object->resident_page_count == (size / PAGE_SIZE)) {
nxt_page = (vm_page_t)vm_page_queue_first(&object->memq);
whole_object = true;
} else {
offset = (vm_offset_t)(upl_adjusted_offset(upl, PAGE_MASK) - object->paging_offset);
whole_object = false;
}
while (size) {
if (whole_object) {
if (nxt_page != VM_PAGE_NULL) {
m = nxt_page;
nxt_page = (vm_page_t)vm_page_queue_next(&nxt_page->vmp_listq);
}
} else {
m = vm_page_lookup(object, offset);
offset += PAGE_SIZE;
if (m == VM_PAGE_NULL) {
panic("iopl_valid_data: missing expected page at offset %lx", (long)offset);
}
}
if (m->vmp_busy) {
if (!m->vmp_absent) {
panic("iopl_valid_data: busy page w/o absent");
}
if (m->vmp_pageq.next || m->vmp_pageq.prev) {
panic("iopl_valid_data: busy+absent page on page queue");
}
if (m->vmp_reusable) {
panic("iopl_valid_data: %p is reusable", m);
}
m->vmp_absent = FALSE;
m->vmp_dirty = TRUE;
assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
assert(m->vmp_wire_count == 0);
m->vmp_wire_count++;
assert(m->vmp_wire_count);
if (m->vmp_wire_count == 1) {
m->vmp_q_state = VM_PAGE_IS_WIRED;
wired_count++;
} else {
panic("iopl_valid_data: %p already wired", m);
}
vm_page_wakeup_done(object, m);
}
size -= PAGE_SIZE;
}
if (wired_count) {
VM_OBJECT_WIRED_PAGE_COUNT(object, wired_count);
assert(object->resident_page_count >= object->wired_page_count);
/* no need to adjust purgeable accounting for this object: */
assert(object->purgable != VM_PURGABLE_VOLATILE);
assert(object->purgable != VM_PURGABLE_EMPTY);
vm_page_lockspin_queues();
vm_page_wire_count += wired_count;
vm_page_unlock_queues();
}
VM_OBJECT_WIRED_PAGE_UPDATE_END(object, tag);
vm_object_unlock(object);
}
void
vm_object_set_pmap_cache_attr(
vm_object_t object,
upl_page_info_array_t user_page_list,
unsigned int num_pages,
boolean_t batch_pmap_op)
{
unsigned int cache_attr = 0;
cache_attr = object->wimg_bits & VM_WIMG_MASK;
assert(user_page_list);
if (cache_attr != VM_WIMG_USE_DEFAULT) {
PMAP_BATCH_SET_CACHE_ATTR(object, user_page_list, cache_attr, num_pages, batch_pmap_op);
}
}
static bool
vm_object_iopl_wire_full(
vm_object_t object,
upl_t upl,
upl_page_info_array_t user_page_list,
upl_control_flags_t cntrl_flags,
vm_tag_t tag)
{
vm_page_t dst_page;
unsigned int entry;
int page_count;
int delayed_unlock = 0;
boolean_t retval = TRUE;
ppnum_t phys_page;
vm_object_lock_assert_exclusive(object);
assert(object->purgable != VM_PURGABLE_VOLATILE);
assert(object->purgable != VM_PURGABLE_EMPTY);
assert(object->pager == NULL);
assert(object->vo_copy == NULL);
assert(object->shadow == NULL);
page_count = object->resident_page_count;
dst_page = (vm_page_t)vm_page_queue_first(&object->memq);
vm_page_lock_queues();
while (page_count--) {
if (dst_page->vmp_busy ||
dst_page->vmp_fictitious ||
dst_page->vmp_absent ||
VMP_ERROR_GET(dst_page) ||
dst_page->vmp_cleaning ||
dst_page->vmp_restart ||
dst_page->vmp_laundry) {
retval = FALSE;
goto done;
}
if ((cntrl_flags & UPL_REQUEST_FORCE_COHERENCY) && dst_page->vmp_written_by_kernel == TRUE) {
retval = FALSE;
goto done;
}
dst_page->vmp_reference = TRUE;
vm_page_wire(dst_page, tag, FALSE);
if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
SET_PAGE_DIRTY(dst_page, FALSE);
}
entry = (unsigned int)(dst_page->vmp_offset / PAGE_SIZE);
assert(entry >= 0 && entry < object->resident_page_count);
bitmap_set(upl->lite_list, entry);
phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
if (phys_page > upl->highest_page) {
upl->highest_page = phys_page;
}
if (user_page_list) {
user_page_list[entry].phys_addr = phys_page;
user_page_list[entry].absent = dst_page->vmp_absent;
user_page_list[entry].dirty = dst_page->vmp_dirty;
user_page_list[entry].free_when_done = dst_page->vmp_free_when_done;
user_page_list[entry].precious = dst_page->vmp_precious;
user_page_list[entry].device = FALSE;
user_page_list[entry].speculative = FALSE;
user_page_list[entry].cs_validated = FALSE;
user_page_list[entry].cs_tainted = FALSE;
user_page_list[entry].cs_nx = FALSE;
user_page_list[entry].needed = FALSE;
user_page_list[entry].mark = FALSE;
}
if (delayed_unlock++ > 256) {
delayed_unlock = 0;
lck_mtx_yield(&vm_page_queue_lock);
VM_CHECK_MEMORYSTATUS;
}
dst_page = (vm_page_t)vm_page_queue_next(&dst_page->vmp_listq);
}
done:
vm_page_unlock_queues();
VM_CHECK_MEMORYSTATUS;
return retval;
}
static kern_return_t
vm_object_iopl_wire_empty(
vm_object_t object,
upl_t upl,
upl_page_info_array_t user_page_list,
upl_control_flags_t cntrl_flags,
vm_tag_t tag,
vm_object_offset_t *dst_offset,
int page_count,
int *page_grab_count)
{
vm_page_t dst_page;
boolean_t no_zero_fill = FALSE;
int interruptible;
int pages_wired = 0;
int pages_inserted = 0;
int entry = 0;
uint64_t delayed_ledger_update = 0;
kern_return_t ret = KERN_SUCCESS;
int grab_options;
ppnum_t phys_page;
vm_object_lock_assert_exclusive(object);
assert(object->purgable != VM_PURGABLE_VOLATILE);
assert(object->purgable != VM_PURGABLE_EMPTY);
assert(object->pager == NULL);
assert(object->vo_copy == NULL);
assert(object->shadow == NULL);
if (cntrl_flags & UPL_SET_INTERRUPTIBLE) {
interruptible = THREAD_ABORTSAFE;
} else {
interruptible = THREAD_UNINT;
}
if (cntrl_flags & (UPL_NOZEROFILL | UPL_NOZEROFILLIO)) {
no_zero_fill = TRUE;
}
grab_options = 0;
#if CONFIG_SECLUDED_MEMORY
if (object->can_grab_secluded) {
grab_options |= VM_PAGE_GRAB_SECLUDED;
}
#endif /* CONFIG_SECLUDED_MEMORY */
while (page_count--) {
while ((dst_page = vm_page_grab_options(grab_options))
== VM_PAGE_NULL) {
OSAddAtomic(page_count, &vm_upl_wait_for_pages);
VM_DEBUG_EVENT(vm_iopl_page_wait, DBG_VM_IOPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
if (vm_page_wait(interruptible) == FALSE) {
/*
* interrupted case
*/
OSAddAtomic(-page_count, &vm_upl_wait_for_pages);
VM_DEBUG_EVENT(vm_iopl_page_wait, DBG_VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, -1);
ret = MACH_SEND_INTERRUPTED;
goto done;
}
OSAddAtomic(-page_count, &vm_upl_wait_for_pages);
VM_DEBUG_EVENT(vm_iopl_page_wait, DBG_VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
}
if (no_zero_fill == FALSE) {
vm_page_zero_fill(dst_page);
} else {
dst_page->vmp_absent = TRUE;
}
dst_page->vmp_reference = TRUE;
if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
SET_PAGE_DIRTY(dst_page, FALSE);
}
if (dst_page->vmp_absent == FALSE) {
assert(dst_page->vmp_q_state == VM_PAGE_NOT_ON_Q);
assert(dst_page->vmp_wire_count == 0);
dst_page->vmp_wire_count++;
dst_page->vmp_q_state = VM_PAGE_IS_WIRED;
assert(dst_page->vmp_wire_count);
pages_wired++;
vm_page_wakeup_done(object, dst_page);
}
pages_inserted++;
vm_page_insert_internal(dst_page, object, *dst_offset, tag, FALSE, TRUE, TRUE, TRUE, &delayed_ledger_update);
bitmap_set(upl->lite_list, entry);
phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
if (phys_page > upl->highest_page) {
upl->highest_page = phys_page;
}
if (user_page_list) {
user_page_list[entry].phys_addr = phys_page;
user_page_list[entry].absent = dst_page->vmp_absent;
user_page_list[entry].dirty = dst_page->vmp_dirty;
user_page_list[entry].free_when_done = FALSE;
user_page_list[entry].precious = FALSE;
user_page_list[entry].device = FALSE;
user_page_list[entry].speculative = FALSE;
user_page_list[entry].cs_validated = FALSE;
user_page_list[entry].cs_tainted = FALSE;
user_page_list[entry].cs_nx = FALSE;
user_page_list[entry].needed = FALSE;
user_page_list[entry].mark = FALSE;
}
entry++;
*dst_offset += PAGE_SIZE_64;
}
done:
if (pages_wired) {
vm_page_lockspin_queues();
vm_page_wire_count += pages_wired;
vm_page_unlock_queues();
}
if (pages_inserted) {
if (object->internal) {
OSAddAtomic(pages_inserted, &vm_page_internal_count);
} else {
OSAddAtomic(pages_inserted, &vm_page_external_count);
}
}
if (delayed_ledger_update) {
task_t owner;
int ledger_idx_volatile;
int ledger_idx_nonvolatile;
int ledger_idx_volatile_compressed;
int ledger_idx_nonvolatile_compressed;
int ledger_idx_composite;
int ledger_idx_external_wired;
boolean_t do_footprint;
owner = VM_OBJECT_OWNER(object);
assert(owner);
vm_object_ledger_tag_ledgers(object,
&ledger_idx_volatile,
&ledger_idx_nonvolatile,
&ledger_idx_volatile_compressed,
&ledger_idx_nonvolatile_compressed,
&ledger_idx_composite,
&ledger_idx_external_wired,
&do_footprint);
if (object->internal) {
/* more non-volatile bytes */
ledger_credit(owner->ledger,
ledger_idx_nonvolatile,
delayed_ledger_update);
if (do_footprint) {
/* more footprint */
ledger_credit(owner->ledger,
task_ledgers.phys_footprint,
delayed_ledger_update);
} else if (ledger_idx_composite != -1) {
ledger_credit(owner->ledger,
ledger_idx_composite,
delayed_ledger_update);
}
} else {
/* more external wired bytes */
ledger_credit(owner->ledger,
ledger_idx_external_wired,
delayed_ledger_update);
if (do_footprint) {
/* more footprint */
ledger_credit(owner->ledger,
task_ledgers.phys_footprint,
delayed_ledger_update);
} else if (ledger_idx_composite != -1) {
ledger_credit(owner->ledger,
ledger_idx_composite,
delayed_ledger_update);
}
}
}
assert(page_grab_count);
*page_grab_count = pages_inserted;
return ret;
}
kern_return_t
vm_object_iopl_request(
vm_object_t object,
vm_object_offset_t offset,
upl_size_t size,
upl_t *upl_ptr,
upl_page_info_array_t user_page_list,
unsigned int *page_list_count,
upl_control_flags_t cntrl_flags,
vm_tag_t tag)
{
vm_page_t dst_page;
vm_object_offset_t dst_offset;
upl_size_t xfer_size;
upl_t upl = NULL;
unsigned int entry;
int no_zero_fill = FALSE;
unsigned int size_in_pages;
int page_grab_count = 0;
u_int32_t psize;
kern_return_t ret;
vm_prot_t prot;
struct vm_object_fault_info fault_info = {};
struct vm_page_delayed_work dw_array;
struct vm_page_delayed_work *dwp, *dwp_start;
bool dwp_finish_ctx = TRUE;
int dw_count;
int dw_limit;
int dw_index;
boolean_t caller_lookup;
int io_tracking_flag = 0;
int interruptible;
ppnum_t phys_page;
boolean_t set_cache_attr_needed = FALSE;
boolean_t free_wired_pages = FALSE;
boolean_t fast_path_empty_req = FALSE;
boolean_t fast_path_full_req = FALSE;
#if DEVELOPMENT || DEBUG
task_t task = current_task();
#endif /* DEVELOPMENT || DEBUG */
dwp_start = dwp = NULL;
vm_object_offset_t original_offset = offset;
upl_size_t original_size = size;
// DEBUG4K_UPL("object %p offset 0x%llx size 0x%llx cntrl_flags 0x%llx\n", object, (uint64_t)offset, (uint64_t)size, cntrl_flags);
size = (upl_size_t)(vm_object_round_page(offset + size) - vm_object_trunc_page(offset));
offset = vm_object_trunc_page(offset);
if (size != original_size || offset != original_offset) {
DEBUG4K_IOKIT("flags 0x%llx object %p offset 0x%llx size 0x%x -> offset 0x%llx size 0x%x\n", cntrl_flags, object, original_offset, original_size, offset, size);
}
if (cntrl_flags & ~UPL_VALID_FLAGS) {
/*
* For forward compatibility's sake,
* reject any unknown flag.
*/
return KERN_INVALID_VALUE;
}
if (vm_lopage_needed == FALSE) {
cntrl_flags &= ~UPL_NEED_32BIT_ADDR;
}
if (cntrl_flags & UPL_NEED_32BIT_ADDR) {
if ((cntrl_flags & (UPL_SET_IO_WIRE | UPL_SET_LITE)) != (UPL_SET_IO_WIRE | UPL_SET_LITE)) {
return KERN_INVALID_VALUE;
}
if (object->phys_contiguous) {
if ((offset + object->vo_shadow_offset) >= (vm_object_offset_t)max_valid_dma_address) {
return KERN_INVALID_ADDRESS;
}
if (((offset + object->vo_shadow_offset) + size) >= (vm_object_offset_t)max_valid_dma_address) {
return KERN_INVALID_ADDRESS;
}
}
}
if (cntrl_flags & (UPL_NOZEROFILL | UPL_NOZEROFILLIO)) {
no_zero_fill = TRUE;
}
if (cntrl_flags & UPL_COPYOUT_FROM) {
prot = VM_PROT_READ;
} else {
prot = VM_PROT_READ | VM_PROT_WRITE;
}
if ((!object->internal) && (object->paging_offset != 0)) {
panic("vm_object_iopl_request: external object with non-zero paging offset");
}
VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, DBG_VM_IOPL_REQUEST, DBG_FUNC_START, size, cntrl_flags, prot, 0);
#if CONFIG_IOSCHED || UPL_DEBUG
if ((object->io_tracking && !is_kernel_object(object)) || upl_debug_enabled) {
io_tracking_flag |= UPL_CREATE_IO_TRACKING;
}
#endif
#if CONFIG_IOSCHED
if (object->io_tracking) {
/* Check if we're dealing with the kernel object. We do not support expedite on kernel object UPLs */
if (!is_kernel_object(object)) {
io_tracking_flag |= UPL_CREATE_EXPEDITE_SUP;
}
}
#endif
if (object->phys_contiguous) {
psize = PAGE_SIZE;
} else {
psize = size;
dw_count = 0;
dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
dwp_start = vm_page_delayed_work_get_ctx();
if (dwp_start == NULL) {
dwp_start = &dw_array;
dw_limit = 1;
dwp_finish_ctx = FALSE;
}
dwp = dwp_start;
}
if (cntrl_flags & UPL_SET_INTERNAL) {
upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE | io_tracking_flag, UPL_IO_WIRE, psize);
user_page_list = size ? upl->page_list : NULL;
} else {
upl = upl_create(UPL_CREATE_LITE | io_tracking_flag, UPL_IO_WIRE, psize);
}
if (user_page_list) {
user_page_list[0].device = FALSE;
}
*upl_ptr = upl;
if (cntrl_flags & UPL_NOZEROFILLIO) {
DTRACE_VM4(upl_nozerofillio,
vm_object_t, object,
vm_object_offset_t, offset,
upl_size_t, size,
upl_t, upl);
}
upl->map_object = object;
upl->u_offset = original_offset;
upl->u_size = original_size;
size_in_pages = size / PAGE_SIZE;
if (is_kernel_object(object) &&
!(cntrl_flags & (UPL_NEED_32BIT_ADDR | UPL_BLOCK_ACCESS))) {
upl->flags |= UPL_KERNEL_OBJECT;
#if UPL_DEBUG
vm_object_lock(object);
#else
vm_object_lock_shared(object);
#endif
} else {
vm_object_lock(object);
vm_object_activity_begin(object);
}
/*
* paging in progress also protects the paging_offset
*/
upl->u_offset = original_offset + object->paging_offset;
if (cntrl_flags & UPL_BLOCK_ACCESS) {
/*
* The user requested that access to the pages in this UPL
* be blocked until the UPL is commited or aborted.
*/
upl->flags |= UPL_ACCESS_BLOCKED;
}
#if CONFIG_IOSCHED || UPL_DEBUG
if ((upl->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
vm_object_activity_begin(object);
queue_enter(&object->uplq, upl, upl_t, uplq);
}
#endif
if (object->phys_contiguous) {
if (upl->flags & UPL_ACCESS_BLOCKED) {
assert(!object->blocked_access);
object->blocked_access = TRUE;
}
vm_object_unlock(object);
/*
* don't need any shadow mappings for this one
* since it is already I/O memory
*/
upl->flags |= UPL_DEVICE_MEMORY;
upl->highest_page = (ppnum_t) ((offset + object->vo_shadow_offset + size - 1) >> PAGE_SHIFT);
if (user_page_list) {
user_page_list[0].phys_addr = (ppnum_t) ((offset + object->vo_shadow_offset) >> PAGE_SHIFT);
user_page_list[0].device = TRUE;
}
if (page_list_count != NULL) {
if (upl->flags & UPL_INTERNAL) {
*page_list_count = 0;
} else {
*page_list_count = 1;
}
}
VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, DBG_VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, KERN_SUCCESS, 0, 0);
#if DEVELOPMENT || DEBUG
if (task != NULL) {
ledger_credit(task->ledger, task_ledgers.pages_grabbed_iopl, page_grab_count);
}
#endif /* DEVELOPMENT || DEBUG */
return KERN_SUCCESS;
}
if (!is_kernel_object(object) && object != compressor_object) {
/*
* Protect user space from future COW operations
*/
#if VM_OBJECT_TRACKING_OP_TRUESHARE
if (!object->true_share &&
vm_object_tracking_btlog) {
btlog_record(vm_object_tracking_btlog, object,
VM_OBJECT_TRACKING_OP_TRUESHARE,
btref_get(__builtin_frame_address(0), 0));
}
#endif /* VM_OBJECT_TRACKING_OP_TRUESHARE */
vm_object_lock_assert_exclusive(object);
VM_OBJECT_SET_TRUE_SHARE(object, TRUE);
if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
}
}
if (!(cntrl_flags & UPL_COPYOUT_FROM) &&
object->vo_copy != VM_OBJECT_NULL) {
/*
* Honor copy-on-write obligations
*
* The caller is gathering these pages and
* might modify their contents. We need to
* make sure that the copy object has its own
* private copies of these pages before we let
* the caller modify them.
*
* NOTE: someone else could map the original object
* after we've done this copy-on-write here, and they
* could then see an inconsistent picture of the memory
* while it's being modified via the UPL. To prevent this,
* we would have to block access to these pages until the
* UPL is released. We could use the UPL_BLOCK_ACCESS
* code path for that...
*/
vm_object_update(object,
offset,
size,
NULL,
NULL,
FALSE, /* should_return */
MEMORY_OBJECT_COPY_SYNC,
VM_PROT_NO_CHANGE);
VM_PAGEOUT_DEBUG(iopl_cow, 1);
VM_PAGEOUT_DEBUG(iopl_cow_pages, (size >> PAGE_SHIFT));
}
if (!(cntrl_flags & (UPL_NEED_32BIT_ADDR | UPL_BLOCK_ACCESS)) &&
object->purgable != VM_PURGABLE_VOLATILE &&
object->purgable != VM_PURGABLE_EMPTY &&
object->vo_copy == NULL &&
size == object->vo_size &&
offset == 0 &&
object->shadow == NULL &&
object->pager == NULL) {
if (object->resident_page_count == size_in_pages) {
assert(object != compressor_object);
assert(!is_kernel_object(object));
fast_path_full_req = TRUE;
} else if (object->resident_page_count == 0) {
assert(object != compressor_object);
assert(!is_kernel_object(object));
fast_path_empty_req = TRUE;
set_cache_attr_needed = TRUE;
}
}
if (cntrl_flags & UPL_SET_INTERRUPTIBLE) {
interruptible = THREAD_ABORTSAFE;
} else {
interruptible = THREAD_UNINT;
}
entry = 0;
xfer_size = size;
dst_offset = offset;
if (fast_path_full_req) {
if (vm_object_iopl_wire_full(object, upl, user_page_list, cntrl_flags, tag) == TRUE) {
goto finish;
}
/*
* we couldn't complete the processing of this request on the fast path
* so fall through to the slow path and finish up
*/
} else if (fast_path_empty_req) {
if (cntrl_flags & UPL_REQUEST_NO_FAULT) {
ret = KERN_MEMORY_ERROR;
goto return_err;
}
ret = vm_object_iopl_wire_empty(object, upl, user_page_list,
cntrl_flags, tag, &dst_offset, size_in_pages, &page_grab_count);
if (ret) {
free_wired_pages = TRUE;
goto return_err;
}
goto finish;
}
fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL;
fault_info.lo_offset = offset;
fault_info.hi_offset = offset + xfer_size;
fault_info.mark_zf_absent = TRUE;
fault_info.interruptible = interruptible;
fault_info.batch_pmap_op = TRUE;
while (xfer_size) {
vm_fault_return_t result;
dwp->dw_mask = 0;
if (fast_path_full_req) {
/*
* if we get here, it means that we ran into a page
* state we couldn't handle in the fast path and
* bailed out to the slow path... since the order
* we look at pages is different between the 2 paths,
* the following check is needed to determine whether
* this page was already processed in the fast path
*/
if (bitmap_test(upl->lite_list, entry)) {
goto skip_page;
}
}
dst_page = vm_page_lookup(object, dst_offset);
if (dst_page == VM_PAGE_NULL ||
dst_page->vmp_busy ||
VMP_ERROR_GET(dst_page) ||
dst_page->vmp_restart ||
dst_page->vmp_absent ||
dst_page->vmp_fictitious) {
if (is_kernel_object(object)) {
panic("vm_object_iopl_request: missing/bad page in kernel object");
}
if (object == compressor_object) {
panic("vm_object_iopl_request: missing/bad page in compressor object");
}
if (cntrl_flags & UPL_REQUEST_NO_FAULT) {
ret = KERN_MEMORY_ERROR;
goto return_err;
}
set_cache_attr_needed = TRUE;
/*
* We just looked up the page and the result remains valid
* until the object lock is release, so send it to
* vm_fault_page() (as "dst_page"), to avoid having to
* look it up again there.
*/
caller_lookup = TRUE;
do {
vm_page_t top_page;
kern_return_t error_code;
fault_info.cluster_size = xfer_size;
vm_object_paging_begin(object);
result = vm_fault_page(object, dst_offset,
prot | VM_PROT_WRITE, FALSE,
caller_lookup,
&prot, &dst_page, &top_page,
(int *)0,
&error_code, no_zero_fill,
&fault_info);
/* our lookup is no longer valid at this point */
caller_lookup = FALSE;
switch (result) {
case VM_FAULT_SUCCESS:
page_grab_count++;
if (!dst_page->vmp_absent) {
vm_page_wakeup_done(object, dst_page);
} else {
/*
* we only get back an absent page if we
* requested that it not be zero-filled
* because we are about to fill it via I/O
*
* absent pages should be left BUSY
* to prevent them from being faulted
* into an address space before we've
* had a chance to complete the I/O on
* them since they may contain info that
* shouldn't be seen by the faulting task
*/
}
/*
* Release paging references and
* top-level placeholder page, if any.
*/
if (top_page != VM_PAGE_NULL) {
vm_object_t local_object;
local_object = VM_PAGE_OBJECT(top_page);
/*
* comparing 2 packed pointers
*/
if (top_page->vmp_object != dst_page->vmp_object) {
vm_object_lock(local_object);
VM_PAGE_FREE(top_page);
vm_object_paging_end(local_object);
vm_object_unlock(local_object);
} else {
VM_PAGE_FREE(top_page);
vm_object_paging_end(local_object);
}
}
vm_object_paging_end(object);
break;
case VM_FAULT_RETRY:
vm_object_lock(object);
break;
case VM_FAULT_MEMORY_SHORTAGE:
OSAddAtomic((size_in_pages - entry), &vm_upl_wait_for_pages);
VM_DEBUG_EVENT(vm_iopl_page_wait, DBG_VM_IOPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
if (vm_page_wait(interruptible)) {
OSAddAtomic(-(size_in_pages - entry), &vm_upl_wait_for_pages);
VM_DEBUG_EVENT(vm_iopl_page_wait, DBG_VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
vm_object_lock(object);
break;
}
OSAddAtomic(-(size_in_pages - entry), &vm_upl_wait_for_pages);
VM_DEBUG_EVENT(vm_iopl_page_wait, DBG_VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, -1);
ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAULT_OBJIOPLREQ_MEMORY_SHORTAGE), 0 /* arg */);
OS_FALLTHROUGH;
case VM_FAULT_INTERRUPTED:
error_code = MACH_SEND_INTERRUPTED;
OS_FALLTHROUGH;
case VM_FAULT_MEMORY_ERROR:
memory_error:
ret = (error_code ? error_code: KERN_MEMORY_ERROR);
vm_object_lock(object);
goto return_err;
case VM_FAULT_SUCCESS_NO_VM_PAGE:
/* success but no page: fail */
vm_object_paging_end(object);
vm_object_unlock(object);
goto memory_error;
default:
panic("vm_object_iopl_request: unexpected error"
" 0x%x from vm_fault_page()\n", result);
}
} while (result != VM_FAULT_SUCCESS);
}
phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
if (upl->flags & UPL_KERNEL_OBJECT) {
goto record_phys_addr;
}
if (dst_page->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
dst_page->vmp_busy = TRUE;
goto record_phys_addr;
}
if (dst_page->vmp_cleaning) {
/*
* Someone else is cleaning this page in place.
* In theory, we should be able to proceed and use this
* page but they'll probably end up clearing the "busy"
* bit on it in upl_commit_range() but they didn't set
* it, so they would clear our "busy" bit and open
* us to race conditions.
* We'd better wait for the cleaning to complete and
* then try again.
*/
VM_PAGEOUT_DEBUG(vm_object_iopl_request_sleep_for_cleaning, 1);
vm_page_sleep(object, dst_page, THREAD_UNINT, LCK_SLEEP_EXCLUSIVE);
continue;
}
if (dst_page->vmp_laundry) {
vm_pageout_steal_laundry(dst_page, FALSE);
}
if ((cntrl_flags & UPL_NEED_32BIT_ADDR) &&
phys_page >= (max_valid_dma_address >> PAGE_SHIFT)) {
vm_page_t low_page;
int refmod;
/*
* support devices that can't DMA above 32 bits
* by substituting pages from a pool of low address
* memory for any pages we find above the 4G mark
* can't substitute if the page is already wired because
* we don't know whether that physical address has been
* handed out to some other 64 bit capable DMA device to use
*/
if (VM_PAGE_WIRED(dst_page)) {
ret = KERN_PROTECTION_FAILURE;
goto return_err;
}
low_page = vm_page_grablo();
if (low_page == VM_PAGE_NULL) {
ret = KERN_RESOURCE_SHORTAGE;
goto return_err;
}
/*
* from here until the vm_page_replace completes
* we musn't drop the object lock... we don't
* want anyone refaulting this page in and using
* it after we disconnect it... we want the fault
* to find the new page being substituted.
*/
if (dst_page->vmp_pmapped) {
refmod = pmap_disconnect(phys_page);
} else {
refmod = 0;
}
if (!dst_page->vmp_absent) {
vm_page_copy(dst_page, low_page);
}
low_page->vmp_reference = dst_page->vmp_reference;
low_page->vmp_dirty = dst_page->vmp_dirty;
low_page->vmp_absent = dst_page->vmp_absent;
if (refmod & VM_MEM_REFERENCED) {
low_page->vmp_reference = TRUE;
}
if (refmod & VM_MEM_MODIFIED) {
SET_PAGE_DIRTY(low_page, FALSE);
}
vm_page_replace(low_page, object, dst_offset);
dst_page = low_page;
/*
* vm_page_grablo returned the page marked
* BUSY... we don't need a PAGE_WAKEUP_DONE
* here, because we've never dropped the object lock
*/
if (!dst_page->vmp_absent) {
dst_page->vmp_busy = FALSE;
}
phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
}
if (!dst_page->vmp_busy) {
dwp->dw_mask |= DW_vm_page_wire;
}
if (cntrl_flags & UPL_BLOCK_ACCESS) {
/*
* Mark the page "busy" to block any future page fault
* on this page in addition to wiring it.
* We'll also remove the mapping
* of all these pages before leaving this routine.
*/
assert(!dst_page->vmp_fictitious);
dst_page->vmp_busy = TRUE;
}
/*
* expect the page to be used
* page queues lock must be held to set 'reference'
*/
dwp->dw_mask |= DW_set_reference;
if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
SET_PAGE_DIRTY(dst_page, TRUE);
/*
* Page belonging to a code-signed object is about to
* be written. Mark it tainted and disconnect it from
* all pmaps so processes have to fault it back in and
* deal with the tainted bit.
*/
if (object->code_signed && dst_page->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
dst_page->vmp_cs_tainted = VMP_CS_ALL_TRUE;
vm_page_iopl_tainted++;
if (dst_page->vmp_pmapped) {
int refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(dst_page));
if (refmod & VM_MEM_REFERENCED) {
dst_page->vmp_reference = TRUE;
}
}
}
}
if ((cntrl_flags & UPL_REQUEST_FORCE_COHERENCY) && dst_page->vmp_written_by_kernel == TRUE) {
pmap_sync_page_attributes_phys(phys_page);
dst_page->vmp_written_by_kernel = FALSE;
}
record_phys_addr:
if (dst_page->vmp_busy) {
upl->flags |= UPL_HAS_BUSY;
}
bitmap_set(upl->lite_list, entry);
if (phys_page > upl->highest_page) {
upl->highest_page = phys_page;
}
if (user_page_list) {
user_page_list[entry].phys_addr = phys_page;
user_page_list[entry].free_when_done = dst_page->vmp_free_when_done;
user_page_list[entry].absent = dst_page->vmp_absent;
user_page_list[entry].dirty = dst_page->vmp_dirty;
user_page_list[entry].precious = dst_page->vmp_precious;
user_page_list[entry].device = FALSE;
user_page_list[entry].needed = FALSE;
if (dst_page->vmp_clustered == TRUE) {
user_page_list[entry].speculative = (dst_page->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) ? TRUE : FALSE;
} else {
user_page_list[entry].speculative = FALSE;
}
user_page_list[entry].cs_validated = dst_page->vmp_cs_validated;
user_page_list[entry].cs_tainted = dst_page->vmp_cs_tainted;
user_page_list[entry].cs_nx = dst_page->vmp_cs_nx;
user_page_list[entry].mark = FALSE;
}
if (!is_kernel_object(object) && object != compressor_object) {
/*
* someone is explicitly grabbing this page...
* update clustered and speculative state
*
*/
if (dst_page->vmp_clustered) {
VM_PAGE_CONSUME_CLUSTERED(dst_page);
}
}
skip_page:
entry++;
dst_offset += PAGE_SIZE_64;
xfer_size -= PAGE_SIZE;
if (dwp->dw_mask) {
VM_PAGE_ADD_DELAYED_WORK(dwp, dst_page, dw_count);
if (dw_count >= dw_limit) {
vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
dwp = dwp_start;
dw_count = 0;
}
}
}
assert(entry == size_in_pages);
if (dw_count) {
vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
dwp = dwp_start;
dw_count = 0;
}
finish:
if (user_page_list && set_cache_attr_needed == TRUE) {
vm_object_set_pmap_cache_attr(object, user_page_list, size_in_pages, TRUE);
}
if (page_list_count != NULL) {
if (upl->flags & UPL_INTERNAL) {
*page_list_count = 0;
} else if (*page_list_count > size_in_pages) {
*page_list_count = size_in_pages;
}
}
vm_object_unlock(object);
if (cntrl_flags & UPL_BLOCK_ACCESS) {
/*
* We've marked all the pages "busy" so that future
* page faults will block.
* Now remove the mapping for these pages, so that they
* can't be accessed without causing a page fault.
*/
vm_object_pmap_protect(object, offset, (vm_object_size_t)size,
PMAP_NULL,
PAGE_SIZE,
0, VM_PROT_NONE);
assert(!object->blocked_access);
object->blocked_access = TRUE;
}
VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, DBG_VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, KERN_SUCCESS, 0, 0);
#if DEVELOPMENT || DEBUG
if (task != NULL) {
ledger_credit(task->ledger, task_ledgers.pages_grabbed_iopl, page_grab_count);
}
#endif /* DEVELOPMENT || DEBUG */
if (dwp_start && dwp_finish_ctx) {
vm_page_delayed_work_finish_ctx(dwp_start);
dwp_start = dwp = NULL;
}
return KERN_SUCCESS;
return_err:
dw_index = 0;
for (; offset < dst_offset; offset += PAGE_SIZE) {
boolean_t need_unwire;
dst_page = vm_page_lookup(object, offset);
if (dst_page == VM_PAGE_NULL) {
panic("vm_object_iopl_request: Wired page missing.");
}
/*
* if we've already processed this page in an earlier
* dw_do_work, we need to undo the wiring... we will
* leave the dirty and reference bits on if they
* were set, since we don't have a good way of knowing
* what the previous state was and we won't get here
* under any normal circumstances... we will always
* clear BUSY and wakeup any waiters via vm_page_free
* or PAGE_WAKEUP_DONE
*/
need_unwire = TRUE;
if (dw_count) {
if ((dwp_start)[dw_index].dw_m == dst_page) {
/*
* still in the deferred work list
* which means we haven't yet called
* vm_page_wire on this page
*/
need_unwire = FALSE;
dw_index++;
dw_count--;
}
}
vm_page_lock_queues();
if (dst_page->vmp_absent || free_wired_pages == TRUE) {
vm_page_free(dst_page);
need_unwire = FALSE;
} else {
if (need_unwire == TRUE) {
vm_page_unwire(dst_page, TRUE);
}
vm_page_wakeup_done(object, dst_page);
}
vm_page_unlock_queues();
if (need_unwire == TRUE) {
counter_inc(&vm_statistics_reactivations);
}
}
#if UPL_DEBUG
upl->upl_state = 2;
#endif
if (!(upl->flags & UPL_KERNEL_OBJECT)) {
vm_object_activity_end(object);
vm_object_collapse(object, 0, TRUE);
}
vm_object_unlock(object);
upl_destroy(upl);
VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, DBG_VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, ret, 0, 0);
#if DEVELOPMENT || DEBUG
if (task != NULL) {
ledger_credit(task->ledger, task_ledgers.pages_grabbed_iopl, page_grab_count);
}
#endif /* DEVELOPMENT || DEBUG */
if (dwp_start && dwp_finish_ctx) {
vm_page_delayed_work_finish_ctx(dwp_start);
dwp_start = dwp = NULL;
}
return ret;
}
kern_return_t
upl_transpose(
upl_t upl1,
upl_t upl2)
{
kern_return_t retval;
boolean_t upls_locked;
vm_object_t object1, object2;
/* LD: Should mapped UPLs be eligible for a transpose? */
if (upl1 == UPL_NULL || upl2 == UPL_NULL || upl1 == upl2 || ((upl1->flags & UPL_VECTOR) == UPL_VECTOR) || ((upl2->flags & UPL_VECTOR) == UPL_VECTOR)) {
return KERN_INVALID_ARGUMENT;
}
upls_locked = FALSE;
/*
* Since we need to lock both UPLs at the same time,
* avoid deadlocks by always taking locks in the same order.
*/
if (upl1 < upl2) {
upl_lock(upl1);
upl_lock(upl2);
} else {
upl_lock(upl2);
upl_lock(upl1);
}
upls_locked = TRUE; /* the UPLs will need to be unlocked */
object1 = upl1->map_object;
object2 = upl2->map_object;
if (upl1->u_offset != 0 || upl2->u_offset != 0 ||
upl1->u_size != upl2->u_size) {
/*
* We deal only with full objects, not subsets.
* That's because we exchange the entire backing store info
* for the objects: pager, resident pages, etc... We can't do
* only part of it.
*/
retval = KERN_INVALID_VALUE;
goto done;
}
/*
* Tranpose the VM objects' backing store.
*/
retval = vm_object_transpose(object1, object2,
upl_adjusted_size(upl1, PAGE_MASK));
if (retval == KERN_SUCCESS) {
/*
* Make each UPL point to the correct VM object, i.e. the
* object holding the pages that the UPL refers to...
*/
#if CONFIG_IOSCHED || UPL_DEBUG
if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || (upl2->flags & UPL_TRACKED_BY_OBJECT)) {
vm_object_lock(object1);
vm_object_lock(object2);
}
if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
queue_remove(&object1->uplq, upl1, upl_t, uplq);
}
if ((upl2->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
queue_remove(&object2->uplq, upl2, upl_t, uplq);
}
#endif
upl1->map_object = object2;
upl2->map_object = object1;
#if CONFIG_IOSCHED || UPL_DEBUG
if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
queue_enter(&object2->uplq, upl1, upl_t, uplq);
}
if ((upl2->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
queue_enter(&object1->uplq, upl2, upl_t, uplq);
}
if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || (upl2->flags & UPL_TRACKED_BY_OBJECT)) {
vm_object_unlock(object2);
vm_object_unlock(object1);
}
#endif
}
done:
/*
* Cleanup.
*/
if (upls_locked) {
upl_unlock(upl1);
upl_unlock(upl2);
upls_locked = FALSE;
}
return retval;
}
void
upl_range_needed(
upl_t upl,
int index,
int count)
{
int size_in_pages;
if (!(upl->flags & UPL_INTERNAL) || count <= 0) {
return;
}
size_in_pages = upl_adjusted_size(upl, PAGE_MASK) / PAGE_SIZE;
while (count-- && index < size_in_pages) {
upl->page_list[index++].needed = TRUE;
}
}
/*
* Reserve of virtual addresses in the kernel address space.
* We need to map the physical pages in the kernel, so that we
* can call the code-signing or slide routines with a kernel
* virtual address. We keep this pool of pre-allocated kernel
* virtual addresses so that we don't have to scan the kernel's
* virtaul address space each time we need to work with
* a physical page.
*/
SIMPLE_LOCK_DECLARE(vm_paging_lock, 0);
#define VM_PAGING_NUM_PAGES 64
SECURITY_READ_ONLY_LATE(vm_offset_t) vm_paging_base_address = 0;
bool vm_paging_page_inuse[VM_PAGING_NUM_PAGES] = { FALSE, };
int vm_paging_max_index = 0;
int vm_paging_page_waiter = 0;
int vm_paging_page_waiter_total = 0;
unsigned long vm_paging_no_kernel_page = 0;
unsigned long vm_paging_objects_mapped = 0;
unsigned long vm_paging_pages_mapped = 0;
unsigned long vm_paging_objects_mapped_slow = 0;
unsigned long vm_paging_pages_mapped_slow = 0;
__startup_func
static void
vm_paging_map_init(void)
{
kmem_alloc(kernel_map, &vm_paging_base_address,
ptoa(VM_PAGING_NUM_PAGES),
KMA_DATA | KMA_NOFAIL | KMA_KOBJECT | KMA_PERMANENT | KMA_PAGEABLE,
VM_KERN_MEMORY_NONE);
}
STARTUP(ZALLOC, STARTUP_RANK_LAST, vm_paging_map_init);
/*
* vm_paging_map_object:
* Maps part of a VM object's pages in the kernel
* virtual address space, using the pre-allocated
* kernel virtual addresses, if possible.
* Context:
* The VM object is locked. This lock will get
* dropped and re-acquired though, so the caller
* must make sure the VM object is kept alive
* (by holding a VM map that has a reference
* on it, for example, or taking an extra reference).
* The page should also be kept busy to prevent
* it from being reclaimed.
*/
kern_return_t
vm_paging_map_object(
vm_page_t page,
vm_object_t object,
vm_object_offset_t offset,
vm_prot_t protection,
boolean_t can_unlock_object,
vm_map_size_t *size, /* IN/OUT */
vm_map_offset_t *address, /* OUT */
boolean_t *need_unmap) /* OUT */
{
kern_return_t kr;
vm_map_offset_t page_map_offset;
vm_map_size_t map_size;
vm_object_offset_t object_offset;
int i;
if (page != VM_PAGE_NULL && *size == PAGE_SIZE) {
/* use permanent 1-to-1 kernel mapping of physical memory ? */
*address = (vm_map_offset_t)
phystokv((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(page) << PAGE_SHIFT);
*need_unmap = FALSE;
return KERN_SUCCESS;
assert(page->vmp_busy);
/*
* Use one of the pre-allocated kernel virtual addresses
* and just enter the VM page in the kernel address space
* at that virtual address.
*/
simple_lock(&vm_paging_lock, &vm_pageout_lck_grp);
/*
* Try and find an available kernel virtual address
* from our pre-allocated pool.
*/
page_map_offset = 0;
for (;;) {
for (i = 0; i < VM_PAGING_NUM_PAGES; i++) {
if (vm_paging_page_inuse[i] == FALSE) {
page_map_offset =
vm_paging_base_address +
(i * PAGE_SIZE);
break;
}
}
if (page_map_offset != 0) {
/* found a space to map our page ! */
break;
}
if (can_unlock_object) {
/*
* If we can afford to unlock the VM object,
* let's take the slow path now...
*/
break;
}
/*
* We can't afford to unlock the VM object, so
* let's wait for a space to become available...
*/
vm_paging_page_waiter_total++;
vm_paging_page_waiter++;
kr = assert_wait((event_t)&vm_paging_page_waiter, THREAD_UNINT);
if (kr == THREAD_WAITING) {
simple_unlock(&vm_paging_lock);
kr = thread_block(THREAD_CONTINUE_NULL);
simple_lock(&vm_paging_lock, &vm_pageout_lck_grp);
}
vm_paging_page_waiter--;
/* ... and try again */
}
if (page_map_offset != 0) {
/*
* We found a kernel virtual address;
* map the physical page to that virtual address.
*/
if (i > vm_paging_max_index) {
vm_paging_max_index = i;
}
vm_paging_page_inuse[i] = TRUE;
simple_unlock(&vm_paging_lock);
page->vmp_pmapped = TRUE;
/*
* Keep the VM object locked over the PMAP_ENTER
* and the actual use of the page by the kernel,
* or this pmap mapping might get undone by a
* vm_object_pmap_protect() call...
*/
kr = pmap_enter_check(kernel_pmap,
page_map_offset,
page,
protection,
VM_PROT_NONE,
0,
TRUE);
assert(kr == KERN_SUCCESS);
vm_paging_objects_mapped++;
vm_paging_pages_mapped++;
*address = page_map_offset;
*need_unmap = TRUE;
#if KASAN
kasan_notify_address(page_map_offset, PAGE_SIZE);
#endif
/* all done and mapped, ready to use ! */
return KERN_SUCCESS;
}
/*
* We ran out of pre-allocated kernel virtual
* addresses. Just map the page in the kernel
* the slow and regular way.
*/
vm_paging_no_kernel_page++;
simple_unlock(&vm_paging_lock);
}
if (!can_unlock_object) {
*address = 0;
*size = 0;
*need_unmap = FALSE;
return KERN_NOT_SUPPORTED;
}
object_offset = vm_object_trunc_page(offset);
map_size = vm_map_round_page(*size,
VM_MAP_PAGE_MASK(kernel_map));
/*
* Try and map the required range of the object
* in the kernel_map. Given that allocation is
* for pageable memory, it shouldn't contain
* pointers and is mapped into the data range.
*/
vm_object_reference_locked(object); /* for the map entry */
vm_object_unlock(object);
kr = vm_map_enter(kernel_map,
address,
map_size,
0,
VM_MAP_KERNEL_FLAGS_DATA_ANYWHERE(),
object,
object_offset,
FALSE,
protection,
VM_PROT_ALL,
VM_INHERIT_NONE);
if (kr != KERN_SUCCESS) {
*address = 0;
*size = 0;
*need_unmap = FALSE;
vm_object_deallocate(object); /* for the map entry */
vm_object_lock(object);
return kr;
}
*size = map_size;
/*
* Enter the mapped pages in the page table now.
*/
vm_object_lock(object);
/*
* VM object must be kept locked from before PMAP_ENTER()
* until after the kernel is done accessing the page(s).
* Otherwise, the pmap mappings in the kernel could be
* undone by a call to vm_object_pmap_protect().
*/
for (page_map_offset = 0;
map_size != 0;
map_size -= PAGE_SIZE_64, page_map_offset += PAGE_SIZE_64) {
page = vm_page_lookup(object, offset + page_map_offset);
if (page == VM_PAGE_NULL) {
printf("vm_paging_map_object: no page !?");
vm_object_unlock(object);
vm_map_remove(kernel_map, *address, *size);
*address = 0;
*size = 0;
*need_unmap = FALSE;
vm_object_lock(object);
return KERN_MEMORY_ERROR;
}
page->vmp_pmapped = TRUE;
kr = pmap_enter_check(kernel_pmap,
*address + page_map_offset,
page,
protection,
VM_PROT_NONE,
0,
TRUE);
assert(kr == KERN_SUCCESS);
#if KASAN
kasan_notify_address(*address + page_map_offset, PAGE_SIZE);
#endif
}
vm_paging_objects_mapped_slow++;
vm_paging_pages_mapped_slow += (unsigned long) (map_size / PAGE_SIZE_64);
*need_unmap = TRUE;
return KERN_SUCCESS;
}
/*
* vm_paging_unmap_object:
* Unmaps part of a VM object's pages from the kernel
* virtual address space.
* Context:
* The VM object is locked. This lock will get
* dropped and re-acquired though.
*/
void
vm_paging_unmap_object(
vm_object_t object,
vm_map_offset_t start,
vm_map_offset_t end)
{
int i;
if ((vm_paging_base_address == 0) ||
(start < vm_paging_base_address) ||
(end > (vm_paging_base_address
+ (VM_PAGING_NUM_PAGES * PAGE_SIZE)))) {
/*
* We didn't use our pre-allocated pool of
* kernel virtual address. Deallocate the
* virtual memory.
*/
if (object != VM_OBJECT_NULL) {
vm_object_unlock(object);
}
vm_map_remove(kernel_map, start, end);
if (object != VM_OBJECT_NULL) {
vm_object_lock(object);
}
} else {
/*
* We used a kernel virtual address from our
* pre-allocated pool. Put it back in the pool
* for next time.
*/
assert(end - start == PAGE_SIZE);
i = (int) ((start - vm_paging_base_address) >> PAGE_SHIFT);
assert(i >= 0 && i < VM_PAGING_NUM_PAGES);
/* undo the pmap mapping */
pmap_remove(kernel_pmap, start, end);
simple_lock(&vm_paging_lock, &vm_pageout_lck_grp);
vm_paging_page_inuse[i] = FALSE;
if (vm_paging_page_waiter) {
thread_wakeup(&vm_paging_page_waiter);
}
simple_unlock(&vm_paging_lock);
}
}
/*
* page->vmp_object must be locked
*/
void
vm_pageout_steal_laundry(vm_page_t page, boolean_t queues_locked)
{
if (!queues_locked) {
vm_page_lockspin_queues();
}
page->vmp_free_when_done = FALSE;
/*
* need to drop the laundry count...
* we may also need to remove it
* from the I/O paging queue...
* vm_pageout_throttle_up handles both cases
*
* the laundry and pageout_queue flags are cleared...
*/
vm_pageout_throttle_up(page);
if (!queues_locked) {
vm_page_unlock_queues();
}
}
#define VECTOR_UPL_ELEMENTS_UPPER_LIMIT 64
upl_t
vector_upl_create(vm_offset_t upl_offset, uint32_t max_upls)
{
int i = 0;
upl_t upl;
assert(max_upls > 0);
if (max_upls == 0) {
return NULL;
}
if (max_upls > VECTOR_UPL_ELEMENTS_UPPER_LIMIT) {
max_upls = VECTOR_UPL_ELEMENTS_UPPER_LIMIT;
}
vector_upl_t vector_upl = kalloc_type(struct _vector_upl, typeof(vector_upl->upls[0]), max_upls, Z_WAITOK | Z_NOFAIL);
upl = upl_create(0, UPL_VECTOR, 0);
upl->vector_upl = vector_upl;
upl->u_offset = upl_offset;
vector_upl->size = 0;
vector_upl->offset = upl_offset;
vector_upl->invalid_upls = 0;
vector_upl->num_upls = 0;
vector_upl->pagelist = NULL;
vector_upl->max_upls = max_upls;
for (i = 0; i < max_upls; i++) {
vector_upl->upls[i].iostate.size = 0;
vector_upl->upls[i].iostate.offset = 0;
}
return upl;
}
upl_size_t
vector_upl_get_size(const upl_t upl)
{
if (!vector_upl_is_valid(upl)) {
return upl_get_size(upl);
} else {
return round_page_32(upl->vector_upl->size);
}
}
uint32_t
vector_upl_max_upls(const upl_t upl)
{
if (!vector_upl_is_valid(upl)) {
return 0;
}
return ((vector_upl_t)(upl->vector_upl))->max_upls;
}
void
vector_upl_deallocate(upl_t upl)
{
vector_upl_t vector_upl = upl->vector_upl;
assert(vector_upl_is_valid(upl));
if (vector_upl->invalid_upls != vector_upl->num_upls) {
panic("Deallocating non-empty Vectored UPL");
}
uint32_t max_upls = vector_upl->max_upls;
kfree_type(struct upl_page_info, atop(vector_upl->size), vector_upl->pagelist);
kfree_type(struct _vector_upl, typeof(vector_upl->upls[0]), max_upls, vector_upl);
upl->vector_upl = NULL;
}
boolean_t
vector_upl_is_valid(upl_t upl)
{
return upl && (upl->flags & UPL_VECTOR) && upl->vector_upl;
}
boolean_t
vector_upl_set_subupl(upl_t upl, upl_t subupl, uint32_t io_size)
{
if (vector_upl_is_valid(upl)) {
vector_upl_t vector_upl = upl->vector_upl;
if (vector_upl) {
if (subupl) {
if (io_size) {
if (io_size < PAGE_SIZE) {
io_size = PAGE_SIZE;
}
subupl->vector_upl = (void*)vector_upl;
vector_upl->upls[vector_upl->num_upls++].elem = subupl;
vector_upl->size += io_size;
upl->u_size += io_size;
} else {
uint32_t i = 0, invalid_upls = 0;
for (i = 0; i < vector_upl->num_upls; i++) {
if (vector_upl->upls[i].elem == subupl) {
break;
}
}
if (i == vector_upl->num_upls) {
panic("Trying to remove sub-upl when none exists");
}
vector_upl->upls[i].elem = NULL;
invalid_upls = os_atomic_inc(&(vector_upl)->invalid_upls,
relaxed);
if (invalid_upls == vector_upl->num_upls) {
return TRUE;
} else {
return FALSE;
}
}
} else {
panic("vector_upl_set_subupl was passed a NULL upl element");
}
} else {
panic("vector_upl_set_subupl was passed a non-vectored upl");
}
} else {
panic("vector_upl_set_subupl was passed a NULL upl");
}
return FALSE;
}
void
vector_upl_set_pagelist(upl_t upl)
{
if (vector_upl_is_valid(upl)) {
uint32_t i = 0;
vector_upl_t vector_upl = upl->vector_upl;
if (vector_upl) {
vm_offset_t pagelist_size = 0, cur_upl_pagelist_size = 0;
vector_upl->pagelist = kalloc_type(struct upl_page_info,
atop(vector_upl->size), Z_WAITOK);
for (i = 0; i < vector_upl->num_upls; i++) {
cur_upl_pagelist_size = sizeof(struct upl_page_info) * upl_adjusted_size(vector_upl->upls[i].elem, PAGE_MASK) / PAGE_SIZE;
bcopy(vector_upl->upls[i].elem->page_list, (char*)vector_upl->pagelist + pagelist_size, cur_upl_pagelist_size);
pagelist_size += cur_upl_pagelist_size;
if (vector_upl->upls[i].elem->highest_page > upl->highest_page) {
upl->highest_page = vector_upl->upls[i].elem->highest_page;
}
}
assert( pagelist_size == (sizeof(struct upl_page_info) * (vector_upl->size / PAGE_SIZE)));
} else {
panic("vector_upl_set_pagelist was passed a non-vectored upl");
}
} else {
panic("vector_upl_set_pagelist was passed a NULL upl");
}
}
upl_t
vector_upl_subupl_byindex(upl_t upl, uint32_t index)
{
if (vector_upl_is_valid(upl)) {
vector_upl_t vector_upl = upl->vector_upl;
if (vector_upl) {
if (index < vector_upl->num_upls) {
return vector_upl->upls[index].elem;
}
} else {
panic("vector_upl_subupl_byindex was passed a non-vectored upl");
}
}
return NULL;
}
upl_t
vector_upl_subupl_byoffset(upl_t upl, upl_offset_t *upl_offset, upl_size_t *upl_size)
{
if (vector_upl_is_valid(upl)) {
uint32_t i = 0;
vector_upl_t vector_upl = upl->vector_upl;
if (vector_upl) {
upl_t subupl = NULL;
vector_upl_iostates_t subupl_state;
for (i = 0; i < vector_upl->num_upls; i++) {
subupl = vector_upl->upls[i].elem;
subupl_state = vector_upl->upls[i].iostate;
if (*upl_offset <= (subupl_state.offset + subupl_state.size - 1)) {
/* We could have been passed an offset/size pair that belongs
* to an UPL element that has already been committed/aborted.
* If so, return NULL.
*/
if (subupl == NULL) {
return NULL;
}
if ((subupl_state.offset + subupl_state.size) < (*upl_offset + *upl_size)) {
*upl_size = (subupl_state.offset + subupl_state.size) - *upl_offset;
if (*upl_size > subupl_state.size) {
*upl_size = subupl_state.size;
}
}
if (*upl_offset >= subupl_state.offset) {
*upl_offset -= subupl_state.offset;
} else if (i) {
panic("Vector UPL offset miscalculation");
}
return subupl;
}
}
} else {
panic("vector_upl_subupl_byoffset was passed a non-vectored UPL");
}
}
return NULL;
}
void
vector_upl_get_submap(upl_t upl, vm_map_t *v_upl_submap, vm_offset_t *submap_dst_addr)
{
*v_upl_submap = NULL;
if (vector_upl_is_valid(upl)) {
vector_upl_t vector_upl = upl->vector_upl;
if (vector_upl) {
*v_upl_submap = vector_upl->submap;
*submap_dst_addr = vector_upl->submap_dst_addr;
} else {
panic("vector_upl_get_submap was passed a non-vectored UPL");
}
} else {
panic("vector_upl_get_submap was passed a null UPL");
}
}
void
vector_upl_set_submap(upl_t upl, vm_map_t submap, vm_offset_t submap_dst_addr)
{
if (vector_upl_is_valid(upl)) {
vector_upl_t vector_upl = upl->vector_upl;
if (vector_upl) {
vector_upl->submap = submap;
vector_upl->submap_dst_addr = submap_dst_addr;
} else {
panic("vector_upl_get_submap was passed a non-vectored UPL");
}
} else {
panic("vector_upl_get_submap was passed a NULL UPL");
}
}
void
vector_upl_set_iostate(upl_t upl, upl_t subupl, upl_offset_t offset, upl_size_t size)
{
if (vector_upl_is_valid(upl)) {
uint32_t i = 0;
vector_upl_t vector_upl = upl->vector_upl;
if (vector_upl) {
for (i = 0; i < vector_upl->num_upls; i++) {
if (vector_upl->upls[i].elem == subupl) {
break;
}
}
if (i == vector_upl->num_upls) {
panic("setting sub-upl iostate when none exists");
}
vector_upl->upls[i].iostate.offset = offset;
if (size < PAGE_SIZE) {
size = PAGE_SIZE;
}
vector_upl->upls[i].iostate.size = size;
} else {
panic("vector_upl_set_iostate was passed a non-vectored UPL");
}
} else {
panic("vector_upl_set_iostate was passed a NULL UPL");
}
}
void
vector_upl_get_iostate(upl_t upl, upl_t subupl, upl_offset_t *offset, upl_size_t *size)
{
if (vector_upl_is_valid(upl)) {
uint32_t i = 0;
vector_upl_t vector_upl = upl->vector_upl;
if (vector_upl) {
for (i = 0; i < vector_upl->num_upls; i++) {
if (vector_upl->upls[i].elem == subupl) {
break;
}
}
if (i == vector_upl->num_upls) {
panic("getting sub-upl iostate when none exists");
}
*offset = vector_upl->upls[i].iostate.offset;
*size = vector_upl->upls[i].iostate.size;
} else {
panic("vector_upl_get_iostate was passed a non-vectored UPL");
}
} else {
panic("vector_upl_get_iostate was passed a NULL UPL");
}
}
void
vector_upl_get_iostate_byindex(upl_t upl, uint32_t index, upl_offset_t *offset, upl_size_t *size)
{
if (vector_upl_is_valid(upl)) {
vector_upl_t vector_upl = upl->vector_upl;
if (vector_upl) {
if (index < vector_upl->num_upls) {
*offset = vector_upl->upls[index].iostate.offset;
*size = vector_upl->upls[index].iostate.size;
} else {
*offset = *size = 0;
}
} else {
panic("vector_upl_get_iostate_byindex was passed a non-vectored UPL");
}
} else {
panic("vector_upl_get_iostate_byindex was passed a NULL UPL");
}
}
void *
upl_get_internal_vectorupl(upl_t upl)
{
return upl->vector_upl;
}
upl_page_info_t *
upl_get_internal_vectorupl_pagelist(upl_t upl)
{
return upl->vector_upl->pagelist;
}
upl_page_info_t *
upl_get_internal_page_list(upl_t upl)
{
return upl->vector_upl ? upl->vector_upl->pagelist : upl->page_list;
}
void
upl_clear_dirty(
upl_t upl,
boolean_t value)
{
if (value) {
upl->flags |= UPL_CLEAR_DIRTY;
} else {
upl->flags &= ~UPL_CLEAR_DIRTY;
}
}
void
upl_set_referenced(
upl_t upl,
boolean_t value)
{
upl_lock(upl);
if (value) {
upl->ext_ref_count++;
} else {
if (!upl->ext_ref_count) {
panic("upl_set_referenced not %p", upl);
}
upl->ext_ref_count--;
}
upl_unlock(upl);
}
void
upl_set_map_exclusive(upl_t upl)
{
upl_lock(upl);
while (upl->map_addr_owner) {
upl->flags |= UPL_MAP_EXCLUSIVE_WAIT;
upl_lock_sleep(upl, &upl->map_addr_owner, ctid_get_thread(upl->map_addr_owner));
}
upl->map_addr_owner = thread_get_ctid(current_thread());
upl_unlock(upl);
}
void
upl_clear_map_exclusive(upl_t upl)
{
assert(upl->map_addr_owner == thread_get_ctid(current_thread()));
upl_lock(upl);
if (upl->flags & UPL_MAP_EXCLUSIVE_WAIT) {
upl->flags &= ~UPL_MAP_EXCLUSIVE_WAIT;
upl_wakeup(&upl->map_addr_owner);
}
upl->map_addr_owner = 0;
upl_unlock(upl);
}
#if CONFIG_IOSCHED
void
upl_set_blkno(
upl_t upl,
vm_offset_t upl_offset,
int io_size,
int64_t blkno)
{
int i, j;
if ((upl->flags & UPL_EXPEDITE_SUPPORTED) == 0) {
return;
}
assert(upl->upl_reprio_info != 0);
for (i = (int)(upl_offset / PAGE_SIZE), j = 0; j < io_size; i++, j += PAGE_SIZE) {
UPL_SET_REPRIO_INFO(upl, i, blkno, io_size);
}
}
#endif
void inline
memoryshot(unsigned int event, unsigned int control)
{
if (vm_debug_events) {
KERNEL_DEBUG_CONSTANT1((MACHDBG_CODE(DBG_MACH_VM_PRESSURE, event)) | control,
vm_page_active_count, vm_page_inactive_count,
vm_page_free_count, vm_page_speculative_count,
vm_page_throttled_count);
} else {
(void) event;
(void) control;
}
}
#ifdef MACH_BSD
boolean_t
upl_device_page(upl_page_info_t *upl)
{
return UPL_DEVICE_PAGE(upl);
}
boolean_t
upl_page_present(upl_page_info_t *upl, int index)
{
return UPL_PAGE_PRESENT(upl, index);
}
boolean_t
upl_speculative_page(upl_page_info_t *upl, int index)
{
return UPL_SPECULATIVE_PAGE(upl, index);
}
boolean_t
upl_dirty_page(upl_page_info_t *upl, int index)
{
return UPL_DIRTY_PAGE(upl, index);
}
boolean_t
upl_valid_page(upl_page_info_t *upl, int index)
{
return UPL_VALID_PAGE(upl, index);
}
ppnum_t
upl_phys_page(upl_page_info_t *upl, int index)
{
return UPL_PHYS_PAGE(upl, index);
}
void
upl_page_set_mark(upl_page_info_t *upl, int index, boolean_t v)
{
upl[index].mark = v;
}
boolean_t
upl_page_get_mark(upl_page_info_t *upl, int index)
{
return upl[index].mark;
}
void
vm_countdirtypages(void)
{
vm_page_t m;
int dpages;
int pgopages;
int precpages;
dpages = 0;
pgopages = 0;
precpages = 0;
vm_page_lock_queues();
m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
do {
if (m == (vm_page_t)0) {
break;
}
if (m->vmp_dirty) {
dpages++;
}
if (m->vmp_free_when_done) {
pgopages++;
}
if (m->vmp_precious) {
precpages++;
}
assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
if (m == (vm_page_t)0) {
break;
}
} while (!vm_page_queue_end(&vm_page_queue_inactive, (vm_page_queue_entry_t) m));
vm_page_unlock_queues();
vm_page_lock_queues();
m = (vm_page_t) vm_page_queue_first(&vm_page_queue_throttled);
do {
if (m == (vm_page_t)0) {
break;
}
dpages++;
assert(m->vmp_dirty);
assert(!m->vmp_free_when_done);
assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
if (m == (vm_page_t)0) {
break;
}
} while (!vm_page_queue_end(&vm_page_queue_throttled, (vm_page_queue_entry_t) m));
vm_page_unlock_queues();
vm_page_lock_queues();
m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
do {
if (m == (vm_page_t)0) {
break;
}
if (m->vmp_dirty) {
dpages++;
}
if (m->vmp_free_when_done) {
pgopages++;
}
if (m->vmp_precious) {
precpages++;
}
assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
if (m == (vm_page_t)0) {
break;
}
} while (!vm_page_queue_end(&vm_page_queue_anonymous, (vm_page_queue_entry_t) m));
vm_page_unlock_queues();
printf("IN Q: %d : %d : %d\n", dpages, pgopages, precpages);
dpages = 0;
pgopages = 0;
precpages = 0;
vm_page_lock_queues();
m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
do {
if (m == (vm_page_t)0) {
break;
}
if (m->vmp_dirty) {
dpages++;
}
if (m->vmp_free_when_done) {
pgopages++;
}
if (m->vmp_precious) {
precpages++;
}
assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
if (m == (vm_page_t)0) {
break;
}
} while (!vm_page_queue_end(&vm_page_queue_active, (vm_page_queue_entry_t) m));
vm_page_unlock_queues();
printf("AC Q: %d : %d : %d\n", dpages, pgopages, precpages);
}
#endif /* MACH_BSD */
#if CONFIG_IOSCHED
int
upl_get_cached_tier(upl_t upl)
{
assert(upl);
if (upl->flags & UPL_TRACKED_BY_OBJECT) {
return upl->upl_priority;
}
return -1;
}
#endif /* CONFIG_IOSCHED */
void
upl_callout_iodone(upl_t upl)
{
struct upl_io_completion *upl_ctx = upl->upl_iodone;
if (upl_ctx) {
void (*iodone_func)(void *, int) = upl_ctx->io_done;
assert(upl_ctx->io_done);
(*iodone_func)(upl_ctx->io_context, upl_ctx->io_error);
}
}
void
upl_set_iodone(upl_t upl, void *upl_iodone)
{
upl->upl_iodone = (struct upl_io_completion *)upl_iodone;
}
void
upl_set_iodone_error(upl_t upl, int error)
{
struct upl_io_completion *upl_ctx = upl->upl_iodone;
if (upl_ctx) {
upl_ctx->io_error = error;
}
}
ppnum_t
upl_get_highest_page(
upl_t upl)
{
return upl->highest_page;
}
upl_size_t
upl_get_size(
upl_t upl)
{
return upl_adjusted_size(upl, PAGE_MASK);
}
upl_size_t
upl_adjusted_size(
upl_t upl,
vm_map_offset_t pgmask)
{
vm_object_offset_t start_offset, end_offset;
start_offset = trunc_page_mask_64(upl->u_offset, pgmask);
end_offset = round_page_mask_64(upl->u_offset + upl->u_size, pgmask);
return (upl_size_t)(end_offset - start_offset);
}
vm_object_offset_t
upl_adjusted_offset(
upl_t upl,
vm_map_offset_t pgmask)
{
return trunc_page_mask_64(upl->u_offset, pgmask);
}
vm_object_offset_t
upl_get_data_offset(
upl_t upl)
{
return upl->u_offset - upl_adjusted_offset(upl, PAGE_MASK);
}
upl_t
upl_associated_upl(upl_t upl)
{
return upl->associated_upl;
}
void
upl_set_associated_upl(upl_t upl, upl_t associated_upl)
{
upl->associated_upl = associated_upl;
}
struct vnode *
upl_lookup_vnode(upl_t upl)
{
if (!upl->map_object->internal) {
return vnode_pager_lookup_vnode(upl->map_object->pager);
} else {
return NULL;
}
}
#if UPL_DEBUG
kern_return_t
upl_ubc_alias_set(upl_t upl, uintptr_t alias1, uintptr_t alias2)
{
upl->ubc_alias1 = alias1;
upl->ubc_alias2 = alias2;
return KERN_SUCCESS;
}
int
upl_ubc_alias_get(upl_t upl, uintptr_t * al, uintptr_t * al2)
{
if (al) {
*al = upl->ubc_alias1;
}
if (al2) {
*al2 = upl->ubc_alias2;
}
return KERN_SUCCESS;
}
#endif /* UPL_DEBUG */
#if VM_PRESSURE_EVENTS
/*
* Upward trajectory.
*/
boolean_t
VM_PRESSURE_NORMAL_TO_WARNING(void)
{
if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
/* Available pages below our threshold */
if (memorystatus_available_pages < memorystatus_available_pages_pressure) {
#if CONFIG_FREEZE
/* No frozen processes to kill */
if (memorystatus_frozen_count == 0) {
/* Not enough suspended processes available. */
if (memorystatus_suspended_count < MEMORYSTATUS_SUSPENDED_THRESHOLD) {
return TRUE;
}
}
#else /* CONFIG_FREEZE */
return TRUE;
#endif /* CONFIG_FREEZE */
}
return FALSE;
} else {
return (AVAILABLE_NON_COMPRESSED_MEMORY < VM_PAGE_COMPRESSOR_COMPACT_THRESHOLD) ? 1 : 0;
}
}
boolean_t
VM_PRESSURE_WARNING_TO_CRITICAL(void)
{
if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
/* Available pages below our threshold */
if (memorystatus_available_pages < memorystatus_available_pages_critical) {
return TRUE;
}
return FALSE;
} else {
return vm_compressor_low_on_space() || (AVAILABLE_NON_COMPRESSED_MEMORY < ((12 * VM_PAGE_COMPRESSOR_SWAP_UNTHROTTLE_THRESHOLD) / 10)) ? 1 : 0;
}
}
/*
* Downward trajectory.
*/
boolean_t
VM_PRESSURE_WARNING_TO_NORMAL(void)
{
if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
/* Available pages above our threshold */
unsigned int target_threshold = (unsigned int) (memorystatus_available_pages_pressure + ((15 * memorystatus_available_pages_pressure) / 100));
if (memorystatus_available_pages > target_threshold) {
return TRUE;
}
return FALSE;
} else {
return (AVAILABLE_NON_COMPRESSED_MEMORY > ((12 * VM_PAGE_COMPRESSOR_COMPACT_THRESHOLD) / 10)) ? 1 : 0;
}
}
boolean_t
VM_PRESSURE_CRITICAL_TO_WARNING(void)
{
if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
/* Available pages above our threshold */
unsigned int target_threshold = (unsigned int)(memorystatus_available_pages_critical + ((15 * memorystatus_available_pages_critical) / 100));
if (memorystatus_available_pages > target_threshold) {
return TRUE;
}
return FALSE;
} else {
return (AVAILABLE_NON_COMPRESSED_MEMORY > ((14 * VM_PAGE_COMPRESSOR_SWAP_UNTHROTTLE_THRESHOLD) / 10)) ? 1 : 0;
}
}
#endif /* VM_PRESSURE_EVENTS */
#if DEVELOPMENT || DEBUG
bool compressor_running_perf_test;
uint64_t compressor_perf_test_pages_processed;
static kern_return_t
move_pages_to_queue(
vm_map_t map,
user_addr_t start_addr,
size_t buffer_size,
vm_page_queue_head_t *queue,
size_t *pages_moved)
{
kern_return_t err = KERN_SUCCESS;
vm_map_entry_t curr_entry = VM_MAP_ENTRY_NULL;
boolean_t addr_in_map = FALSE;
user_addr_t end_addr = USER_ADDR_NULL, curr_addr = USER_ADDR_NULL;
vm_object_t curr_object = VM_OBJECT_NULL;
*pages_moved = 0;
if (VM_MAP_PAGE_SIZE(map) != PAGE_SIZE_64) {
/*
* We don't currently support benchmarking maps with a different page size
* than the kernel.
*/
return KERN_INVALID_ARGUMENT;
}
if (os_add_overflow(start_addr, buffer_size, &end_addr)) {
return KERN_INVALID_ARGUMENT;
}
vm_map_lock_read(map);
curr_addr = vm_map_trunc_page_mask(start_addr, VM_MAP_PAGE_MASK(map));
end_addr = vm_map_round_page_mask(start_addr + buffer_size, VM_MAP_PAGE_MASK(map));
while (curr_addr < end_addr) {
addr_in_map = vm_map_lookup_entry(map, curr_addr, &curr_entry);
if (!addr_in_map) {
err = KERN_INVALID_ARGUMENT;
break;
}
curr_object = VME_OBJECT(curr_entry);
if (curr_object) {
vm_object_lock(curr_object);
/* We really only want anonymous memory that's in the top level map and object here. */
if (curr_entry->is_sub_map || curr_entry->wired_count != 0 ||
curr_object->shadow != VM_OBJECT_NULL || !curr_object->internal) {
err = KERN_INVALID_ARGUMENT;
vm_object_unlock(curr_object);
break;
}
vm_map_offset_t start_offset = (curr_addr - curr_entry->vme_start) + VME_OFFSET(curr_entry);
vm_map_offset_t end_offset = MIN(curr_entry->vme_end, end_addr) -
(curr_entry->vme_start + VME_OFFSET(curr_entry));
vm_map_offset_t curr_offset = start_offset;
vm_page_t curr_page;
while (curr_offset < end_offset) {
curr_page = vm_page_lookup(curr_object, vm_object_trunc_page(curr_offset));
if (curr_page != VM_PAGE_NULL) {
vm_page_lock_queues();
if (curr_page->vmp_laundry) {
vm_pageout_steal_laundry(curr_page, TRUE);
}
/*
* we've already factored out pages in the laundry which
* means this page can't be on the pageout queue so it's
* safe to do the vm_page_queues_remove
*/
bool donate = (curr_page->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
vm_page_queues_remove(curr_page, TRUE);
if (donate) {
/*
* The compressor needs to see this bit to know
* where this page needs to land. Also if stolen,
* this bit helps put the page back in the right
* special queue where it belongs.
*/
curr_page->vmp_on_specialq = VM_PAGE_SPECIAL_Q_DONATE;
}
// Clear the referenced bit so we ensure this gets paged out
curr_page->vmp_reference = false;
if (curr_page->vmp_pmapped) {
pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(curr_page),
VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void*)NULL);
}
vm_page_queue_enter(queue, curr_page, vmp_pageq);
vm_page_unlock_queues();
*pages_moved += 1;
}
curr_offset += PAGE_SIZE_64;
curr_addr += PAGE_SIZE_64;
}
}
vm_object_unlock(curr_object);
}
vm_map_unlock_read(map);
return err;
}
/*
* Local queue for processing benchmark pages.
* Can't be allocated on the stack because the pointer has to
* be packable.
*/
vm_page_queue_head_t compressor_perf_test_queue VM_PAGE_PACKED_ALIGNED;
kern_return_t
run_compressor_perf_test(
user_addr_t buf,
size_t buffer_size,
uint64_t *time,
uint64_t *bytes_compressed,
uint64_t *compressor_growth)
{
kern_return_t err = KERN_SUCCESS;
if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
return KERN_NOT_SUPPORTED;
}
if (current_task() == kernel_task) {
return KERN_INVALID_ARGUMENT;
}
vm_page_lock_queues();
if (compressor_running_perf_test) {
/* Only run one instance of the benchmark at a time. */
vm_page_unlock_queues();
return KERN_RESOURCE_SHORTAGE;
}
vm_page_unlock_queues();
size_t page_count = 0;
vm_map_t map;
vm_page_t p, next;
uint64_t compressor_perf_test_start = 0, compressor_perf_test_end = 0;
uint64_t compressed_bytes_start = 0, compressed_bytes_end = 0;
*bytes_compressed = *compressor_growth = 0;
vm_page_queue_init(&compressor_perf_test_queue);
map = current_task()->map;
err = move_pages_to_queue(map, buf, buffer_size, &compressor_perf_test_queue, &page_count);
if (err != KERN_SUCCESS) {
goto out;
}
vm_page_lock_queues();
compressor_running_perf_test = true;
compressor_perf_test_pages_processed = 0;
/*
* At this point the compressor threads should only process the benchmark queue
* so we can look at the difference in c_segment_compressed_bytes while the perf test is running
* to determine how many compressed bytes we ended up using.
*/
compressed_bytes_start = os_atomic_load(&c_segment_compressed_bytes, relaxed);
vm_page_unlock_queues();
page_count = vm_pageout_page_queue(&compressor_perf_test_queue, page_count, true);
vm_page_lock_queues();
compressor_perf_test_start = mach_absolute_time();
// Wake up the compressor thread(s)
sched_cond_signal(&pgo_iothread_internal_state[0].pgo_wakeup,
pgo_iothread_internal_state[0].pgo_iothread);
/*
* Depending on when this test is run we could overshoot or be right on the mark
* with our page_count. So the comparison is of the _less than_ variety.
*/
while (compressor_perf_test_pages_processed < page_count) {
assert_wait((event_t) &compressor_perf_test_pages_processed, THREAD_UNINT);
vm_page_unlock_queues();
thread_block(THREAD_CONTINUE_NULL);
vm_page_lock_queues();
}
compressor_perf_test_end = mach_absolute_time();
compressed_bytes_end = os_atomic_load(&c_segment_compressed_bytes, relaxed);
vm_page_unlock_queues();
out:
/*
* If we errored out above, then we could still have some pages
* on the local queue. Make sure to put them back on the active queue before
* returning so they're not orphaned.
*/
vm_page_lock_queues();
absolutetime_to_nanoseconds(compressor_perf_test_end - compressor_perf_test_start, time);
p = (vm_page_t) vm_page_queue_first(&compressor_perf_test_queue);
while (p && !vm_page_queue_end(&compressor_perf_test_queue, (vm_page_queue_entry_t)p)) {
next = (vm_page_t)VM_PAGE_UNPACK_PTR(p->vmp_pageq.next);
vm_page_enqueue_active(p, FALSE);
p = next;
}
compressor_running_perf_test = false;
vm_page_unlock_queues();
if (err == KERN_SUCCESS) {
*bytes_compressed = page_count * PAGE_SIZE_64;
*compressor_growth = compressed_bytes_end - compressed_bytes_start;
}
/*
* pageout_scan will consider waking the compactor swapper
* before it blocks. Do the same thing here before we return
* to ensure that back to back benchmark runs can't overly fragment the
* compressor pool.
*/
vm_consider_waking_compactor_swapper();
return err;
}
#endif /* DEVELOPMENT || DEBUG */