/*
* Copyright (c) 2024 Apple Inc. All rights reserved.
*
* @APPLE_LICENSE_HEADER_START@
*
* This file contains Original Code and/or Modifications of Original Code
* as defined in and that are subject to the Apple Public Source License
* Version 2.0 (the 'License'). You may not use this file except in
* compliance with the License. Please obtain a copy of the License at
* http://www.opensource.apple.com/apsl/ and read it before using this
* file.
*
* The Original Code and all software distributed under the License are
* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
* Please see the License for the specific language governing rights and
* limitations under the License.
*
* @APPLE_LICENSE_HEADER_END@
*/
#include <kern/cpu_data.h>
#include <kern/kalloc.h>
#include <kern/locks.h>
#include <kern/mem_acct.h>
#include <kern/percpu.h>
#include <os/atomic_private.h>
#include <os/log.h>
#include <os/ptrtools.h>
#include <sys/mem_acct_private.h>
#include <sys/param.h>
#include <sys/sysctl.h>
#include <net/net_sysctl.h>
struct mem_acct {
int64_t _Atomic ma_allocated; /* Amount of memory accounted towards this subsystem (ignore temporary per-CPU accounting from below) */
int32_t *__zpercpu ma_percpu; /* Per-CPU "bounce-buffer" of accounting that will be folded in to `ma_allocated` */
uint64_t ma_hardlimit; /* hard limit that will not be exceeded */
uint8_t ma_percent; /* Percent of hard-limit we should start soft-limiting (if != 100 && != 0) */
uint64_t _Atomic ma_peak;
char ma_name[MEM_ACCT_NAME_LENGTH]; /* Name of the subsystem using this instance of memory-accounting module */
};
#define MEM_ACCT_PCPU_MAX 1024 * 1024 /* Update global var after 1MB in the per-cpu var */
static struct mem_acct *memacct[MEM_ACCT_MAX];
static uint64_t
mem_acct_softlimit(uint64_t hardlimit, uint8_t percent)
{
return (hardlimit * percent) / 100;
}
static uint64_t
mem_acct_presoftlimit(uint64_t hardlimit, uint8_t percent)
{
return (mem_acct_softlimit(hardlimit, percent) * percent) / 100;
}
int
mem_acct_limited(const struct mem_acct *macct)
{
uint64_t hardlimit;
int64_t allocated;
uint8_t percent;
allocated = os_atomic_load(&macct->ma_allocated, relaxed);
if (allocated < 0) {
return 0;
}
hardlimit = os_access_once(macct->ma_hardlimit);
if (hardlimit && allocated > hardlimit) {
return MEMACCT_HARDLIMIT;
}
percent = os_access_once(macct->ma_percent);
if (percent) {
if (allocated > mem_acct_softlimit(hardlimit, percent)) {
return MEMACCT_SOFTLIMIT;
}
if (allocated > mem_acct_presoftlimit(hardlimit, percent)) {
return MEMACCT_PRESOFTLIMIT;
}
}
return 0;
}
void
_mem_acct_add(struct mem_acct *macct, int size)
{
int *pcpu;
/*
* Yes, the accounting is not 100% accurate with the per-cpu
* "bounce-buffer" storing intermediate results. For example, we may
* report "hard-limit" even though all the per-cpu counters may bring us
* below the limit. But honestly, we don't care... If we hit hard-limit
* the system is gonna be in a bad state anyways until we have given
* away enough memory.
*
* The same counts for softlimit, but softlimit still allows us to
* account memory and just makes us a bit more aggressive at freeing
* stuff.
*/
/* Now, add the size to the per-cpu variable */
disable_preemption();
pcpu = zpercpu_get(macct->ma_percpu);
*pcpu += size;
/* If we added enough to the pcpu variable, fold it into the global variable */
if (*pcpu > MEM_ACCT_PCPU_MAX || *pcpu < -MEM_ACCT_PCPU_MAX) {
int limited, newlimited;
int64_t allocated;
limited = mem_acct_limited(macct);
allocated = os_atomic_add(&macct->ma_allocated, *pcpu, relaxed);
/*
* Can be temporarily < 0 if the CPU freeing memory hits
* MEM_ACCT_PCPU_MAX first.
*/
if (allocated > 0) {
os_atomic_max(&macct->ma_peak, allocated, relaxed);
}
newlimited = mem_acct_limited(macct);
if (limited != newlimited) {
os_log(OS_LOG_DEFAULT,
"memacct: %s goes from %u to %u for its limit",
macct->ma_name, limited, newlimited);
}
*pcpu = 0;
}
enable_preemption();
}
static LCK_GRP_DECLARE(mem_acct_mtx_grp, "mem_acct");
static LCK_MTX_DECLARE(mem_acct_mtx, &mem_acct_mtx_grp);
struct mem_acct *
mem_acct_register(const char *__null_terminated name,
uint64_t hardlimit, uint8_t percent)
{
struct mem_acct *acct = NULL;
int i, index = -1;
if (percent > 100) {
os_log(OS_LOG_DEFAULT,
"memacct: percentage for softlimit is out-of-bounds\n");
return NULL;
}
lck_mtx_lock(&mem_acct_mtx);
/* Find an empty slot in the accounting array and check for name uniqueness */
for (i = 0; i < MEM_ACCT_MAX; i++) {
if (memacct[i] == NULL) {
if (index == -1) {
index = i;
}
continue;
}
if (strlcmp(memacct[i]->ma_name, name, MEM_ACCT_NAME_LENGTH - 1) == 0) {
os_log(OS_LOG_DEFAULT,
"memacct: subsystem %s already exists", name);
goto exit;
}
}
if (index == -1) {
os_log(OS_LOG_DEFAULT, "memacct: No space for additional subsystem");
goto exit;
}
memacct[index] = kalloc_type(struct mem_acct, Z_WAITOK_ZERO_NOFAIL);
acct = memacct[index];
strlcpy(acct->ma_name, name, MEM_ACCT_NAME_LENGTH);
acct->ma_hardlimit = hardlimit;
if (percent >= 100) {
os_log(OS_LOG_DEFAULT,
"memacct: percent is > 100");
memacct[index] = NULL;
kfree_type(struct mem_acct, acct);
acct = NULL;
goto exit;
}
acct->ma_percent = percent;
acct->ma_percpu = zalloc_percpu_permanent_type(int32_t);
exit:
lck_mtx_unlock(&mem_acct_mtx);
return acct;
}
/*
* Memory Accounting sysctl handlers
*/
struct walkarg {
int w_op, w_sub;
struct sysctl_req *w_req;
};
/* sysctls on a per-subsystem basis */
static int sysctl_subsystem_peak(struct walkarg *w);
static int sysctl_subsystem_soft_limit(struct walkarg *w);
static int sysctl_subsystem_hard_limit(struct walkarg *w);
static int sysctl_subsystem_allocated(struct walkarg *w);
static int sysctl_all_subsystem_statistics(struct walkarg *w);
/* sysctls for all active subsystems */
static int sysctl_all_statistics(struct sysctl_req *);
static int sysctl_mem_acct_subsystems(struct sysctl_req *);
/* Handler function for all Memory Accounting sysctls */
static int sysctl_mem_acct SYSCTL_HANDLER_ARGS;
/* Helper functions */
static void memacct_copy_stats(struct memacct_statistics *s, struct mem_acct *a);
SYSCTL_NODE(_kern, OID_AUTO, memacct,
CTLFLAG_RW | CTLFLAG_LOCKED, sysctl_mem_acct, "Memory Accounting");
static int
sysctl_mem_acct SYSCTL_HANDLER_ARGS
{
#pragma unused(oidp)
DECLARE_SYSCTL_HANDLER_ARG_ARRAY(int, 2, name, namelen);
int error = EINVAL;
struct walkarg w;
/* Verify the specified subsystem index is valid */
if (name[1] >= MEM_ACCT_MAX || name[1] < 0) {
return EINVAL;
}
bzero(&w, sizeof(w));
w.w_req = req;
w.w_op = name[0];
w.w_sub = name[1];
switch (w.w_op) {
case MEM_ACCT_PEAK:
error = sysctl_subsystem_peak(&w);
break;
case MEM_ACCT_SOFT_LIMIT:
error = sysctl_subsystem_soft_limit(&w);
break;
case MEM_ACCT_HARD_LIMIT:
error = sysctl_subsystem_hard_limit(&w);
break;
case MEM_ACCT_ALLOCATED:
error = sysctl_subsystem_allocated(&w);
break;
case MEM_ACCT_SUBSYSTEMS:
error = sysctl_mem_acct_subsystems(req);
break;
case MEM_ACCT_ALL_SUBSYSTEM_STATISTICS:
error = sysctl_all_subsystem_statistics(&w);
break;
case MEM_ACCT_ALL_STATISTICS:
error = sysctl_all_statistics(req);
break;
}
return error;
}
static int
sysctl_subsystem_peak(struct walkarg *w)
{
int error;
uint64_t value;
int changed = 0;
struct mem_acct *acct = memacct[w->w_sub];
if (acct == NULL) {
return ENOENT;
}
value = os_atomic_load(&acct->ma_peak, relaxed);
error = sysctl_io_number(w->w_req, value, sizeof(value), &value, &changed);
if (error || !changed) {
return error;
}
os_atomic_store(&acct->ma_peak, value, relaxed);
return 0;
}
static int
sysctl_subsystem_soft_limit(struct walkarg *w)
{
int error;
uint64_t hardlimit, value;
int changed = 0;
struct mem_acct *acct = memacct[w->w_sub];
if (acct == NULL) {
return ENOENT;
}
hardlimit = os_atomic_load(&acct->ma_hardlimit, relaxed);
if (acct->ma_percent) {
value = mem_acct_softlimit(hardlimit, acct->ma_percent);
} else {
value = hardlimit;
}
error = sysctl_io_number(w->w_req, value, sizeof(value), &value, &changed);
if (error || !changed) {
return error;
}
return EPERM;
}
static int
sysctl_subsystem_hard_limit(struct walkarg *w)
{
int error;
uint64_t value;
int changed = 0;
struct mem_acct *acct = memacct[w->w_sub];
if (acct == NULL) {
return ENOENT;
}
value = os_atomic_load(&acct->ma_hardlimit, relaxed);
error = sysctl_io_number(w->w_req, value, sizeof(value), &value, &changed);
if (error || !changed) {
return error;
}
acct->ma_hardlimit = value;
return 0;
}
static int
sysctl_subsystem_allocated(struct walkarg *w)
{
int64_t value;
struct mem_acct *acct = memacct[w->w_sub];
lck_mtx_lock(&mem_acct_mtx);
if (acct == NULL) {
return ENOENT;
}
value = os_atomic_load(&acct->ma_allocated, relaxed);
zpercpu_foreach(v, acct->ma_percpu) {
value += *v;
}
lck_mtx_unlock(&mem_acct_mtx);
return sysctl_io_number(w->w_req, value, sizeof(value), NULL, NULL);
}
static int
sysctl_all_subsystem_statistics(struct walkarg *w)
{
/* Returns a single memacct_statistics struct for the specified subsystem */
struct memacct_statistics stats = {};
struct mem_acct *acct = memacct[w->w_sub];
lck_mtx_lock(&mem_acct_mtx);
if (acct == NULL) {
return ENOENT;
}
memacct_copy_stats(&stats, acct);
lck_mtx_unlock(&mem_acct_mtx);
return sysctl_io_opaque(w->w_req, &stats, sizeof(stats), NULL);
}
static int
sysctl_all_statistics(struct sysctl_req *req)
{
/* Returns an array of memacct_statistics structs for all active subsystems */
int i, error;
int count = 0;
lck_mtx_lock(&mem_acct_mtx);
for (i = 0; i < MEM_ACCT_MAX; i++) {
if (memacct[i] == NULL) {
break;
}
count++;
}
struct memacct_statistics *memstats = kalloc_data(sizeof(struct memacct_statistics) * count, Z_WAITOK_ZERO_NOFAIL);
for (i = 0; i < count; i++) {
struct mem_acct *acct;
struct memacct_statistics *stats;
acct = memacct[i];
stats = &memstats[i];
memacct_copy_stats(stats, acct);
}
lck_mtx_unlock(&mem_acct_mtx);
error = sysctl_io_opaque(req, memstats, sizeof(struct memacct_statistics) * count, NULL);
if (error) {
kfree_data(memstats, sizeof(struct memacct_statistics) * count);
return error;
}
kfree_data(memstats, sizeof(struct memacct_statistics) * count);
return 0;
}
static int
sysctl_mem_acct_subsystems(struct sysctl_req *req)
{
/* Returns an array names for all active subsystems */
int i, j, error;
int count = 0;
int totalCharCount = 0;
lck_mtx_lock(&mem_acct_mtx);
for (i = 0; i < MEM_ACCT_MAX; i++) {
if (memacct[i] == NULL) {
break;
}
count++;
}
char *names = kalloc_data(count * MEM_ACCT_NAME_LENGTH, Z_WAITOK_ZERO_NOFAIL);
for (i = 0; i < count; i++) {
struct mem_acct *acct = memacct[i];
char acct_name[MEM_ACCT_NAME_LENGTH];
strbufcpy(acct_name, acct->ma_name);
for (j = 0; j < MEM_ACCT_NAME_LENGTH; j++) {
names[totalCharCount++] = acct_name[j];
}
}
lck_mtx_unlock(&mem_acct_mtx);
error = sysctl_io_opaque(req, names, sizeof(char) * count * MEM_ACCT_NAME_LENGTH, NULL);
if (error) {
kfree_data(names, sizeof(char) * count * MEM_ACCT_NAME_LENGTH);
return error;
}
kfree_data(names, sizeof(char) * count * MEM_ACCT_NAME_LENGTH);
return 0;
}
static void
memacct_copy_stats(struct memacct_statistics *s, struct mem_acct *a)
{
s->peak = os_atomic_load(&a->ma_peak, relaxed);
s->allocated = os_atomic_load(&a->ma_allocated, relaxed);
zpercpu_foreach(v, a->ma_percpu) {
s->allocated += *v;
}
if (a->ma_percent) {
s->softlimit = mem_acct_softlimit(a->ma_hardlimit, a->ma_percent);
} else {
s->softlimit = a->ma_hardlimit;
}
s->hardlimit = a->ma_hardlimit;
strbufcpy(s->ma_name, a->ma_name);
}