This is xnu-11215.1.10. See this file in:
/*
 * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
 *
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. The rights granted to you under the License
 * may not be used to create, or enable the creation or redistribution of,
 * unlawful or unlicensed copies of an Apple operating system, or to
 * circumvent, violate, or enable the circumvention or violation of, any
 * terms of an Apple operating system software license agreement.
 *
 * Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this file.
 *
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
 */
/*
 * @OSF_COPYRIGHT@
 */
/*
 * Mach Operating System
 * Copyright (c) 1991,1990,1989 Carnegie Mellon University
 * All Rights Reserved.
 *
 * Permission to use, copy, modify and distribute this software and its
 * documentation is hereby granted, provided that both the copyright
 * notice and this permission notice appear in all copies of the
 * software, derivative works or modified versions, and any portions
 * thereof, and that both notices appear in supporting documentation.
 *
 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
 *
 * Carnegie Mellon requests users of this software to return to
 *
 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
 *  School of Computer Science
 *  Carnegie Mellon University
 *  Pittsburgh PA 15213-3890
 *
 * any improvements or extensions that they make and grant Carnegie Mellon
 * the rights to redistribute these changes.
 */
/*
 */
/*
 *	File:	ipc/ipc_pset.c
 *	Author:	Rich Draves
 *	Date:	1989
 *
 *	Functions to manipulate IPC port sets.
 */

#include <mach/port.h>
#include <mach/kern_return.h>
#include <mach/message.h>
#include <ipc/ipc_mqueue.h>
#include <ipc/ipc_object.h>
#include <ipc/ipc_policy.h>
#include <ipc/ipc_pset.h>
#include <ipc/ipc_right.h>
#include <ipc/ipc_space.h>
#include <ipc/ipc_port.h>
#include <ipc/ipc_kmsg.h>
#include <kern/policy_internal.h>

#include <kern/kern_types.h>

#include <vm/vm_map.h>
#include <libkern/section_keywords.h>
#include <pthread/priority_private.h>

/* processor_set stole ipc_pset_init */
static void
ipc_port_set_init(ipc_pset_t pset, mach_port_name_t name, int policy)
{
	waitq_init(&pset->ips_wqset, WQT_PORT_SET, policy | SYNC_POLICY_FIFO);
	klist_init(&pset->ips_klist);
	pset->ips_wqset.wqset_index = MACH_PORT_INDEX(name);
}

/*
 *	Routine:	ipc_pset_alloc
 *	Purpose:
 *		Allocate a port set.
 *	Conditions:
 *		Nothing locked.  If successful, the port set is returned
 *		locked.  (The caller doesn't have a reference.)
 *	Returns:
 *		KERN_SUCCESS		The port set is allocated.
 *		KERN_INVALID_TASK	The space is dead.
 *		KERN_NO_SPACE		No room for an entry in the space.
 */

kern_return_t
ipc_pset_alloc(
	ipc_space_t             space,
	mach_port_name_t        *namep,
	ipc_pset_t              *psetp)
{
	ipc_pset_t pset;
	mach_port_name_t name;
	kern_return_t kr;

	kr = ipc_object_alloc(space, IOT_PORT_SET,
	    MACH_PORT_TYPE_PORT_SET, 0,
	    &name, (ipc_object_t *) &pset);
	if (kr != KERN_SUCCESS) {
		return kr;
	}
	/* space is locked */

	ipc_port_set_init(pset, name, SYNC_POLICY_INIT_LOCKED);
	/* port set is locked */

	is_write_unlock(space);

	*namep = name;
	*psetp = pset;
	return KERN_SUCCESS;
}

/*
 *	Routine:	ipc_pset_alloc_name
 *	Purpose:
 *		Allocate a port set, with a specific name.
 *	Conditions:
 *		Nothing locked.  If successful, the port set is returned
 *		locked.  (The caller doesn't have a reference.)
 *	Returns:
 *		KERN_SUCCESS		The port set is allocated.
 *		KERN_INVALID_TASK	The space is dead.
 *		KERN_NAME_EXISTS	The name already denotes a right.
 */

kern_return_t
ipc_pset_alloc_name(
	ipc_space_t             space,
	mach_port_name_t        name,
	ipc_pset_t              *psetp)
{
	return ipc_object_alloc_name(space, IOT_PORT_SET,
	           MACH_PORT_TYPE_PORT_SET, 0,
	           name, (ipc_object_t *)psetp, ^(ipc_object_t object){
		ipc_port_set_init(ips_object_to_pset(object), name,
		SYNC_POLICY_INIT_LOCKED);
	});
}


/*
 *	Routine:	ipc_pset_alloc_special
 *	Purpose:
 *		Allocate a port set in a special space.
 *		The new port set is returned with one ref.
 *		If unsuccessful, IPS_NULL is returned.
 *	Conditions:
 *		Nothing locked.
 */
ipc_pset_t
ipc_pset_alloc_special(
	__assert_only ipc_space_t space)
{
	ipc_pset_t pset;

	assert(space != IS_NULL);
	assert(!is_active(space));

	pset = ips_object_to_pset(io_alloc(IOT_PORT_SET, Z_WAITOK | Z_ZERO));
	if (pset == IPS_NULL) {
		return IPS_NULL;
	}

	os_atomic_init(&pset->ips_object.io_bits, io_makebits(IOT_PORT_SET));
	os_atomic_init(&pset->ips_object.io_references, 1);

	ipc_port_set_init(pset, MACH_PORT_SPECIAL_DEFAULT, 0);

	return pset;
}


/*
 *	Routine:	ipc_pset_destroy
 *	Purpose:
 *		Destroys a port_set.
 *	Conditions:
 *		The port_set is locked and alive.
 *		The caller has a reference, which is consumed.
 *		Afterwards, the port_set is unlocked and dead.
 */

void
ipc_pset_destroy(
	ipc_space_t     space,
	ipc_pset_t      pset)
{
	waitq_link_list_t free_l = { };

	assert(ips_active(pset));

	io_bits_andnot(ips_to_object(pset), IO_BITS_ACTIVE);

	/*
	 * Set all waiters on the portset running to
	 * discover the change.
	 *
	 * Then under the same lock hold, deinit the waitq-set,
	 * which will remove all the member message queues,
	 * linkages and clean up preposts.
	 */
	ipc_mqueue_changed(space, &pset->ips_wqset);
	waitq_invalidate(&pset->ips_wqset);
	waitq_set_unlink_all_locked(&pset->ips_wqset, &free_l);

	ips_mq_unlock(pset);

	ips_release(pset);       /* consume the ref our caller gave us */

	waitq_link_free_list(WQT_PORT_SET, &free_l);
}

/*
 *	Routine:	ipc_pset_finalize
 *	Purpose:
 *		Called on last reference deallocate to
 *		free any remaining data associated with the pset.
 *	Conditions:
 *		Nothing locked.
 */
void
ipc_pset_finalize(
	ipc_pset_t              pset)
{
	waitq_deinit(&pset->ips_wqset);
}


#pragma mark - kevent support

/*
 * Kqueue EVFILT_MACHPORT support
 *
 * - kn_ipc_{port,pset} points to the monitored ipc port or pset. If the knote
 *   is using a kqwl, it is eligible to participate in sync IPC overrides.
 *
 *   For the first such sync IPC message in the port, we set up the port's
 *   turnstile to directly push on the kqwl's turnstile (which is in turn set up
 *   during filt_machportattach). If userspace responds to the message, the
 *   turnstile push is severed the point of reply. If userspace returns without
 *   responding to the message, we sever the turnstile push at the
 *   point of reenabling the knote to deliver the next message. This is why the
 *   knote needs to remember the port. For more details, see also
 *   filt_machport_turnstile_complete.
 *
 *   If there are multiple other sync IPC messages in the port, messages 2 to n
 *   redirect their turnstile push to the kqwl through an intermediatry "knote"
 *   turnstile which in turn, pushes on the kqwl turnstile. This knote turnstile
 *   is stored in the kn_hook. See also filt_machport_turnstile_prepare_lazily.
 *
 * - (in/out) ext[0] holds a mach_vm_address_t to a userspace buffer
 *   that can be used to direct-deliver messages when
 *   MACH_RCV_MSG is set in kn_sfflags
 *
 * - (in/out) ext[1] holds a mach_msg_size_t representing the size
 *   of the userspace buffer held in ext[0].
 *
 * - (out)    ext[2] is used to deliver qos information
 *   about the send queue to userspace.
 *
 * - (abused) ext[3] is used in kernel to hold a reference to the first port
 *   with a turnstile that participate to sync IPC override. For more details,
 *   see filt_machport_stash_port
 *
 * - kn_hook is optionally a "knote" turnstile. It is used as the inheritor
 *   of turnstiles for rights copied out as part of direct message delivery
 *   when they can participate to sync IPC override.
 *
 *   It is used to atomically neuter the sync IPC override when the knote is
 *   re-enabled.
 *
 */

#include <sys/event.h>
#include <sys/errno.h>

static int
filt_pset_filter_result(ipc_pset_t pset)
{
	ips_mq_lock_held(pset);

	if (!waitq_is_valid(&pset->ips_wqset)) {
		return 0;
	}

	return waitq_set_first_prepost(&pset->ips_wqset, WQS_PREPOST_PEEK) ?
	       FILTER_ACTIVE : 0;
}

static int
filt_port_filter_result(struct knote *kn, ipc_port_t port)
{
	struct kqueue *kqwl = knote_get_kq(kn);
	ipc_kmsg_t first;
	int result = 0;

	ip_mq_lock_held(port);

	if (kn->kn_sfflags & MACH_RCV_MSG) {
		result = FILTER_RESET_EVENT_QOS;
	}

	if (!waitq_is_valid(&port->ip_waitq)) {
		return result;
	}

	if (port->ip_kernel_iotier_override != kqueue_get_iotier_override(kqwl)) {
		kqueue_set_iotier_override(kqwl, port->ip_kernel_iotier_override);
		result |= FILTER_ADJUST_EVENT_IOTIER_BIT;
	}

	first = ipc_kmsg_queue_first(&port->ip_messages.imq_messages);
	if (!first) {
		return result;
	}

	result = FILTER_ACTIVE;
	if (kn->kn_sfflags & MACH_RCV_MSG) {
		result |= FILTER_ADJUST_EVENT_QOS(first->ikm_qos_override);
	}

#if CONFIG_PREADOPT_TG
	struct thread_group *tg = ipc_kmsg_get_thread_group(first);
	if (tg) {
		struct kqueue *kq = knote_get_kq(kn);
		kqueue_set_preadopted_thread_group(kq, tg,
		    first->ikm_qos_override);
	}
#endif

	return result;
}

struct turnstile *
filt_ipc_kqueue_turnstile(struct knote *kn)
{
	assert(kn->kn_filter == EVFILT_MACHPORT || kn->kn_filter == EVFILT_WORKLOOP);
	return kqueue_turnstile(knote_get_kq(kn));
}

bool
filt_machport_kqueue_has_turnstile(struct knote *kn)
{
	assert(kn->kn_filter == EVFILT_MACHPORT);
	return ((kn->kn_sfflags & MACH_RCV_MSG) || (kn->kn_sfflags & MACH_RCV_SYNC_PEEK))
	       && (kn->kn_flags & EV_DISPATCH);
}

/*
 * Stashes a port that participate to sync IPC override on the knote until the
 * knote is re-enabled.
 *
 * It returns:
 * - the turnstile to use as an inheritor for the stashed port
 * - the kind of stash that happened as PORT_SYNC_* value among:
 *   o not stashed (no sync IPC support)
 *   o stashed in the knote (in kn_ext[3])
 *   o to be hooked to the kn_hook knote
 */
struct turnstile *
filt_machport_stash_port(struct knote *kn, ipc_port_t port, int *link)
{
	struct turnstile *ts = TURNSTILE_NULL;

	if (kn->kn_filter == EVFILT_WORKLOOP) {
		assert(kn->kn_ipc_port == NULL);
		kn->kn_ipc_port = port;
		ip_reference(port);
		if (link) {
			*link = PORT_SYNC_LINK_WORKLOOP_KNOTE;
		}
		ts = filt_ipc_kqueue_turnstile(kn);
	} else if (!filt_machport_kqueue_has_turnstile(kn)) {
		if (link) {
			*link = PORT_SYNC_LINK_NO_LINKAGE;
		}
	} else if (kn->kn_ext[3] == 0) {
		ip_reference(port);
		kn->kn_ext[3] = (uintptr_t)port;
		ts = filt_ipc_kqueue_turnstile(kn);
		if (link) {
			*link = PORT_SYNC_LINK_WORKLOOP_KNOTE;
		}
	} else {
		ts = (struct turnstile *)knote_kn_hook_get_raw(kn);
		if (link) {
			*link = PORT_SYNC_LINK_WORKLOOP_STASH;
		}
	}

	return ts;
}

/*
 * Lazily prepare a turnstile so that filt_machport_stash_port()
 * can be called with the mqueue lock held.
 *
 * It will allocate a turnstile in kn_hook if:
 * - the knote supports sync IPC override,
 * - we already stashed a port in kn_ext[3],
 * - the object that will be copied out has a chance to ask to be stashed.
 *
 * It is setup so that its inheritor is the workloop turnstile that has been
 * allocated when this knote was attached.
 */
void
filt_machport_turnstile_prepare_lazily(
	struct knote *kn,
	mach_msg_type_name_t msgt_name,
	ipc_port_t port)
{
	/* This is called from within filt_machportprocess */
	assert((kn->kn_status & KN_SUPPRESSED) && (kn->kn_status & KN_LOCKED));

	if (!filt_machport_kqueue_has_turnstile(kn)) {
		return;
	}

	if (kn->kn_ext[3] == 0 || knote_kn_hook_get_raw(kn)) {
		return;
	}

	struct turnstile *ts = filt_ipc_kqueue_turnstile(kn);
	if ((msgt_name == MACH_MSG_TYPE_PORT_SEND_ONCE && port->ip_specialreply) ||
	    (msgt_name == MACH_MSG_TYPE_PORT_RECEIVE)) {
		struct turnstile *kn_ts = turnstile_alloc();
		struct turnstile *ts_store;
		kn_ts = turnstile_prepare((uintptr_t)kn, &ts_store, kn_ts, TURNSTILE_KNOTE);
		knote_kn_hook_set_raw(kn, ts_store);

		turnstile_update_inheritor(kn_ts, ts,
		    TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_TURNSTILE);
		turnstile_cleanup();
	}
}

static void
filt_machport_turnstile_complete_port(struct knote *kn, ipc_port_t port)
{
	struct turnstile *ts = TURNSTILE_NULL;

	ip_mq_lock(port);
	if (port->ip_specialreply) {
		/*
		 * If the reply has been sent to the special reply port already,
		 * then the special reply port may already be reused to do something
		 * entirely different.
		 *
		 * However, the only reason for it to still point to this knote is
		 * that it's still waiting for a reply, so when this is the case,
		 * neuter the linkage.
		 */
		if (port->ip_sync_link_state == PORT_SYNC_LINK_WORKLOOP_KNOTE &&
		    port->ip_sync_inheritor_knote == kn) {
			ipc_port_adjust_special_reply_port_locked(port, NULL,
			    (IPC_PORT_ADJUST_SR_NONE | IPC_PORT_ADJUST_SR_ENABLE_EVENT), FALSE);
			/* port unlocked */
		} else {
			ip_mq_unlock(port);
		}
	} else {
		/*
		 * For receive rights, if their IMQ_KNOTE() is still this
		 * knote, then sever the link.
		 */
		if (port->ip_sync_link_state == PORT_SYNC_LINK_WORKLOOP_KNOTE &&
		    port->ip_messages.imq_inheritor_knote == kn) {
			ipc_port_adjust_sync_link_state_locked(port, PORT_SYNC_LINK_ANY, NULL);
			ts = port_send_turnstile(port);
		}
		if (ts) {
			turnstile_reference(ts);
			turnstile_update_inheritor(ts, TURNSTILE_INHERITOR_NULL,
			    TURNSTILE_IMMEDIATE_UPDATE);
		}
		ip_mq_unlock(port);

		if (ts) {
			turnstile_update_inheritor_complete(ts,
			    TURNSTILE_INTERLOCK_NOT_HELD);
			turnstile_deallocate(ts);
		}
	}

	ip_release(port);
}

void
filt_wldetach_sync_ipc(struct knote *kn)
{
	ipc_port_t port = kn->kn_ipc_port;
	filt_machport_turnstile_complete_port(kn, port);
	kn->kn_ipc_port = IP_NULL;
}

/*
 * Other half of filt_machport_turnstile_prepare_lazily()
 *
 * This is serialized by the knote state machine.
 */
static void
filt_machport_turnstile_complete(struct knote *kn)
{
	if (kn->kn_ext[3]) {
		ipc_port_t port = (ipc_port_t)kn->kn_ext[3];
		filt_machport_turnstile_complete_port(kn, port);
		kn->kn_ext[3] = 0;
	}

	struct turnstile *ts = knote_kn_hook_get_raw(kn);
	if (ts) {
		turnstile_update_inheritor(ts, TURNSTILE_INHERITOR_NULL,
		    TURNSTILE_IMMEDIATE_UPDATE);
		turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);

		struct turnstile *ts_store = ts;
		turnstile_complete((uintptr_t)kn, (struct turnstile **)&ts_store, &ts, TURNSTILE_KNOTE);
		knote_kn_hook_set_raw(kn, ts_store);

		turnstile_cleanup();

		assert(ts);
		turnstile_deallocate(ts);
	}
}

static void
filt_machport_link(struct klist *klist, struct knote *kn)
{
	struct knote *hd = SLIST_FIRST(klist);

	if (hd && filt_machport_kqueue_has_turnstile(kn)) {
		SLIST_INSERT_AFTER(hd, kn, kn_selnext);
	} else {
		SLIST_INSERT_HEAD(klist, kn, kn_selnext);
	}
}

static void
filt_machport_unlink(struct klist *klist, struct knote *kn)
{
	struct knote **knprev;

	KNOTE_DETACH(klist, kn);

	/* make sure the first knote is a knote we can push on */
	SLIST_FOREACH_PREVPTR(kn, knprev, klist, kn_selnext) {
		if (filt_machport_kqueue_has_turnstile(kn)) {
			*knprev = SLIST_NEXT(kn, kn_selnext);
			SLIST_INSERT_HEAD(klist, kn, kn_selnext);
			break;
		}
	}
}

int
filt_wlattach_sync_ipc(struct knote *kn)
{
	mach_port_name_t name = (mach_port_name_t)kn->kn_id;
	ipc_space_t space = current_space();
	ipc_entry_bits_t bits;
	ipc_object_t object;
	ipc_port_t port = IP_NULL;
	int error = 0;

	if (ipc_right_lookup_read(space, name, &bits, &object) != KERN_SUCCESS) {
		return ENOENT;
	}
	/* object is locked and active */

	if (bits & MACH_PORT_TYPE_RECEIVE) {
		port = ip_object_to_port(object);
		if (port->ip_specialreply || ip_is_kobject(port)) {
			error = ENOENT;
		}
	} else if (bits & MACH_PORT_TYPE_SEND_ONCE) {
		port = ip_object_to_port(object);
		if (!port->ip_specialreply) {
			error = ENOENT;
		}
	} else {
		error = ENOENT;
	}
	if (error) {
		io_unlock(object);
		return error;
	}

	if (port->ip_sync_link_state == PORT_SYNC_LINK_ANY) {
		io_unlock(object);
		/*
		 * We cannot start a sync IPC inheritance chain, only further one
		 * Note: this can also happen if the inheritance chain broke
		 * because the original requestor died.
		 */
		return ENOENT;
	}

	if (port->ip_specialreply) {
		ipc_port_adjust_special_reply_port_locked(port, kn,
		    IPC_PORT_ADJUST_SR_LINK_WORKLOOP, FALSE);
	} else {
		ipc_port_adjust_port_locked(port, kn, FALSE);
	}

	/* make sure the port was stashed */
	assert(kn->kn_ipc_port == port);

	/* port has been unlocked by ipc_port_adjust_* */

	return 0;
}

static int
filt_psetattach(struct knote *kn, ipc_pset_t pset)
{
	int result = 0;

	ips_reference(pset);
	kn->kn_ipc_pset = pset;

	filt_machport_link(&pset->ips_klist, kn);
	result = filt_pset_filter_result(pset);
	ips_mq_unlock(pset);

	return result;
}

static int
filt_portattach(struct knote *kn, ipc_port_t port)
{
	struct turnstile *send_turnstile = TURNSTILE_NULL;
	int result = 0;

	if (port->ip_specialreply) {
		/*
		 * Registering for kevents on special reply ports
		 * isn't supported for two reasons:
		 *
		 * 1. it really makes very little sense for a port that
		 *    is supposed to be used synchronously
		 *
		 * 2. their ports's ip_klist field will be used to
		 *    store the receive turnstile, so we can't possibly
		 *    attach them anyway.
		 */
		ip_mq_unlock(port);
		knote_set_error(kn, ENOTSUP);
		return 0;
	}

	ip_reference(port);
	kn->kn_ipc_port = port;
	if (port->ip_sync_link_state != PORT_SYNC_LINK_ANY) {
		/*
		 * We're attaching a port that used to have an IMQ_KNOTE,
		 * clobber this state, we'll fixup its turnstile inheritor below.
		 */
		ipc_port_adjust_sync_link_state_locked(port, PORT_SYNC_LINK_ANY, NULL);
	}

	filt_machport_link(&port->ip_klist, kn);
	result = filt_port_filter_result(kn, port);

	/*
	 * Update the port's turnstile inheritor
	 *
	 * Unlike filt_machportdetach(), we don't have to care about races for
	 * turnstile_workloop_pusher_info(): filt_machport_link() doesn't affect
	 * already pushing knotes, and if the current one becomes the new
	 * pusher, it'll only be visible when turnstile_workloop_pusher_info()
	 * returns.
	 */
	send_turnstile = port_send_turnstile(port);
	if (send_turnstile) {
		turnstile_reference(send_turnstile);
		ipc_port_send_update_inheritor(port, send_turnstile,
		    TURNSTILE_IMMEDIATE_UPDATE);

		/*
		 * rdar://problem/48861190
		 *
		 * When a listener connection resumes a peer,
		 * updating the inheritor above has moved the push
		 * from the current thread to the workloop.
		 *
		 * However, we haven't told the workloop yet
		 * that it needs a thread request, and we risk
		 * to be preeempted as soon as we drop the space
		 * lock below.
		 *
		 * To avoid this disable preemption and let kevent
		 * reenable it after it takes the kqlock.
		 */
		disable_preemption();
		result |= FILTER_THREADREQ_NODEFEER;
	}

	ip_mq_unlock(port);

	if (send_turnstile) {
		turnstile_update_inheritor_complete(send_turnstile,
		    TURNSTILE_INTERLOCK_NOT_HELD);
		turnstile_deallocate(send_turnstile);
	}

	return result;
}

static int
filt_machportattach(struct knote *kn, __unused struct kevent_qos_s *kev)
{
	mach_port_name_t name = (mach_port_name_t)kn->kn_id;
	ipc_space_t space = current_space();
	ipc_entry_bits_t bits;
	ipc_object_t object;
	kern_return_t kr;

	kn->kn_flags &= ~EV_EOF;
	kn->kn_ext[3] = 0;

	if (filt_machport_kqueue_has_turnstile(kn)) {
		/*
		 * If the filter is likely to support sync IPC override,
		 * and it happens to be attaching to a workloop,
		 * make sure the workloop has an allocated turnstile.
		 */
		kqueue_alloc_turnstile(knote_get_kq(kn));
	}

	kr = ipc_right_lookup_read(space, name, &bits, &object);

	if (kr != KERN_SUCCESS) {
		knote_set_error(kn, ENOENT);
		return 0;
	}
	/* object is locked and active */

	if (bits & MACH_PORT_TYPE_PORT_SET) {
		kn->kn_filtid = EVFILTID_MACH_PORT_SET;
		return filt_psetattach(kn, ips_object_to_pset(object));
	}

	if (bits & MACH_PORT_TYPE_RECEIVE) {
		kn->kn_filtid = EVFILTID_MACH_PORT;
		return filt_portattach(kn, ip_object_to_port(object));
	}

	io_unlock(object);
	knote_set_error(kn, ENOTSUP);
	return 0;
}

static void
filt_psetdetach(struct knote *kn)
{
	ipc_pset_t pset = kn->kn_ipc_pset;

	filt_machport_turnstile_complete(kn);

	ips_mq_lock(pset);

	if ((kn->kn_status & KN_VANISHED) || (kn->kn_flags & EV_EOF)) {
		/*
		 * ipc_mqueue_changed() already unhooked this knote from the waitq,
		 */
	} else {
		filt_machport_unlink(&pset->ips_klist, kn);
	}

	kn->kn_ipc_pset = IPS_NULL;
	ips_mq_unlock(pset);
	ips_release(pset);
}

static void
filt_portdetach(struct knote *kn)
{
	ipc_port_t port = kn->kn_ipc_port;
	struct turnstile *send_turnstile = TURNSTILE_NULL;

	filt_machport_turnstile_complete(kn);

	ip_mq_lock(port);
	if ((kn->kn_status & KN_VANISHED) || (kn->kn_flags & EV_EOF)) {
		/*
		 * ipc_mqueue_changed() already unhooked this knote from the waitq,
		 */
	} else {
		/*
		 * When the knote being detached is the first one in the list,
		 * then unlinking the knote *and* updating the turnstile inheritor
		 * need to happen atomically with respect to the callers of
		 * turnstile_workloop_pusher_info().
		 *
		 * The caller of turnstile_workloop_pusher_info() will use the kq req
		 * lock (and hence the kqlock), so we just need to hold the kqlock too.
		 */
		assert(port->ip_sync_link_state == PORT_SYNC_LINK_ANY);
		if (kn == SLIST_FIRST(&port->ip_klist)) {
			send_turnstile = port_send_turnstile(port);
		}
		filt_machport_unlink(&port->ip_klist, kn);
		struct kqueue *kq = knote_get_kq(kn);
		kqueue_set_iotier_override(kq, THROTTLE_LEVEL_END);
	}

	if (send_turnstile) {
		turnstile_reference(send_turnstile);
		ipc_port_send_update_inheritor(port, send_turnstile,
		    TURNSTILE_IMMEDIATE_UPDATE);
	}

	/* Clear the knote pointer once the knote has been removed from turnstile */
	kn->kn_ipc_port = IP_NULL;
	ip_mq_unlock(port);

	if (send_turnstile) {
		turnstile_update_inheritor_complete(send_turnstile,
		    TURNSTILE_INTERLOCK_NOT_HELD);
		turnstile_deallocate(send_turnstile);
	}

	ip_release(port);
}

/*
 * filt_{pset,port}event - deliver events into the mach port filter
 *
 * Mach port message arrival events are currently only posted via the
 * kqueue filter routine for ports.
 *
 * If there is a message at the head of the queue,
 * we indicate that the knote should go active.  If
 * the message is to be direct-received, we adjust the
 * QoS of the knote according the requested and override
 * QoS of that first message.
 *
 * When the knote is for a port-set, the hint is non 0
 * and is the waitq which is posting.
 */
static int
filt_psetevent(struct knote *kn __unused, long hint __assert_only)
{
	/*
	 * When called for a port-set,
	 * the posting port waitq is locked.
	 *
	 * waitq_set_first_prepost()
	 * in filt_machport_filter_result()
	 * would try to lock it and be very sad.
	 *
	 * Just trust what we know to be true.
	 */
	assert(hint != 0);
	return FILTER_ACTIVE;
}

static int
filt_portevent(struct knote *kn, long hint __assert_only)
{
	assert(hint == 0);
	return filt_port_filter_result(kn, kn->kn_ipc_port);
}

void
ipc_pset_prepost(struct waitq_set *wqs, struct waitq *waitq)
{
	KNOTE(&ips_from_waitq(wqs)->ips_klist, (long)waitq);
}

static void
filt_machporttouch(struct knote *kn, struct kevent_qos_s *kev)
{
	/*
	 * Specificying MACH_RCV_MSG or MACH_RCV_SYNC_PEEK during attach results in
	 * allocation of a turnstile. Modifying the filter flags to include these
	 * flags later, without a turnstile being allocated, leads to
	 * inconsistencies.
	 */
	if ((kn->kn_sfflags ^ kev->fflags) & (MACH_RCV_MSG | MACH_RCV_SYNC_PEEK)) {
		kev->flags |= EV_ERROR;
		kev->data = EINVAL;
		return;
	}

	/* copy in new settings and save off new input fflags */
	kn->kn_sfflags = kev->fflags;
	kn->kn_ext[0] = kev->ext[0];
	kn->kn_ext[1] = kev->ext[1];

	if (kev->flags & EV_ENABLE) {
		/*
		 * If the knote is being enabled, make sure there's no lingering
		 * IPC overrides from the previous message delivery.
		 */
		filt_machport_turnstile_complete(kn);
	}
}

static int
filt_psettouch(struct knote *kn, struct kevent_qos_s *kev)
{
	ipc_pset_t pset = kn->kn_ipc_pset;
	int result = 0;

	filt_machporttouch(kn, kev);
	if (kev->flags & EV_ERROR) {
		return 0;
	}

	ips_mq_lock(pset);
	result = filt_pset_filter_result(pset);
	ips_mq_unlock(pset);

	return result;
}

static int
filt_porttouch(struct knote *kn, struct kevent_qos_s *kev)
{
	ipc_port_t port = kn->kn_ipc_port;
	int result = 0;

	filt_machporttouch(kn, kev);
	if (kev->flags & EV_ERROR) {
		return 0;
	}

	ip_mq_lock(port);
	result = filt_port_filter_result(kn, port);
	ip_mq_unlock(port);

	return result;
}

static int
filt_machportprocess(
	struct knote           *kn,
	struct kevent_qos_s    *kev,
	ipc_object_t            object,
	ipc_object_type_t       otype)
{
	thread_t self = current_thread();
	kevent_ctx_t kectx = NULL;

	wait_result_t wresult;
	mach_msg_option64_t option64;
	mach_vm_address_t msg_addr;
	mach_msg_size_t max_msg_size;
	mach_msg_recv_result_t msgr;

	int result = FILTER_ACTIVE;

	/* Capture current state */
	knote_fill_kevent(kn, kev, MACH_PORT_NULL);

	/* Clear port reference, use ext3 as size of msg aux data */
	kev->ext[3] = 0;

	/* If already deallocated/moved return one last EOF event */
	if (kev->flags & EV_EOF) {
		return FILTER_ACTIVE | FILTER_RESET_EVENT_QOS;
	}

	/*
	 * Only honor supported receive options. If no options are
	 * provided, just force a MACH_RCV_LARGE to detect the
	 * name of the port and sizeof the waiting message.
	 *
	 * Extend kn_sfflags to 64 bits.
	 *
	 * Add MACH_RCV_TIMEOUT to never wait (in case someone concurrently
	 * dequeued the message that made this knote active already).
	 */
	option64 = kn->kn_sfflags & (MACH_RCV_MSG | MACH_RCV_LARGE |
	    MACH_RCV_LARGE_IDENTITY | MACH_RCV_TRAILER_MASK |
	    MACH_RCV_VOUCHER | MACH_MSG_STRICT_REPLY);
	option64 = ipc_current_user_policy(current_task(), option64);

	if (option64 & MACH_RCV_MSG) {
		msg_addr = (mach_vm_address_t) kn->kn_ext[0];
		max_msg_size = (mach_msg_size_t) kn->kn_ext[1];

		/*
		 * Copy out the incoming message as vector, and append aux data
		 * immediately after the message proper (if any) and report its
		 * size on ext3.
		 *
		 * Note: MACH64_RCV_LINEAR_VECTOR is how the receive machinery
		 *       knows this comes from kevent (see comment in
		 *       mach_msg_receive_too_large()).
		 */
		option64 |= (MACH64_MSG_VECTOR | MACH64_RCV_LINEAR_VECTOR);

		/*
		 * If the kevent didn't specify a buffer and length, carve a buffer
		 * from the filter processing data according to the flags.
		 */
		if (max_msg_size == 0) {
			kectx = kevent_get_context(self);
			msg_addr  = (mach_vm_address_t)kectx->kec_data_out;
			max_msg_size  = (mach_msg_size_t)kectx->kec_data_resid;
			option64 |= (MACH_RCV_LARGE | MACH_RCV_LARGE_IDENTITY);
			/* Receive vector linearly onto stack */
			if (kectx->kec_process_flags & KEVENT_FLAG_STACK_DATA) {
				option64 |= MACH64_RCV_STACK;
			}
		}
	} else {
		/* just detect the port name (if a set) and size of the first message */
		option64 = MACH_RCV_LARGE;
		msg_addr = 0;
		max_msg_size = 0;
	}
	option64 |= MACH_RCV_TIMEOUT; /* never wait */

	/*
	 * Set up to receive a message or the notification of a
	 * too large message.  But never allow this call to wait.
	 * If the user provided aditional options, like trailer
	 * options, pass those through here.  But we don't support
	 * scatter lists through this interface.
	 *
	 * Note: while in filt_machportprocess(),
	 *       the knote has a reference on `object` that we can borrow.
	 */

	/* Set up message proper receive params on thread */
	bzero(&self->ith_receive, sizeof(self->ith_receive));
	self->ith_recv_bufs = (mach_msg_recv_bufs_t){
		.recv_msg_addr = msg_addr,
		.recv_msg_size = max_msg_size,
	};
	self->ith_object = object;
	self->ith_option = option64;
	self->ith_knote  = kn;

	ipc_object_lock(object, otype);

	wresult = ipc_mqueue_receive_on_thread_and_unlock(io_waitq(object),
	    MACH_MSG_TIMEOUT_NONE, THREAD_INTERRUPTIBLE, self);
	/* port unlocked */

	/* If we timed out, or the process is exiting, just zero.  */
	if (wresult == THREAD_RESTART || self->ith_state == MACH_RCV_TIMED_OUT) {
		assert(self->turnstile != TURNSTILE_NULL);
		self->ith_knote = ITH_KNOTE_NULL;
		return 0;
	}

	assert(wresult == THREAD_NOT_WAITING);
	assert(self->ith_state != MACH_RCV_IN_PROGRESS);

	/*
	 * If we weren't attempting to receive a message
	 * directly, we need to return the port name in
	 * the kevent structure.
	 */
	if ((option64 & MACH_RCV_MSG) != MACH_RCV_MSG) {
		assert(self->ith_state == MACH_RCV_TOO_LARGE);
		assert(self->ith_kmsg == IKM_NULL);
		kev->data = self->ith_receiver_name;
		self->ith_knote = ITH_KNOTE_NULL;
		return result;
	}

#if CONFIG_PREADOPT_TG
	/* If we're the first EVFILT_MACHPORT knote that is being processed for this
	 * kqwl, then make sure to preadopt the thread group from the kmsg we're
	 * about to receive. This is to make sure that we fix up the preadoption
	 * thread group correctly on the receive side for the first message.
	 */
	struct kqueue *kq = knote_get_kq(kn);

	if (self->ith_kmsg) {
		struct thread_group *tg = ipc_kmsg_get_thread_group(self->ith_kmsg);

		kqueue_process_preadopt_thread_group(self, kq, tg);
	}
#endif
	if (otype == IOT_PORT) {
		ipc_port_t port = ip_object_to_port(object);
		struct kqueue *kqwl = knote_get_kq(kn);
		if (port->ip_kernel_iotier_override != kqueue_get_iotier_override(kqwl)) {
			/*
			 * Lock the port to make sure port->ip_kernel_iotier_override does
			 * not change while updating the kqueue override, else kqueue could
			 * have old iotier value.
			 */
			ip_mq_lock(port);
			kqueue_set_iotier_override(kqwl, port->ip_kernel_iotier_override);
			ip_mq_unlock(port);
			result |= FILTER_ADJUST_EVENT_IOTIER_BIT;
		}
	}

	/*
	 * Attempt to receive the message directly, returning
	 * the results in the fflags field.
	 */
	io_reference(object);
	kev->fflags = mach_msg_receive_results(&msgr);

	/* kmsg and object reference consumed */

	/*
	 * if the user asked for the identity of ports containing a
	 * a too-large message, return it in the data field (as we
	 * do for messages we didn't try to receive).
	 */
	kev->ext[1] = msgr.msgr_msg_size + msgr.msgr_trailer_size;
	kev->ext[3] = msgr.msgr_aux_size;  /* Only lower 32 bits of ext3 are used */
	if (kev->fflags == MACH_RCV_TOO_LARGE &&
	    (option64 & MACH_RCV_LARGE_IDENTITY)) {
		kev->data = msgr.msgr_recv_name;
	} else {
		kev->data = MACH_PORT_NULL;
	}

	/*
	 * If we used a data buffer carved out from the filt_process data,
	 * store the address used in the knote and adjust the residual and
	 * other parameters for future use.
	 */
	if (kectx && kev->fflags != MACH_RCV_TOO_LARGE) {
		mach_vm_size_t size = msgr.msgr_msg_size +
		    msgr.msgr_trailer_size + msgr.msgr_aux_size;

		assert(kectx->kec_data_resid >= size);
		kectx->kec_data_resid -= size;
		if ((kectx->kec_process_flags & KEVENT_FLAG_STACK_DATA) == 0) {
			kev->ext[0] = kectx->kec_data_out;
			kectx->kec_data_out += size;
		} else {
			assert(option64 & MACH64_RCV_STACK);
			kev->ext[0] = kectx->kec_data_out + kectx->kec_data_resid;
		}
	}

	/*
	 * Apply message-based QoS values to output kevent as prescribed.
	 * The kev->ext[2] field gets (msg-qos << 32) | (override-qos).
	 */
	if (kev->fflags == MACH_MSG_SUCCESS) {
		kev->ext[2] = ((uint64_t)msgr.msgr_priority << 32) |
		    _pthread_priority_make_from_thread_qos(msgr.msgr_qos_ovrd, 0, 0);
	}

	self->ith_knote = ITH_KNOTE_NULL;
	return result;
}

static int
filt_psetprocess(struct knote *kn, struct kevent_qos_s *kev)
{
	ipc_object_t io = ips_to_object(kn->kn_ipc_pset);

	return filt_machportprocess(kn, kev, io, IOT_PORT_SET);
}

static int
filt_portprocess(struct knote *kn, struct kevent_qos_s *kev)
{
	ipc_object_t io = ip_to_object(kn->kn_ipc_port);

	return filt_machportprocess(kn, kev, io, IOT_PORT);
}

static void
filt_machportsanitizedcopyout(struct knote *kn, struct kevent_qos_s *kev)
{
	*kev = *(struct kevent_qos_s *)&kn->kn_kevent;

	// We may have stashed the address to the port that is pushing on the sync
	// IPC so clear it out.
	kev->ext[3] = 0;
}

const struct filterops machport_attach_filtops = {
	.f_adjusts_qos = true,
	.f_extended_codes = true,
	.f_attach = filt_machportattach,
	.f_sanitized_copyout = filt_machportsanitizedcopyout,
};

const struct filterops mach_port_filtops = {
	.f_adjusts_qos = true,
	.f_extended_codes = true,
	.f_detach = filt_portdetach,
	.f_event = filt_portevent,
	.f_touch = filt_porttouch,
	.f_process = filt_portprocess,
	.f_sanitized_copyout = filt_machportsanitizedcopyout,
};

const struct filterops mach_port_set_filtops = {
	.f_adjusts_qos = true,
	.f_extended_codes = true,
	.f_detach = filt_psetdetach,
	.f_event = filt_psetevent,
	.f_touch = filt_psettouch,
	.f_process = filt_psetprocess,
	.f_sanitized_copyout = filt_machportsanitizedcopyout,
};